diff --git a/entrez.py b/entrez.py index 36a83a67258eff6219d90db6da9d3065765cd99b..8117eb1cea0d1cdba74e08d8c89c97c3ac6064ed 100644 --- a/entrez.py +++ b/entrez.py @@ -40,6 +40,8 @@ def make_folder(identifier): while True: query = "(((((\"paired\"[Layout]) AND ((\"instrument illumina hiseq 1000\"[Properties] OR \"instrument illumina hiseq 1500\"[Properties] OR \"instrument illumina hiseq 2000\"[Properties] OR \"instrument illumina hiseq 2500\"[Properties] OR \"instrument illumina hiseq 3000\"[Properties] OR \"instrument illumina hiseq 4000\"[Properties] OR \"instrument illumina hiseq x ten\"[Properties] OR \"instrument illumina miseq\"[Properties]))) AND \"illumina\"[Platform]) AND \"amplicon\"[Strategy]) AND \"filetype fastq\"[Properties]) AND \"cluster public\"[Properties]" + query = "PRJNA527973" + # query = "PRJNA517152" Entrez.email = "A.N.Other@example.com" # Always tell NCBI who you are handle = Entrez.esearch(db="sra", term=query, diff --git a/xmlparser.py b/xmlparser.py index 9700af5fdbe64dca562f31e26e5b631176055c21..52d74582ebb285e4aa61c55ed1eae8239ee5f5d1 100644 --- a/xmlparser.py +++ b/xmlparser.py @@ -1,7 +1,9 @@ import os +import pickle import re import xmltodict +import xlsxwriter # GLOBAL PARAMETERS MINIMUM_NUMBER_OF_SPOTS = 10000 @@ -85,56 +87,59 @@ def paired_check(doc): def setup_investigation(content): investigation = {} + return investigation + + +def setup_study(content): + study = {} + + # ORIGINAL RESEARCHER INFORMATION # SUBMITTER INFORMATION submission = content['SUBMISSION'] - investigation['lab_name'] = submission['@lab_name'] + study['submission_lab_name'] = submission['@lab_name'] if '@center_name' in submission: - investigation['center_name'] = submission['@center_name'] - investigation['submission_accession'] = submission['@accession'] - investigation['submission_alias'] = submission['@alias'] + study['submission_center_name'] = submission['@center_name'] + + study['submission_accession'] = submission['@accession'] + study['submission_alias'] = submission['@alias'] # ORGANIZATION INFORMATION organization = content['Organization'] if 'Contact' in organization: if type(organization['Contact']) == list: - + print("MULTIPLE CONTACTS DETECTED... -.-") print(organization['Contact']) - investigation['email'] = organization['Contact']['@email'] + study['email'] = organization['Contact']['@email'] # organization['Contact']['@sec_email'] if 'Name' in organization['Contact']: if 'Middle' in organization['Contact']['Name']: - investigation['middle_name'] = organization['Contact']['Name']['Middle'] - investigation['first_name'] = organization['Contact']['Name']['First'] - investigation['last_name'] = organization['Contact']['Name']['Last'] + study['middle_name'] = organization['Contact']['Name']['Middle'] + study['first_name'] = organization['Contact']['Name']['First'] + study['last_name'] = organization['Contact']['Name']['Last'] if 'Address' in organization['Contact']: - investigation['postal_code'] = organization['Contact']['Address']['@postal_code'] - investigation['department'] = organization['Contact']['Address']['Department'] - investigation['institution'] = organization['Contact']['Address']['Institution'] - investigation['street'] = organization['Contact']['Address']['Street'] - investigation['city'] = organization['Contact']['Address']['City'] + study['postal_code'] = organization['Contact']['Address']['@postal_code'] + study['department'] = organization['Contact']['Address']['Department'] + study['institution'] = organization['Contact']['Address']['Institution'] + study['street'] = organization['Contact']['Address']['Street'] + study['city'] = organization['Contact']['Address']['City'] if 'Sub' in organization['Contact']['Address']: - investigation['sub'] = organization['Contact']['Address']['Sub'] + study['sub'] = organization['Contact']['Address']['Sub'] if 'Country' in organization['Contact']['Address']: - investigation['country'] = organization['Contact']['Address']['Country'] - - return investigation - - -def setup_study(content): - study = {} + study['country'] = organization['Contact']['Address']['Country'] + # STUDY INFORMATION study_object = content['STUDY'] - study['accession'] = study_object['@accession'] + study['StudyIdentifier'] = study_object['@accession'] study['alias'] = study_object['@alias'] - study['title'] = study_object['DESCRIPTOR']['STUDY_TITLE'] - - if '@existing_study_type' in study_object: - study['type'] = study_object['DESCRIPTOR']['@existing_study_type'] + study['StudyTitle'] = study_object['DESCRIPTOR']['STUDY_TITLE'] + study['StudyDescription'] = study_object['DESCRIPTOR']['STUDY_ABSTRACT'] + if 'STUDY_TYPE' in study_object['DESCRIPTOR']: + study['type'] = study_object['DESCRIPTOR']['STUDY_TYPE']['@existing_study_type'] if 'CENTER_PROJECT_NAME' in study_object['DESCRIPTOR']: study['center_project_name'] = study_object['DESCRIPTOR']['CENTER_PROJECT_NAME'] @@ -146,25 +151,30 @@ def setup_sample(content): sample = {} sample_object = content['SAMPLE'] sample['alias'] = sample_object['@alias'] - sample['accession'] = sample_object['@accession'] + sample['SampleIdentifier'] = sample_object['@accession'] if type(sample_object['IDENTIFIERS']['EXTERNAL_ID']) == list: for entry in sample_object['IDENTIFIERS']['EXTERNAL_ID']: if entry['@namespace'] == 'BioSample': sample_object['IDENTIFIERS']['EXTERNAL_ID'] = entry + sample['ObservationUnitIdentifier'] = content['EXPERIMENT']['@accession'] sample['namespace'] = sample_object['IDENTIFIERS']['EXTERNAL_ID']['@namespace'] sample['text'] = sample_object['IDENTIFIERS']['EXTERNAL_ID']['#text'] if "TITLE" in sample_object: - sample['title'] = sample_object['TITLE'] + sample['SampleTitle'] = sample_object['TITLE'] + else: + sample['SampleTitle'] = 'Automatic sample title from SRA ' + sample['SampleIdentifier'] - sample['taxon'] = sample_object['SAMPLE_NAME']['TAXON_ID'] + sample['TaxonID'] = sample_object['SAMPLE_NAME']['TAXON_ID'] if 'SCIENTIFIC_NAME' in sample_object['SAMPLE_NAME']: - sample['scientific_name'] = sample_object['SAMPLE_NAME']['SCIENTIFIC_NAME'] + sample['OrganismName'] = sample_object['SAMPLE_NAME']['SCIENTIFIC_NAME'] if "DESCRIPTION" in sample_object: - sample['description'] = sample_object['DESCRIPTION'] + sample['SampleDescription'] = sample_object['DESCRIPTION'] + else: + sample['SampleDescription'] = 'Automatic sample description from SRA ' + sample['SampleIdentifier'] sample['attributes'] = {} if 'SAMPLE_ATTRIBUTES' in sample_object: @@ -175,20 +185,15 @@ def setup_sample(content): def setup_observation_unit(content): + observation_unit = {} + experiment = content['EXPERIMENT'] - ou_accession = experiment['@accession'] - ou_alias = experiment['@alias'] - ou_title = experiment['TITLE'] - study_accession = experiment['STUDY_REF']['@accession'] - ou_design_description = experiment['DESIGN']['DESIGN_DESCRIPTION'] - - observation_unit = { - "accession": ou_accession, - "alias": ou_alias, - "title": ou_title, - "study_id": study_accession, - "description": ou_design_description - } + + observation_unit['ObservationUnitIdentifier'] = experiment['@accession'] + observation_unit['alias'] = experiment['@alias'] + observation_unit['ObservationUnitTitle'] = experiment['TITLE'] + observation_unit['StudyIdentifier'] = experiment['STUDY_REF']['@accession'] + observation_unit['ObservationUnitDescription'] = experiment['DESIGN']['DESIGN_DESCRIPTION'] return observation_unit @@ -196,16 +201,17 @@ def setup_observation_unit(content): def setup_assay(content): assay = {} run = content['RUN_SET']['RUN'] - assay['accession'] = run['@accession'] + assay['AssayIdentifier'] = run['@accession'] assay['alias'] = run['@alias'] assay['total_spots'] = run['@total_spots'] assay['total_bases'] = run['@total_bases'] assay['size'] = run['@size'] assay['load_done'] = run['@load_done'] assay['published'] = run['@published'] + assay['SampleIdentifier'] = content['SAMPLE']['@accession'] - ou_accession = run['EXPERIMENT_REF']['@accession'] - + assay['AssayTitle'] = 'Automatic assay title from SRA ' + assay['AssayIdentifier'] + assay['AssayDescription'] = 'Automatic assay description from SRA ' + assay['AssayIdentifier'] # Files sra_file = run['SRAFiles']['SRAFile'] @@ -222,8 +228,8 @@ def setup_assay(content): print(sra['@filename']) print("SOMETHING WENT WRONG!") - assay['forward_fastq'] = sra_file['@filename'] + "_1.fastq.gz" - assay['reverse_fastq'] = sra_file['@filename'] + "_2.fastq.gz" + assay['FileNameForward'] = sra_file['@filename'] + "_1.fastq.gz" + assay['FileNameReverse'] = sra_file['@filename'] + "_2.fastq.gz" assay['date'] = sra_file['@date'] assay['super_type'] = sra_file['@supertype'] assay['sratoolkit'] = sra_file['@sratoolkit'] @@ -238,17 +244,182 @@ def setup_assay(content): assay['library_selection'] = library['LIBRARY_SELECTION'] assay['library_layout'] = list(dict(library['LIBRARY_LAYOUT']).keys()) + # Machine + assay['instrument_model'] = content['EXPERIMENT']['PLATFORM']['ILLUMINA']['INSTRUMENT_MODEL'] + if 'ILLUMINA' in content['EXPERIMENT']['PLATFORM']: + assay['SequencingPlatform'] = 'Illumina' + else: + print("NEW SEQUENCING PLATFORM DETECTED!!!") + + if len(assay['library_layout']) == 1: + assay['library_layout'] = assay['library_layout'][0] + return assay # OPTIONAL FILTER FOR LENGTH?... run['Statistics']['READ'][0]['@average'] -def lookup_creation(doc): +def create_header(header, keys): + for value in sorted(keys): + if value not in header: + header.append(value) + return header + + +def create_xlsx(pickle_list): + # Prepocessing of picke list + investigation_keys = set() + study_keys = set() + observation_unit_keys = set() + sample_keys = set() + assay_keys = set() + + # Create a workbook and add a worksheet. + workbook = xlsxwriter.Workbook('sra.xlsx') + project_worksheet = workbook.add_worksheet(name="Project") + investigation_worksheet = workbook.add_worksheet(name="Investigation") + study_worksheet = workbook.add_worksheet(name="Study") + ou_worksheet = workbook.add_worksheet(name="ObservationUnit") + sample_worksheet = workbook.add_worksheet(name="Sample") + assay_worksheet = workbook.add_worksheet(name="Assay") + + for pickle_file in pickle_list: + with open(pickle_file, 'rb') as pkl: + investigation = pickle.load(pkl) + study = pickle.load(pkl) + study_keys.update(study.keys()) + observation_unit = pickle.load(pkl) + observation_unit_keys.update(observation_unit.keys()) + sample = pickle.load(pkl) + sample_keys.update(sample.keys()) + sample_keys.update(sample['attributes'].keys()) + sample_keys.remove("attributes") + assay = pickle.load(pkl) + assay_keys.update(assay.keys()) + + # Creating the study header + study_header = ["StudyIdentifier", "StudyDescription", "StudyTitle", "InvestigationIdentifier"] + study_header = create_header(study_header, study_keys) + observation_unit_header = ["ObservationUnitIdentifier", "ObservationUnitDescription", "ObservationUnitTitle", + "StudyIdentifier"] + observation_unit_header = create_header(observation_unit_header, observation_unit_keys) + + sample_header = ["SampleIdentifier", "SampleDescription", "SampleName", "ObservationUnitIdentifier", "TaxonomyId", + "SampleOrganism"] + sample_header = create_header(sample_header, sample_keys) + + assay_header = ["AssayIdentifier", "SampleIdentifier", "AssayTitle", "AssayDescription", "FileNameForward", + "FileNameReverse", "ForwardPrimer", "ReversePrimer", "PrimerNames", "IsolationProtocol", + "SequencingCenter", "SequencingPlatform", "SequencingDate"] + + assay_header = create_header(assay_header, assay_keys) + + for index, value in enumerate(study_header): + study_worksheet.write(0, index, value) + + for index, value in enumerate(observation_unit_header): + ou_worksheet.write(0, index, value) + + for index, value in enumerate(sample_header): + sample_worksheet.write(0, index, value) + + for index, value in enumerate(assay_header): + assay_worksheet.write(0, index, value) + + ############################################################### + # Filling the study sheet + ############################################################### + study_identifier = set() + for row, pickle_file in enumerate(pickle_list): + with open(pickle_file, 'rb') as pkl: + investigation = pickle.load(pkl) + study = pickle.load(pkl) + observation_unit = pickle.load(pkl) + sample = pickle.load(pkl) + assay = pickle.load(pkl) + + # Skip studies that are already parsed from other pickles + if study['StudyIdentifier'] in study_identifier: continue + row = row + 1 + for key in study: + column = study_header.index(key) + study_worksheet.write(row, column, study[key]) + study_identifier.add(study['StudyIdentifier']) + + ############################################################### + # Filling the observation unit sheet + ############################################################### + ou_identifier = set() + for row, pickle_file in enumerate(pickle_list): + with open(pickle_file, 'rb') as pkl: + investigation = pickle.load(pkl) + study = pickle.load(pkl) + observation_unit = pickle.load(pkl) + sample = pickle.load(pkl) + assay = pickle.load(pkl) + + # Skip studies that are already parsed from other pickles + if observation_unit['ObservationUnitIdentifier'] in ou_identifier: continue + row = row + 1 + for key in observation_unit: + column = observation_unit_header.index(key) + ou_worksheet.write(row, column, observation_unit[key]) + ou_identifier.add(observation_unit['ObservationUnitIdentifier']) + + ############################################################### + # Filling the sample sheet + ############################################################### + sample_identifier = set() + for row, pickle_file in enumerate(pickle_list): + with open(pickle_file, 'rb') as pkl: + investigation = pickle.load(pkl) + study = pickle.load(pkl) + observation_unit = pickle.load(pkl) + sample = pickle.load(pkl) + assay = pickle.load(pkl) + + # Skip studies that are already parsed from other pickles + if sample['SampleIdentifier'] in sample_identifier: continue + row = row + 1 + attributes = sample.pop("attributes") + sample = {**sample, **attributes} + + for key in sample: + column = sample_header.index(key) + sample_worksheet.write(row, column, sample[key]) + sample_identifier.add(sample['SampleIdentifier']) + + ############################################################### + # Filling the assay sheet + ############################################################### + assay_identifier = set() + for row, pickle_file in enumerate(pickle_list): + with open(pickle_file, 'rb') as pkl: + investigation = pickle.load(pkl) + study = pickle.load(pkl) + observation_unit = pickle.load(pkl) + sample = pickle.load(pkl) + assay = pickle.load(pkl) + + # Skip studies that are already parsed from other pickles + if assay['AssayIdentifier'] in assay_identifier: continue + row = row + 1 + + for key in assay: + column = assay_header.index(key) + assay_worksheet.write(row, column, assay[key]) + assay_identifier.add(assay['AssayIdentifier']) + + workbook.close() + print("done") + + +def lookup_creation(doc, pickle_file): content = doc['EXPERIMENT_PACKAGE_SET']['EXPERIMENT_PACKAGE'] - if int(content['RUN_SET']['RUN']['@total_spots']) < MINIMUM_NUMBER_OF_SPOTS: - return None + # if int(content['RUN_SET']['RUN']['@total_spots']) < MINIMUM_NUMBER_OF_SPOTS: + # return None - if content['RUN_SET']['RUN']['Statistics']['@nreads'] != "2": - return None + # if content['RUN_SET']['RUN']['Statistics']['@nreads'] != "2": + # return None investigation = setup_investigation(content) study = setup_study(content) @@ -256,39 +427,69 @@ def lookup_creation(doc): sample = setup_sample(content) assay = setup_assay(content) + # Dump into file for merging with other xml files + pickle_file = open(pickle_file, 'wb') + pickle.dump(investigation, pickle_file) + pickle.dump(study, pickle_file) + pickle.dump(observation_unit, pickle_file) + pickle.dump(sample, pickle_file) + pickle.dump(assay, pickle_file) + pickle_file.close() + + +def selection(content): + """ + Selection function based on criteria set by the developer + :param file: + :return: + """ + if "PRJNA527973" not in content and "PRJNA517152" not in content: + return False + + # if "soil" not in content: + # return False + + return True + def main(): + pickle_list = set() for dir in sorted(os.listdir(dirName)): path = dirName + "/" + dir print("Parsing ", path) - if dir != "125": continue + for (dirpath, dirnames, filenames) in os.walk(path): listOfFiles = set([os.path.join(dirpath, file) for file in filenames]) for index, elem in enumerate(listOfFiles): - # print("Parsing", index, "of", len(listOfFiles)) + if not elem.endswith("xml"): continue with open(elem) as fd: content = fd.read() + if selection(content): + doc = xmltodict.parse(content) - # if "soil" not in content: - # continue - - doc = xmltodict.parse(content) - - status = machine_filter(doc) + status = machine_filter(doc) - if not status: - continue + if not status: + continue - status = paired_check(doc) + status = paired_check(doc) - if not status: - continue + if not status: + continue - # Create lookup file - lookup_creation(doc) + # Create lookup file + pickle_file = elem.replace("xml", "pkl") + if os.path.exists(pickle_file): + os.remove(pickle_file) - # print(paired, single, error, ' '.join(machines)) - # break + if not os.path.exists(pickle_file): + print("Creating pickle") + lookup_creation(doc, pickle_file) + else: + pass + pickle_list.add(pickle_file) + # Create EXCEL file + create_xlsx(pickle_list) if __name__ == '__main__':