diff --git a/entrez.py b/entrez.py index 8117eb1cea0d1cdba74e08d8c89c97c3ac6064ed..ca63cda02a8553250de59d5ca706b38781b4951c 100644 --- a/entrez.py +++ b/entrez.py @@ -65,7 +65,8 @@ while True: for index, identifier in enumerate(record['IdList']): folder = make_folder(identifier) path = folder + "/" + identifier + ".xml" - + if os.path.exists(path): + continue if not os.path.exists(folder): os.makedirs(folder) identifiers.add(int(identifier)) @@ -76,5 +77,5 @@ while True: results = Parallel(n_jobs=num_cores)(delayed(processInput)(i) for i in identifiers) # Stop when finished sys.exit(0) - except TerminatedWorkerError as e: + except Exception as e: pass diff --git a/xmlparser.py b/xmlparser.py index c20b16d61010cdc1be18765832a442651256cad4..224d458fb8856901b2b392415e6467e206d6b1bf 100644 --- a/xmlparser.py +++ b/xmlparser.py @@ -5,10 +5,6 @@ import re import xmltodict import xlsxwriter -# GLOBAL PARAMETERS -MINIMUM_NUMBER_OF_SPOTS = 10000 - - def myprint(d): """ Prints key, value for a nested dictionary @@ -43,6 +39,9 @@ def machine_filter(doc): :return: """ + + if 'EXPERIMENT_PACKAGE' not in doc['EXPERIMENT_PACKAGE_SET']: return False + if 'ILLUMINA' not in doc['EXPERIMENT_PACKAGE_SET']['EXPERIMENT_PACKAGE']['EXPERIMENT']['PLATFORM'].keys(): print('Platform is not illumina') return False @@ -159,7 +158,8 @@ def setup_study(content): study['StudyIdentifier'] = study_object['@accession'] study['alias'] = study_object['@alias'] study['StudyTitle'] = study_object['DESCRIPTOR']['STUDY_TITLE'] - study['StudyDescription'] = study_object['DESCRIPTOR']['STUDY_ABSTRACT'] + if 'STUDY_ABSTRACT' in study_object['DESCRIPTOR']: + study['StudyDescription'] = study_object['DESCRIPTOR']['STUDY_ABSTRACT'] if 'STUDY_TYPE' in study_object['DESCRIPTOR']: study['type'] = study_object['DESCRIPTOR']['STUDY_TYPE']['@existing_study_type'] @@ -229,10 +229,11 @@ def setup_observation_unit(content): def setup_assay(content): assay = {} - assay['SequencingCenter'] = 'Unknown' - assay['SequencingDate'] = 'Unknown' - assay['IsolationProtocol'] = 'Unknown' - + assay['Facility'] = 'Unknown' + assay['Method'] = 'Unknown' + assay['Date'] = 'Unknown' + assay['target_subfragment'] = "" + run = content['RUN_SET']['RUN'] assay['AssayIdentifier'] = run['@accession'] assay['alias'] = run['@alias'] @@ -243,7 +244,7 @@ def setup_assay(content): assay['published'] = run['@published'] assay['SampleIdentifier'] = content['SAMPLE']['@accession'] - assay['AssayTitle'] = 'Automatic assay title from SRA ' + assay['AssayIdentifier'] + assay['AssayName'] = 'Automatic assay title from SRA ' + assay['AssayIdentifier'] assay['AssayDescription'] = 'Automatic assay description from SRA ' + assay['AssayIdentifier'] # Files sra_file = run['SRAFiles']['SRAFile'] @@ -308,7 +309,7 @@ def create_xlsx(pickle_list): assay_keys = set() # Create a workbook and add a worksheet. - workbook = xlsxwriter.Workbook('sra.xlsx') + workbook = xlsxwriter.Workbook(EXCEL_FILE) project_worksheet = workbook.add_worksheet(name="Project") investigation_worksheet = workbook.add_worksheet(name="Investigation") study_worksheet = workbook.add_worksheet(name="Study") @@ -518,6 +519,7 @@ def lookup_creation(doc, pickle_file): # if content['RUN_SET']['RUN']['Statistics']['@nreads'] != "2": # return None + project = setup_project(content) investigation = setup_investigation(content) study = setup_study(content) @@ -535,22 +537,26 @@ def lookup_creation(doc, pickle_file): pickle.dump(assay, pickle_file) pickle_file.close() - def selection(content): """ Selection function based on criteria set by the developer :param file: :return: """ - if "ERP119217" in content: - print("Match detected") + if "" in content: + print("Filter disabled") return True - # if "PRJNA527973" in content or "PRJNA517152" in content: + # if "ERP119217" in content: + # print("Match detected") # return True + # if "PRJNA527973" in content or "PRJNA517152" in content: + # return True + # if "soil" not in content: # return False + print("Did not pass the selection filter") return False @@ -564,22 +570,21 @@ def main(): for (dirpath, dirnames, filenames) in os.walk(path): listOfFiles = set([os.path.join(dirpath, file) for file in filenames]) - for index, elem in enumerate(listOfFiles): - if not elem.endswith("xml"): continue + for elem in listOfFiles: + if not elem.endswith("xml"): + print("element not xml") + continue with open(elem) as fd: content = fd.read() - if selection(content): doc = xmltodict.parse(content) - status = machine_filter(doc) - - if not status: + if not machine_filter(doc): + print("Not passing machine filter", elem) continue - status = paired_check(doc) - - if not status: + if not paired_check(doc): + print("Not passing paired check", elem) continue # Create lookup file @@ -589,14 +594,16 @@ def main(): os.remove(pickle_file) if not os.path.exists(pickle_file): - # print("Creating pickle") + print("Creating pickle") lookup_creation(doc, pickle_file) - else: - pass + pickle_list.add(pickle_file) # Create EXCEL file create_xlsx(pickle_list) if __name__ == '__main__': + # GLOBAL PARAMETERS + # MINIMUM_NUMBER_OF_SPOTS = 10000 + EXCEL_FILE = 'sra.xlsx' main()