error handling improved

fd39af12 · Jasper Koehorst · 210c88d7 · fd39af12 · fd39af12
Commit fd39af12 authored 3 years ago by Jasper Koehorst
--- a/entrez.py
+++ b/entrez.py
@@ -65,7 +65,8 @@ while True:
    for index, identifier in enumerate(record['IdList']):
        folder = make_folder(identifier)
        path = folder + "/" + identifier + ".xml"
-
+        if os.path.exists(path):
+            continue
        if not os.path.exists(folder):
            os.makedirs(folder)
        identifiers.add(int(identifier))
@@ -76,5 +77,5 @@ while True:
        results = Parallel(n_jobs=num_cores)(delayed(processInput)(i) for i in identifiers)
        # Stop when finished
        sys.exit(0)
-    except TerminatedWorkerError as e:
+    except Exception as e:
        pass
--- a/xmlparser.py
+++ b/xmlparser.py
@@ -5,10 +5,6 @@ import re
 import xmltodict
 import xlsxwriter

-# GLOBAL PARAMETERS
-MINIMUM_NUMBER_OF_SPOTS = 10000
-
-
 def myprint(d):
    """
    Prints key, value for a nested dictionary
@@ -43,6 +39,9 @@ def machine_filter(doc):
    :return:
    """

+
+    if 'EXPERIMENT_PACKAGE' not in doc['EXPERIMENT_PACKAGE_SET']: return False
+
    if 'ILLUMINA' not in doc['EXPERIMENT_PACKAGE_SET']['EXPERIMENT_PACKAGE']['EXPERIMENT']['PLATFORM'].keys():
        print('Platform is not illumina')
        return False
@@ -159,7 +158,8 @@ def setup_study(content):
    study['StudyIdentifier'] = study_object['@accession']
    study['alias'] = study_object['@alias']
    study['StudyTitle'] = study_object['DESCRIPTOR']['STUDY_TITLE']
-    study['StudyDescription'] = study_object['DESCRIPTOR']['STUDY_ABSTRACT']
+    if 'STUDY_ABSTRACT' in study_object['DESCRIPTOR']:
+        study['StudyDescription'] = study_object['DESCRIPTOR']['STUDY_ABSTRACT']
    if 'STUDY_TYPE' in study_object['DESCRIPTOR']:
        study['type'] = study_object['DESCRIPTOR']['STUDY_TYPE']['@existing_study_type']

@@ -229,10 +229,11 @@ def setup_observation_unit(content):

 def setup_assay(content):
    assay = {}
-    assay['SequencingCenter'] = 'Unknown'
-    assay['SequencingDate'] = 'Unknown'
-    assay['IsolationProtocol'] = 'Unknown'
-    
+    assay['Facility'] = 'Unknown'
+    assay['Method'] = 'Unknown'
+    assay['Date'] = 'Unknown'
+    assay['target_subfragment'] = ""
+
    run = content['RUN_SET']['RUN']
    assay['AssayIdentifier'] = run['@accession']
    assay['alias'] = run['@alias']
@@ -243,7 +244,7 @@ def setup_assay(content):
    assay['published'] = run['@published']
    assay['SampleIdentifier'] = content['SAMPLE']['@accession']

-    assay['AssayTitle'] = 'Automatic assay title from SRA ' + assay['AssayIdentifier']
+    assay['AssayName'] = 'Automatic assay title from SRA ' + assay['AssayIdentifier']
    assay['AssayDescription'] = 'Automatic assay description from SRA ' + assay['AssayIdentifier']
    # Files
    sra_file = run['SRAFiles']['SRAFile']
@@ -308,7 +309,7 @@ def create_xlsx(pickle_list):
    assay_keys = set()

    # Create a workbook and add a worksheet.
-    workbook = xlsxwriter.Workbook('sra.xlsx')
+    workbook = xlsxwriter.Workbook(EXCEL_FILE)
    project_worksheet = workbook.add_worksheet(name="Project")
    investigation_worksheet = workbook.add_worksheet(name="Investigation")
    study_worksheet = workbook.add_worksheet(name="Study")
@@ -518,6 +519,7 @@ def lookup_creation(doc, pickle_file):

    # if content['RUN_SET']['RUN']['Statistics']['@nreads'] != "2":
    #     return None
+
    project = setup_project(content)
    investigation = setup_investigation(content)
    study = setup_study(content)
@@ -535,22 +537,26 @@ def lookup_creation(doc, pickle_file):
    pickle.dump(assay, pickle_file)
    pickle_file.close()

-
 def selection(content):
    """
    Selection function based on criteria set by the developer
    :param file:
    :return:
    """
-    if "ERP119217" in content:
-        print("Match detected")
+    if "" in content:
+        print("Filter disabled")
        return True
-    # if "PRJNA527973" in content or "PRJNA517152" in content:
+    # if "ERP119217" in content:
+    #     print("Match detected")
    #     return True

+    # if "PRJNA527973" in content or "PRJNA517152" in content:
+        # return True
+
    # if "soil" not in content:
    #     return False

+    print("Did not pass the selection filter")
    return False


@@ -564,22 +570,21 @@ def main():

        for (dirpath, dirnames, filenames) in os.walk(path):
            listOfFiles = set([os.path.join(dirpath, file) for file in filenames])
-            for index, elem in enumerate(listOfFiles):
-                if not elem.endswith("xml"): continue
+            for elem in listOfFiles:
+                if not elem.endswith("xml"):
+                    print("element not xml")
+                    continue
                with open(elem) as fd:
                    content = fd.read()
-
                    if selection(content):
                        doc = xmltodict.parse(content)

-                        status = machine_filter(doc)
-
-                        if not status:
+                        if not machine_filter(doc):
+                            print("Not passing machine filter", elem)
                            continue

-                        status = paired_check(doc)
-
-                        if not status:
+                        if not paired_check(doc):
+                            print("Not passing paired check", elem)
                            continue

                        # Create lookup file
@@ -589,14 +594,16 @@ def main():
                            os.remove(pickle_file)

                        if not os.path.exists(pickle_file):
-                            # print("Creating pickle")
+                            print("Creating pickle")
                            lookup_creation(doc, pickle_file)
-                        else:
-                            pass
+
                        pickle_list.add(pickle_file)
    # Create EXCEL file
    create_xlsx(pickle_list)


 if __name__ == '__main__':
+    # GLOBAL PARAMETERS
+    # MINIMUM_NUMBER_OF_SPOTS = 10000
+    EXCEL_FILE = 'sra.xlsx'
    main()