sync including new/old illumina machine

ff9b6a25 · Jasper Koehorst · e62ccb96 · ff9b6a25
Commit ff9b6a25 authored 3 years ago by Jasper Koehorst
--- a/study_creator.py
+++ b/study_creator.py
@@ -55,7 +55,7 @@ error = 0

 machines = set()
 illumina_machines = set(["Illumina HiSeq 1000", "Illumina HiSeq 1500", "Illumina HiSeq 2000", "Illumina HiSeq 2500",
-                         "Illumina HiSeq 3000", "Illumina HiSeq 4000", "Illumina HiSeq x ten", "Illumina MiSeq", "NextSeq 500"])
+                         "Illumina HiSeq 3000", "Illumina HiSeq 4000", "Illumina HiSeq x ten", "Illumina MiSeq", "NextSeq 500", "Illumina NovaSeq 6000"])


 def machine_filter(doc):
@@ -120,22 +120,22 @@ def paired_check(doc):

 def setup_project(content):
    project = {
-        "ProjectIdentifier": "UNLOCK",
-        "ProjectTitle": "UNLOCK Microbial Potential",
-        "ProjectDescription": "An open infrastructure for exploring new horizons for research on microbial communities."
+        "Project identifier": "UNLOCK",
+        "Project title": "UNLOCK Microbial Potential",
+        "Project description": "An open infrastructure for exploring new horizons for research on microbial communities."
    }
    return project


 def setup_investigation(content):
    investigation = {
-        "InvestigationIdentifier": "SRA_Amplicon",
-        "InvestigationTitle": "SRA amplicon data from various studies",
-        "InvestigationDescription": "Automatically generated metadata from SRA information",
-        "ProjectIdentifier":"UNLOCK",
-        "FirstName": "Jasper",
-        "LastName": "Koehorst",
-        "Email": "jasper.koehorst@wur.nl",
+        "Investigation identifier": "SRA_Amplicon",
+        "Investigation title": "SRA amplicon data from various studies",
+        "Investigation description": "Automatically generated metadata from SRA information",
+        "Project identifier":"UNLOCK",
+        "Firstname": "Jasper",
+        "Lastname": "Koehorst",
+        "Email address": "jasper.koehorst@wur.nl",
        "ORCID": "0000-0001-8172-8981",
        "Organization": "Wageningen University",
        "Department": "UNLOCK",
@@ -145,7 +145,7 @@ def setup_investigation(content):


 def setup_study(content):
-    study = {"InvestigationIdentifier":"SRA_Amplicon"}
+    study = {"Investigation identifier":"SRA_Amplicon"}

    # ORIGINAL RESEARCHER INFORMATION

@@ -188,7 +188,7 @@ def setup_study(content):

    # STUDY INFORMATION
    study_object = content['STUDY']
-    study['StudyIdentifier'] = study_object['@accession']
+    study['Study identifier'] = study_object['@accession']
    study['alias'] = study_object['@alias']
    study['StudyTitle'] = study_object['DESCRIPTOR']['STUDY_TITLE']
    if 'STUDY_ABSTRACT' in study_object['DESCRIPTOR']:
@@ -214,9 +214,9 @@ def setup_observation_unit(content):
    experiment = content['EXPERIMENT']
    sample = content['SAMPLE']

-    observation_unit['ObservationUnitIdentifier'] = "X" + sample['@accession']
+    observation_unit['Observational unit identifier'] = "X" + sample['@accession']
    observation_unit['alias'] = "X" + sample['@alias']
-    observation_unit['StudyIdentifier'] = experiment['STUDY_REF']['@accession']
+    observation_unit['Study identifier'] = experiment['STUDY_REF']['@accession']
    
    if "TITLE" in sample:
        observation_unit['ObservationUnitDescription'] = sample['TITLE']
@@ -231,16 +231,16 @@ def setup_sample(content):
    sample = {}
    sample_object = content['SAMPLE']
    sample['alias'] = sample_object['@alias']
-    sample['SampleIdentifier'] = sample_object['@accession']
-    sample['BioSafetyLevel'] = "0"
-    sample['collection_date'] = "1900-01-01T00:00:00"
+    sample['Sample identifier'] = sample_object['@accession']
+    sample['Biosafety level'] = "0"
+    sample['collection date'] = "1900-01-01T00:00:00"

    if type(sample_object['IDENTIFIERS']['EXTERNAL_ID']) == list:
        for entry in sample_object['IDENTIFIERS']['EXTERNAL_ID']:
            if entry['@namespace'] == 'BioSample':
                sample_object['IDENTIFIERS']['EXTERNAL_ID'] = entry

-    sample['ObservationUnitIdentifier'] = "X" + sample_object['@accession']
+    sample['Observational unit identifier'] = "X" + sample_object['@accession']
    sample['namespace'] = sample_object['IDENTIFIERS']['EXTERNAL_ID']['@namespace']
    sample['text'] = sample_object['IDENTIFIERS']['EXTERNAL_ID']['#text']

@@ -249,8 +249,8 @@ def setup_sample(content):
        sample['SampleTitle'] = sample_object['TITLE']
        sample['SampleName'] = sample_object['TITLE']
    else:
-        sample['SampleTitle'] = 'Automatic sample title from SRA ' + sample['SampleIdentifier']
-        sample['SampleName'] = 'Automatic sample title from SRA ' + sample['SampleIdentifier']
+        sample['SampleTitle'] = 'Automatic sample title from SRA ' + sample['Sample identifier']
+        sample['SampleName'] = 'Automatic sample title from SRA ' + sample['Sample identifier']

    sample['NCBI taxonomy ID'] = sample_object['SAMPLE_NAME']['TAXON_ID']

@@ -262,7 +262,7 @@ def setup_sample(content):
    if "DESCRIPTION" in sample_object:
        sample['SampleDescription'] = sample_object['DESCRIPTION']
    else:
-        sample['SampleDescription'] = 'Automatic sample description from SRA ' + sample['SampleIdentifier']
+        sample['SampleDescription'] = 'Automatic sample description from SRA ' + sample['Sample identifier']

    sample['attributes'] = {}
    if 'SAMPLE_ATTRIBUTES' in sample_object:
@@ -281,21 +281,21 @@ def setup_assay(content):
    assay['Facility'] = 'Unknown'
    assay['Method'] = 'Unknown'
    # assay['Date'] = 'Unknown'
-    assay['target_subfragment'] = ""
-    assay['IsolationProtocol'] = "Unknown"
+    assay['Target subfragment'] = ""
+    assay['Isolation protocol'] = "Unknown"

    run = content['RUN_SET']['RUN']
-    assay['AssayIdentifier'] = run['@accession']
+    assay['Assay identifier'] = run['@accession']
    assay['alias'] = run['@alias']
    assay['total_spots'] = run['@total_spots']
    assay['total_bases'] = run['@total_bases']
    assay['size'] = run['@size']
    assay['load_done'] = run['@load_done']
    assay['published'] = run['@published']
-    assay['SampleIdentifier'] = content['SAMPLE']['@accession']
-    assay['StudyIdentifier'] = content['STUDY']['@accession']
-    assay['AssayName'] = 'Automatic assay title from SRA ' + assay['AssayIdentifier']
-    assay['AssayDescription'] = 'Automatic assay description from SRA ' + assay['AssayIdentifier']
+    assay['Sample identifier'] = content['SAMPLE']['@accession']
+    assay['Study identifier'] = content['STUDY']['@accession']
+    assay['Assay name'] = 'Automatic assay title from SRA ' + assay['Assay identifier']
+    assay['Assay description'] = 'Automatic assay description from SRA ' + assay['Assay identifier']
    assay['experiment'] = content['EXPERIMENT']['@accession']
    
    # Files
@@ -314,8 +314,8 @@ def setup_assay(content):
                print(sra['@filename'])
            print("SOMETHING WENT WRONG!")

-    assay['FileNameForward'] = sra_file['@filename'] + "_1.fastq.gz"
-    assay['FileNameReverse'] = sra_file['@filename'] + "_2.fastq.gz"
+    assay['Forward filename'] = sra_file['@filename'] + "_1.fastq.gz"
+    assay['Reverse filename'] = sra_file['@filename'] + "_2.fastq.gz"
    assay['Date'] = sra_file['@date']
    assay['super_type'] = sra_file['@supertype']
    assay['sratoolkit'] = sra_file['@sratoolkit']
@@ -324,23 +324,23 @@ def setup_assay(content):
    library = content['EXPERIMENT']['DESIGN']['LIBRARY_DESCRIPTOR']

    if 'LIBRARY_NAME' in library:
-        assay['library_name'] = library['LIBRARY_NAME']
+        assay['Library name'] = library['LIBRARY_NAME']
    
-    assay['library_strategy'] = library['LIBRARY_STRATEGY']
-    assay['library_source'] = library['LIBRARY_SOURCE']
-    assay['library_selection'] = library['LIBRARY_SELECTION']
-    assay['library_layout'] = list(dict(library['LIBRARY_LAYOUT']).keys())
+    assay['Library strategy'] = library['LIBRARY_STRATEGY']
+    assay['Library source'] = library['LIBRARY_SOURCE']
+    assay['Library selection'] = library['LIBRARY_SELECTION']
+    assay['Library layout'] = list(dict(library['LIBRARY_LAYOUT']).keys())

    # Machine
-    assay['InstrumentModel'] = content['EXPERIMENT']['PLATFORM']['ILLUMINA']['INSTRUMENT_MODEL']
+    assay['Instument model'] = content['EXPERIMENT']['PLATFORM']['ILLUMINA']['INSTRUMENT_MODEL']
    
    if 'ILLUMINA' in content['EXPERIMENT']['PLATFORM']:
        assay['Platform'] = 'Illumina'
    else:
        print("NEW SEQUENCING PLATFORM DETECTED!!!")

-    if len(assay['library_layout']) == 1:
-        assay['library_layout'] = assay['library_layout'][0]
+    if len(assay['Library layout']) == 1:
+        assay['Library layout'] = assay['Library layout'][0]

    return assay
    
@@ -396,13 +396,9 @@ def filtering(TAG, VALUE):
            else:
                print("New parser needed for " + [VALUE])

-            
-    
-    # if "+00:00" in VALUE:
-        # print("STOP!")
-
    return VALUE

+
 def create_xlsx(pickle_list):
    # Prepocessing of picke list
    project_keys = set()
@@ -439,25 +435,25 @@ def create_xlsx(pickle_list):
            assay_keys.update(assay.keys())

    # Creating the headers
-    project_header = ["ProjectIdentifier", "ProjectDescription", "ProjectTitle"]
+    project_header = ["Project identifier", "Project description", "Project title"]
    project_header = create_header(project_header, project_keys)

-    investigation_header = ["InvestigationIdentifier", "InvestigationDescription", "InvestigationTitle",
-                            "ProjectIdentifier", "FirstName", "LastName", "Email", "ORCID", "Organization",
+    investigation_header = ["Investigation identifier", "Investigation description", "Investigation title",
+                            "Project identifier", "Firstname", "Lastname", "Email address", "ORCID", "Organization",
                            "Department", "Role"]
    investigation_header = create_header(investigation_header, investigation_keys)

-    study_header = ["StudyIdentifier", "StudyDescription", "StudyTitle", "InvestigationIdentifier"]
+    study_header = ["Study identifier", "Study description", "Study title", "Investigation identifier"]
    study_header = create_header(study_header, study_keys)
-    observation_unit_header = ["ObservationUnitIdentifier", "ObservationUnitDescription", "ObservationUnitName", "StudyIdentifier"]
+    observation_unit_header = ["Observational unit identifier", "Observational unit description", "Observational unit name", "Study identifier"]
    observation_unit_header = create_header(observation_unit_header, observation_unit_keys)

-    sample_header = ["SampleIdentifier", "SampleDescription", "SampleName", "ObservationUnitIdentifier", "NCBI taxonomy ID", "SampleOrganism"]
+    sample_header = ["Sample identifier", "Sample description", "sample name", "Observational unit identifier", "NCBI taxonomy ID", "Sample organism"]
    sample_header = create_header(sample_header, sample_keys)

-    assay_header = ["AssayIdentifier", "SampleIdentifier", "AssayTitle", "AssayDescription", "FileNameForward",
-                    "FileNameReverse", "ForwardPrimer", "ReversePrimer", "PrimerNames", "IsolationProtocol",
-                    "SequencingCenter", "SequencingPlatform", "SequencingDate"]
+    assay_header = ["Assay identifier", "Sample identifier", "Assay description", "Forward filename",
+                    "Reverse filename", "Forward primer", "Reverse primer", "Primer names", "Isolation protocol",
+                    "Sequencing center", "Platform", "Date"]

    assay_header = create_header(assay_header, assay_keys)

@@ -494,11 +490,11 @@ def create_xlsx(pickle_list):
            assay = pickle.load(pkl)

            # Skip studies that are already parsed from other pickles
-            if project['ProjectIdentifier'] in project_identifier: continue
+            if project['Project identifier'] in project_identifier: continue
            for key in project:
                column = project_header.index(key)
                project_worksheet.write(row_number, column, project[key])
-                project_identifier.add(project['ProjectIdentifier'])
+                project_identifier.add(project['Project identifier'])
            row_number = row_number + 1

    ###############################################################
@@ -516,11 +512,11 @@ def create_xlsx(pickle_list):
            assay = pickle.load(pkl)

            # Skip studies that are already parsed from other pickles
-            if investigation['InvestigationIdentifier'] in investigation_identifier: continue
+            if investigation['Investigation identifier'] in investigation_identifier: continue
            for key in investigation:
                column = investigation_header.index(key)
                investigation_worksheet.write(row, column, investigation[key])
-                investigation_identifier.add(investigation['InvestigationIdentifier'])
+                investigation_identifier.add(investigation['Investigation identifier'])
            row_number = row_number + 1

    ###############################################################
@@ -538,11 +534,11 @@ def create_xlsx(pickle_list):
            assay = pickle.load(pkl)

            # Skip studies that are already parsed from other assay pickles
-            if study['StudyIdentifier'] in study_identifier: continue
+            if study['Study identifier'] in study_identifier: continue
            for key in study:
                column = study_header.index(key)
                study_worksheet.write(row_number, column, study[key])
-                study_identifier.add(study['StudyIdentifier'])
+                study_identifier.add(study['Study identifier'])
            row_number = row_number + 1

    ###############################################################
@@ -560,11 +556,11 @@ def create_xlsx(pickle_list):
            assay = pickle.load(pkl)

            # Skip studies that are already parsed from other pickles
-            if observation_unit['ObservationUnitIdentifier'] in ou_identifier: continue
+            if observation_unit['Observational unit identifier'] in ou_identifier: continue
            for key in observation_unit:
                column = observation_unit_header.index(key)
                ou_worksheet.write(row_number, column, observation_unit[key])
-                ou_identifier.add(observation_unit['ObservationUnitIdentifier'])
+                ou_identifier.add(observation_unit['Observational unit identifier'])
            row_number = row_number + 1
    ###############################################################
    # Filling the sample sheet
@@ -581,14 +577,14 @@ def create_xlsx(pickle_list):
            assay = pickle.load(pkl)

            # Skip studies that are already parsed from other pickles
-            if sample['SampleIdentifier'] in sample_identifier: continue
+            if sample['Sample identifier'] in sample_identifier: continue
            attributes = sample.pop("attributes")
            sample = {**sample, **attributes}

            for key in sample:
                column = sample_header.index(key)
                sample_worksheet.write(row_number, column, sample[key])
-                sample_identifier.add(sample['SampleIdentifier'])
+                sample_identifier.add(sample['Sample identifier'])
            row_number = row_number + 1

    ###############################################################
@@ -606,11 +602,11 @@ def create_xlsx(pickle_list):
            assay = pickle.load(pkl)

            # Skip studies that are already parsed from other pickles
-            if assay['AssayIdentifier'] in assay_identifier: continue
+            if assay['Assay identifier'] in assay_identifier: continue
            for key in assay:
                column = assay_header.index(key)
                assay_worksheet.write(row_number, column, assay[key])
-                assay_identifier.add(assay['AssayIdentifier'])
+                assay_identifier.add(assay['Assay identifier'])
            row_number = row_number + 1

    workbook.close()
@@ -669,7 +665,6 @@ def selection(content):
 def main(identifiers):
    pickle_list = set()
    
-
    for xml in identifiers.values():
        if not xml.endswith(".xml"): continue
        with open(xml) as fd:
@@ -714,6 +709,7 @@ if __name__ == '__main__':
    identifiers = {}
    
    for index, identifier in enumerate(record['IdList']):
+        print(">>>", identifier)
        folder = make_folder(identifier)
        path = folder + "/" + identifier + ".xml"
        identifiers[identifier] = path