Skip to content
Snippets Groups Projects
Commit ff9b6a25 authored by Jasper Koehorst's avatar Jasper Koehorst
Browse files

sync including new/old illumina machine

parent e62ccb96
Branches
No related tags found
No related merge requests found
......@@ -55,7 +55,7 @@ error = 0
machines = set()
illumina_machines = set(["Illumina HiSeq 1000", "Illumina HiSeq 1500", "Illumina HiSeq 2000", "Illumina HiSeq 2500",
"Illumina HiSeq 3000", "Illumina HiSeq 4000", "Illumina HiSeq x ten", "Illumina MiSeq", "NextSeq 500"])
"Illumina HiSeq 3000", "Illumina HiSeq 4000", "Illumina HiSeq x ten", "Illumina MiSeq", "NextSeq 500", "Illumina NovaSeq 6000"])
def machine_filter(doc):
......@@ -120,22 +120,22 @@ def paired_check(doc):
def setup_project(content):
project = {
"ProjectIdentifier": "UNLOCK",
"ProjectTitle": "UNLOCK Microbial Potential",
"ProjectDescription": "An open infrastructure for exploring new horizons for research on microbial communities."
"Project identifier": "UNLOCK",
"Project title": "UNLOCK Microbial Potential",
"Project description": "An open infrastructure for exploring new horizons for research on microbial communities."
}
return project
def setup_investigation(content):
investigation = {
"InvestigationIdentifier": "SRA_Amplicon",
"InvestigationTitle": "SRA amplicon data from various studies",
"InvestigationDescription": "Automatically generated metadata from SRA information",
"ProjectIdentifier":"UNLOCK",
"FirstName": "Jasper",
"LastName": "Koehorst",
"Email": "jasper.koehorst@wur.nl",
"Investigation identifier": "SRA_Amplicon",
"Investigation title": "SRA amplicon data from various studies",
"Investigation description": "Automatically generated metadata from SRA information",
"Project identifier":"UNLOCK",
"Firstname": "Jasper",
"Lastname": "Koehorst",
"Email address": "jasper.koehorst@wur.nl",
"ORCID": "0000-0001-8172-8981",
"Organization": "Wageningen University",
"Department": "UNLOCK",
......@@ -145,7 +145,7 @@ def setup_investigation(content):
def setup_study(content):
study = {"InvestigationIdentifier":"SRA_Amplicon"}
study = {"Investigation identifier":"SRA_Amplicon"}
# ORIGINAL RESEARCHER INFORMATION
......@@ -188,7 +188,7 @@ def setup_study(content):
# STUDY INFORMATION
study_object = content['STUDY']
study['StudyIdentifier'] = study_object['@accession']
study['Study identifier'] = study_object['@accession']
study['alias'] = study_object['@alias']
study['StudyTitle'] = study_object['DESCRIPTOR']['STUDY_TITLE']
if 'STUDY_ABSTRACT' in study_object['DESCRIPTOR']:
......@@ -214,9 +214,9 @@ def setup_observation_unit(content):
experiment = content['EXPERIMENT']
sample = content['SAMPLE']
observation_unit['ObservationUnitIdentifier'] = "X" + sample['@accession']
observation_unit['Observational unit identifier'] = "X" + sample['@accession']
observation_unit['alias'] = "X" + sample['@alias']
observation_unit['StudyIdentifier'] = experiment['STUDY_REF']['@accession']
observation_unit['Study identifier'] = experiment['STUDY_REF']['@accession']
if "TITLE" in sample:
observation_unit['ObservationUnitDescription'] = sample['TITLE']
......@@ -231,16 +231,16 @@ def setup_sample(content):
sample = {}
sample_object = content['SAMPLE']
sample['alias'] = sample_object['@alias']
sample['SampleIdentifier'] = sample_object['@accession']
sample['BioSafetyLevel'] = "0"
sample['collection_date'] = "1900-01-01T00:00:00"
sample['Sample identifier'] = sample_object['@accession']
sample['Biosafety level'] = "0"
sample['collection date'] = "1900-01-01T00:00:00"
if type(sample_object['IDENTIFIERS']['EXTERNAL_ID']) == list:
for entry in sample_object['IDENTIFIERS']['EXTERNAL_ID']:
if entry['@namespace'] == 'BioSample':
sample_object['IDENTIFIERS']['EXTERNAL_ID'] = entry
sample['ObservationUnitIdentifier'] = "X" + sample_object['@accession']
sample['Observational unit identifier'] = "X" + sample_object['@accession']
sample['namespace'] = sample_object['IDENTIFIERS']['EXTERNAL_ID']['@namespace']
sample['text'] = sample_object['IDENTIFIERS']['EXTERNAL_ID']['#text']
......@@ -249,8 +249,8 @@ def setup_sample(content):
sample['SampleTitle'] = sample_object['TITLE']
sample['SampleName'] = sample_object['TITLE']
else:
sample['SampleTitle'] = 'Automatic sample title from SRA ' + sample['SampleIdentifier']
sample['SampleName'] = 'Automatic sample title from SRA ' + sample['SampleIdentifier']
sample['SampleTitle'] = 'Automatic sample title from SRA ' + sample['Sample identifier']
sample['SampleName'] = 'Automatic sample title from SRA ' + sample['Sample identifier']
sample['NCBI taxonomy ID'] = sample_object['SAMPLE_NAME']['TAXON_ID']
......@@ -262,7 +262,7 @@ def setup_sample(content):
if "DESCRIPTION" in sample_object:
sample['SampleDescription'] = sample_object['DESCRIPTION']
else:
sample['SampleDescription'] = 'Automatic sample description from SRA ' + sample['SampleIdentifier']
sample['SampleDescription'] = 'Automatic sample description from SRA ' + sample['Sample identifier']
sample['attributes'] = {}
if 'SAMPLE_ATTRIBUTES' in sample_object:
......@@ -281,21 +281,21 @@ def setup_assay(content):
assay['Facility'] = 'Unknown'
assay['Method'] = 'Unknown'
# assay['Date'] = 'Unknown'
assay['target_subfragment'] = ""
assay['IsolationProtocol'] = "Unknown"
assay['Target subfragment'] = ""
assay['Isolation protocol'] = "Unknown"
run = content['RUN_SET']['RUN']
assay['AssayIdentifier'] = run['@accession']
assay['Assay identifier'] = run['@accession']
assay['alias'] = run['@alias']
assay['total_spots'] = run['@total_spots']
assay['total_bases'] = run['@total_bases']
assay['size'] = run['@size']
assay['load_done'] = run['@load_done']
assay['published'] = run['@published']
assay['SampleIdentifier'] = content['SAMPLE']['@accession']
assay['StudyIdentifier'] = content['STUDY']['@accession']
assay['AssayName'] = 'Automatic assay title from SRA ' + assay['AssayIdentifier']
assay['AssayDescription'] = 'Automatic assay description from SRA ' + assay['AssayIdentifier']
assay['Sample identifier'] = content['SAMPLE']['@accession']
assay['Study identifier'] = content['STUDY']['@accession']
assay['Assay name'] = 'Automatic assay title from SRA ' + assay['Assay identifier']
assay['Assay description'] = 'Automatic assay description from SRA ' + assay['Assay identifier']
assay['experiment'] = content['EXPERIMENT']['@accession']
# Files
......@@ -314,8 +314,8 @@ def setup_assay(content):
print(sra['@filename'])
print("SOMETHING WENT WRONG!")
assay['FileNameForward'] = sra_file['@filename'] + "_1.fastq.gz"
assay['FileNameReverse'] = sra_file['@filename'] + "_2.fastq.gz"
assay['Forward filename'] = sra_file['@filename'] + "_1.fastq.gz"
assay['Reverse filename'] = sra_file['@filename'] + "_2.fastq.gz"
assay['Date'] = sra_file['@date']
assay['super_type'] = sra_file['@supertype']
assay['sratoolkit'] = sra_file['@sratoolkit']
......@@ -324,23 +324,23 @@ def setup_assay(content):
library = content['EXPERIMENT']['DESIGN']['LIBRARY_DESCRIPTOR']
if 'LIBRARY_NAME' in library:
assay['library_name'] = library['LIBRARY_NAME']
assay['Library name'] = library['LIBRARY_NAME']
assay['library_strategy'] = library['LIBRARY_STRATEGY']
assay['library_source'] = library['LIBRARY_SOURCE']
assay['library_selection'] = library['LIBRARY_SELECTION']
assay['library_layout'] = list(dict(library['LIBRARY_LAYOUT']).keys())
assay['Library strategy'] = library['LIBRARY_STRATEGY']
assay['Library source'] = library['LIBRARY_SOURCE']
assay['Library selection'] = library['LIBRARY_SELECTION']
assay['Library layout'] = list(dict(library['LIBRARY_LAYOUT']).keys())
# Machine
assay['InstrumentModel'] = content['EXPERIMENT']['PLATFORM']['ILLUMINA']['INSTRUMENT_MODEL']
assay['Instument model'] = content['EXPERIMENT']['PLATFORM']['ILLUMINA']['INSTRUMENT_MODEL']
if 'ILLUMINA' in content['EXPERIMENT']['PLATFORM']:
assay['Platform'] = 'Illumina'
else:
print("NEW SEQUENCING PLATFORM DETECTED!!!")
if len(assay['library_layout']) == 1:
assay['library_layout'] = assay['library_layout'][0]
if len(assay['Library layout']) == 1:
assay['Library layout'] = assay['Library layout'][0]
return assay
......@@ -396,13 +396,9 @@ def filtering(TAG, VALUE):
else:
print("New parser needed for " + [VALUE])
# if "+00:00" in VALUE:
# print("STOP!")
return VALUE
def create_xlsx(pickle_list):
# Prepocessing of picke list
project_keys = set()
......@@ -439,25 +435,25 @@ def create_xlsx(pickle_list):
assay_keys.update(assay.keys())
# Creating the headers
project_header = ["ProjectIdentifier", "ProjectDescription", "ProjectTitle"]
project_header = ["Project identifier", "Project description", "Project title"]
project_header = create_header(project_header, project_keys)
investigation_header = ["InvestigationIdentifier", "InvestigationDescription", "InvestigationTitle",
"ProjectIdentifier", "FirstName", "LastName", "Email", "ORCID", "Organization",
investigation_header = ["Investigation identifier", "Investigation description", "Investigation title",
"Project identifier", "Firstname", "Lastname", "Email address", "ORCID", "Organization",
"Department", "Role"]
investigation_header = create_header(investigation_header, investigation_keys)
study_header = ["StudyIdentifier", "StudyDescription", "StudyTitle", "InvestigationIdentifier"]
study_header = ["Study identifier", "Study description", "Study title", "Investigation identifier"]
study_header = create_header(study_header, study_keys)
observation_unit_header = ["ObservationUnitIdentifier", "ObservationUnitDescription", "ObservationUnitName", "StudyIdentifier"]
observation_unit_header = ["Observational unit identifier", "Observational unit description", "Observational unit name", "Study identifier"]
observation_unit_header = create_header(observation_unit_header, observation_unit_keys)
sample_header = ["SampleIdentifier", "SampleDescription", "SampleName", "ObservationUnitIdentifier", "NCBI taxonomy ID", "SampleOrganism"]
sample_header = ["Sample identifier", "Sample description", "sample name", "Observational unit identifier", "NCBI taxonomy ID", "Sample organism"]
sample_header = create_header(sample_header, sample_keys)
assay_header = ["AssayIdentifier", "SampleIdentifier", "AssayTitle", "AssayDescription", "FileNameForward",
"FileNameReverse", "ForwardPrimer", "ReversePrimer", "PrimerNames", "IsolationProtocol",
"SequencingCenter", "SequencingPlatform", "SequencingDate"]
assay_header = ["Assay identifier", "Sample identifier", "Assay description", "Forward filename",
"Reverse filename", "Forward primer", "Reverse primer", "Primer names", "Isolation protocol",
"Sequencing center", "Platform", "Date"]
assay_header = create_header(assay_header, assay_keys)
......@@ -494,11 +490,11 @@ def create_xlsx(pickle_list):
assay = pickle.load(pkl)
# Skip studies that are already parsed from other pickles
if project['ProjectIdentifier'] in project_identifier: continue
if project['Project identifier'] in project_identifier: continue
for key in project:
column = project_header.index(key)
project_worksheet.write(row_number, column, project[key])
project_identifier.add(project['ProjectIdentifier'])
project_identifier.add(project['Project identifier'])
row_number = row_number + 1
###############################################################
......@@ -516,11 +512,11 @@ def create_xlsx(pickle_list):
assay = pickle.load(pkl)
# Skip studies that are already parsed from other pickles
if investigation['InvestigationIdentifier'] in investigation_identifier: continue
if investigation['Investigation identifier'] in investigation_identifier: continue
for key in investigation:
column = investigation_header.index(key)
investigation_worksheet.write(row, column, investigation[key])
investigation_identifier.add(investigation['InvestigationIdentifier'])
investigation_identifier.add(investigation['Investigation identifier'])
row_number = row_number + 1
###############################################################
......@@ -538,11 +534,11 @@ def create_xlsx(pickle_list):
assay = pickle.load(pkl)
# Skip studies that are already parsed from other assay pickles
if study['StudyIdentifier'] in study_identifier: continue
if study['Study identifier'] in study_identifier: continue
for key in study:
column = study_header.index(key)
study_worksheet.write(row_number, column, study[key])
study_identifier.add(study['StudyIdentifier'])
study_identifier.add(study['Study identifier'])
row_number = row_number + 1
###############################################################
......@@ -560,11 +556,11 @@ def create_xlsx(pickle_list):
assay = pickle.load(pkl)
# Skip studies that are already parsed from other pickles
if observation_unit['ObservationUnitIdentifier'] in ou_identifier: continue
if observation_unit['Observational unit identifier'] in ou_identifier: continue
for key in observation_unit:
column = observation_unit_header.index(key)
ou_worksheet.write(row_number, column, observation_unit[key])
ou_identifier.add(observation_unit['ObservationUnitIdentifier'])
ou_identifier.add(observation_unit['Observational unit identifier'])
row_number = row_number + 1
###############################################################
# Filling the sample sheet
......@@ -581,14 +577,14 @@ def create_xlsx(pickle_list):
assay = pickle.load(pkl)
# Skip studies that are already parsed from other pickles
if sample['SampleIdentifier'] in sample_identifier: continue
if sample['Sample identifier'] in sample_identifier: continue
attributes = sample.pop("attributes")
sample = {**sample, **attributes}
for key in sample:
column = sample_header.index(key)
sample_worksheet.write(row_number, column, sample[key])
sample_identifier.add(sample['SampleIdentifier'])
sample_identifier.add(sample['Sample identifier'])
row_number = row_number + 1
###############################################################
......@@ -606,11 +602,11 @@ def create_xlsx(pickle_list):
assay = pickle.load(pkl)
# Skip studies that are already parsed from other pickles
if assay['AssayIdentifier'] in assay_identifier: continue
if assay['Assay identifier'] in assay_identifier: continue
for key in assay:
column = assay_header.index(key)
assay_worksheet.write(row_number, column, assay[key])
assay_identifier.add(assay['AssayIdentifier'])
assay_identifier.add(assay['Assay identifier'])
row_number = row_number + 1
workbook.close()
......@@ -669,7 +665,6 @@ def selection(content):
def main(identifiers):
pickle_list = set()
for xml in identifiers.values():
if not xml.endswith(".xml"): continue
with open(xml) as fd:
......@@ -714,6 +709,7 @@ if __name__ == '__main__':
identifiers = {}
for index, identifier in enumerate(record['IdList']):
print(">>>", identifier)
folder = make_folder(identifier)
path = folder + "/" + identifier + ".xml"
identifiers[identifier] = path
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment