diff --git a/entrez.py b/entrez.py
index 36a83a67258eff6219d90db6da9d3065765cd99b..8117eb1cea0d1cdba74e08d8c89c97c3ac6064ed 100644
--- a/entrez.py
+++ b/entrez.py
@@ -40,6 +40,8 @@ def make_folder(identifier):
 
 while True:
     query = "(((((\"paired\"[Layout]) AND ((\"instrument illumina hiseq 1000\"[Properties] OR \"instrument illumina hiseq 1500\"[Properties] OR \"instrument illumina hiseq 2000\"[Properties] OR \"instrument illumina hiseq 2500\"[Properties] OR \"instrument illumina hiseq 3000\"[Properties] OR \"instrument illumina hiseq 4000\"[Properties] OR \"instrument illumina hiseq x ten\"[Properties] OR \"instrument illumina miseq\"[Properties]))) AND \"illumina\"[Platform]) AND \"amplicon\"[Strategy]) AND \"filetype fastq\"[Properties]) AND \"cluster public\"[Properties]"
+    query = "PRJNA527973"
+    # query = "PRJNA517152"
     Entrez.email = "A.N.Other@example.com"  # Always tell NCBI who you are
     handle = Entrez.esearch(db="sra",
                             term=query,
diff --git a/xmlparser.py b/xmlparser.py
index 9700af5fdbe64dca562f31e26e5b631176055c21..52d74582ebb285e4aa61c55ed1eae8239ee5f5d1 100644
--- a/xmlparser.py
+++ b/xmlparser.py
@@ -1,7 +1,9 @@
 import os
+import pickle
 import re
 
 import xmltodict
+import xlsxwriter
 
 # GLOBAL PARAMETERS
 MINIMUM_NUMBER_OF_SPOTS = 10000
@@ -85,56 +87,59 @@ def paired_check(doc):
 
 def setup_investigation(content):
     investigation = {}
+    return investigation
+
+
+def setup_study(content):
+    study = {}
+
+    # ORIGINAL RESEARCHER INFORMATION
 
     # SUBMITTER INFORMATION
     submission = content['SUBMISSION']
-    investigation['lab_name'] = submission['@lab_name']
+    study['submission_lab_name'] = submission['@lab_name']
     if '@center_name' in submission:
-        investigation['center_name'] = submission['@center_name']
-    investigation['submission_accession'] = submission['@accession']
-    investigation['submission_alias'] = submission['@alias']
+        study['submission_center_name'] = submission['@center_name']
+
+    study['submission_accession'] = submission['@accession']
+    study['submission_alias'] = submission['@alias']
 
     # ORGANIZATION INFORMATION
     organization = content['Organization']
 
     if 'Contact' in organization:
         if type(organization['Contact']) == list:
-
+            print("MULTIPLE CONTACTS DETECTED... -.-")
             print(organization['Contact'])
-        investigation['email'] = organization['Contact']['@email']
+        study['email'] = organization['Contact']['@email']
         # organization['Contact']['@sec_email']
 
         if 'Name' in organization['Contact']:
             if 'Middle' in organization['Contact']['Name']:
-                investigation['middle_name'] = organization['Contact']['Name']['Middle']
-            investigation['first_name'] = organization['Contact']['Name']['First']
-            investigation['last_name'] = organization['Contact']['Name']['Last']
+                study['middle_name'] = organization['Contact']['Name']['Middle']
+            study['first_name'] = organization['Contact']['Name']['First']
+            study['last_name'] = organization['Contact']['Name']['Last']
 
         if 'Address' in organization['Contact']:
-            investigation['postal_code'] = organization['Contact']['Address']['@postal_code']
-            investigation['department'] = organization['Contact']['Address']['Department']
-            investigation['institution'] = organization['Contact']['Address']['Institution']
-            investigation['street'] = organization['Contact']['Address']['Street']
-            investigation['city'] = organization['Contact']['Address']['City']
+            study['postal_code'] = organization['Contact']['Address']['@postal_code']
+            study['department'] = organization['Contact']['Address']['Department']
+            study['institution'] = organization['Contact']['Address']['Institution']
+            study['street'] = organization['Contact']['Address']['Street']
+            study['city'] = organization['Contact']['Address']['City']
 
             if 'Sub' in organization['Contact']['Address']:
-                investigation['sub'] = organization['Contact']['Address']['Sub']
+                study['sub'] = organization['Contact']['Address']['Sub']
             if 'Country' in organization['Contact']['Address']:
-                investigation['country'] = organization['Contact']['Address']['Country']
-
-    return investigation
-
-
-def setup_study(content):
-    study = {}
+                study['country'] = organization['Contact']['Address']['Country']
 
+    # STUDY INFORMATION
     study_object = content['STUDY']
-    study['accession'] = study_object['@accession']
+    study['StudyIdentifier'] = study_object['@accession']
     study['alias'] = study_object['@alias']
-    study['title'] = study_object['DESCRIPTOR']['STUDY_TITLE']
-
-    if '@existing_study_type' in study_object:
-        study['type'] = study_object['DESCRIPTOR']['@existing_study_type']
+    study['StudyTitle'] = study_object['DESCRIPTOR']['STUDY_TITLE']
+    study['StudyDescription'] = study_object['DESCRIPTOR']['STUDY_ABSTRACT']
+    if 'STUDY_TYPE' in study_object['DESCRIPTOR']:
+        study['type'] = study_object['DESCRIPTOR']['STUDY_TYPE']['@existing_study_type']
 
     if 'CENTER_PROJECT_NAME' in study_object['DESCRIPTOR']:
         study['center_project_name'] = study_object['DESCRIPTOR']['CENTER_PROJECT_NAME']
@@ -146,25 +151,30 @@ def setup_sample(content):
     sample = {}
     sample_object = content['SAMPLE']
     sample['alias'] = sample_object['@alias']
-    sample['accession'] = sample_object['@accession']
+    sample['SampleIdentifier'] = sample_object['@accession']
 
     if type(sample_object['IDENTIFIERS']['EXTERNAL_ID']) == list:
         for entry in sample_object['IDENTIFIERS']['EXTERNAL_ID']:
             if entry['@namespace'] == 'BioSample':
                 sample_object['IDENTIFIERS']['EXTERNAL_ID'] = entry
 
+    sample['ObservationUnitIdentifier'] = content['EXPERIMENT']['@accession']
     sample['namespace'] = sample_object['IDENTIFIERS']['EXTERNAL_ID']['@namespace']
     sample['text'] = sample_object['IDENTIFIERS']['EXTERNAL_ID']['#text']
 
     if "TITLE" in sample_object:
-        sample['title'] = sample_object['TITLE']
+        sample['SampleTitle'] = sample_object['TITLE']
+    else:
+        sample['SampleTitle'] = 'Automatic sample title from SRA ' + sample['SampleIdentifier']
 
-    sample['taxon'] = sample_object['SAMPLE_NAME']['TAXON_ID']
+    sample['TaxonID'] = sample_object['SAMPLE_NAME']['TAXON_ID']
     if 'SCIENTIFIC_NAME' in sample_object['SAMPLE_NAME']:
-        sample['scientific_name'] = sample_object['SAMPLE_NAME']['SCIENTIFIC_NAME']
+        sample['OrganismName'] = sample_object['SAMPLE_NAME']['SCIENTIFIC_NAME']
 
     if "DESCRIPTION" in sample_object:
-        sample['description'] = sample_object['DESCRIPTION']
+        sample['SampleDescription'] = sample_object['DESCRIPTION']
+    else:
+        sample['SampleDescription'] = 'Automatic sample description from SRA ' + sample['SampleIdentifier']
 
     sample['attributes'] = {}
     if 'SAMPLE_ATTRIBUTES' in sample_object:
@@ -175,20 +185,15 @@ def setup_sample(content):
 
 
 def setup_observation_unit(content):
+    observation_unit = {}
+
     experiment = content['EXPERIMENT']
-    ou_accession = experiment['@accession']
-    ou_alias = experiment['@alias']
-    ou_title = experiment['TITLE']
-    study_accession = experiment['STUDY_REF']['@accession']
-    ou_design_description = experiment['DESIGN']['DESIGN_DESCRIPTION']
-
-    observation_unit = {
-        "accession": ou_accession,
-        "alias": ou_alias,
-        "title": ou_title,
-        "study_id": study_accession,
-        "description": ou_design_description
-    }
+
+    observation_unit['ObservationUnitIdentifier'] = experiment['@accession']
+    observation_unit['alias'] = experiment['@alias']
+    observation_unit['ObservationUnitTitle'] = experiment['TITLE']
+    observation_unit['StudyIdentifier'] = experiment['STUDY_REF']['@accession']
+    observation_unit['ObservationUnitDescription'] = experiment['DESIGN']['DESIGN_DESCRIPTION']
 
     return observation_unit
 
@@ -196,16 +201,17 @@ def setup_observation_unit(content):
 def setup_assay(content):
     assay = {}
     run = content['RUN_SET']['RUN']
-    assay['accession'] = run['@accession']
+    assay['AssayIdentifier'] = run['@accession']
     assay['alias'] = run['@alias']
     assay['total_spots'] = run['@total_spots']
     assay['total_bases'] = run['@total_bases']
     assay['size'] = run['@size']
     assay['load_done'] = run['@load_done']
     assay['published'] = run['@published']
+    assay['SampleIdentifier'] = content['SAMPLE']['@accession']
 
-    ou_accession = run['EXPERIMENT_REF']['@accession']
-
+    assay['AssayTitle'] = 'Automatic assay title from SRA ' + assay['AssayIdentifier']
+    assay['AssayDescription'] = 'Automatic assay description from SRA ' + assay['AssayIdentifier']
     # Files
     sra_file = run['SRAFiles']['SRAFile']
 
@@ -222,8 +228,8 @@ def setup_assay(content):
                 print(sra['@filename'])
             print("SOMETHING WENT WRONG!")
 
-    assay['forward_fastq'] = sra_file['@filename'] + "_1.fastq.gz"
-    assay['reverse_fastq'] = sra_file['@filename'] + "_2.fastq.gz"
+    assay['FileNameForward'] = sra_file['@filename'] + "_1.fastq.gz"
+    assay['FileNameReverse'] = sra_file['@filename'] + "_2.fastq.gz"
     assay['date'] = sra_file['@date']
     assay['super_type'] = sra_file['@supertype']
     assay['sratoolkit'] = sra_file['@sratoolkit']
@@ -238,17 +244,182 @@ def setup_assay(content):
     assay['library_selection'] = library['LIBRARY_SELECTION']
     assay['library_layout'] = list(dict(library['LIBRARY_LAYOUT']).keys())
 
+    # Machine
+    assay['instrument_model'] = content['EXPERIMENT']['PLATFORM']['ILLUMINA']['INSTRUMENT_MODEL']
+    if 'ILLUMINA' in content['EXPERIMENT']['PLATFORM']:
+        assay['SequencingPlatform'] = 'Illumina'
+    else:
+        print("NEW SEQUENCING PLATFORM DETECTED!!!")
+
+    if len(assay['library_layout']) == 1:
+        assay['library_layout'] = assay['library_layout'][0]
+
     return assay
     # OPTIONAL FILTER FOR LENGTH?... run['Statistics']['READ'][0]['@average']
 
 
-def lookup_creation(doc):
+def create_header(header, keys):
+    for value in sorted(keys):
+        if value not in header:
+            header.append(value)
+    return header
+
+
+def create_xlsx(pickle_list):
+    # Prepocessing of picke list
+    investigation_keys = set()
+    study_keys = set()
+    observation_unit_keys = set()
+    sample_keys = set()
+    assay_keys = set()
+
+    # Create a workbook and add a worksheet.
+    workbook = xlsxwriter.Workbook('sra.xlsx')
+    project_worksheet = workbook.add_worksheet(name="Project")
+    investigation_worksheet = workbook.add_worksheet(name="Investigation")
+    study_worksheet = workbook.add_worksheet(name="Study")
+    ou_worksheet = workbook.add_worksheet(name="ObservationUnit")
+    sample_worksheet = workbook.add_worksheet(name="Sample")
+    assay_worksheet = workbook.add_worksheet(name="Assay")
+
+    for pickle_file in pickle_list:
+        with open(pickle_file, 'rb') as pkl:
+            investigation = pickle.load(pkl)
+            study = pickle.load(pkl)
+            study_keys.update(study.keys())
+            observation_unit = pickle.load(pkl)
+            observation_unit_keys.update(observation_unit.keys())
+            sample = pickle.load(pkl)
+            sample_keys.update(sample.keys())
+            sample_keys.update(sample['attributes'].keys())
+            sample_keys.remove("attributes")
+            assay = pickle.load(pkl)
+            assay_keys.update(assay.keys())
+
+    # Creating the study header
+    study_header = ["StudyIdentifier", "StudyDescription", "StudyTitle", "InvestigationIdentifier"]
+    study_header = create_header(study_header, study_keys)
+    observation_unit_header = ["ObservationUnitIdentifier", "ObservationUnitDescription", "ObservationUnitTitle",
+                               "StudyIdentifier"]
+    observation_unit_header = create_header(observation_unit_header, observation_unit_keys)
+
+    sample_header = ["SampleIdentifier", "SampleDescription", "SampleName", "ObservationUnitIdentifier", "TaxonomyId",
+                     "SampleOrganism"]
+    sample_header = create_header(sample_header, sample_keys)
+
+    assay_header = ["AssayIdentifier", "SampleIdentifier", "AssayTitle", "AssayDescription", "FileNameForward",
+                    "FileNameReverse", "ForwardPrimer", "ReversePrimer", "PrimerNames", "IsolationProtocol",
+                    "SequencingCenter", "SequencingPlatform", "SequencingDate"]
+
+    assay_header = create_header(assay_header, assay_keys)
+
+    for index, value in enumerate(study_header):
+        study_worksheet.write(0, index, value)
+
+    for index, value in enumerate(observation_unit_header):
+        ou_worksheet.write(0, index, value)
+
+    for index, value in enumerate(sample_header):
+        sample_worksheet.write(0, index, value)
+
+    for index, value in enumerate(assay_header):
+        assay_worksheet.write(0, index, value)
+
+    ###############################################################
+    # Filling the study sheet
+    ###############################################################
+    study_identifier = set()
+    for row, pickle_file in enumerate(pickle_list):
+        with open(pickle_file, 'rb') as pkl:
+            investigation = pickle.load(pkl)
+            study = pickle.load(pkl)
+            observation_unit = pickle.load(pkl)
+            sample = pickle.load(pkl)
+            assay = pickle.load(pkl)
+
+            # Skip studies that are already parsed from other pickles
+            if study['StudyIdentifier'] in study_identifier: continue
+            row = row + 1
+            for key in study:
+                column = study_header.index(key)
+                study_worksheet.write(row, column, study[key])
+                study_identifier.add(study['StudyIdentifier'])
+
+    ###############################################################
+    # Filling the observation unit sheet
+    ###############################################################
+    ou_identifier = set()
+    for row, pickle_file in enumerate(pickle_list):
+        with open(pickle_file, 'rb') as pkl:
+            investigation = pickle.load(pkl)
+            study = pickle.load(pkl)
+            observation_unit = pickle.load(pkl)
+            sample = pickle.load(pkl)
+            assay = pickle.load(pkl)
+
+            # Skip studies that are already parsed from other pickles
+            if observation_unit['ObservationUnitIdentifier'] in ou_identifier: continue
+            row = row + 1
+            for key in observation_unit:
+                column = observation_unit_header.index(key)
+                ou_worksheet.write(row, column, observation_unit[key])
+                ou_identifier.add(observation_unit['ObservationUnitIdentifier'])
+
+    ###############################################################
+    # Filling the sample sheet
+    ###############################################################
+    sample_identifier = set()
+    for row, pickle_file in enumerate(pickle_list):
+        with open(pickle_file, 'rb') as pkl:
+            investigation = pickle.load(pkl)
+            study = pickle.load(pkl)
+            observation_unit = pickle.load(pkl)
+            sample = pickle.load(pkl)
+            assay = pickle.load(pkl)
+
+            # Skip studies that are already parsed from other pickles
+            if sample['SampleIdentifier'] in sample_identifier: continue
+            row = row + 1
+            attributes = sample.pop("attributes")
+            sample = {**sample, **attributes}
+
+            for key in sample:
+                column = sample_header.index(key)
+                sample_worksheet.write(row, column, sample[key])
+                sample_identifier.add(sample['SampleIdentifier'])
+
+    ###############################################################
+    # Filling the assay sheet
+    ###############################################################
+    assay_identifier = set()
+    for row, pickle_file in enumerate(pickle_list):
+        with open(pickle_file, 'rb') as pkl:
+            investigation = pickle.load(pkl)
+            study = pickle.load(pkl)
+            observation_unit = pickle.load(pkl)
+            sample = pickle.load(pkl)
+            assay = pickle.load(pkl)
+
+            # Skip studies that are already parsed from other pickles
+            if assay['AssayIdentifier'] in assay_identifier: continue
+            row = row + 1
+
+            for key in assay:
+                column = assay_header.index(key)
+                assay_worksheet.write(row, column, assay[key])
+                assay_identifier.add(assay['AssayIdentifier'])
+
+    workbook.close()
+    print("done")
+
+
+def lookup_creation(doc, pickle_file):
     content = doc['EXPERIMENT_PACKAGE_SET']['EXPERIMENT_PACKAGE']
-    if int(content['RUN_SET']['RUN']['@total_spots']) < MINIMUM_NUMBER_OF_SPOTS:
-        return None
+    # if int(content['RUN_SET']['RUN']['@total_spots']) < MINIMUM_NUMBER_OF_SPOTS:
+    #     return None
 
-    if content['RUN_SET']['RUN']['Statistics']['@nreads'] != "2":
-        return None
+    # if content['RUN_SET']['RUN']['Statistics']['@nreads'] != "2":
+    #     return None
 
     investigation = setup_investigation(content)
     study = setup_study(content)
@@ -256,39 +427,69 @@ def lookup_creation(doc):
     sample = setup_sample(content)
     assay = setup_assay(content)
 
+    # Dump into file for merging with other xml files
+    pickle_file = open(pickle_file, 'wb')
+    pickle.dump(investigation, pickle_file)
+    pickle.dump(study, pickle_file)
+    pickle.dump(observation_unit, pickle_file)
+    pickle.dump(sample, pickle_file)
+    pickle.dump(assay, pickle_file)
+    pickle_file.close()
+
+
+def selection(content):
+    """
+    Selection function based on criteria set by the developer
+    :param file:
+    :return:
+    """
+    if "PRJNA527973" not in content and "PRJNA517152" not in content:
+        return False
+
+    # if "soil" not in content:
+    #     return False
+
+    return True
+
 
 def main():
+    pickle_list = set()
     for dir in sorted(os.listdir(dirName)):
         path = dirName + "/" + dir
         print("Parsing ", path)
-        if dir != "125": continue
+
         for (dirpath, dirnames, filenames) in os.walk(path):
             listOfFiles = set([os.path.join(dirpath, file) for file in filenames])
             for index, elem in enumerate(listOfFiles):
-                # print("Parsing", index, "of", len(listOfFiles))
+                if not elem.endswith("xml"): continue
                 with open(elem) as fd:
                     content = fd.read()
+                    if selection(content):
+                        doc = xmltodict.parse(content)
 
-                    # if "soil" not in content:
-                    #     continue
-
-                    doc = xmltodict.parse(content)
-
-                    status = machine_filter(doc)
+                        status = machine_filter(doc)
 
-                    if not status:
-                        continue
+                        if not status:
+                            continue
 
-                    status = paired_check(doc)
+                        status = paired_check(doc)
 
-                    if not status:
-                        continue
+                        if not status:
+                            continue
 
-                    # Create lookup file
-                    lookup_creation(doc)
+                        # Create lookup file
+                        pickle_file = elem.replace("xml", "pkl")
+                        if os.path.exists(pickle_file):
+                            os.remove(pickle_file)
 
-            # print(paired, single, error, ' '.join(machines))
-            # break
+                        if not os.path.exists(pickle_file):
+                            print("Creating pickle")
+                            lookup_creation(doc, pickle_file)
+                        else:
+                            pass
+                        pickle_list.add(pickle_file)
+    # Create EXCEL file
+    create_xlsx(pickle_list)
 
 
 if __name__ == '__main__':