Added script that takes in fastq files and converts them to fasta files with a...

Added script that takes in fastq files and converts them to fasta files with a filename that corresponds to the original fast5 read

Added script that takes in fastq files and converts them to fasta files with a...
13cbca63 · Noordijk, Ben · eddc8b67 · 13cbca63 · 13cbca63
Commit 13cbca63 authored 3 years ago by Noordijk, Ben
--- a/CnnParameterFile.yaml
+++ b/CnnParameterFile.yaml
@@ -6,8 +6,7 @@
 # CNN ARCHITECTURE
 nn_class: Cnn_test
 batch_size: 32
-eps_per_kmer_switch: 50
-max_sequence_length: 1000  # Only for example reads
+eps_per_kmer_switch: 20
 filter_width: 1000
 kernel_size: 10
 filters: 6

--- a/tools/get_one_barcode.py
+++ b/tools/get_one_barcode.py
 import itertools
 import multiprocessing
 import os
+from pathlib import Path
+import Bio.SeqIO
 import pandas as pd
+from Bio import SeqIO

+# Never truncate strings:
+pd.set_option('display.max_colwidth', None)

-def get_barcode_read(barcode_id, sequencing_summary_path, sequencing_dir,
-                     out_dir, worker_count):
-    """Get list of all files in directory that belong to a certain barcode
+def get_barcode_fast5_reads(barcode_id, sequencing_summary_path, sequencing_dir,
+                            out_dir, worker_count):
+    """Get list of all fast5 files that belong to a certain barcode

    :param barcode_id: barcode to extract, example 'barcode12'
    :type barcode_id: str
@@ -14,38 +19,98 @@ def get_barcode_read(barcode_id, sequencing_summary_path, sequencing_dir,
    :param sequencing_dir: path to directory containing fast5 reads
    :param out_dir: where to save output filenames and what to call the file
    :param worker_count: how many processes to run in parallel
-    :return:
+    :return: list of fast 5 filenames
    """
    # Index all files
    all_files = os.walk(sequencing_dir)

-    # Load all reads and their info from sequencing summary
-    df = pd.read_csv(sequencing_summary_path, sep='\t',
-                     usecols=['filename', 'read_id', 'run_id', 'batch_id',
-                              'barcode_arrangement'])
-    # Extract only reads that are of the target barcode
-    filtered_df = df[df['barcode_arrangement'] == barcode_id]
-    id_of_barcoded_reads = filtered_df['filename'].squeeze().to_list()
-    print('Finished indexing everything')
+    barcode_df = summary_df_one_barcode(barcode_id, sequencing_summary_path)
+    filename_of_barcoded_reads = barcode_df['filename'].squeeze().to_list()
+    print('Indexed everything')

    # Prepare the arguments for the multiprocessing
-    args = [[file_location, id_of_barcoded_reads]
+    args = [[file_location, filename_of_barcoded_reads]
            for file_location in all_files]
    # Get all files in parallel
    with multiprocessing.Pool(worker_count) as p:
        files_of_read = p.starmap(find_reads_in_dir, args)
    # Merge list of lists into one big list
    files_of_read = itertools.chain.from_iterable(files_of_read)
+
+    print('Saving to file')
    series = pd.Series(files_of_read)
    series.to_csv(out_dir, index=False, header=False)

    return files_of_read


+def fastq_to_fasta(fastq_filepath, df, outdir):
+    """Convert single fastq file to fasta file and write to file in outdir
+
+    :param fastq_filepath: string to single fastq file
+    :param df: dataframe of sequencing_summary.txt
+    :param outdir: directory to write fasta files to
+
+    :return:
+    """
+    seqs = SeqIO.parse(fastq_filepath, 'fastq')
+    for seq in seqs:
+        filename = df.loc[df['read_id'] == seq.id]['filename']
+        filename = filename.to_string(header=False, index=False)
+        # Convert .fastq to .fasta here
+        filename = filename[:-1] + 'a'
+        print(f'Saving {filename}')
+        SeqIO.write(seq, os.path.join(outdir, filename), 'fasta')
+
+
+def get_barcode_fastq_reads(barcode_id, sequencing_summary_path,
+                            sequencing_dir, out_dir, worker_count):
+    """Wrapper function to get all fastq reads and convert them to fasta files
+    with file names that correspond to the original fast5 files.
+
+    :param barcode_id: barcode to extract, example 'barcode12'
+    :type barcode_id: str
+    :param sequencing_summary_path: path to sequencing_summary.txt
+    :param sequencing_dir: path to directory containing fastq reads
+    :param out_dir: where to save output fasta files
+    :param worker_count: how many processes to run in parallel
+    :return:
+    """
+    all_files = Path(sequencing_dir).iterdir()
+
+    barcode_df = summary_df_one_barcode(barcode_id, sequencing_summary_path)
+
+    # Prepare the arguments for the multiprocessing
+    args = [[file, barcode_df, out_dir] for file in all_files]
+    # Get all files in parallel
+    with multiprocessing.Pool(worker_count) as p:
+        p.starmap(fastq_to_fasta, args)
+
+
+def summary_df_one_barcode(barcode_id, sequencing_summary_path):
+    """Load sequencing summary and return dataframe with only
+    reads of one barcode
+
+    :param barcode_id: barcode to extract, example 'barcode12'
+    :type barcode_id: str
+    :param sequencing_summary_path: path to sequencing_summary.txt
+    :return: dataframe
+    """
+    # Load all reads and their info from sequencing summary
+    df = pd.read_csv(sequencing_summary_path, sep='\t',
+                     usecols=['filename', 'read_id', 'run_id', 'batch_id',
+                              'barcode_arrangement'])
+    # Extract only reads that are of the target barcode
+    filtered_df = df[df['barcode_arrangement'] == barcode_id]
+    return filtered_df
+
+
 def find_reads_in_dir(directory_tuple, id_of_barcoded_reads):
    """Given a certain directory tuple and list of valid files, return only the
    files in the directory that are valid, i.e. contain the barcode

+    This is used for directories where not all reads belong to a barcode
+
    :param directory_tuple: tuple of (dirpath, dirnames, filenames)
                            output by os.walk
    :type directory_tuple: tuple
@@ -63,15 +128,24 @@ def find_reads_in_dir(directory_tuple, id_of_barcoded_reads):
    return files_of_read


+# def main():
+#     # TODO do not hardcode these in
+#     barcode = 'barcode12'
+#     read_path = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/guppy_output/workspace/'
+#     # read_path = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/guppy_output/workspace/16Srhizosphere_reads/fast5/69'
+#     seq_summary_path = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/guppy_output/sequencing_summary.txt'
+#     out_dir = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/files_of_barcode.csv'
+#     worker_count = 96
+#     get_barcode_fast5_reads(barcode, seq_summary_path, read_path, out_dir, worker_count)
+
 def main():
-    # TODO do not hardcode these in
+    """This one is for testing the fastq extraction"""
    barcode = 'barcode12'
-    read_path = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/guppy_output/workspace/'
    seq_summary_path = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/guppy_output/sequencing_summary.txt'
-    out_dir = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/files_of_barcode.csv'
+    read_path = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/guppy_output/pass/barcode12'
+    out_dir = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/fasta'
    worker_count = 96
-    get_barcode_read(barcode, seq_summary_path, read_path, out_dir, worker_count)
-
+    get_barcode_fastq_reads(barcode, seq_summary_path, read_path, out_dir, worker_count)

 if __name__ == '__main__':
    main()