Skip to content
Snippets Groups Projects
Commit 13cbca63 authored by Noordijk, Ben's avatar Noordijk, Ben
Browse files

Added script that takes in fastq files and converts them to fasta files with a...

Added script that takes in fastq files and converts them to fasta files with a filename that corresponds to the original fast5 read
parent eddc8b67
No related branches found
No related tags found
1 merge request!3Added data preparation, hyperparameter optimisation, benchmarking code and k-mer library visualisation
......@@ -6,8 +6,7 @@
# CNN ARCHITECTURE
nn_class: Cnn_test
batch_size: 32
eps_per_kmer_switch: 50
max_sequence_length: 1000 # Only for example reads
eps_per_kmer_switch: 20
filter_width: 1000
kernel_size: 10
filters: 6
......
import itertools
import multiprocessing
import os
from pathlib import Path
import Bio.SeqIO
import pandas as pd
from Bio import SeqIO
# Never truncate strings:
pd.set_option('display.max_colwidth', None)
def get_barcode_read(barcode_id, sequencing_summary_path, sequencing_dir,
out_dir, worker_count):
"""Get list of all files in directory that belong to a certain barcode
def get_barcode_fast5_reads(barcode_id, sequencing_summary_path, sequencing_dir,
out_dir, worker_count):
"""Get list of all fast5 files that belong to a certain barcode
:param barcode_id: barcode to extract, example 'barcode12'
:type barcode_id: str
......@@ -14,38 +19,98 @@ def get_barcode_read(barcode_id, sequencing_summary_path, sequencing_dir,
:param sequencing_dir: path to directory containing fast5 reads
:param out_dir: where to save output filenames and what to call the file
:param worker_count: how many processes to run in parallel
:return:
:return: list of fast 5 filenames
"""
# Index all files
all_files = os.walk(sequencing_dir)
# Load all reads and their info from sequencing summary
df = pd.read_csv(sequencing_summary_path, sep='\t',
usecols=['filename', 'read_id', 'run_id', 'batch_id',
'barcode_arrangement'])
# Extract only reads that are of the target barcode
filtered_df = df[df['barcode_arrangement'] == barcode_id]
id_of_barcoded_reads = filtered_df['filename'].squeeze().to_list()
print('Finished indexing everything')
barcode_df = summary_df_one_barcode(barcode_id, sequencing_summary_path)
filename_of_barcoded_reads = barcode_df['filename'].squeeze().to_list()
print('Indexed everything')
# Prepare the arguments for the multiprocessing
args = [[file_location, id_of_barcoded_reads]
args = [[file_location, filename_of_barcoded_reads]
for file_location in all_files]
# Get all files in parallel
with multiprocessing.Pool(worker_count) as p:
files_of_read = p.starmap(find_reads_in_dir, args)
# Merge list of lists into one big list
files_of_read = itertools.chain.from_iterable(files_of_read)
print('Saving to file')
series = pd.Series(files_of_read)
series.to_csv(out_dir, index=False, header=False)
return files_of_read
def fastq_to_fasta(fastq_filepath, df, outdir):
"""Convert single fastq file to fasta file and write to file in outdir
:param fastq_filepath: string to single fastq file
:param df: dataframe of sequencing_summary.txt
:param outdir: directory to write fasta files to
:return:
"""
seqs = SeqIO.parse(fastq_filepath, 'fastq')
for seq in seqs:
filename = df.loc[df['read_id'] == seq.id]['filename']
filename = filename.to_string(header=False, index=False)
# Convert .fastq to .fasta here
filename = filename[:-1] + 'a'
print(f'Saving {filename}')
SeqIO.write(seq, os.path.join(outdir, filename), 'fasta')
def get_barcode_fastq_reads(barcode_id, sequencing_summary_path,
sequencing_dir, out_dir, worker_count):
"""Wrapper function to get all fastq reads and convert them to fasta files
with file names that correspond to the original fast5 files.
:param barcode_id: barcode to extract, example 'barcode12'
:type barcode_id: str
:param sequencing_summary_path: path to sequencing_summary.txt
:param sequencing_dir: path to directory containing fastq reads
:param out_dir: where to save output fasta files
:param worker_count: how many processes to run in parallel
:return:
"""
all_files = Path(sequencing_dir).iterdir()
barcode_df = summary_df_one_barcode(barcode_id, sequencing_summary_path)
# Prepare the arguments for the multiprocessing
args = [[file, barcode_df, out_dir] for file in all_files]
# Get all files in parallel
with multiprocessing.Pool(worker_count) as p:
p.starmap(fastq_to_fasta, args)
def summary_df_one_barcode(barcode_id, sequencing_summary_path):
"""Load sequencing summary and return dataframe with only
reads of one barcode
:param barcode_id: barcode to extract, example 'barcode12'
:type barcode_id: str
:param sequencing_summary_path: path to sequencing_summary.txt
:return: dataframe
"""
# Load all reads and their info from sequencing summary
df = pd.read_csv(sequencing_summary_path, sep='\t',
usecols=['filename', 'read_id', 'run_id', 'batch_id',
'barcode_arrangement'])
# Extract only reads that are of the target barcode
filtered_df = df[df['barcode_arrangement'] == barcode_id]
return filtered_df
def find_reads_in_dir(directory_tuple, id_of_barcoded_reads):
"""Given a certain directory tuple and list of valid files, return only the
files in the directory that are valid, i.e. contain the barcode
This is used for directories where not all reads belong to a barcode
:param directory_tuple: tuple of (dirpath, dirnames, filenames)
output by os.walk
:type directory_tuple: tuple
......@@ -63,15 +128,24 @@ def find_reads_in_dir(directory_tuple, id_of_barcoded_reads):
return files_of_read
# def main():
# # TODO do not hardcode these in
# barcode = 'barcode12'
# read_path = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/guppy_output/workspace/'
# # read_path = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/guppy_output/workspace/16Srhizosphere_reads/fast5/69'
# seq_summary_path = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/guppy_output/sequencing_summary.txt'
# out_dir = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/files_of_barcode.csv'
# worker_count = 96
# get_barcode_fast5_reads(barcode, seq_summary_path, read_path, out_dir, worker_count)
def main():
# TODO do not hardcode these in
"""This one is for testing the fastq extraction"""
barcode = 'barcode12'
read_path = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/guppy_output/workspace/'
seq_summary_path = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/guppy_output/sequencing_summary.txt'
out_dir = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/files_of_barcode.csv'
read_path = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/guppy_output/pass/barcode12'
out_dir = '/home/noord087/lustre_link/HoiCarlos/16Sreads_mockcommunity/demultiplexed_reads/fasta'
worker_count = 96
get_barcode_read(barcode, seq_summary_path, read_path, out_dir, worker_count)
get_barcode_fastq_reads(barcode, seq_summary_path, read_path, out_dir, worker_count)
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment