From f1323363c4b06a5630a26ccacf2926c33e892dec Mon Sep 17 00:00:00 2001 From: "Overduin, Sam" <sam.overduin@wur.nl> Date: Thu, 5 Mar 2020 17:01:49 +0100 Subject: [PATCH] Upload cut_barcode_from_reads.py --- scripts/cut_barcode_from_reads.py | 84 +++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 scripts/cut_barcode_from_reads.py diff --git a/scripts/cut_barcode_from_reads.py b/scripts/cut_barcode_from_reads.py new file mode 100644 index 00000000..bc123f89 --- /dev/null +++ b/scripts/cut_barcode_from_reads.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +import gzip +import argparse +import datetime +import subprocess + +#./cut_barcode_from_reads.py -i ../data/barcoded/enriched.fastq.gz -o ../data/barcoded/enriched_trimmed.fastq.gz + + +parser = argparse.ArgumentParser() + +parser.add_argument('-i', '--infile', required=True) +#parser.add_argument('-p', '--threads', type=int, required=False) +parser.add_argument('-o', '--outfile', required=True) +args = parser.parse_args() + +def write_output(output_file, output_lines): + #compress_file = False + if output_file.endswith('.gz'): + compress_file = True + output_file = output_file[:-3] + with open(output_file, 'w+') as f: + for line in output_lines: + f.write(line) + print('Done writing') + if compress_file: + print('Starting compression') + cmd = 'pigz -p16 "'+output_file+'"' + subprocess.call(cmd, shell=True) + print('Done compression') + +new_fasta = [] +print(datetime.datetime.now(), 'started reading fastq.gz') +with gzip.open(args.infile, 'rt') as f: + i = 0 + n = 0 + corrected_instances = 0 + records = 0 + for line in f: + n += 1 + if n == 1: + start_bar = line.find('BX:Z:')+5 + if start_bar == 4: + new_fasta.append(line) + barcode = None + else: + new_fasta.append(line) + barcode = line[start_bar:start_bar+16] + #print(line, barcode) + elif n == 2: + if barcode: + bar_in_seq = line.find(barcode) + if bar_in_seq != -1: + cut_pos = bar_in_seq + 16 + #print(line, line[cut_pos:], barcode) + new_fasta.append(line[cut_pos:]) + corrected_instances += 1 + else: + cut_pos = 0 + new_fasta.append(line) + else: + cut_pos = 0 + new_fasta.append(line) + elif n == 3: + new_fasta.append(line) + elif n == 4: + new_fasta.append(line[cut_pos:]) + records += 1 + if records % 100000000 == 0: + print(datetime.datetime.now(), records, 'records parsed') + n = 0 + else: + print('this state should never be reached, check code') + print(line) + exit() + i += 1 + #if i == 100000: + #break + +print(datetime.datetime.now(), 'Trimmed', corrected_instances, 'out of', records, 'records') +print(datetime.datetime.now(), 'Writing file') +write_output(args.outfile, new_fasta) +print(datetime.datetime.now(), 'Done writing') + -- GitLab