From f1323363c4b06a5630a26ccacf2926c33e892dec Mon Sep 17 00:00:00 2001
From: "Overduin, Sam" <sam.overduin@wur.nl>
Date: Thu, 5 Mar 2020 17:01:49 +0100
Subject: [PATCH] Upload cut_barcode_from_reads.py

---
 scripts/cut_barcode_from_reads.py | 84 +++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 scripts/cut_barcode_from_reads.py

diff --git a/scripts/cut_barcode_from_reads.py b/scripts/cut_barcode_from_reads.py
new file mode 100644
index 00000000..bc123f89
--- /dev/null
+++ b/scripts/cut_barcode_from_reads.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+import gzip
+import argparse
+import datetime
+import subprocess
+
+#./cut_barcode_from_reads.py -i ../data/barcoded/enriched.fastq.gz -o ../data/barcoded/enriched_trimmed.fastq.gz
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('-i', '--infile', required=True)
+#parser.add_argument('-p', '--threads', type=int, required=False)
+parser.add_argument('-o', '--outfile', required=True)
+args = parser.parse_args()
+
+def write_output(output_file, output_lines):
+    #compress_file = False
+    if output_file.endswith('.gz'):
+        compress_file = True
+        output_file = output_file[:-3]
+    with open(output_file, 'w+') as f:
+        for line in output_lines:
+            f.write(line)
+    print('Done writing')
+    if compress_file:
+        print('Starting compression')
+        cmd = 'pigz -p16 "'+output_file+'"'
+        subprocess.call(cmd, shell=True)
+        print('Done compression')
+
+new_fasta = []
+print(datetime.datetime.now(), 'started reading fastq.gz')
+with gzip.open(args.infile, 'rt') as f:
+    i = 0
+    n = 0
+    corrected_instances = 0
+    records = 0
+    for line in f:
+        n += 1
+        if n == 1:
+            start_bar = line.find('BX:Z:')+5
+            if start_bar == 4:
+                new_fasta.append(line)
+                barcode = None
+            else:
+                new_fasta.append(line)
+                barcode = line[start_bar:start_bar+16]
+                #print(line, barcode)
+        elif n == 2:
+            if barcode:
+                bar_in_seq = line.find(barcode)
+                if bar_in_seq != -1:
+                    cut_pos = bar_in_seq + 16
+                    #print(line, line[cut_pos:], barcode)
+                    new_fasta.append(line[cut_pos:])
+                    corrected_instances += 1
+                else:
+                    cut_pos = 0
+                    new_fasta.append(line)
+            else:
+                cut_pos = 0
+                new_fasta.append(line)
+        elif n == 3:
+            new_fasta.append(line)
+        elif n == 4:
+            new_fasta.append(line[cut_pos:])
+            records += 1
+            if records % 100000000 == 0:
+                print(datetime.datetime.now(), records, 'records parsed')
+            n = 0
+        else:
+            print('this state should never be reached, check code')
+            print(line)
+            exit()
+        i += 1
+        #if i == 100000:
+            #break
+
+print(datetime.datetime.now(), 'Trimmed', corrected_instances, 'out of', records, 'records')
+print(datetime.datetime.now(), 'Writing file')
+write_output(args.outfile, new_fasta)
+print(datetime.datetime.now(), 'Done writing')
+            
-- 
GitLab