From 7d7af0fdba138866c529709972acb5f3658838dd Mon Sep 17 00:00:00 2001
From: bart <bart.nijsse@wur.nl>
Date: Fri, 15 Oct 2021 12:22:28 +0200
Subject: [PATCH] move

---
 metagenomics/bin_assembly_stats.sh | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100755 metagenomics/bin_assembly_stats.sh

diff --git a/metagenomics/bin_assembly_stats.sh b/metagenomics/bin_assembly_stats.sh
new file mode 100755
index 0000000..f63f5b7
--- /dev/null
+++ b/metagenomics/bin_assembly_stats.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Loops through a folder of fasta files and calculates #contigs, total length, N50, largst contig and GC%.
+# Also outputs a file with the bins and their respective assembly contig names.
+#
+# The raw_n50 executeable comes from the idba assembler
+# https://github.com/loneknightpy/idba
+
+echo -e 'bin\tcontigs\ttotal_length\tN50\tlargest\tGC%'
+for filepath in `ls $1/*.fa*`; do
+  bin=$(basename $filepath | sed s'/.fa.*//'g)
+
+  grep "^>" $filepath | awk -v bin=$bin '{print bin"\t"$1}' | sed 's/>//g' >> binContigs.tsv
+
+  raw_n50_stats=$(/unlock/infrastructure/binaries/raw_n50 $filepath)
+  
+  GC=$(grep -v ">" $filepath | sed -e 's/\(.\)/\1\n/g' | grep -c "G\|C")
+  size=$(echo $raw_n50_stats | awk '{print $11}')
+  GCcontent=$(echo | awk -v GC=$GC -v size=$size '{printf "%.2f", GC/size*100}')
+  
+  stats=$(echo $raw_n50_stats | awk '{print $2"\t"$11"\t"$4"\t"$6}')
+  echo -e $bin"\t"$stats"\t"$GCcontent | sed 's/ /\t/g'
+done
-- 
GitLab