From 7d7af0fdba138866c529709972acb5f3658838dd Mon Sep 17 00:00:00 2001 From: bart <bart.nijsse@wur.nl> Date: Fri, 15 Oct 2021 12:22:28 +0200 Subject: [PATCH] move --- metagenomics/bin_assembly_stats.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100755 metagenomics/bin_assembly_stats.sh diff --git a/metagenomics/bin_assembly_stats.sh b/metagenomics/bin_assembly_stats.sh new file mode 100755 index 0000000..f63f5b7 --- /dev/null +++ b/metagenomics/bin_assembly_stats.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Loops through a folder of fasta files and calculates #contigs, total length, N50, largst contig and GC%. +# Also outputs a file with the bins and their respective assembly contig names. +# +# The raw_n50 executeable comes from the idba assembler +# https://github.com/loneknightpy/idba + +echo -e 'bin\tcontigs\ttotal_length\tN50\tlargest\tGC%' +for filepath in `ls $1/*.fa*`; do + bin=$(basename $filepath | sed s'/.fa.*//'g) + + grep "^>" $filepath | awk -v bin=$bin '{print bin"\t"$1}' | sed 's/>//g' >> binContigs.tsv + + raw_n50_stats=$(/unlock/infrastructure/binaries/raw_n50 $filepath) + + GC=$(grep -v ">" $filepath | sed -e 's/\(.\)/\1\n/g' | grep -c "G\|C") + size=$(echo $raw_n50_stats | awk '{print $11}') + GCcontent=$(echo | awk -v GC=$GC -v size=$size '{printf "%.2f", GC/size*100}') + + stats=$(echo $raw_n50_stats | awk '{print $2"\t"$11"\t"$4"\t"$6}') + echo -e $bin"\t"$stats"\t"$GCcontent | sed 's/ /\t/g' +done -- GitLab