Skip to content
Snippets Groups Projects
Commit c16434f0 authored by Bart's avatar Bart
Browse files

bin assembly stats script

parent 9f055c2d
Branches
No related tags found
No related merge requests found
#!/bin/bash
# Loops through a folder of fasta files and calculates #contigs, total length, N50, largst contig and GC%.
# (also works on gzipped fasta files)
#
# The raw_n50 executeable comes from the idba_ud assembler.
echo -e 'bin\tcontigs\ttotal_length\tN50\tlargest\tGC%'
for filepath in `ls $1/*.fa*`; do
bin=$(basename $filepath | sed s'/.fa.*//'g)
raw_n50_stats=$(/unlock/infrastructure/binaries/raw_n50 $filepath)
GC=$(grep -v ">" $filepath | sed -e 's/\(.\)/\1\n/g' | grep -c "G\|C")
size=$(echo $raw_n50_stats | awk '{print $11}')
GCcontent=$(echo | awk -v GC=$GC -v size=$size '{printf "%.2f", GC/size*100}')
stats=$(echo $raw_n50_stats | awk '{print $2"\t"$11"\t"$4"\t"$6}')
echo -e $bin"\t"$stats"\t"$GCcontent | sed 's/ /\t/g'
done
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment