Skip to content
Snippets Groups Projects
Commit 207a8e95 authored by Jasper Koehorst's avatar Jasper Koehorst
Browse files

Merge branch 'master' of git.wur.nl:unlock/scripts

parents 0e02a472 74d668d8
No related branches found
No related tags found
No related merge requests found
......@@ -35,9 +35,10 @@ binContigs = set()
bins = 0
prev_bin = ""
for line in open(binContigsFile).readlines():
if line.split()[0] != prev_bin: bins+=1
binContigs.add(line.strip().split()[1])
prev_bin = line.split()[0]
if not "unbinned" in line:
if line.split()[0] != prev_bin: bins+=1
binContigs.add(line.strip().split()[1])
prev_bin = line.split()[0]
total_assembly_size = 0
mapped_reads = 0
......
#!/usr/bin/python3
import sys
import argparse
import os
import pandas as pd
def parse_options():
usage = "\nbins_summary.py -b bins_folder -i idxstats_file -f flagstat_file -b busco_summaries_folder -c checkm_report -o output_summary.tsv"
description = "Creates a summary of bins from different tools"
parser = argparse.ArgumentParser(usage=usage, description=description)
input_group = parser.add_argument_group('Required arguments')
input_group.add_argument("-b","--bin_folder", dest="bin_folder", help="Folder with bins in fasta format", required=True, metavar="")
input_group.add_argument("-d","--bindepths", dest="bin_depths", help="MetaBAT2 aggregateDepths file", required=True, metavar="")
input_group.add_argument("-B","--busco", dest="busco_folder", help="Folder with BUSCO reports", required=False, metavar="")
input_group.add_argument("-c","--checkm", dest="checkm_report", help="Checkm report file", required=False, metavar="")
input_group.add_argument("-o","--output_file",dest="output_file", help="Output name", required=True, metavar="")
inputs = parser.parse_args()
return inputs
def bin_stats(bin_folder):
bin_file_list = os.listdir(bin_folder)
for bin in bin_file_list:
bin_file_path = bin_folder+"/"+bin
# TOTAL SIZE, CONTIG SIZES, GC%
total_size = 0
contig_sizes = []
contig_len = 0
GC = 0
for line in open(bin_file_path,"r").readlines():
if not line.startswith(">"):
GC += line.count("G")+line.count("C")
total_size += len(line.strip())
contig_len += len(line.strip())
else:
contig_sizes.append(contig_len)
contig_len = 0
GC = round(GC/total_size*100,1)
contigs = len(contig_sizes)
largest_contig = max(contig_sizes)
# N50
size = []
for contig in sorted(contig_sizes, reverse=True):
size.append(contig)
if sum(size) >= total_size * 0.5:
n50 = contig
break
df.loc[bin, ['Contigs','Size','Largest_contig','N50','GC']] = [contigs,total_size,largest_contig,n50,GC]
def bin_depths(bin_depths):
for i,line in enumerate(open(bin_depths,"r").readlines()):
if i > 0:
line = line.strip().split()
bin = line[0].split("/")[-1]
df.at[bin,'avgDepth'] = line[2]
def read_checkm_report(checkm_report):
checkm_df = pd.read_fwf(checkm_report, colspecs="infer",skiprows=3,header=None,infer_nrows=16)
# Drop last row because it's dashes
checkm_df.drop(checkm_df.tail(1).index,inplace=True)
for idx, row in checkm_df.iterrows():
df.loc[row[0]+".fa", ['CheckM_Completeness','CheckM_Contamination','CheckM_Strain-heterogeneity']] = [row[11]+"%",row[12]+"%",row[13]+"%"]
def read_busco(busco_folder):
busco_file_list = os.listdir(busco_folder)
for busco_file in busco_file_list:
if "specific" in busco_file:
bin = ".".join(busco_file.split(".")[3:-1])
taxonomy = busco_file.split(".")[2]
busco_file_path = busco_folder+"/"+busco_file
for line in open(busco_file_path):
if line.strip().startswith("C"):
busco_score = line.strip()
df.loc[bin, ['BUSCO_Taxonomy','BUSCO_score']] = [taxonomy,busco_score]
def main(argv):
inputs = parse_options()
out_columns = ['Contigs','Size','Largest_contig','N50','GC','avgDepth','BUSCO_Taxonomy','BUSCO_score','CheckM_Completeness','CheckM_Contamination','CheckM_Strain-heterogeneity']
bin_file_list = os.listdir(inputs.bin_folder)
global df
df = pd.DataFrame(columns=out_columns, index=bin_file_list)
bin_stats(inputs.bin_folder)
bin_depths(inputs.bin_depths)
read_checkm_report(inputs.checkm_report)
read_busco(inputs.busco_folder)
df.index.name = "Bin"
df.to_csv(inputs.output_file, sep="\t")
if __name__ == "__main__":
main(sys.argv[1:])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment