diff --git a/ArrowerSVG.py b/ArrowerSVG.py index d462bccf79807158ea9171a8384fceea9ca6e6b6..d36710fb984ee2bff457ee53a3d25adae1389fc1 100644 --- a/ArrowerSVG.py +++ b/ArrowerSVG.py @@ -8,6 +8,12 @@ # heavily modified by Jorge Navarro 2016 # ###################################################################### +# Makes sure the script can be used with Python 2 as well as Python 3. +from __future__ import print_function, division +from sys import version_info +if version_info[0]==2: + range = xrange + import os import sys import argparse @@ -169,15 +175,10 @@ def draw_arrow(additional_tabs, X, Y, L, l, H, h, strand, color, color_contour, else: color_contour = [50, 50, 50] - arrow += additional_tabs + "\t\t<polygon " - arrow += "class=\"" + gid + "\" " - arrow += "points=\"" + " ".join(points_coords) + "\" " - arrow += "fill=\"rgb(" + ",".join(map(str,color)) +")\" " - arrow += "fill-opacity=\"1.0\" " - arrow += "stroke=\"rgb(" + ",".join(map(str,color_contour)) + ")\" " - arrow += "stroke-width=\"" + str(gene_contour_thickness) + "\" " - arrow += category + " />\n" - + arrow += "{}\t\t<polygon class=\"{}\" ".format(additional_tabs, gid) + arrow += "points=\"{}\" fill=\"rgb({})\" ".format(" ".join(points_coords), ",".join([str(val) for val in color])) + arrow += "fill-opacity=\"1.0\" stroke=\"rgb({})\" ".format(",".join([str(val) for val in color_contour])) + arrow += "stroke-width=\"{}\" {} />\n".format(str(gene_contour_thickness), category) # paint domains. Domains on the tip of the arrow should not have corners sticking # out of them @@ -193,8 +194,7 @@ def draw_arrow(additional_tabs, X, Y, L, l, H, h, strand, color, color_contour, dccolour = domain[6] arrow += additional_tabs + "\t\t<g>\n" - arrow += additional_tabs + "\t\t\t<title>" + dname + " (" + dacc + ")\n\"" + ddesc + "\"</title>\n" - + arrow += "{}\t\t\t<title>{} ({})\n\"{}\"</title>\n".format(additional_tabs, dname, dacc, ddesc) if strand == "+": # calculate how far from head_start we (the horizontal guide at y=Y+internal_domain_margin) @@ -211,16 +211,11 @@ def draw_arrow(additional_tabs, X, Y, L, l, H, h, strand, color, color_contour, x_margin_offset = internal_domain_margin/sin(pi - atan2(h+H/2.0,-head_length)) if (dX + dL) < head_start + collision_x - x_margin_offset: - arrow += additional_tabs + "\t\t\t<rect class=\"" + dacc + "\" " - arrow += "x=\"" + str(X+dX) + "\" " - arrow += "y=\"" + str(Y + internal_domain_margin) + "\" " - arrow += "stroke-linejoin=\"round\" " - arrow += "width=\"" + str(dL) + "\" " - arrow += "height=\"" + str(dH) + "\" " - arrow += "fill=\"rgb(" + ",".join(map(str,dcolor)) + ")\" " - arrow += "stroke=\"rgb(" + ",".join(map(str,dccolour)) + ")\" " - arrow += "stroke-width=\"" + str(domain_contour_thickness) + "\" " - arrow += "opacity=\"0.75\" />\n" + arrow += "{}\t\t\t<rect class=\"{}\" x=\"{}\" ".format(additional_tabs, dacc, str(X+dX)) + arrow += "y=\"{}\" stroke-linejoin=\"round\" ".format(str(Y + internal_domain_margin)) + arrow += "width=\"{}\" height=\"{}\" ".format(str(dL), str(dH)) + arrow += "fill=\"rgb({})\" stroke=\"rgb({})\" ".format(",".join([str(val) for val in dcolor]), ",".join([str(val) for val in dccolour])) + arrow += "stroke-width=\"{}\" opacity=\"0.75\" />\n".format(str(domain_contour_thickness)) else: del points[:] @@ -263,15 +258,12 @@ def draw_arrow(additional_tabs, X, Y, L, l, H, h, strand, color, color_contour, for point in points: points_coords.append(str(int(point[0])) + "," + str(int(point[1]))) - arrow += additional_tabs + "\t\t\t<polygon class=\"" + dacc + "\" " - arrow += "points=\"" + " ".join(points_coords) + "\" " - arrow += "stroke-linejoin=\"round\" " - arrow += "width=\"" + str(dL) + "\" " - arrow += "height=\"" + str(dH) + "\" " - arrow += "fill=\"rgb(" + ",".join(map(str,dcolor)) + ")\" " - arrow += "stroke=\"rgb(" + ",".join(map(str,dccolour)) + ")\" " - arrow += "stroke-width=\"" + str(domain_contour_thickness) + "\" " - arrow += "opacity=\"0.75\" />\n" + arrow += "{}\t\t\t<polygon class=\"{}\" ".format(additional_tabs, dacc) + arrow += "points=\"{}\" stroke-linejoin=\"round\" ".format(" ".join(points_coords)) + arrow += "width=\"{}\" height=\"{}\" ".format(str(dL), str(dH)) + arrow += "fill=\"rgb({})\" ".format(",".join([str(val) for val in dcolor])) + arrow += "stroke=\"rgb({})\" ".format(",".join([str(val) for val in dccolour])) + arrow += "stroke-width=\"{}\" opacity=\"0.75\" />\n".format(str(domain_contour_thickness)) # now check other direction else: @@ -286,16 +278,12 @@ def draw_arrow(additional_tabs, X, Y, L, l, H, h, strand, color, color_contour, # nice, blocky domains if dX > collision_x + x_margin_offset: - arrow += additional_tabs + "\t\t\t<rect class=\"" + dacc + "\" " - arrow += "x=\"" + str(X+dX) + "\" " - arrow += "y=\"" + str(Y + internal_domain_margin) + "\" " - arrow += "stroke-linejoin=\"round\" " - arrow += "width=\"" + str(dL) + "\" " - arrow += "height=\"" + str(dH) + "\" " - arrow += "fill=\"rgb(" + ",".join(map(str,dcolor)) + ")\" " - arrow += "stroke=\"rgb(" + ",".join(map(str,dccolour)) + ")\" " - arrow += "stroke-width=\"" + str(domain_contour_thickness) + "\" " - arrow += "opacity=\"0.75\" />\n" + arrow += "{}\t\t\t<rect class=\"{}\" ".format(additional_tabs, dacc) + arrow += "x=\"{}\" y=\"{}\" ".format(str(X+dX), str(Y + internal_domain_margin)) + arrow += "stroke-linejoin=\"round\" width=\"{}\" height=\"{}\" ".format(str(dL), str(dH)) + arrow += "fill=\"rgb({})\" ".format(",".join([str(val) for val in dcolor])) + arrow += "stroke=\"rgb({})\" ".format(",".join([str(val) for val in dccolour])) + arrow += "stroke-width=\"{}\" opacity=\"0.75\" />\n".format(str(domain_contour_thickness)) else: del points[:] @@ -329,15 +317,12 @@ def draw_arrow(additional_tabs, X, Y, L, l, H, h, strand, color, color_contour, for point in points: points_coords.append(str(int(point[0])) + "," + str(int(point[1]))) - arrow += additional_tabs + "\t\t\t<polygon class=\"" + dacc + "\" " - arrow += "points=\"" + " ".join(points_coords) + "\" " - arrow += "stroke-linejoin=\"round\" " - arrow += "width=\"" + str(dL) + "\" " - arrow += "height=\"" + str(dH) + "\" " - arrow += "fill=\"rgb(" + ",".join(map(str,dcolor)) + ")\" " - arrow += "stroke=\"rgb(" + ",".join(map(str,dccolour)) + ")\" " - arrow += "stroke-width=\"" + str(domain_contour_thickness) + "\" " - arrow += "opacity=\"0.75\" />\n" + arrow += "{}\t\t\t<polygon class=\"{}\" ".format(additional_tabs, dacc) + arrow += "points=\"{}\" stroke-linejoin=\"round\" ".format(" ".join(points_coords)) + arrow += "width=\"{}\" height=\"{}\" ".format(str(dL), str(dH)) + arrow += "fill=\"rgb({})\" ".format(",".join([str(val) for val in dcolor])) + arrow += "stroke=\"rgb({})\" ".format(",".join([str(val) for val in dccolour])) + arrow += "stroke-width=\"{}\" opacity=\"0.75\" />\n".format(str(domain_contour_thickness)) arrow += additional_tabs + "\t\t</g>\n" @@ -351,7 +336,7 @@ def draw_line(X,Y,L): Draw a line below genes """ - line = "<line x1=\"" + str(X) + "\" y1=\"" + str(Y) + "\" x2=\"" + str(X+L) + "\" y2=\"" + str(Y) + "\" style=\"stroke:rgb(50,50,50); stroke-width:" + str(stripe_thickness) + " \"/>\n" + line = "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" style=\"stroke:rgb(50,50,50); stroke-width:{} \"/>\n".format(str(X), str(Y), str(X+L), str(Y), str(stripe_thickness)) return line @@ -426,7 +411,7 @@ def SVG(write_html, outputfile, GenBankFile, pfdFile, use_pfd, color_genes, colo header = "\t\t<div title=\"" + GenBankFile[:-4] + "\">\n" additional_tabs = "\t\t\t" - header += additional_tabs + "<svg width=\"" + str(max_width + 2*(mX)) + "\" height=\"" + str(loci*(2*h + H + 2*mY)) + "\">\n" + header += "{}<svg width=\"{}\" height=\"{}\">\n".format(additional_tabs, str(max_width + 2*(mX)), str(loci*(2*h + H + 2*mY))) addY = loci*(2*h + H + 2*mY) else: @@ -654,7 +639,7 @@ def SVG(write_html, outputfile, GenBankFile, pfdFile, use_pfd, color_genes, colo with open(gene_color_file, "a") as color_genes_handle: for new_names in new_color_genes: - color_genes_handle.write(new_names + "\t" + ",".join(map(str,new_color_genes[new_names])) + "\n") + color_genes_handle.write(new_names + "\t" + ",".join([str(ncg) for ncg in new_color_genes[new_names]]) + "\n") if len(new_color_domains) > 0: if len(new_color_domains) < 10: @@ -664,7 +649,7 @@ def SVG(write_html, outputfile, GenBankFile, pfdFile, use_pfd, color_genes, colo with open(domains_color_file, "a") as color_domains_handle: for new_names in new_color_domains: - color_domains_handle.write(new_names + "\t" + ",".join(map(str,new_color_domains[new_names])) + "\n") + color_domains_handle.write(new_names + "\t" + ",".join([str(ncdom) for ncdom in new_color_domains[new_names]]) + "\n") mode = "a" if write_html == True else "w" diff --git a/bigscape.py b/bigscape.py index 33f4d70fa19ccc2a028473bd7a3b4d1054746feb..da027e27bb5288ccf2954a2a0c2fa376b143fa7e 100644 --- a/bigscape.py +++ b/bigscape.py @@ -27,8 +27,16 @@ https://git.wageningenur.nl/medema-group/BiG-SCAPE # License: GNU Affero General Public License v3 or later # A copy of GNU AGPL v3 should have been included in this software package in LICENSE.txt. """ +# Makes sure the script can be used with Python 2 as well as Python 3. +from __future__ import print_function +from __future__ import division +from sys import version_info +if version_info[0]==2: + range = xrange + import cPickle as pickle # for storing and retrieving dictionaries +elif version_info[0]==3: + import pickle # for storing and retrieving dictionaries -import cPickle as pickle # for storing and retrieving dictionaries from math import exp, log import os import subprocess @@ -47,12 +55,12 @@ from Bio import pairwise2 from Bio.SubsMat.MatrixInfo import pam250 as scoring_matrix from functions import * -from munkres import Munkres from ArrowerSVG import * import numpy as np from array import array from scipy.sparse import lil_matrix +from scipy.optimize import linear_sum_assignment import pysapc @@ -100,7 +108,7 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc print(" Skipping file " + fname) continue if "_ORF" in fname: - print(" Skipping file " + fname + " (string '_ORF' is used internally)") + print(" Skipping file {} (string '_ORF' is used internally)".format(fname)) continue if " " in fname: @@ -117,7 +125,7 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc # basic file verification. Substitutes check_data_integrity records = list(SeqIO.parse(os.path.join(dirpath,fname), "genbank")) except ValueError as e: - print(" Error with file " + os.path.join(dirpath, fname) + ": \n '" + str(e) + "'") + print(" Error with file {}: \n '{}'".format(os.path.join(dirpath, fname), str(e))) print(" (This file will be excluded from the analysis)") continue else: @@ -169,8 +177,7 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc strand = '+' else: strand = '-' - - fasta_header = clusterName + "_ORF" + str(cds_ctr)+ ":gid:" + str(gene_id) + ":pid:" + str(protein_id) + ":loc:" + str(gene_start) + ":" + str(gene_end) + ":strand:" + strand + fasta_header = "{}_ORF{}:gid:{}:pid:{}:loc:{}:{}:strand:{}".format(clusterName, str(cds_ctr), str(gene_id), str(protein_id), str(gene_start), str(gene_end), strand) fasta_header = fasta_header.replace(">","") #the coordinates might contain larger than signs, tools upstream don't like this fasta_header = fasta_header.replace(" ", "") #the domtable output format (hmmscan) uses spaces as a delimiter, so these cannot be present in the fasta header @@ -215,7 +222,7 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc reminder = len(nt_seq)%3 if reminder > 0: if fuzzy_start and fuzzy_end: - print("Warning, CDS (" + clusterName + ", " + CDS.qualifiers.get('locus_tag',"")[0] + ") has fuzzy start and end positions, and a sequence length not multiple of three. Skipping") + print("Warning, CDS ({}, {}) has fuzzy start and end positions, and a sequence length not multiple of three. Skipping".format(clusterName, CDS.qualifiers.get('locus_tag',"")[0])) break if fuzzy_start: @@ -289,15 +296,15 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc else: with open(outputfile,'w') as fastaHandle: for header_sequence in fasta_data: - fastaHandle.write('%s\n' % header_sequence[0]) - fastaHandle.write('%s\n' % header_sequence[1]) + fastaHandle.write("{}\n".format(str(header_sequence[0]))) + fastaHandle.write("{}\n".format(str(header_sequence[1]))) if verbose: - print(" Adding " + fname + " (" + str(bgc_size) + " bps)") + print(" Adding {} ({} bps)".format(fname, str(bgc_size))) else: - print(" Discarding " + clusterName + " (size less than " + str(min_bgc_size) + " bp, was " + str(bgc_size) + ")") + print(" Discarding {} (size less than {} bp, was {})".format(clusterName, str(min_bgc_size), str(bgc_size))) del fasta_data[:] biosynthetic_genes.clear() @@ -308,29 +315,34 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc if file_counter == 1: sys.exit("\nError: Only one file found. Please input at least two files") - print("\n Starting with " + str(file_counter) + " files") + print("\n Starting with {} files".format(str(file_counter))) print(" Files that had its sequence extracted: " + str(file_counter - processed_sequences)) return genbankDict -def timeit(f): - def wrap(*args): - insignificant_runtime = 1 #prevents an overload - time1 = time.time() - ret = f(*args) - time2 = time.time() - runtime = time2-time1 - - runtime_string = '\t%s function took %0.3f s' % (f.func_name, runtime) - - if runtime > insignificant_runtime: +def timeit(funct): + """Writes the runtimes of functions to a file and prints them on screen. + + Input: + - funct: a function + Output: + - That function its output. + - Runtime of that function in a file called commands.txt and on screen. + """ + def _wrap(*args): + start_time = time.time() + ret = funct(*args) + runtime = time.time()-start_time + runtime_string = '{} took {:.3f} seconds'.format(funct.__name__, runtime) + # To prevent insignificant runtimes from ending up in the file. + if runtime > 1: with open(os.path.join(output_folder, "runtimes.txt"), 'a') as timings_file: timings_file.write(runtime_string + "\n") - print runtime_string - + print(runtime_string) return ret - return wrap + + return _wrap @timeit @@ -357,7 +369,7 @@ def generate_network(cluster_pairs, cores): def generate_dist_matrix(parms): """Unpack data to actually launch cluster_distance for one pair of BGCs""" - cluster1Idx,cluster2Idx,bgcClassIdx = map(int,parms) + cluster1Idx,cluster2Idx,bgcClassIdx = [int(parm) for parm in parms] cluster1 = clusterNames[cluster1Idx] cluster2 = clusterNames[cluster2Idx] bgc_class = bgcClassNames[bgcClassIdx] @@ -366,7 +378,7 @@ def generate_dist_matrix(parms): domain_list_A = DomainList[cluster1] domain_list_B = DomainList[cluster2] except KeyError: - print(" Warning: domain list for " + cluster1 + " or " + cluster2 + " was not found. Extracting from pfs files") + print(" Warning: domain list for {} or {} was not found. Extracting from pfs files".format(cluster1, cluster2)) cluster_file1 = os.path.join(output_folder, cluster1 + ".pfs") cluster_file2 = os.path.join(output_folder, cluster2 + ".pfs") @@ -376,13 +388,13 @@ def generate_dist_matrix(parms): # this really shouldn't happen if we've filtered domain-less gene clusters already if len(domain_list_A) == 0 or len(domain_list_B) == 0: - print(" Warning: Regarding distance between clusters " + cluster1 + " and " + cluster2 + ":") + print(" Warning: Regarding distance between clusters {} and {}:".format(cluster1, cluster2)) if len(domain_list_A) == 0 and len(domain_list_B) == 0: print(" None have identified domains. Distance cannot be calculated") elif (domain_list_A) == 0: - print(" Cluster " + cluster1 + " has no identified domains. Distance set to 1") + print(" Cluster {} has no identified domains. Distance set to 1".format(cluster1)) else: - print(" Cluster " + cluster2 + " has no identified domains. Distance set to 1") + print(" Cluster {} has no identified domains. Distance set to 1".format(cluster2)) # last two values (S, Sa) should really be zero but this could give rise to errors when parsing # the network file (unless we catched the case S = Sa = 0 @@ -414,7 +426,7 @@ def cluster_distance(a, b, a_domlist, b_domlist, bgc_class): setA = set(A_domlist) setB = set(B_domlist) - intersect = setA.intersection(setB) + intersect = setA & setB S = 0 S_anchor = 0 @@ -505,10 +517,9 @@ def cluster_distance(a, b, a_domlist, b_domlist, bgc_class): B_domain_sequence_slice_top[domain] = len(BGCs[B][domain]) - # JACCARD INDEX - Jaccard = len(intersect)/ float( len(setA) + len(setB) - len(intersect)) - + Jaccard = len(intersect)/ len(setA|setB) + # DSS INDEX #domain_difference: Difference in sequence per domain. If one cluster does @@ -541,9 +552,10 @@ def cluster_distance(a, b, a_domlist, b_domlist, bgc_class): S = domain_difference # can be done because it's the first use of these S_anchor = domain_difference_anchor - + # Cases 2 and 3 (now merged) missing_aligned_domain_files = [] + for shared_domain in intersect: specific_domain_list_A = BGCs[A][shared_domain] specific_domain_list_B = BGCs[B][shared_domain] @@ -556,8 +568,7 @@ def cluster_distance(a, b, a_domlist, b_domlist, bgc_class): accumulated_distance = 0 # Fill distance matrix between domain's A and B versions - DistanceMatrix = [[1 for col in range(num_copies_b)] for row in range(num_copies_a)] - + DistanceMatrix = np.ndarray((num_copies_a,num_copies_b)) for domsa in range(num_copies_a): for domsb in range(num_copies_b): sequence_tag_a = specific_domain_list_A[domsa + A_domain_sequence_slice_bottom[shared_domain]] @@ -577,7 +588,7 @@ def cluster_distance(a, b, a_domlist, b_domlist, bgc_class): if shared_domain not in missing_aligned_domain_files and verbose: # this will print everytime an unfound <domain>.algn is not found for every # distance calculation (but at least, not for every domain pair!) - print(" Warning: " + shared_domain + ".algn not found. Trying pairwise alignment...") + print(" Warning: {}.algn not found. Trying pairwise alignment...".format(shared_domain)) missing_aligned_domain_files.append(shared_domain) try: @@ -603,9 +614,9 @@ def cluster_distance(a, b, a_domlist, b_domlist, bgc_class): # Sequences *should* be of the same length unless something went # wrong elsewhere if len(aligned_seqA) != len(aligned_seqB): - print("\tWARNING: mismatch in sequences' lengths while calculating sequence identity (" + shared_domain + ")") - print("\t Specific domain 1: " + sequence_tag_a + " len: " + str(len(aligned_seqA))) - print("\t Specific domain 2: " + sequence_tag_b + " len: " + str(len(aligned_seqB))) + print("\tWARNING: mismatch in sequences' lengths while calculating sequence identity ({})".format(shared_domain)) + print("\t Specific domain 1: {} len: {}".format(sequence_tag_a, str(len(aligned_seqA)))) + print("\t Specific domain 2: {} len: {}".format(sequence_tag_b, str(len(aligned_seqB)))) seq_length = min(len(aligned_seqA), len(aligned_seqB)) else: seq_length = len(aligned_seqA) @@ -617,17 +628,16 @@ def cluster_distance(a, b, a_domlist, b_domlist, bgc_class): else: gaps += 1 - DistanceMatrix[domsa][domsb] = 1 - ( float(matches)/float(seq_length-gaps) ) + DistanceMatrix[domsa][domsb] = 1- ( matches/(seq_length-gaps) ) #Only use the best scoring pairs - Hungarian = Munkres() - BestIndexes = Hungarian.compute(DistanceMatrix) - accumulated_distance = sum([DistanceMatrix[bi[0]][bi[1]] for bi in BestIndexes]) + BestIndexes = linear_sum_assignment(DistanceMatrix) + accumulated_distance = DistanceMatrix[BestIndexes].sum() # the difference in number of domains accounts for the "lost" (or not duplicated) domains sum_seq_dist = (abs(num_copies_a-num_copies_b) + accumulated_distance) #essentially 1-sim normalization_element = max(num_copies_a, num_copies_b) - + if shared_domain[:7] in anchor_domains: S_anchor += normalization_element domain_difference_anchor += sum_seq_dist @@ -636,12 +646,12 @@ def cluster_distance(a, b, a_domlist, b_domlist, bgc_class): domain_difference += sum_seq_dist if S_anchor != 0 and S != 0: - DSS_non_anchor = domain_difference / float(S) - DSS_anchor = domain_difference_anchor / float(S_anchor) + DSS_non_anchor = domain_difference / S + DSS_anchor = domain_difference_anchor / S_anchor # Calculate proper, proportional weight to each kind of domain - non_anchor_prct = S / float(S + S_anchor) - anchor_prct = S_anchor / float(S + S_anchor) + non_anchor_prct = S / (S + S_anchor) + anchor_prct = S_anchor / (S + S_anchor) # boost anchor subcomponent and re-normalize non_anchor_weight = non_anchor_prct / (anchor_prct*anchorboost + non_anchor_prct) @@ -651,19 +661,19 @@ def cluster_distance(a, b, a_domlist, b_domlist, bgc_class): DSS = (non_anchor_weight*DSS_non_anchor) + (anchor_weight*DSS_anchor) elif S_anchor == 0: - DSS_non_anchor = domain_difference / float(S) + DSS_non_anchor = domain_difference / S DSS_anchor = 0.0 - + DSS = DSS_non_anchor else: #only anchor domains were found DSS_non_anchor = 0.0 - DSS_anchor = domain_difference_anchor / float(S_anchor) + DSS_anchor = domain_difference_anchor / S_anchor DSS = DSS_anchor DSS = 1-DSS #transform into similarity - + # ADJACENCY INDEX # calculates the Tanimoto similarity of pairs of adjacent domains @@ -680,7 +690,7 @@ def cluster_distance(a, b, a_domlist, b_domlist, bgc_class): setB_pairs.add(tuple(sorted([B_domlist[l],B_domlist[l+1]]))) # same treatment as in Jaccard - AI = float(len(setA_pairs.intersection(setB_pairs))) / float(len(setA_pairs.union(setB_pairs))) + AI = len(setA_pairs & setB_pairs) / len(setA_pairs | setB_pairs) Distance = 1 - (Jaccardw * Jaccard) - (DSSw * DSS) - (AIw * AI) @@ -689,9 +699,9 @@ def cluster_distance(a, b, a_domlist, b_domlist, bgc_class): if Distance < -0.000001: # this definitely is something else... print("Negative distance detected!") print(Distance) - print(A + " - " + B) - print("J: " + str(Jaccard) + "\tDSS: " + str(DSS) + "\tAI: " + str(AI)) - print("Jw: " + str(Jaccardw) + "\tDSSw: " + str(DSSw) + "\tAIw: " + str(AIw)) + print("{} - {}".format(A, B)) + print("J: {}\tDSS: {}\tAI: {}".format(str(Jaccard), str(DSS), str(AI))) + print("Jw: {}\tDSSw: {}\tAIw: {}".format(str(Jaccardw), str(DSSw), str(AIw))) Distance = 0.0 return Distance, Jaccard, DSS, AI, DSS_non_anchor, DSS_anchor, S, S_anchor @@ -798,7 +808,7 @@ def runHmmScan(fastaPath, hmmPath, outputdir, verbose): name = ".".join(fastaPath.split(os.sep)[-1].split(".")[:-1]) outputName = os.path.join(outputdir, name+".domtable") - hmmscan_cmd = "hmmscan --cpu 0 --domtblout %s --cut_tc %s %s" % (outputName,hmmFile,fastaPath) + hmmscan_cmd = "hmmscan --cpu 0 --domtblout {} --cut_tc {} {}".format(outputName, hmmFile, fastaPath) if verbose == True: print(" " + hmmscan_cmd) subprocess.check_output(hmmscan_cmd, shell=True) @@ -825,17 +835,17 @@ def parseHmmScan(hmmscanResults, pfd_folder, pfs_folder, overlapCutoff): # Save list of domains per BGC pfsoutput = os.path.join(pfs_folder, outputbase + ".pfs") - with open(pfsoutput, 'wb') as pfs_handle: + with open(pfsoutput, 'w') as pfs_handle: pfs_handle.write(" ".join(domains)) # Save more complete information of each domain per BGC pfdoutput = os.path.join(pfd_folder, outputbase + ".pfd") - with open(pfdoutput,'wb') as pfd_handle: + with open(pfdoutput,'w') as pfd_handle: write_pfd(pfd_handle, filtered_matrix) else: # there aren't any domains in this BGC # delete from all data structures - print(" No domains where found in " + outputbase + ".domtable. Removing it from further analysis") + print(" No domains where found in {}.domtable. Removing it from further analysis".format(outputbase)) info = genbankDict.get(outputbase) clusters.remove(outputbase) baseNames.remove(outputbase) @@ -890,8 +900,7 @@ def clusterJsonBatch(outputFileBase,matrix,cutoffs=[1.0],damping=0.8,clusterClan if verbose: print(" ...done") numBGCs = len(bgcs) - bs_distances = [[float('%.3f' % simMatrix[row, col]) for col in xrange(row+1)] for row in - xrange(numBGCs)] + bs_distances = [[float("{:.3f}".format(simMatrix[row, col])) for col in range(row+1)] for row in range(numBGCs)] #bs_data = [{"id": clusterNames[int(bgc)]} for bgc in bgcs] bs_data = [] bgcJsonDict = {} @@ -900,12 +909,12 @@ def clusterJsonBatch(outputFileBase,matrix,cutoffs=[1.0],damping=0.8,clusterClan members = familiesDict.setdefault(label, []) members.append(idx) familiesDict[label] = members - + ## guarantee order of families and indices for later assignment # Families should be indexed from 0 to # families - 1 familiesDictReIdxd = {idx:members for idx,members in enumerate(familiesDict.values())} familyIdxs = range(len(familiesDictReIdxd)) - + ### Use the 0.5 distance cutoff to cluster clans by default if clusterClans and cutoff == clanCutoff: famSimDict = dict() @@ -938,7 +947,7 @@ def clusterJsonBatch(outputFileBase,matrix,cutoffs=[1.0],damping=0.8,clusterClan preference='min').fit_predict(famSimMatrix) else: clanLabels = [] - + if len(clanLabels) > 0: clansDict = {} for idx,clanLabel in enumerate(clanLabels): @@ -946,7 +955,7 @@ def clusterJsonBatch(outputFileBase,matrix,cutoffs=[1.0],damping=0.8,clusterClan clanMembers.append(idx) clansDict[clanLabel] = clanMembers fam2clan = dict(zip(familyIdxs,clanLabels)) - + for bgc in bgcs: bgcName = clusterNames[int(bgc)] bgcJsonDict[bgcName] = {} @@ -984,22 +993,22 @@ def clusterJsonBatch(outputFileBase,matrix,cutoffs=[1.0],damping=0.8,clusterClan orfDict[orf]["domains"].append({'code': '{} : {}'.format(pfamID,pfamDescr),'start':int(entry[3]),'end':int(entry[4]),'bitscore': float(entry[1])}) else: orfDict[orf]["domains"].append({'code': entry[5], 'start': int(entry[3]), 'end': int(entry[4]), 'bitscore': float(entry[1])}) - bgcJsonDict[bgcName]['orfs'] = orfDict.values() + bgcJsonDict[bgcName]['orfs'] = list(orfDict.values()) bs_data = [bgcJsonDict[clusterNames[int(bgc)]] for bgc in bgcs] - + if len(clanLabels) > 0: - bs_families = [{'id': 'FAM_%.3d' % family, 'members': members, 'clan': 'CLAN_%.3d' % fam2clan[family]} + bs_families = [{"id": "FAM_{:03d}".format(family), 'members': members, "clan": "CLAN_{:03d}".format(fam2clan[family])} for family, members in familiesDictReIdxd.items()] - bs_clans = [{'id': 'CLAN_%.3d' % clan, 'members': members} + bs_clans = [{"id": "CLAN_{:03d}".format(clan), 'members': members} for clan, members in enumerate(clansDict.values())] else: - bs_families = [{'id': 'FAM_%.3d' % family, 'members': members, } + bs_families = [{"id": "FAM_{:03d}".format(family), 'members': members, } for family, members in familiesDictReIdxd.items()] - + # column1: BGC, column2: clustering pseudo family if verbose: print(" Writing clustering file") - with open(outputFileBase + "_clustering_c" + str(cutoff) + ".tsv", "w") as clustering_file: + with open("{}_clustering_c{:4.2f}.tsv".format(outputFileBase, cutoff), "w") as clustering_file: i = 0 for label in familiesDictReIdxd: i += 1 @@ -1008,13 +1017,13 @@ def clusterJsonBatch(outputFileBase,matrix,cutoffs=[1.0],damping=0.8,clusterClan if verbose: print(" Writing JS file") - outputFile = "{}_cutoff{}.js".format(outputFileBase,cutoff) + outputFile = "{}_cutoff{:4.2f}.js".format(outputFileBase,cutoff) with open(outputFile, 'w') as outfile: - outfile.write('var bs_similarity=%s\n' % str(bs_distances)) - outfile.write('var bs_data=%s\n' % str(bs_data)) - outfile.write('var bs_families=%s\n' % str(bs_families)) + outfile.write("var bs_similarity={}\n".format(str(bs_distances))) + outfile.write("var bs_data={}\n".format(str(bs_data))) + outfile.write("var bs_families={}".format(str(bs_families))) if len(clanLabels) > 0: - outfile.write('var bs_clans=%s' % str(bs_clans)) + outfile.write("\nvar bs_clans={}\n".format(str(bs_clans))) return class FloatRange(object): @@ -1024,7 +1033,7 @@ class FloatRange(object): def __eq__(self, other): return self.start <= other <= self.end def __repr__(self): - return '{0}-{1}'.format(self.start, self.end) + return '{}-{}'.format(self.start, self.end) def CMD_parser(): parser = ArgumentParser() @@ -1119,7 +1128,7 @@ if __name__=="__main__": if options.outputdir == "": - print "please provide a name for an output folder using parameter -o or --outputdir" + print("please provide a name for an output folder using parameter -o or --outputdir") sys.exit(0) global anchor_domains @@ -1221,7 +1230,7 @@ if __name__=="__main__": sampleDict = {} # {sampleName:set(bgc1,bgc2,...)} gbk_files = [] # raw list of gbk file locations - for (cluster, (path, clusterSample)) in genbankDict.iteritems(): + for (cluster, (path, clusterSample)) in genbankDict.items(): gbk_files.append(path) for sample in clusterSample: clustersInSample = sampleDict.get(sample, set()) @@ -1243,7 +1252,7 @@ if __name__=="__main__": create_directory(pfd_folder, "pfd", False) create_directory(svg_folder, "SVG", False) - print("\nTrying threading on %i cores" % cores) + print("\nTrying threading on {} cores".format(str(cores))) """BGCs -- dictionary of this structure: @@ -1326,27 +1335,23 @@ if __name__=="__main__": outputfile = os.path.join(domtable_folder,outputbase + '.domtable') if os.path.isfile(outputfile) and os.path.getsize(outputfile) > 0: alreadyDone.add(fasta) - - if len(fastaFiles - alreadyDone) == 0: + task_set = fastaFiles - alreadyDone + if len(task_set) == 0: print(" All fasta files had already been processed") elif len(alreadyDone) > 0: - if len(fastaFiles-alreadyDone) < 20: - print " Warning! The following NEW fasta file(s) will be processed: %s" % ", ".join(".".join(x.split(os.sep)[-1].split(".")[:-1]) for x in fastaFiles - alreadyDone) + if len(task_set) < 20: + print(" Warning! The following NEW fasta file(s) will be processed: {}".format(", ".join(".".join(x.split(os.sep)[-1].split(".")[:-1]) for x in task_set))) else: - print(" Warning: " + str(len(fastaFiles-alreadyDone)) + " NEW fasta files will be processed") + print(" Warning: {} NEW fasta files will be processed".format(str(len(task_set)))) else: - print(" Predicting domains for " + str(len(fastaFiles)) + " fasta files") - - task_set = fastaFiles - alreadyDone + print(" Predicting domains for {} fasta files".format(str(len(fastaFiles)))) pool = Pool(cores,maxtasksperchild=1) for fastaFile in task_set: pool.apply_async(runHmmScan,args=(fastaFile, pfam_dir, domtable_folder, verbose)) pool.close() pool.join() - - print " Finished generating domtable files." - + print(" Finished generating domtable files.") ### Step 3: Parse hmmscan domtable results and generate pfs and pfd files print("\nParsing hmmscan domtable files") @@ -1374,16 +1379,16 @@ if __name__=="__main__": outputfile = os.path.join(pfd_folder, outputbase + '.pfd') if os.path.isfile(outputfile) and os.path.getsize(outputfile) > 0: alreadyDone.add(domtable) - - if len(domtableFiles - alreadyDone) == 0: # Re-run + domtableFilesUnprocessed = domtableFiles - alreadyDone + if len(domtableFilesUnprocessed) == 0: # Re-run print(" All domtable files had already been processed") elif len(alreadyDone) > 0: # Incomplete run - if len(domtableFiles-alreadyDone) < 20: - print " Warning! The following domtable files had not been processed: %s" % ", ".join(".".join(x.split(os.sep)[-1].split(".")[:-1]) for x in domtableFiles - alreadyDone) + if len(domtableFilesUnprocessed) < 20: + print(" Warning! The following domtable files had not been processed: {}".format(", ".join(".".join(x.split(os.sep)[-1].split(".")[:-1]) for x in domtableFilesUnprocessed))) else: - print(" Warning: " + str(len(domtableFiles-alreadyDone)) + " domtable files will be processed") + print(" Warning: {} domtable files will be processed".format(str(len(domtableFilesUnprocessed)))) else: # First run - print(" Processing " + str(len(domtableFiles)) + " domtable files") + print(" Processing {} domtable files".format(str(len(domtableFiles)))) # If using the multiprocessing version and outputbase doesn't have any # predicted domains, it's not as easy to remove if from the analysis @@ -1411,7 +1416,7 @@ if __name__=="__main__": for thing in os.listdir(domains_folder): os.remove(os.path.join(domains_folder,thing)) - print " Finished generating generating pfs and pfd files." + print(" Finished generating generating pfs and pfd files.") ### Step 4: Parse the pfs, pfd files to generate BGC dictionary, clusters, and clusters per sample objects @@ -1451,7 +1456,7 @@ if __name__=="__main__": print(" Processing: " + outputbase) pfdFile = os.path.join(pfd_folder, outputbase + ".pfd") - filtered_matrix = [map(lambda x: x.strip(), line.split('\t')) for line in open(pfdFile)] + filtered_matrix = [[part.strip() for part in line.split('\t')] for line in open(pfdFile)] # save each domain sequence from a single BGC in its corresponding file fasta_file = os.path.join(bgc_fasta_folder, outputbase + ".fasta") @@ -1469,7 +1474,7 @@ if __name__=="__main__": del filtered_matrix[:] # store processed BGCs dictionary for future re-runs - with open(os.path.join(output_folder, "BGCs.dict"), "w") as BGC_file: + with open(os.path.join(output_folder, "BGCs.dict"), "wb") as BGC_file: pickle.dump(BGCs, BGC_file) BGC_file.close() @@ -1580,7 +1585,7 @@ if __name__=="__main__": # avoid multiple alignment if the domains all belong to the same BGC fasta_domains.remove(domain_file) if verbose: - print(" Skipping Multiple Alignment for " + domain_name + " (appears only in one BGC)") + print(" Skipping Multiple Alignment for {} (appears only in one BGC)".format(domain_name)) sequence_tag_list.clear() del header_list[:] @@ -1603,7 +1608,7 @@ if __name__=="__main__": # verify all tasks were completed by checking existance of alignment files for domain in fasta_domains: if not os.path.isfile(domain[:-6]+".algn"): - print(" WARNING, " + domain[:-6] + ".algn could not be found (possible issue with aligner).") + print(" WARNING, {}.algn could not be found (possible issue with aligner).".format(domain[:-6])) else: print(" No domain fasta files found to align") @@ -1654,8 +1659,7 @@ if __name__=="__main__": list_file.write("\t".join([bgc, bgc_info[bgc].accession_id, bgc_info[bgc].description, product, sort_bgc(product)]) + "\n") print(" Calculating all pairwise distances") - - pairs = set(map(tuple, map(sorted, combinations(mix_set, 2)))) + pairs = set([tuple(sorted(combo)) for combo in combinations(mix_set, 2)]) del mix_set[:] cluster_pairs = [(x, y, -1) for (x, y) in pairs] pairs.clear() @@ -1666,9 +1670,9 @@ if __name__=="__main__": pathBase = os.path.join(output_folder, networks_folder_all, "all_mix") filenames = [] for cutoff in cutoff_list: - filenames.append(pathBase + "_c%.2f.network" % cutoff) + filenames.append("{}_c{:.2f}.network".format(pathBase, cutoff)) clusterJsonBatch(pathBase, network_matrix_mix, cutoffs=cutoff_list,clusterClans=options.cluster_family,clanCutoff=options.clan_cutoff) - cutoffs_and_filenames = zip(cutoff_list, filenames) + cutoffs_and_filenames = list(zip(cutoff_list, filenames)) del filenames[:] write_network_matrix(network_matrix_mix, cutoffs_and_filenames, include_singletons, clusterNames, bgc_info) @@ -1714,7 +1718,7 @@ if __name__=="__main__": for bgc_class in BGC_classes: folder_name = bgc_class - print("\n " + folder_name + " (" + str(len(BGC_classes[bgc_class])) + " BGCs)") + print("\n {} ({} BGCs)".format(folder_name, str(len(BGC_classes[bgc_class])))) # create output directory create_directory(os.path.join(output_folder, networks_folder_all, folder_name), " All - " + bgc_class, False) @@ -1732,7 +1736,7 @@ if __name__=="__main__": if len(BGC_classes[bgc_class]) > 1: print(" Calculating all pairwise distances") - pairs = set(map(tuple, map(sorted, combinations(BGC_classes[bgc_class], 2)))) + pairs = set([tuple(sorted(combo)) for combo in combinations(BGC_classes[bgc_class], 2)]) del BGC_classes[bgc_class][:] cluster_pairs = [(x, y, bgcClassName2idx[bgc_class]) for (x, y) in pairs] pairs.clear() @@ -1743,11 +1747,11 @@ if __name__=="__main__": pathBase = os.path.join(output_folder, networks_folder_all, folder_name, "all" + folder_name) filenames = [] for cutoff in cutoff_list: - filenames.append(pathBase + "_c%.2f.network" % cutoff) - cutoffs_and_filenames = zip(cutoff_list, filenames) + filenames.append("{}_c{:.2f}.network".format(pathBase, cutoff)) + cutoffs_and_filenames = list(zip(cutoff_list, filenames)) del filenames[:] clusterJsonBatch(pathBase, network_matrix, cutoffs=cutoff_list,clusterClans=options.cluster_family,clanCutoff=options.clan_cutoff) - write_network_matrix(network_matrix, cutoffs_and_filenames, include_singletons,clusterNames, bgc_info) + write_network_matrix(network_matrix, cutoffs_and_filenames, include_singletons, clusterNames, bgc_info) del network_matrix[:] @@ -1764,10 +1768,10 @@ if __name__=="__main__": # create output directory for all samples create_directory(os.path.join(output_folder, networks_folder_samples), "Samples", False) - for sample, sampleClusters in sampleDict.iteritems(): + for sample, sampleClusters in sampleDict.items(): print("\n Sample: " + sample) if len(sampleClusters) == 1: - print(" Warning: Sample size = 1 detected. Not generating network for this sample (" + sample + ")") + print(" Warning: Sample size = 1 detected. Not generating network for this sample ({})".format(sample)) else: # create output directory for this sample create_directory(os.path.join(output_folder, networks_folder_samples, sample), " Samples - " + sample, False) @@ -1793,7 +1797,7 @@ if __name__=="__main__": product = bgc_info[bgc].product list_file.write("\t".join([bgc, bgc_info[bgc].accession_id, bgc_info[bgc].description, product, sort_bgc(product)]) + "\n") - pairs = set(map(tuple, map(sorted, combinations(mix_set, 2)))) + pairs = set([tuple(sorted(combo)) for combo in combinations(mix_set, 2)]) del mix_set[:] cluster_pairs = [(x, y, -1) for (x, y) in pairs] pairs.clear() @@ -1804,10 +1808,10 @@ if __name__=="__main__": pathBase = os.path.join(output_folder, networks_folder_samples, sample, "sample_" + sample + "_mix") filenames = [] for cutoff in cutoff_list: - filenames.append(pathBase + "_c%.2f.network" % cutoff) - cutoffs_and_filenames = zip(cutoff_list, filenames) + filenames.append("{}_c{:.2f}.network".format(pathBase, cutoff)) + cutoffs_and_filenames = list(zip(cutoff_list, filenames)) clusterJsonBatch(pathBase, network_matrix_sample, cutoffs=cutoff_list,clusterClans=options.cluster_family,clanCutoff=options.clan_cutoff) - write_network_matrix(network_matrix_sample, cutoffs_and_filenames, include_singletons, clusterNames,bgc_info) + write_network_matrix(network_matrix_sample, cutoffs_and_filenames, include_singletons, clusterNames, bgc_info) del network_matrix_sample[:] @@ -1866,7 +1870,7 @@ if __name__=="__main__": list_file.write("\t".join([bgc, bgc_info[bgc].accession_id, bgc_info[bgc].description, product, sort_bgc(product)]) + "\n") if len(BGC_classes[bgc_class]) > 1: - pairs = set(map(tuple, map(sorted, combinations(BGC_classes[bgc_class], 2)))) + pairs = set([tuple(sorted(combo)) for combo in combinations(BGC_classes[bgc_class], 2)]) del BGC_classes[bgc_class][:] cluster_pairs = [(x, y, bgcClassName2idx[bgc_class]) for (x, y) in pairs] pairs.clear() @@ -1877,18 +1881,18 @@ if __name__=="__main__": "sample_" + sample + "_" + folder_name) filenames = [] for cutoff in cutoff_list: - filenames.append(pathBase + "_c%.2f.network" % cutoff) - cutoffs_and_filenames = zip(cutoff_list, filenames) + filenames.append("{}_c{:.2f}.network".format(pathBase, cutoff)) + cutoffs_and_filenames = list(zip(cutoff_list, filenames)) clusterJsonBatch(pathBase, network_matrix_sample, cutoffs=cutoff_list,clusterClans=options.cluster_family,clanCutoff=options.clan_cutoff) write_network_matrix(network_matrix_sample, cutoffs_and_filenames, include_singletons, clusterNames,bgc_info) del network_matrix_sample[:] del BGC_classes[bgc_class][:] - pickle.dump(bgc_info,open(os.path.join(output_folder,'bgc_info.dict'),'w')) + pickle.dump(bgc_info,open(os.path.join(output_folder,'bgc_info.dict'),'wb')) runtime = time.time()-time1 - runtime_string = '\n\n\tMain function took %0.3f s' % (runtime) + runtime_string = "\n\n\tMain function took {:.3f} s".format(runtime) with open(os.path.join(output_folder, "runtimes.txt"), 'a') as timings_file: timings_file.write(runtime_string + "\n") - print runtime_string + print(runtime_string) diff --git a/functions.py b/functions.py index 2610b0f22e39c9690b4671a80bbb7d28c68b32fa..f8f0eed83ec825dc102a1d611480769a646c956b 100644 --- a/functions.py +++ b/functions.py @@ -16,6 +16,11 @@ Functions used by bigscape.py # License: GNU Affero General Public License v3 or later # A copy of GNU AGPL v3 should have been included in this software package in LICENSE.txt. """ +# Makes sure the script can be used with Python 2 as well as Python 3. +from __future__ import print_function +from sys import version_info +if version_info[0]==2: + range = xrange import os import subprocess @@ -62,9 +67,9 @@ def get_anchor_domains(filename): domains.add(line.strip().split("\t")[0].split(".")[0]) return domains except IOError: - print "You have not provided the anchor_domains.txt file." - print "if you want to make use of the anchor domains in the DSS distance metric,\ - make a file that contains a Pfam domain on each line." + print("You have not provided the anchor_domains.txt file.") + print("if you want to make use of the anchor domains in the DSS distance metric, \ +make a file that contains a Pfam domain on each line.") return set() @@ -157,15 +162,6 @@ def write_pfd(pfd_handle, matrix): pfd_handle.close() -def get_domains(filename): - handle = open(filename, 'r') - domains = [] - for line in handle: - if line[0] != "#": - domains.append(filter(None, line.split(" "))[1]) - - return domains - def no_overlap(locA1, locA2, locB1, locB2): """Return True if there is no overlap between two regions""" @@ -229,11 +225,9 @@ def save_domain_seqs(filtered_matrix, fasta_dict, domains_folder, outputbase): header = row[-1].strip() seq = fasta_dict[header] #access the sequence by using the header - domain_file = open(os.path.join(domains_folder, domain + ".fasta"), 'a') #append to existing file - domain_file.write(">" + header + ":" + row[3] + ":" + row[4] \ - + "\n" + seq[int(row[3]):int(row[4])] + "\n") #only use the range of the pfam domain within the sequence - + domain_file.write(">{}:{}:{}\n{}\n".format(header, row[3], row[4], + seq[int(row[3]):int(row[4])])) #only use the range of the pfam domain within the sequence domain_file.close() @@ -430,7 +424,7 @@ def domtable_parser(gbk, dom_file): else: for line in dom_handle: if line[0] != "#": - splitline = filter(None, line.split(" ")) + splitline = line.split() pfd_row = [] pfd_row.append(gbk) #add clustername or gbk filename @@ -440,7 +434,7 @@ def domtable_parser(gbk, dom_file): try: pfd_row.append(header_list[header_list.index("gid")+1]) #add gene ID if known except ValueError: - print "No gene ID in ", gbk + print("No gene ID in " + gbk) pfd_row.append('') pfd_row.append(splitline[19])#first coordinate, env coord from