diff --git a/knowncluster.py b/knowncluster.py new file mode 100644 index 0000000000000000000000000000000000000000..1d737b4d96fd3895a4763f4d24fc45830c68b3d5 --- /dev/null +++ b/knowncluster.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python + + +""" +Developer: Emzo de los Santos +compare a set of clusters to MiBIG-Database using cluster distance method developed in BiG-SCAPE + +Make New Data Structure for MiBIG Clusters that algorithm can use +""" +import os +from glob import glob +from functions import parsePFD,fasta_parser +from scipy.optimize import linear_sum_assignment +from Bio.SubsMat.MatrixInfo import pam250 as scoring_matrix +from Bio import pairwise2 +import numpy as np +from bigscape import calculate_GK + +def buildMiBIGDct(path): + """ + Given a path to MiBIG pfs and pfd files will build a Data Structure that can be used for MiBIG comparisons, this + includes the different domains in the MiBIG cluster, the product and the domain sequences + :param path: + :return: {MiBIG Cluster ID: (product,domList,domDict)} + domDict - {pfamID:{domIdx:sequence}} + domDict is built from getting the fasta sequences after parsePFD + + parsePFD returns: + (clusterPfamDict,domList) + + clusterPfamDict is {Pfam ID: [(Domain ID (geneID,(start,end)) ,location)]} + """ + # Only Build a Dictionary for Entries that have fasta files and pfdFiles + + mibigDict = {} + pfdFiles = glob(path + '/*.pfd') + fastaFiles = glob(path + '/*.fasta') + + pfdSet = set(x.split('.pfd')[0] for x in pfdFiles) + fastaSet = set(x.split('.fasta')[0] for x in fastaFiles) + + clustersToProcess = pfdSet & fastaSet + for mibigCluster in clustersToProcess: + clusterProduct = mibigCluster.split('.')[-1] + clusterID = mibigCluster.split(os.sep)[-1] + mibig_pfd_handle = os.path.join(path, mibigCluster + '.pfd') + mibig_fasta_handle = os.path.join(path,mibigCluster + '.fasta') + clusterPfamDict,domList = parsePFD(mibig_pfd_handle) + clusterFastaDict = fasta_parser(open(mibig_fasta_handle)) + domDict = {} + for pfamDomain in clusterPfamDict.keys(): + domainPfamDict = {} + for domainInstance in clusterPfamDict[pfamDomain]: + geneID, (domStart, domEnd) = domainInstance + domainPfamDict[domainInstance] = clusterFastaDict['>'+geneID][domStart:domEnd] + domDict[pfamDomain] = domainPfamDict + mibigDict[clusterID] = (clusterProduct,domList,domDict) + + return mibigDict + +def cluster_mibig_distance(clusterHandle,mibigDct,anchor_domains): + """ + calculate the distance between a specified cluster (with pfd and fasta files) to the mibig database + :param cluster: path to cluster + :param mibigDct: dictionary for miBIG with the correct structure (can use buildMiBIGDct to generate) + :param anchor_domains: text file specifying anchor domains + :return: distance_dictionary for cluster + {mibigID:(Distance, Jaccard, DDS, GK, DDS_non_anchor, DDS_anchor, S, S_anchor)} + """ + + ## Parameters for Distance Calculation ####### + + Jaccardw = 0.2 + DDSw = 0.75 + GKw = 0.05 + + domain_difference_anchor, S_anchor = 0, 0 + domain_difference, S = 0, 0 + gap_open = -15 + gap_extend = -6.67 + anchorweight = 2 + nbhood = 4 + ########################### + + if os.path.isfile(os.path.join(clusterHandle + '.pfd')): + cluster_pfd_handle = os.path.join(clusterHandle + '.pfd') + else: + print "No PFD file for %s" % clusterHandle + raise Exception + if os.path.isfile(os.path.join(clusterHandle + '.fasta')): + cluster_fasta_handle = os.path.join(clusterHandle + '.fasta') + else: + print "No Fasta file for %s" % clusterHandle + raise Exception + + cluster_fasta_dict = fasta_parser(open(cluster_fasta_handle)) + cluster, clusterDomList = parsePFD(cluster_pfd_handle) + + clusterDomSet = set(clusterDomList) + + cluster_dist_dict = {} + + for (mibigCluster,(product,mibigDomList,mibigDomDict)) in mibigDct.iteritems(): + print mibigCluster + mibigDomSet = set(mibigDomDict.keys()) + + intersect = clusterDomSet.intersection(mibigDomSet) + not_intersect = clusterDomSet.symmetric_difference(mibigDomSet) + + for unshared_domain in not_intersect: # no need to look at seq identity, since these domains are unshared + # for each occurence of an unshared domain do domain_difference += count of domain and S += count of domain + unshared_occurrences = [] + if unshared_domain in clusterDomSet: + unshared_occurrences = cluster[unshared_domain] + else: + unshared_occurrences = mibigDomDict[unshared_domain] + + # don't look at domain version, hence the split + if unshared_domain.split(".")[0] in anchor_domains: + domain_difference_anchor += len(unshared_occurrences) + else: + domain_difference += len(unshared_occurrences) + S = domain_difference # can be done because it's the first use of these + S_anchor = domain_difference_anchor + + #These are the cases for all the shared domains + for shared_domain in intersect: + # First we need to get the sequences and generate the domain dictionary for the cluster + clusterDomIdx = cluster[shared_domain] + clusterPfamDict = dict() + for domain in clusterDomIdx: + geneID,(domStart,domEnd) = domain + clusterPfamDict[domain] = cluster_fasta_dict['>'+geneID][domStart:domEnd] + + mibigPfamDict = mibigDomDict[shared_domain] + + clusSize = len(clusterPfamDict) + mibigSize = len(mibigPfamDict) + + scoreMatrix = np.ndarray((clusSize,mibigSize)) + # Get all of the instances of the domain from cluster A and cluster B and populate a similarity matrix + for i,domA in enumerate(clusterPfamDict.keys()): + for j,domB in enumerate(mibigPfamDict.keys()): + + seqA = clusterPfamDict[domA] + seqB = mibigPfamDict[domB] + + # this will guarantee that youll always get symmetry with comparisons by ordering it alphabetically + # but we'll lose accuracy and won't be able to recover the alignments + + if seqA > seqB: + seqB,seqA = seqA,seqB + # Using BioPython + alignScore = pairwise2.align.globalds(seqA, seqB, scoring_matrix, gap_open, gap_extend) + # calculate percent similarity from best scoring alignment + bestAlignment = alignScore[0] + alignA = bestAlignment[0] + alignB = bestAlignment[1] + posCtr = 0. + for (a,b) in zip(alignA,alignB): + if a == '-' or b == '-': + pass + else: + if a == b: + posCtr += 1 + # score is 1 - % identity since we want to define a distance to minimize + scoreMatrix[i,j] = 1 - posCtr/len(alignA) + # used scipy's linear_sum_assigment to do the hungarian algorithm, haven't tested Munkres for behaviour + + pairings = [(x,y) for x,y in zip(*linear_sum_assignment(scoreMatrix)) if (x<clusSize) and (y<mibigSize)] + # total distance from all of the paired domains + accumulated_distance = sum(scoreMatrix[a] for a in pairings) + # to get the total sequence distance you need to add the unpaired domains + sum_seq_dist = accumulated_distance + abs(clusSize - mibigSize) + # update the DDS or DDS_anchor and total domain counts depending on whether or not the domain is an anchor domain + if shared_domain.split(".")[0] in anchor_domains: + S_anchor += max(clusSize,mibigSize) + domain_difference_anchor += sum_seq_dist + else: + S += max(clusSize,mibigSize) + domain_difference += sum_seq_dist + + if S_anchor != 0 and S != 0: + DDS_non_anchor = domain_difference / float(S) + DDS_anchor = domain_difference_anchor / float(S_anchor) + + # Calculate proper, proportional weight to each kind of domain + non_anchor_prct = S / float(S + S_anchor) + anchor_prct = S_anchor / float(S + S_anchor) + + # boost anchor subcomponent and re-normalize + non_anchor_weight = non_anchor_prct / (anchor_prct * anchorweight + non_anchor_prct) + anchor_weight = anchor_prct * anchorweight / (anchor_prct * anchorweight + non_anchor_prct) + + # Use anchorweight parameter to boost percieved rDDS_anchor + DDS = (non_anchor_weight * DDS_non_anchor) + (anchor_weight * DDS_anchor) + + elif S_anchor == 0: + DDS_non_anchor = domain_difference / float(S) + DDS_anchor = 0.0 + DDS = DDS_non_anchor + + else: # only anchor domains were found + DDS_non_anchor = 0.0 + DDS_anchor = domain_difference_anchor / float(S_anchor) + DDS = DDS_anchor + + DDS = 1 - DDS # transform into similarity + # GK INDEX + # calculate the Goodman-Kruskal gamma index + + clusterDomListr = [item for item in clusterDomList] + clusterDomListr.reverse() + + GK = max([calculate_GK(clusterDomList, mibigDomList, nbhood), calculate_GK(clusterDomListr, mibigDomList, nbhood)]) + Jaccard = len(intersect) / float(len(set(clusterDomList)) + len(set(mibigDomList)) - len(intersect)) + + Distance = 1 - (Jaccardw * Jaccard) - (DDSw * DDS) - (GKw * GK) + + cluster_dist_dict[mibigCluster] = (Distance, Jaccard, DDS, GK, DDS_non_anchor, DDS_anchor, S, S_anchor) + return cluster_dist_dict + +def cluster_distance_pairwise_align(A, B, outputdir, anchor_domains): + try: + """Compare two clusters using information on their domains, and the sequences of the domains this + version of the script does not need the BGCs or DMS dictionary but requires the pfs and pfd files + """ + ## Check and load required files for the cluster + if os.path.isfile(os.path.join(outputdir, A + '.pfd')): + clusterA_pfd_handle = os.path.join(outputdir, A + '.pfd') + else: + print "No PFD file for %s" % A + raise Exception + + if os.path.isfile(os.path.join(outputdir, B + '.pfd')): + clusterB_pfd_handle = os.path.join(outputdir, B + '.pfd') + else: + print "No PFD file for %s" % B + raise Exception + + if os.path.isfile(os.path.join(outputdir, A + '.fasta')): + clusterA_fasta_handle = os.path.join(outputdir, A + '.fasta') + clusterA_fasta_dict = fasta_parser(open(clusterA_fasta_handle)) + else: + print "No Fasta file for %s" % A + raise Exception + if os.path.isfile(os.path.join(outputdir, B + '.fasta')): + clusterB_fasta_handle = os.path.join(outputdir, B + '.fasta') + clusterB_fasta_dict = fasta_parser(open(clusterB_fasta_handle)) + else: + print "No Fasta file for %s" % B + raise Exception + + clusterA,A_list = parsePFD(clusterA_pfd_handle) + clusterB,B_list = parsePFD(clusterB_pfd_handle) + + A_domlist = set(A_list) + B_domlist = set(B_list) + + intersect = set(A_domlist).intersection(B_domlist) + not_intersect = set(A_domlist).symmetric_difference(set(B_domlist)) + + # JACCARD INDEX + Jaccard = len(intersect) / float(len(set(A_domlist)) + len(set(B_domlist)) - len(intersect)) + + # DDS INDEX + # domain_difference: Difference in sequence per domain. If one cluster doesn't have a domain at all, but the other does, + # this is a sequence difference of 1. If both clusters contain the domain once, and the sequence is the same, there is a seq diff of 0. + # S: Max occurence of each domain + domain_difference_anchor, S_anchor = 0, 0 + domain_difference, S = 0, 0 + gap_open = -15 + gap_extend = -6.67 + + pair = "" # pair of clusters to access their sequence identity + + # Case 1 + for unshared_domain in not_intersect: # no need to look at seq identity, since these domains are unshared + # for each occurence of an unshared domain do domain_difference += count of domain and S += count of domain + unshared_occurrences = [] + if unshared_domain in A_domlist: + unshared_occurrences = clusterA[unshared_domain] + else: + unshared_occurrences = clusterB[unshared_domain] + + # don't look at domain version, hence the split + if unshared_domain.split(".")[0] in anchor_domains: + domain_difference_anchor += len(unshared_occurrences) + else: + domain_difference += len(unshared_occurrences) + S = domain_difference # can be done because it's the first use of these + S_anchor = domain_difference_anchor + + #These are the cases for all the shared domains + for shared_domain in intersect: + # First we want to index all the sequences of the domains in a dictionary + domIdxA = clusterA[shared_domain] + domIdxB = clusterB[shared_domain] + domDictA = dict() + for domain in domIdxA: + geneID,(domStart,domEnd) = domain + domDictA[domain] = clusterA_fasta_dict['>'+geneID][domStart:domEnd] + + domDictB = dict() + for domain in domIdxB: + geneID,(domStart,domEnd) = domain + domDictB[domain] = clusterB_fasta_dict['>'+geneID][domStart:domEnd] + + clusAsize = len(domDictA) + clusBsize = len(domDictB) + + scoreMatrix = np.ndarray((clusAsize,clusBsize)) + # Get all of the instances of the domain from cluster A and cluster B and populate a similarity matrix + for i,domA in enumerate(domIdxA): + for j,domB in enumerate(domIdxB): + + seqA = domDictA[domA] + seqB = domDictB[domB] + + # this will guarantee that youll always get symmetry with comparisons by ordering it alphabetically + # but we'll lose accuracy and won't be able to recover the alignments + + if seqA > seqB: + seqB,seqA = seqA,seqB + # Using BioPython + alignScore = pairwise2.align.globalds(seqA, seqB, scoring_matrix, gap_open, gap_extend) + # calculate percent similarity from best scoring alignment + bestAlignment = alignScore[0] + alignA = bestAlignment[0] + alignB = bestAlignment[1] + posCtr = 0. + for (a,b) in zip(alignA,alignB): + if a == '-' or b == '-': + pass + else: + if a == b: + posCtr += 1 + # score is 1 - % identity since we want to define a distance to minimize + scoreMatrix[i,j] = 1 - posCtr/len(alignA) + # used scipy's linear_sum_assigment to do the hungarian algorithm, haven't tested Munkres for behaviour + + pairings = [(x,y) for x,y in zip(*linear_sum_assignment(scoreMatrix)) if (x<clusAsize) and (y<clusBsize)] + # total distance from all of the paired domains + accumulated_distance = sum(scoreMatrix[a] for a in pairings) + # to get the total sequence distance you need to add the unpaired domains + sum_seq_dist = accumulated_distance + abs(len(domIdxA) - len(domIdxB)) + # update the DDS or DDS_anchor and total domain counts depending on whether or not the domain is an anchor domain + if shared_domain.split(".")[0] in anchor_domains: + S_anchor += max(len(domIdxA),len(domIdxB)) + domain_difference_anchor += sum_seq_dist + else: + S += max(len(domIdxA),len(domIdxB)) + domain_difference += sum_seq_dist + + if S_anchor != 0 and S != 0: + DDS_non_anchor = domain_difference / float(S) + DDS_anchor = domain_difference_anchor / float(S_anchor) + + # Calculate proper, proportional weight to each kind of domain + non_anchor_prct = S / float(S + S_anchor) + anchor_prct = S_anchor / float(S + S_anchor) + + # boost anchor subcomponent and re-normalize + non_anchor_weight = non_anchor_prct / (anchor_prct * anchorweight + non_anchor_prct) + anchor_weight = anchor_prct * anchorweight / (anchor_prct * anchorweight + non_anchor_prct) + + # Use anchorweight parameter to boost percieved rDDS_anchor + DDS = (non_anchor_weight * DDS_non_anchor) + (anchor_weight * DDS_anchor) + + elif S_anchor == 0: + DDS_non_anchor = domain_difference / float(S) + DDS_anchor = 0.0 + DDS = DDS_non_anchor + + else: # only anchor domains were found + DDS_non_anchor = 0.0 + DDS_anchor = domain_difference_anchor / float(S_anchor) + DDS = DDS_anchor + + DDS = 1 - DDS # transform into similarity + # GK INDEX + # calculate the Goodman-Kruskal gamma index + Ar = [item for item in A_list] + Ar.reverse() + GK = max([calculate_GK(A_list, B_list, nbhood), calculate_GK(Ar, B_list, nbhood)]) + + Distance = 1 - (Jaccardw * Jaccard) - (DDSw * DDS) - (GKw * GK) + if Distance < 0: + print("Negative distance detected!") + print("J: " + str(Jaccard) + "\tDDS: " + str(DDS) + "\tGK: " + str(GK)) + print("Jw: " + str(Jaccardw) + "\tDDSw: " + str(DDSw) + "\tGKw: " + str(GKw)) + sys.exit() + except KeyError: + print A, B,shared_domain + return Distance, Jaccard, DDS, GK, DDS_non_anchor, DDS_anchor, S, S_anchor \ No newline at end of file