bigscape.py 155 KB
Newer Older
1
2
3
4
5
6
7
8
#!/usr/bin/env python


"""
BiG-SCAPE

PI: Marnix Medema               marnix.medema@wur.nl

9
Maintainers/developers:
10
Jorge Navarro                   j.navarro@westerdijkinstitute.nl
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
11
Satria Kautsar                  satria.kautsar@wur.nl
12

13
14
15
Developer:
Emmanuel (Emzo) de los Santos   E.De-Los-Santos@warwick.ac.uk

16
17
18
19
20

Usage:   Please see `python bigscape.py -h`

Example: python bigscape.py -c 8 --pfam_dir ./ -i ./inputfiles -o ./results

21
Status: beta
22

23
Official repository:
24
25
26
27
28
29
https://git.wageningenur.nl/medema-group/BiG-SCAPE


# License: GNU Affero General Public License v3 or later
# A copy of GNU AGPL v3 should have been included in this software package in LICENSE.txt.
"""
30

31
32
33
# Makes sure the script can be used with Python 2 as well as Python 3.
from __future__ import print_function
from __future__ import division
34

35
36
37
38
39
40
41
42
43
44
45
46
47
48
from sys import version_info
if version_info[0]==2:
    range = xrange
    import cPickle as pickle # for storing and retrieving dictionaries
elif version_info[0]==3:
    import pickle # for storing and retrieving dictionaries

from math import exp, log
import os
import subprocess
import sys
import time
from glob import glob
from itertools import combinations
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
49
from itertools import product as combinations_product
50
51
52
53
from collections import defaultdict
from multiprocessing import Pool, cpu_count
from argparse import ArgumentParser
from difflib import SequenceMatcher
54
from operator import itemgetter
55
import zipfile
56
57
58
59
60
61

from Bio import SeqIO
from Bio.SeqFeature import BeforePosition, AfterPosition
from Bio import AlignIO
from Bio import pairwise2
from Bio.SubsMat.MatrixInfo import pam250 as scoring_matrix
62
from Bio import Phylo
63
64
65
66
67
68
69
70
71
72

from functions import *
from ArrowerSVG import *

import numpy as np
from array import array
from scipy.sparse import lil_matrix
from scipy.optimize import linear_sum_assignment
import json
import shutil
73
from distutils import dir_util
74
from sklearn.cluster import AffinityPropagation
75
import networkx as nx
76

77
78
79
80
81
82

global use_relevant_mibig
global mibig_set
global genbankDict
global valid_classes

83

Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
84
85
def process_gbk_files(gbk, min_bgc_size, bgc_info, files_no_proteins, files_no_biosynthetic_genes):
    """ Given a file path to a GenBank file, reads information about the BGC"""
86
87
88
89
90
91
92
93
94
95
96
97
98

    biosynthetic_genes = set()
    product_list_per_record = []
    fasta_data = []
    save_fasta = True
    adding_sequence = False
    contig_edge = False
    total_seq_length = 0
    record_end = 0
    offset_record_position = 0
    bgc_locus_tags = []
    locus_sequences = {}
    locus_coordinates = {}
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
99
100
101
    
    file_folder, fname = os.path.split(gbk)
    clusterName = fname[:-4]
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

    # See if we need to keep the sequence
    # (Currently) we have to open the file anyway to read all its 
    # properties for bgc_info anyway...
    outputfile = os.path.join(bgc_fasta_folder, clusterName + '.fasta')
    if os.path.isfile(outputfile) and os.path.getsize(outputfile) > 0 and not force_hmmscan:
        if verbose:
            print(" File {} already processed".format(outputfile))
        save_fasta = False
    else:
        save_fasta = True
    
    try:
        # basic file verification. Substitutes check_data_integrity
        records = list(SeqIO.parse(gbk, "genbank"))
    except ValueError as e:
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
118
        print("   Error with file {}: \n    '{}'".format(gbk, str(e)))
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
        print("    (This file will be excluded from the analysis)")
        return
    else:
        total_seq_length = 0
        bgc_size = 0
        cds_ctr = 0
        product = "no type"
        offset_record_position = 0
        
        max_width = 0 # This will be used for the SVG figure
        record_count = 0
        
        for record in records:
            record_count += 1
            bgc_size += len(record.seq)
            if len(record.seq) > max_width:
                max_width = len(record.seq)
            
            for feature in record.features:
138
139
                # antiSMASH <= 4
                if feature.type == "cluster":
140
                    if "product" in feature.qualifiers:
141
142
143
144
145
                        # in antiSMASH 4 there should only be 1 product qualifiers
                        for product in feature.qualifiers["product"]:
                            for p in product.replace(" ","").split("-"):
                                product_list_per_record.append(p)
                                
146
147
148
149
150
151
152
153
154
                    if "contig_edge" in feature.qualifiers:
                        # there might be mixed contig_edge annotations
                        # in multi-record files. Turn on contig_edge when
                        # there's at least one annotation
                        if feature.qualifiers["contig_edge"][0] == "True":
                            if verbose:
                                print(" Contig edge detected in {}".format(fname))
                            contig_edge = True
                        
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
                # antiSMASH = 5
                if "region" in feature.type:
                    if "product" in feature.qualifiers:
                        for product in feature.qualifiers["product"]:
                            product_list_per_record.append(product)
                            
                    if "contig_edge" in feature.qualifiers:
                        # there might be mixed contig_edge annotations
                        # in multi-record files. Turn on contig_edge when
                        # there's at least one annotation
                        if feature.qualifiers["contig_edge"][0] == "True":
                            if verbose:
                                print(" Contig edge detected in {}".format(fname))
                            contig_edge = True
                            
                        
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
                # Get biosynthetic genes + sequences
                if feature.type == "CDS":
                    cds_ctr += 1
                    CDS = feature
                    
                    gene_id = ""
                    if "gene" in CDS.qualifiers:
                        gene_id = CDS.qualifiers.get('gene',"")[0]
                        
                    
                    protein_id = ""
                    if "protein_id" in CDS.qualifiers:
                        protein_id = CDS.qualifiers.get('protein_id',"")[0]
                    
                    # nofuzzy_start/nofuzzy_end are obsolete
                    # http://biopython.org/DIST/docs/api/Bio.SeqFeature.FeatureLocation-class.html#nofuzzy_start
                    gene_start = offset_record_position + max(0, int(CDS.location.start))
                    gene_end = offset_record_position + max(0, int(CDS.location.end))
                    record_end = gene_end
                    
                    direction = CDS.location.strand
                    if direction == 1:
                        strand = '+'
                    else:
                        strand = '-'
                        
197
                    fasta_header = "{}_ORF{}:gid:{}:pid:{}:loc:{}:{}:strand:{}".format(clusterName, str(cds_ctr), str(gene_id).replace(":","_"), str(protein_id).replace(":","_"), str(gene_start), str(gene_end), strand)
198
199
200
                    fasta_header = fasta_header.replace(">","") #the coordinates might contain larger than signs, tools upstream don't like this
                    fasta_header = fasta_header.replace(" ", "") #the domtable output format (hmmscan) uses spaces as a delimiter, so these cannot be present in the fasta header

201
                    # antiSMASH <=4
202
203
204
205
                    if "sec_met" in feature.qualifiers:
                        if "Kind: biosynthetic" in feature.qualifiers["sec_met"]:
                            biosynthetic_genes.add(fasta_header)

206
207
208
209
210
                    # antiSMASH == 5
                    if "gene_kind" in feature.qualifiers:
                        if "biosynthetic" in feature.qualifiers["gene_kind"]:
                            biosynthetic_genes.add(fasta_header)
                    
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
                    fasta_header = ">"+fasta_header
                    

                    if 'translation' in CDS.qualifiers.keys():
                        prot_seq = CDS.qualifiers['translation'][0]
                    # If translation isn't available translate manually, this will take longer
                    else:
                        nt_seq = CDS.location.extract(record.seq)
                        
                        # If we know sequence is an ORF (like all CDSs), codon table can be
                        #  used to correctly translate alternative start codons.
                        #  see http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc25
                        # If the sequence has a fuzzy start/end, it might not be complete,
                        # (therefore it might not be the true start codon)
                        # However, in this case, if 'translation' not available, assume 
                        #  this is just a random sequence 
                        complete_cds = False 
                        
                        # More about fuzzy positions
                        # http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc39
                        fuzzy_start = False 
                        if str(CDS.location.start)[0] in "<>":
                            complete_cds = False
                            fuzzy_start = True
                            
                        fuzzy_end = False
                        if str(CDS.location.end)[0] in "<>":
                            fuzzy_end = True
                        
                        #for protein sequence if it is at the start of the entry assume 
                        # that end of sequence is in frame and trim from the beginning
                        #if it is at the end of the genbank entry assume that the start 
                        # of the sequence is in frame
                        reminder = len(nt_seq)%3
                        if reminder > 0:
                            if fuzzy_start and fuzzy_end:
                                print("Warning, CDS ({}, {}) has fuzzy\
                                    start and end positions, and a \
                                    sequence length not multiple of \
                                    three. Skipping".format(clusterName, 
                                    CDS.qualifiers.get('locus_tag',"")[0]))
                                break
                            
                            if fuzzy_start:
                                if reminder == 1:
                                    nt_seq = nt_seq[1:]
                                else:
                                    nt_seq = nt_seq[2:]
                            # fuzzy end
                            else:
                                #same logic reverse direction
                                if reminder == 1:
                                    nt_seq = nt_seq[:-1]
                                else:
                                    nt_seq = nt_seq[:-2]
                        
                        # The Genetic Codes: www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
                        if "transl_table" in CDS.qualifiers.keys():
                            CDStable = CDS.qualifiers.get("transl_table", "")[0]
                            prot_seq = str(nt_seq.translate(table=CDStable, to_stop=True, cds=complete_cds))
                        else:
                            prot_seq = str(nt_seq.translate(to_stop=True, cds=complete_cds))
                            
                    total_seq_length += len(prot_seq)
                
                
                    bgc_locus_tags.append(fasta_header)
                    locus_sequences[fasta_header] = prot_seq
                    locus_coordinates[fasta_header] = (gene_start, gene_end, len(prot_seq))
                    

            # TODO: if len(biosynthetic_genes) == 0, traverse record again
            # and add CDS with genes that contain domains labeled sec_met
            # we'll probably have to have a list of domains if we allow
            # fasta files as input
            
            # make absolute positions for ORFs in next records
            offset_record_position += record_end + 100
        
        if bgc_size > min_bgc_size:  # exclude the bgc if it's too small
            # check what we have product-wise
            # In particular, handle different products for multi-record files
            product_set = set(product_list_per_record)
            if len(product_set) == 1: # only one type of product
                product = product_list_per_record[0]
            elif "other" in product_set: # more than one, and it contains "other"
                if len(product_set) == 2:
298
                    product = list(product_set - {'other'})[0] # product = not "other"
299
                else:
300
                    product = ".".join(product_set - {'other'}) # likely a hybrid
301
            else:
302
                product = ".".join(product_set) # likely a hybrid
303
                
304
305
306
            # Don't keep this bgc if its type not in valid classes specified by user
            # This will avoid redundant tasks like domain detection
            subproduct = set()
307
            for p in product.split("."):
308
309
310
311
312
313
314
315
                subproduct.add(sort_bgc(p).lower())
            if "nrps" in subproduct and ("pksi" in subproduct or "pksother" in subproduct):
                subproduct.add("pks-nrp_hybrids")
                
            if len(valid_classes & subproduct) == 0:
                if verbose:
                    print(" Skipping {} (type: {})".format(clusterName, product))
                return False
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
            
            # assuming that the definition field is the same in all records
            # product: antiSMASH predicted class of metabolite
            # gbk definition
            # number of records (for Arrower figures)
            # max_width: width of the largest record (for Arrower figures)
            # id: the GenBank's accession
            #bgc_info[clusterName] = (product, records[0].description, len(records), max_width, records[0].id, biosynthetic_genes.copy())
            # TODO contig_edge annotation is not present for antiSMASH v < 4
            # Perhaps we can try to infer if it's in a contig edge: if
            # - first biosynthetic gene start < 10kb or
            # - max_width - last biosynthetic gene end < 10kb (but this will work only for the largest record)
            bgc_info[clusterName] = bgc_data(records[0].id, records[0].description, product, len(records), max_width, bgc_size + (record_count-1)*1000, records[0].annotations["organism"], ",".join(records[0].annotations["taxonomy"]), biosynthetic_genes.copy(), contig_edge)

            if len(bgc_info[clusterName].biosynthetic_genes) == 0:
                files_no_biosynthetic_genes.append(clusterName+".gbk")

            # TODO why re-process everything if it was already in the list?
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
334
            # if name already in genbankDict.keys -> add file_folder
335
336
            # else: extract all info
            if clusterName in genbankDict.keys():
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
337
338
                # Name was already in use. Use file_folder as the new sample's name
                genbankDict[clusterName][1].add(file_folder) 
339
340
341
342
            else:
                # See if we need to write down the sequence
                if total_seq_length > 0:
                    # location of first instance of the file is genbankDict[clustername][0]
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
343
                    genbankDict.setdefault(clusterName, [gbk, set([file_folder])])
344
345
346
347
348
349
350
351
352

                    if save_fasta:
                        # Find overlaps in CDS regions and delete the shortest ones.
                        # This is thought as a solution for selecting genes with 
                        # alternate splicing events
                        # Food for thought: imagine CDS A overlapping CDS B overlapping
                        # CDS C. If len(A) > len(B) > len(C) and we first compare A vs B
                        # and delete A, then B vs C and delete B: would that be a better
                        # solution than removing B? Could this actually happen?
353
354
355
356
                        # TODO What if the overlapping CDS is in the reverse strand?
                        #  maybe it should be kept as it is
                        # TODO what are the characterized differences in prokarytote
                        #  vs eukaryote CDS overlap?
357
358
359
360
361
362
363
364
                        del_list = set()
                        for a, b in combinations(bgc_locus_tags, 2):
                            a_start, a_end, a_len = locus_coordinates[a]
                            b_start, b_end, b_len = locus_coordinates[b]
                            
                            if b_end <= a_start or b_start >= a_end:
                                pass
                            else:
365
366
367
368
369
370
371
372
                                # calculate overlap
                                if a_start > b_start:
                                    ov_start = a_start
                                else:
                                    ov_start = b_start

                                if a_end < b_end:
                                    ov_end = a_end
373
                                else:
374
375
376
377
378
379
380
381
382
383
384
385
386
                                    ov_end = b_end

                                overlap_length = ov_end - ov_start
                                
                                # allow the overlap to be as large as 10% of the
                                # shortest CDS. Overlap length is in nucleotides
                                # here, whereas a_len, b_len are protein 
                                # sequence lengths
                                if overlap_length/3 > 0.1*min(a_len,b_len):
                                    if a_len > b_len:
                                        del_list.add(b)
                                    else:
                                        del_list.add(a)
387
388
                        
                        for locus in del_list:
389
390
                            if verbose:
                                print("   Removing {} because it overlaps with other ORF".format(locus))
391
392
393
394
395
396
397
398
399
400
401
402
403
404
                            bgc_locus_tags.remove(locus)
                        
                        with open(outputfile,'w') as fastaHandle:
                            for locus in bgc_locus_tags:
                                fastaHandle.write("{}\n".format(locus))
                                fastaHandle.write("{}\n".format(locus_sequences[locus]))
                            adding_sequence = True
                else:
                    files_no_proteins.append(fname)

            if verbose:
                print("  Adding {} ({} bps)".format(fname, str(bgc_size)))
                                
        else:
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
405
            print(" Discarding {} (size less than {} bp, was {})".format(clusterName, str(min_bgc_size), str(bgc_size)))
406
407
408
409
    
    return adding_sequence


410
def get_gbk_files(inputpath, outputdir, bgc_fasta_folder, min_bgc_size, include_gbk_str, exclude_gbk_str, bgc_info):
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
    """Searches given directory for genbank files recursively, will assume that
    the genbank files that have the same name are the same genbank file. 
    Returns a dictionary that contains the names of the clusters found as keys
    and a list that contains [0] a path to the genbank file and [1] the 
    samples that the genbank file is a part of.
    Extract and write the sequences as fasta files if not already in the Fasta 
    folder.
    return: {cluster_name:[genbank_path,[s_a,s_b...]]}
    """
    file_counter = 0
    processed_sequences = 0
    files_no_proteins = []
    files_no_biosynthetic_genes = []


Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
426
427
428
    if os.path.isfile(inputpath):
        files = [inputpath]
    else:
429
430
431
432
        # Unfortunately, this does not work in Python 2:
        #files = glob(os.path.join(inputpath,"**/*.gbk"), recursive=True) 
        files = [os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(inputpath)
                 for f in files if f.endswith(".gbk")]
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
433
434
435
436
        
    for filepath in files:
        file_folder, fname = os.path.split(filepath)
        
437
438
439
440
441
442
443
        if len(include_gbk_str) == 1 and include_gbk_str[0] == "*":
            pass
        else:
            if not any([word in fname for word in include_gbk_str]):
                continue
            
            if exclude_gbk_str != [] and any([word in fname for word in exclude_gbk_str]):
444
                print(" Skipping file " + fname)
445
446
                continue
        
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
447
448
449
450
451
452
453
454
455
456
        if "_ORF" in fname:
            print(" Skipping file {} (string '_ORF' is used internally)".format(fname))
            continue
        
        if " " in filepath:
            sys.exit("\nError: Input GenBank files should not have spaces in their path as hmmscan cannot process them properly ('too many arguments').")
        
        file_counter += 1
        if process_gbk_files(filepath, min_bgc_size, bgc_info, files_no_proteins, files_no_biosynthetic_genes):
            processed_sequences += 1
457
458
459
460
461
462
463
464
    
    if len(files_no_proteins) > 0:
        print("  Warning: Input set has files without protein sequences. They will be discarded")
        print("   (See no_sequences_list.txt)")
        with open(os.path.join(outputdir, "no_sequences_list.txt"), "w") as noseqs:
            for f in sorted(files_no_proteins):
                noseqs.write("{}\n".format(f))
        
465
    if len(files_no_biosynthetic_genes) > 0 and (mode == "glocal" or mode == "auto"):
466
467
        print("  Warning: Input set has files with no Biosynthetic Genes (affects alignment mode)")
        print("   See no_biosynthetic_genes_list.txt")
468
        with open(os.path.join(outputdir, "logs", "no_biosynthetic_genes_list.txt"), "w") as nobiogenes:
469
470
471
472
473
474
            for f in sorted(files_no_biosynthetic_genes):
                nobiogenes.write("{}\n".format(f))
    
    print("\n Starting with {:d} files".format(file_counter))
    print(" Files that had its sequence extracted: {:d}".format(processed_sequences))

475
    return
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491


def timeit(funct):
    """Writes the runtimes of functions to a file and prints them on screen.

    Input:
    - funct: a function
    Output:
    - That function its output.
    - Runtime of that function in a file called commands.txt and on screen.
    """
    def _wrap(*args):
        start_time = time.time()
        ret = funct(*args)
        runtime = time.time()-start_time
        runtime_string = '{} took {:.3f} seconds'.format(funct.__name__, runtime)
492
493
494
        with open(os.path.join(log_folder, "runtimes.txt"), 'a') as timings_file:
            timings_file.write(runtime_string + "\n")
        print(runtime_string)
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
        return ret
    
    return _wrap


@timeit
def generate_network(cluster_pairs, cores):
    """Distributes the distance calculation part
    cluster_pairs is a list of triads (cluster1_index, cluster2_index, BGC class)
    """
    
    pool = Pool(cores, maxtasksperchild=100)
    
    #Assigns the data to the different workers and pools the results back into
    # the network_matrix variable
    network_matrix = pool.map(generate_dist_matrix, cluster_pairs)

    # --- Serialized version of distance calculation ---
    # For the time being, use this if you have memory issues
    #network_matrix = []
    #for pair in cluster_pairs:
      #network_matrix.append(generate_dist_matrix(pair))

    return network_matrix


def generate_dist_matrix(parms):
    """Unpack data to actually launch cluster_distance for one pair of BGCs"""
    
    cluster1Idx,cluster2Idx,bgcClassIdx = [int(parm) for parm in parms]
    cluster1 = clusterNames[cluster1Idx]
    cluster2 = clusterNames[cluster2Idx]
    bgc_class = bgcClassNames[bgcClassIdx]

    try:
        domain_list_A = DomainList[cluster1]
        domain_list_B = DomainList[cluster2]
    except KeyError:
        print(" Warning: domain list for {} or {} was not found. Extracting from pfs files".format(cluster1, cluster2))
        
        cluster_file1 = os.path.join(output_folder, cluster1 + ".pfs")
        cluster_file2 = os.path.join(output_folder, cluster2 + ".pfs")
        
        domain_list_A = get_domain_list(cluster_file1)
        domain_list_B = get_domain_list(cluster_file2)
    
    # this really shouldn't happen if we've filtered domain-less gene clusters already
    if len(domain_list_A) == 0 or len(domain_list_B) == 0:
        print("   Warning: Regarding distance between clusters {} and {}:".format(cluster1, cluster2))
        if len(domain_list_A) == 0 and len(domain_list_B) == 0:
            print("   None have identified domains. Distance cannot be calculated")
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
546
        elif (domain_list_A) == 0:
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
            print("   Cluster {} has no identified domains. Distance set to 1".format(cluster1))
        else:
            print("   Cluster {} has no identified domains. Distance set to 1".format(cluster2))
            
        # last two values (S, Sa) should really be zero but this could give rise to errors when parsing 
        # the network file (unless we catched the case S = Sa = 0

        # cluster1Idx, cluster2Idx, distance, jaccard, DSS, AI, rDSSNa, rDSSa, 
        #   S, Sa, lcsStartA, lcsStartB
        return array('f',[cluster1Idx,cluster2Idx,1,0,0,0,0,0,1,1,0,0])
    
    # "Domain Count per Gene". List of simple labels (integers) indicating number
    # of domains belonging to each gene
    dcg_a = DomainCountGene[cluster1]
    dcg_b = DomainCountGene[cluster2]
    
    # Position of the anchor genes (i.e. genes with domains in the anchor
    # domain list). Should probably be the Core Biosynthetic genes marked by
    # antiSMASH
    core_pos_a = corebiosynthetic_position[cluster1]
    core_pos_b = corebiosynthetic_position[cluster2]
    
    # go = "gene orientation"
    go_a = BGCGeneOrientation[cluster1]
    go_b = BGCGeneOrientation[cluster2]
    
573
574
    dist, jaccard, dss, ai, rDSSna, rDSS, S, Sa, lcsStartA, lcsStartB, seedLength, reverse = cluster_distance_lcs(cluster1, cluster2, domain_list_A,
        domain_list_B, dcg_a, dcg_b, core_pos_a, core_pos_b, go_a, go_b, bgc_class)
575
576
        
    network_row = array('f',[cluster1Idx, cluster2Idx, dist, (1-dist)**2, jaccard, 
577
                             dss, ai, rDSSna, rDSS, S, Sa, lcsStartA, lcsStartB, seedLength, reverse])
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
    return network_row
    

def score_expansion(x_string_, y_string_, downstream):
    """
    Input:
    x_string: list of strings. This one tries to expand based on max score
    y_string: reference list. This was already expanded.
    downstream: If true, expansion goes from left to right.
    
    Output:
    max_score, a
    Where a is length of expansion.
    """
    match = 5
    mismatch = -3
    gap = -2
        
    if downstream:
        x_string = x_string_
        y_string = y_string_
    else:
        # Expansion goes upstream. It's easier to flip both slices and proceed
        # as if going downstream.
        x_string = list(reversed(x_string_))
        y_string = list(reversed(y_string_))
    

    # how many gaps to open before calling a mismatch is more convenient?
    max_gaps_before_mismatch = abs(int(match/gap))
    max_score = 0
    score = 0

    pos_y = 0
    a = 0
    b = 0
    for pos_x in range(len(x_string)):
        g = x_string[pos_x]
        
        # try to find g within the rest of the slice
        # This has the obvious problem of what to do if a gene is found _before_
        # the current y_slice (translocation). Duplications could also complicate
        # things
        try:
            match_pos = y_string[pos_y:].index(g)
        except ValueError:
            score += mismatch
        else:
            score += match + match_pos*gap
            pos_y += match_pos + 1 # move pointer one position after match
            
            # Greater or equals. 'equals' because even if max_score isn't 
            # larger, it's an expansion
            if score >= max_score:
                max_score = score
                # keep track of expansion. Account for zero-based numbering
                a = pos_x + 1
                            
        #print(pos_x, score, status, g)
        #print("")
            
    return max_score, a


def cluster_distance_lcs(A, B, A_domlist, B_domlist, dcg_A, dcg_b, core_pos_A, core_pos_b, go_A, go_b, bgc_class):
    """Compare two clusters using information on their domains, and the 
    sequences of the domains. 
    This version first tries to search for the largest common slices of both BGCs by 
    finding the Longest Common Substring of genes (based on domain content), also
    trying the reverse on one of the BGCs (capital "B" will be used when the 'true'
    orientation of the BGC is found)
    Then the algorithm will try to expand the slices to either side. Once each 
    slice are found, they have to pass one more validation: a core biosynthetic 
    gene (marked in antiSMASH) must be present.
    Capital letters indicate the "true" orientation of the BGC (first BGC, 'A'
    is kept fixed)
    
    Output:
    raw distance, jaccard index, domain sequence similarity, adjacency index, 
    raw DSS non-anchor, raw DSS anchor, number of non-anchor domains in the pair
    (S), number of anchor domains in the pair
    
    """
    
    Jaccardw, DSSw, AIw, anchorboost = bgc_class_weight[bgc_class]

    temp_domain_fastas = {}
    
    # Number of genes in each BGC
    lenG_A = len(dcg_A)
    lenG_B = len(dcg_b)
    
    setA = set(A_domlist)
    setB = set(B_domlist)
    intersect = setA & setB
    
    S = 0
    S_anchor = 0
    
    # define the subset of domain sequence tags to include in
    # the DSS calculation. This is done for every domain.
    A_domain_sequence_slice_bottom = defaultdict(int)
    A_domain_sequence_slice_top = defaultdict(int)
    B_domain_sequence_slice_bottom = defaultdict(int)
    B_domain_sequence_slice_top = defaultdict(int)
    
    # Detect totally unrelated pairs from the beginning
    if len(intersect) == 0:
        # Count total number of anchor and non-anchor domain to report in the 
        # network file. Apart from that, these BGCs are totally unrelated.
        for domain in setA:
            # This is a bit of a hack. If pfam domain ids ever change in size
            # we'd be in trouble. The previous approach was to .split(".")[0]
            # but it's more costly
            if domain[:7] in anchor_domains:
                S_anchor += len(BGCs[A][domain])
            else:
                S += len(BGCs[A][domain])
                
        for domain in setB:
            if domain[:7] in anchor_domains:
                S_anchor += len(BGCs[B][domain])
            else:
                S += len(BGCs[B][domain])
702
703

        return 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, S, S_anchor, 0, 0, 0, 0
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766


    # initialize domain sequence slices
    # They might change if we manage to find a valid overlap
    for domain in setA:
        A_domain_sequence_slice_bottom[domain] = 0
        A_domain_sequence_slice_top[domain] = len(BGCs[A][domain])
        
    for domain in setB:
        B_domain_sequence_slice_bottom[domain] = 0
        B_domain_sequence_slice_top[domain] = len(BGCs[B][domain])
        
    #initialize domlist borders for AI
    domA_start = 0
    domA_end = len(A_domlist)
    domB_start = 0
    domB_end = len(B_domlist)

    # always find lcs seed to use for offset alignment in visualization
    
    # Compress the list of domains according to gene information. For example:
    # A_domlist = a b c d e f g
    # dcg_a =   1  3  1  2 Number of domains per each gene in the BGC
    # go_a =    1 -1 -1  1 Orientation of each gene
    # A_string = a dcb e fg List of concatenated domains
    # Takes into account gene orientation. This works effectively as putting all
    # genes in the same direction in order to be able to compare their domain content
    A_string = []
    start = 0
    for g in range(lenG_A):
        domain_count = dcg_A[g]
        if go_A[g] == 1:
            # x[2:] <- small optimization, drop the "PF" from the pfam ids
            A_string.append("".join(x[2:] for x in A_domlist[start:start+domain_count]))
        else:
            A_string.append("".join(A_domlist[x][2:] for x in range(start+domain_count-1, start-1 ,-1)))
        start += domain_count
        
    b_string = []
    start = 0
    for g in range(lenG_B):
        domain_count = dcg_b[g]
        if go_b[g] == 1:
            b_string.append("".join(x[2:] for x in B_domlist[start:start+domain_count]))
        else:
            b_string.append("".join(B_domlist[x][2:] for x in range(start+domain_count-1, start-1 ,-1)))
        start += domain_count
        
    b_string_reverse = list(reversed(b_string))
        
    seqmatch = SequenceMatcher(None, A_string, b_string)
    # a: start position in A
    # b: start position in B
    # s: length of the match
    a, b, s = seqmatch.find_longest_match(0, lenG_A, 0, lenG_B)
    #print(A, B)
    #print(a, b, s)
    
    seqmatch = SequenceMatcher(None, A_string, b_string_reverse)
    ar, br, sr = seqmatch.find_longest_match(0, lenG_A, 0, lenG_B)
    #print(ar, br, sr)
    
    # We need to keep working with the correct orientation
767
    if s > sr or (s == sr and go_A[a] == go_b[b]):
768
769
770
771
772
773
774
775
776
777
778
779
        dcg_B = dcg_b
        B_string = b_string
        # note: these slices are in terms of genes, not domains (which are 
        # ultimately what is used for distance)
        # Currently, the following values represent the Core Overlap
        sliceStartA = a
        sliceStartB = b
        sliceLengthA = s
        sliceLengthB = s
        
        reverse = False
        b_name = B
780
        
781
    elif s < sr:
782
783
784
785
786
787
788
789
790
791
792
793
794
        dcg_B = list(reversed(dcg_b))
        B_string = b_string_reverse
        
        sliceStartA = ar
        sliceStartB = br
        sliceLengthA = sr
        sliceLengthB = sr
        
        # We'll need to know if we're working in reverse so that the start 
        # postion of the final slice can be transformed to the original orientation
        reverse = True
        b_name = B + "*"
        
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
    # special cases: s == sr
    
    # if only one gene matches, choose the one with the most domains
    elif s == 1:
        seqmatch = SequenceMatcher(None, A_string, b_string)
        max_domains = 0
        x = 0   # index in A with the gene with most domains
        y = 0   # index in B with the gene with most domains
        for a, b, z in seqmatch.get_matching_blocks():
            if z != 0:
                if DomainCountGene[A][a] > max_domains:
                    x = a
                    y = b
                    max_domains = DomainCountGene[A][a]
        
        # note: these slices are in terms of genes, not domains (which are 
        # ultimately what is used for distance)
        # Currently, the following values represent the Core Overlap
        sliceStartA = x
        sliceLengthA = 1
        sliceLengthB = 1
        
        if BGCGeneOrientation[A][x] == BGCGeneOrientation[B][y]:
            sliceStartB = y
            dcg_B = dcg_b
            B_string = b_string
            reverse = False
            b_name = B
        else:
            sliceStartB = len(BGCGeneOrientation[B]) - y - 1
            dcg_B = list(reversed(dcg_b))
            B_string = b_string_reverse
            reverse = True
            b_name = B + "*"
               
    # s == sr and (s == 0 or s > 1)
    else:
        dcg_B = dcg_b
        B_string = b_string
        # note: these slices are in terms of genes, not domains (which are 
        # ultimately what is used for distance)
        # Currently, the following values represent the Core Overlap
        sliceStartA = a
        sliceStartB = b
        sliceLengthA = s
        sliceLengthB = s
        
        reverse = False
        b_name = B
        
845
        
846
847
    lcsStartA = sliceStartA
    lcsStartB = sliceStartB
848
849
850
    seedLength = sliceLengthA
    
    
851
852
853
854
855
856
857
858
859
860
861
862
863
    # paint stuff on screen
    #if sliceStartB > sliceStartA:
        #offset_A = sliceStartB - sliceStartA
        #offset_B = 0
    #else:
        #offset_A = 0
        #offset_B = sliceStartA - sliceStartB
    #print(A, B)
    #print("  "*offset_A + " ".join(map(str,dcg_A[:sliceStartA])) + "[" + " ".join(map(str,dcg_A[sliceStartA:sliceStartA+sliceLengthA])) + "]" + " ".join(map(str,dcg_A[sliceStartA+sliceLengthA:])) + "\t" + A )
    #print("  "*offset_B + " ".join(map(str,dcg_B[:sliceStartB])) + "[" + " ".join(map(str,dcg_B[sliceStartB:sliceStartB+sliceLengthB])) + "]" + " ".join(map(str,dcg_B[sliceStartB+sliceLengthB:])) + "\t" + b_name)
    ##print(sliceStartA, sliceStartB, sliceLengthA)
    #print("")

864
    if mode=="glocal" or (mode=="auto" and (bgc_info[A].contig_edge or bgc_info[B].contig_edge)):
865
866
867
868
869
870
871
872
        #X: bgc that drive expansion
        #Y: the other bgc
        # forward: True if expansion is to the right
        # returns max_score, final positions for X and Y
        #score_expansion(X_string, dcg_X, Y_string, dcg_Y, downstream=True/False)
            
        # Expansion is relatively costly. We ask for a minimum of 3 genes
        # for the core overlap before proceeding with expansion.
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
873
        biosynthetic_hit_A = False
874
875
876
877
878
        for biosynthetic_position in core_pos_A:
            if biosynthetic_position >= sliceStartA and biosynthetic_position <= (sliceStartA+sliceLengthA):
                biosynthetic_hit_A = True
                break
        if sliceLengthA >= 3 or biosynthetic_hit_A:
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
            # LEFT SIDE
            # Find which bgc has the least genes to the left. If both have the same 
            # number, find the one that drives the expansion with highest possible score
            if sliceStartA == sliceStartB:
                # assume complete expansion of A, try to expand B
                score_B, sbB = score_expansion(B_string[:sliceStartB], A_string[:sliceStartA], False)
                # assume complete expansion of B, try to expand A
                score_A, saA = score_expansion(A_string[:sliceStartA], B_string[:sliceStartB], False)
                
                if score_A > score_B or (score_A == score_B and saA > sbB):
                    sliceLengthA += saA
                    sliceLengthB += len(B_string[:sliceStartB])
                    sliceStartA -= saA
                    sliceStartB = 0
                else:
                    sliceLengthA += len(A_string[:sliceStartA])
                    sliceLengthB += sbB
                    sliceStartA = 0
                    sliceStartB -= sbB

            else:
                # A is shorter upstream. Assume complete extension. Find B's extension
                if sliceStartA < sliceStartB:
                    score_B, sb = score_expansion(B_string[:sliceStartB], A_string[:sliceStartA], False)
                    
                    sliceLengthA += len(A_string[:sliceStartA])
                    sliceLengthB += sb
                    sliceStartA = 0
                    sliceStartB -= sb
                else:
                    score_A, sa = score_expansion(A_string[:sliceStartA], B_string[:sliceStartB], False)
                    
                    sliceLengthA += sa
                    sliceLengthB += len(B_string[:sliceStartB])
                    sliceStartA -= sa
                    sliceStartB = 0

            # RIGHT SIDE
            # check which side is the shortest downstream. If both BGCs have the same
            # length left, choose the one with the best expansion
            downstream_A = lenG_A - sliceStartA - sliceLengthA
            downstream_B = lenG_B - sliceStartB - sliceLengthB
            if downstream_A == downstream_B:
                # assume complete extension of A, try to expand B
                score_B, xb = score_expansion(B_string[sliceStartB+sliceLengthB:], A_string[sliceStartA+sliceLengthA:], True)
                # assume complete extension of B, try to expand A
                score_A, xa = score_expansion(A_string[sliceStartA+sliceLengthA:], B_string[sliceStartB+sliceLengthB:], True)
                
            
                if (score_A == score_B and xa > xb) or score_A > score_B:
                    sliceLengthA += xa
                    sliceLengthB += len(B_string[sliceStartB+sliceLengthB:])
                else:
                    sliceLengthA += len(A_string[sliceStartA+sliceLengthA:])
                    sliceLengthB += xb
                    
            else:
                if downstream_A < downstream_B:
                    # extend all of remaining A
                    score_B, xb = score_expansion(B_string[sliceStartB+sliceLengthB:], A_string[sliceStartA+sliceLengthA:], True)
                    
                    sliceLengthA += len(A_string[sliceStartA+sliceLengthA:])
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
941
                    sliceLengthB += xb
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
                    
                else:
                    score_A, xa = score_expansion(A_string[sliceStartA+sliceLengthA:], B_string[sliceStartB+sliceLengthB:], True)
                    sliceLengthA += xa
                    sliceLengthB += len(B_string[sliceStartB+sliceLengthB:])
        
        #print("  "*offset_A + " ".join(map(str,dcg_A[:sliceStartA])) + "[" + " ".join(map(str,dcg_A[sliceStartA:sliceStartA+sliceLengthA])) + "]" + " ".join(map(str,dcg_A[sliceStartA+sliceLengthA:])) + "\t" + A )
        #print("  "*offset_B + " ".join(map(str,dcg_B[:sliceStartB])) + "[" + " ".join(map(str,dcg_B[sliceStartB:sliceStartB+sliceLengthB])) + "]" + " ".join(map(str,dcg_B[sliceStartB+sliceLengthB:])) + "\t" + b_name)

        #print()
        #print(A_string)
        #print(B_string)
        #print()
        if min(sliceLengthA, sliceLengthB) >= 5:
            # First test passed. Find if there is a biosynthetic gene in both slices
            # (note that even if they are, currently we don't check whether it's 
            # actually the _same_ gene)
            biosynthetic_hit_A = False
            biosynthetic_hit_B = False
            
            for biosynthetic_position in core_pos_A:
                if biosynthetic_position >= sliceStartA and biosynthetic_position <= (sliceStartA+sliceLengthA):
                    biosynthetic_hit_A = True
                    break
            
            # return to original orientation if needed
            if reverse:
                sliceStartB = lenG_B - sliceStartB - sliceLengthB
                
            # using original core_pos_b
            for biosynthetic_position in core_pos_b:
                if biosynthetic_position >= sliceStartB and biosynthetic_position <= (sliceStartB + sliceLengthB):
                    biosynthetic_hit_B = True
                    break
                
            # finally...
            if biosynthetic_hit_A and biosynthetic_hit_B:
                domA_start = sum(dcg_A[:sliceStartA])
                domA_end = domA_start + sum(dcg_A[sliceStartA:sliceStartA+sliceLengthA])
                setA = set(A_domlist[domA_start:domA_end])
                
                domB_start = sum(dcg_b[:sliceStartB])
                domB_end = domB_start + sum(dcg_b[sliceStartB:sliceStartB+sliceLengthB])
                setB = set(B_domlist[domB_start:domB_end])
                
                intersect = setA & setB
                
                # re-adjust the indices for each domain so we get only the sequence
                # tags in the selected slice. First step: find out which is the 
                # first copy of each domain we're using
                for domain in A_domlist[:domA_start]:
                    A_domain_sequence_slice_bottom[domain] += 1
                for domain in B_domlist[:domB_start]:
                    B_domain_sequence_slice_bottom[domain] += 1
                    
                # Step 2: work with the last copy of each domain. 
                # Step 2a: make top = bottom
                for domain in setA:
                    A_domain_sequence_slice_top[domain] = A_domain_sequence_slice_bottom[domain]
                for domain in setB:
                    B_domain_sequence_slice_top[domain] = B_domain_sequence_slice_bottom[domain]
                
                # Step 2b: increase top with the domains in the slice
                for domain in A_domlist[domA_start:domA_end]:
                    A_domain_sequence_slice_top[domain] += 1
                for domain in B_domlist[domB_start:domB_end]:
                    B_domain_sequence_slice_top[domain] += 1
                    
            #else:
                #print(" - - Not a valid overlap found - - (no biosynthetic genes)\n")
                
        #else:
                #print(" - - Not a valid overlap found - - (shortest slice not large enough)\n")
                    
    # JACCARD INDEX
    Jaccard = len(intersect) / len(setA | setB)


    # DSS INDEX
    #domain_difference: Difference in sequence per domain. If one cluster does
    # not have a domain at all, but the other does, this is a (complete) 
    # difference in sequence 1. If both clusters contain the domain once, and 
    # the sequence is the same, there is a seq diff of 0.
    #S: Max occurence of each domain
    domain_difference_anchor,S_anchor = 0,0
    domain_difference,S = 0,0
        
    not_intersect = setA.symmetric_difference(setB)
        
    # Case 1
    #no need to look at seq identity, since these domains are unshared
    for unshared_domain in not_intersect:
        #for each occurence of an unshared domain do domain_difference += count 
        # of domain and S += count of domain
        unshared_occurrences = []

        try:
            num_unshared = A_domain_sequence_slice_top[unshared_domain] - A_domain_sequence_slice_bottom[unshared_domain]
        except KeyError:
            num_unshared = B_domain_sequence_slice_top[unshared_domain] - B_domain_sequence_slice_bottom[unshared_domain]
            
        # don't look at domain version, hence the split
        if unshared_domain[:7] in anchor_domains:
            domain_difference_anchor += num_unshared
        else:
            domain_difference += num_unshared
                    
    S = domain_difference # can be done because it's the first use of these
    S_anchor = domain_difference_anchor
        
    # Cases 2 and 3 (now merged)
    missing_aligned_domain_files = []
    for shared_domain in intersect:
        specific_domain_list_A = BGCs[A][shared_domain]
        specific_domain_list_B = BGCs[B][shared_domain]
        
        num_copies_a = A_domain_sequence_slice_top[shared_domain] - A_domain_sequence_slice_bottom[shared_domain]
        num_copies_b = B_domain_sequence_slice_top[shared_domain] - B_domain_sequence_slice_bottom[shared_domain]
        
        temp_domain_fastas.clear()
        
        accumulated_distance = 0
            
        # Fill distance matrix between domain's A and B versions
        DistanceMatrix = np.ndarray((num_copies_a,num_copies_b))
        for domsa in range(num_copies_a):
            for domsb in range(num_copies_b):
                sequence_tag_a = specific_domain_list_A[domsa + A_domain_sequence_slice_bottom[shared_domain]]
                sequence_tag_b = specific_domain_list_B[domsb + B_domain_sequence_slice_bottom[shared_domain]]
                
                seq_length = 0
                matches = 0
                gaps = 0
                
                try:
                    aligned_seqA = AlignedDomainSequences[sequence_tag_a]
                    aligned_seqB = AlignedDomainSequences[sequence_tag_b]
                    
                except KeyError:
                    # For some reason we don't have the multiple alignment files. 
                    # Try manual alignment
                    if shared_domain not in missing_aligned_domain_files and verbose:
                        # this will print everytime an unfound <domain>.algn is not found for every
                        # distance calculation (but at least, not for every domain pair!)
                        print("  Warning: {}.algn not found. Trying pairwise alignment...".format(shared_domain))
                        missing_aligned_domain_files.append(shared_domain)
                    
                    try:
                        unaligned_seqA = temp_domain_fastas[sequence_tag_a]
                        unaligned_seqB = temp_domain_fastas[sequence_tag_b]
                    except KeyError:
                        # parse the file for the first time and load all the sequences
                        with open(os.path.join(domains_folder, shared_domain + ".fasta"),"r") as domain_fasta_handle:
                            temp_domain_fastas = fasta_parser(domain_fasta_handle)
                        
                        unaligned_seqA = temp_domain_fastas[sequence_tag_a]
                        unaligned_seqB = temp_domain_fastas[sequence_tag_b]
                        
                    # gap_open = -15
                    # gap_extend = -6.67. These parameters were set up by Emzo
                    alignScore = pairwise2.align.globalds(unaligned_seqA, unaligned_seqB, scoring_matrix, -15, -6.67, one_alignment_only=True)
                    bestAlignment = alignScore[0]
                    aligned_seqA = bestAlignment[0]
                    aligned_seqB = bestAlignment[1]
                    
                    
                # - Calculate aligned domain sequences similarity -
                # Sequences *should* be of the same length unless something went
                # wrong elsewhere
                if len(aligned_seqA) != len(aligned_seqB):
                    print("\tWARNING: mismatch in sequences' lengths while calculating sequence identity ({})".format(shared_domain))
                    print("\t  Specific domain 1: {} len: {}".format(sequence_tag_a, str(len(aligned_seqA))))
                    print("\t  Specific domain 2: {} len: {}".format(sequence_tag_b, str(len(aligned_seqB))))
                    seq_length = min(len(aligned_seqA), len(aligned_seqB))
                else:
                    seq_length = len(aligned_seqA)
                    
                for position in range(seq_length):
                    if aligned_seqA[position] == aligned_seqB[position]:
                        if aligned_seqA[position] != "-":
                            matches += 1
                        else:
                            gaps += 1
                            
                DistanceMatrix[domsa][domsb] = 1 - ( matches/(seq_length-gaps) )
                
        #Only use the best scoring pairs
        BestIndexes = linear_sum_assignment(DistanceMatrix)
        accumulated_distance = DistanceMatrix[BestIndexes].sum()
        
        # the difference in number of domains accounts for the "lost" (or not duplicated) domains
        sum_seq_dist = (abs(num_copies_a-num_copies_b) + accumulated_distance)  #essentially 1-sim
        normalization_element = max(num_copies_a, num_copies_b)
            
        if shared_domain[:7] in anchor_domains:
            S_anchor += normalization_element
            domain_difference_anchor += sum_seq_dist
        else:
            S += normalization_element
            domain_difference += sum_seq_dist
            
    if S_anchor != 0 and S != 0:
        DSS_non_anchor = domain_difference / S
        DSS_anchor = domain_difference_anchor / S_anchor
        
        # Calculate proper, proportional weight to each kind of domain
        non_anchor_prct = S / (S + S_anchor)
        anchor_prct = S_anchor / (S + S_anchor)
        
        # boost anchor subcomponent and re-normalize
        non_anchor_weight = non_anchor_prct / (anchor_prct*anchorboost + non_anchor_prct)
        anchor_weight = anchor_prct*anchorboost / (anchor_prct*anchorboost + non_anchor_prct)

        # Use anchorboost parameter to boost percieved rDSS_anchor
        DSS = (non_anchor_weight*DSS_non_anchor) + (anchor_weight*DSS_anchor)
        
    elif S_anchor == 0:
        DSS_non_anchor = domain_difference / S
        DSS_anchor = 0.0
        
        DSS = DSS_non_anchor
        
    else: #only anchor domains were found
        DSS_non_anchor = 0.0
        DSS_anchor = domain_difference_anchor / S_anchor
        
        DSS = DSS_anchor
 
    DSS = 1-DSS #transform into similarity
 

    # ADJACENCY INDEX
    # calculates the Tanimoto similarity of pairs of adjacent domains
    
    if len(A_domlist[domA_start:domA_end]) < 2 or len(B_domlist[domB_start:domB_end]) < 2:
        AI = 0.0
    else:
        setA_pairs = set()
        for l in range(domA_start, domA_end-1):
            setA_pairs.add(tuple(sorted([A_domlist[l],A_domlist[l+1]])))
        
        setB_pairs = set()
        for l in range(domB_start, domB_end-1):
            setB_pairs.add(tuple(sorted([B_domlist[l],B_domlist[l+1]])))

        # same treatment as in Jaccard
        AI = len(setA_pairs & setB_pairs) / len(setA_pairs | setB_pairs)

    Distance = 1 - (Jaccardw * Jaccard) - (DSSw * DSS) - (AIw * AI)
    
    # This could happen due to numerical innacuracies
    if Distance < 0.0:
        if Distance < -0.000001: # this definitely is something else...
            print("Negative distance detected!")
            print(Distance)
            print("{} - {}".format(A, B))
            print("J: {}\tDSS: {}\tAI: {}".format(str(Jaccard), str(DSS), str(AI)))
            print("Jw: {}\tDSSw: {}\tAIw: {}".format(str(Jaccardw), str(DSSw), str(AIw)))
        Distance = 0.0
        
1202
    rev = 0.0
1203
    if reverse:
1204
        rev = 1.0
1205
        
1206
    return Distance, Jaccard, DSS, AI, DSS_non_anchor, DSS_anchor, S, S_anchor, lcsStartA, lcsStartB, seedLength, rev
1207

1208
@timeit
1209
def launch_hmmalign(cores, domain_sequence_list):
1210
1211
1212
1213
1214
    """
    Launches instances of hmmalign with multiprocessing.
    Note that the domains parameter contains the .fasta extension
    """
    pool = Pool(cores, maxtasksperchild=32)
1215
    pool.map(run_hmmalign, domain_sequence_list)
1216
1217
    pool.close()
    pool.join()
1218
1219
   

1220
1221
1222
def run_hmmalign(domain_file):
    #domain_file already contains the full path, with the file extension
    domain_base = domain_file.split(os.sep)[-1][:-6]
1223
1224
1225
    hmmfetch_pars = ["hmmfetch", os.path.join(pfam_dir,"Pfam-A.hmm.h3m"), domain_base]
    proc_hmmfetch = subprocess.Popen(hmmfetch_pars, stdout=subprocess.PIPE, shell=False)
    
1226
1227
    domain_file_stk = domain_file[:-6]+".stk"
    hmmalign_pars = ["hmmalign", "-o", domain_file_stk, "-", domain_file]
1228
1229
1230
1231
1232
1233
1234
1235
1236
    proc_hmmalign = subprocess.Popen(hmmalign_pars, stdin=proc_hmmfetch.stdout, stdout=subprocess.PIPE, shell=False)
    
    proc_hmmfetch.stdout.close()
    proc_hmmalign.communicate()[0]
    proc_hmmfetch.wait()
    
    if verbose:
        print(" ".join(hmmfetch_pars) + " | " + " ".join(hmmalign_pars))
    
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
    stockholm_parser(domain_file_stk)
    #SeqIO.convert(domain_file_stk, "stockholm", domain_file[:-6]+".algn", "fasta")
    
    
def stockholm_parser(stkFile):
    reference = ""
    algnDict = defaultdict(str)
    algnFile = stkFile[:-3] + 'algn'
    if not os.path.isfile(algnFile): # prevents overwriting algn files
        with open(stkFile, 'r') as infile:
            for l in infile:
                line = l.strip()
                if line.startswith("#=GC RF"):
                    reference += line[7:].strip()
                elif line == "":
                    continue
                elif line[0] == "/" or line[0] == "#":
                    continue
                else:
                    a = line.split(" ")
                    header = a[0]
                    algn = a[-1]
                    algnDict[header] += algn
                    
        # get start-end coordinates of every "x" island (original consensus)
        # in the reference
        state_reference = False
        slicing_tuples = []
        for pos in range(len(reference)):
            if reference[pos] == "x" and not state_reference:
                state_reference = True
                start = pos
            if reference[pos] == "." and state_reference:
                state_reference = False
                slicing_tuples.append((start,pos))
        if state_reference:
            slicing_tuples.append((start, len(reference)))
    
    if len(algnDict) > 0:
        with open(algnFile, "w") as outfile:
            for header in algnDict:
                sequence = ""
                for a,b in slicing_tuples:
                    sequence += algnDict[header][a:b]
                outfile.write(">{}\n".format(header))
                outfile.write(sequence + "\n")
    return
1284

1285

1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
def runHmmScan(fastaPath, hmmPath, outputdir, verbose):
    """ Runs hmmscan command on a fasta file with a single core to generate a
    domtable file"""
    hmmFile = os.path.join(hmmPath,"Pfam-A.hmm")
    if os.path.isfile(fastaPath):
        name = ".".join(fastaPath.split(os.sep)[-1].split(".")[:-1])
        outputName = os.path.join(outputdir, name+".domtable")
        
        hmmscan_cmd = "hmmscan --cpu 0 --domtblout {} --cut_tc {} {}".format(outputName, hmmFile, fastaPath)
        if verbose == True:
            print("   " + hmmscan_cmd)
        subprocess.check_output(hmmscan_cmd, shell=True)

    else:
        sys.exit("Error running hmmscan: Fasta file " + fastaPath + " doesn't exist")

1302

1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
def parseHmmScan(hmmscanResults, pfd_folder, pfs_folder, overlapCutoff):
    outputbase = ".".join(hmmscanResults.split(os.sep)[-1].split(".")[:-1])
    # try to read the domtable file to find out if this gbk has domains. Domains
    # need to be parsed into fastas anyway.
    if os.path.isfile(hmmscanResults):
        pfd_matrix = domtable_parser(outputbase, hmmscanResults)
        
        # get number of domains to decide if this BGC should be removed
        num_domains = len(pfd_matrix)

        if num_domains > 0:
1314
1315
            if verbose:
                print("  Processing domtable file: " + outputbase)
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340

            # check_overlap also sorts the filtered_matrix results and removes
            # overlapping domains, keeping the highest scoring one
            filtered_matrix, domains = check_overlap(pfd_matrix,overlapCutoff)
            
            # Save list of domains per BGC
            pfsoutput = os.path.join(pfs_folder, outputbase + ".pfs")
            with open(pfsoutput, 'w') as pfs_handle:
                pfs_handle.write(" ".join(domains))
            
            # Save more complete information of each domain per BGC
            pfdoutput = os.path.join(pfd_folder, outputbase + ".pfd")
            with open(pfdoutput,'w') as pfd_handle:
                write_pfd(pfd_handle, filtered_matrix)
        else:
            # there aren't any domains in this BGC
            # delete from all data structures
            print("  No domains where found in {}.domtable. Removing it from further analysis".format(outputbase))
            info = genbankDict.get(outputbase)
            clusters.remove(outputbase)
            baseNames.remove(outputbase)
            gbk_files.remove(info[0])
            for sample in info[1]:
                sampleDict[sample].remove(outputbase)
            del genbankDict[outputbase]
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
1341
1342
            if outputbase in mibig_set:
                mibig_set.remove(outputbase)
1343
1344
1345
1346
1347
1348
            
    else:
        sys.exit("Error: hmmscan file " + outputbase + " was not found! (parseHmmScan)")

    return("")

1349
1350

def clusterJsonBatch(bgcs, pathBase, className, matrix, pos_alignments, cutoffs=[1.0], damping=0.9, clusterClans=False, clanCutoff=(0.5,0.8), htmlFolder=None):
1351
1352
1353
1354
1355
1356
1357
1358
    """BGC Family calling
    Uses csr sparse matrices to call Gene Cluster Families (GCFs) using Affinity
    Propagation.
    
    Cutoff values are in distance (i.e. if two clusters are further than cutoff 
    value, similarity is 0)
    Larger cutoff values are more permissive
    
1359
1360
1361
    bgcs: ordered list of integers (ascending, unique, but not necessarily 
        consecutive) representing the index in the main clusterNames list. Every
        number here is an "external" index
1362
1363
    matrix: list of lists of idx0, idx1, d where the first two elements correspond
        to indices from `bgcs`
1364
    pathBase: folder where GCF files will be deposited
1365
    """
1366
    numBGCs = len(bgcs)
1367
    
1368
    simDict = {} # dictionary of dictionaries
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
    # Doing this so it only has to go through the matrix once
    for row in matrix:
        gc1, gc2, distance = row
        
        if distance < 1.0:
            similarity = 1 - distance
        else:
            similarity = 0
        gcSimilarities = simDict.setdefault(gc1, {})
        gcSimilarities[gc2] = similarity

    clanClassificationCutoff, clanDistanceCutoff = clanCutoff
    if clusterClans and verbose:
        print('Clustering Clans Enabled with parameters clanClassificationCutoff: {}, clanDistanceCutoff: {}'.format(clanClassificationCutoff,clanDistanceCutoff))
    
1384
    # External index number to Internal, consecutive, index
1385
    # If needed, bgcInt2Ext[i] would simply be bgcs[i]
1386
    bgcExt2Int = dict(zip(bgcs, range(numBGCs)))
1387
1388
1389
1390
1391
    
    # Get info on all BGCs to export to .js for visualization
    bs_data = []
    bgcJsonDict = {}
    for bgc in bgcs:
1392
        bgcName = clusterNames[bgc]
1393
        bgcJsonDict[bgcName] = {}
1394
        bgcJsonDict[bgcName]["id"] = bgcName
1395
1396
        bgcJsonDict[bgcName]["desc"] = bgc_info[bgcName].description
        bgcJsonDict[bgcName]["start"] = 1
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
1397
        bgcJsonDict[bgcName]["end"] = bgc_info[bgcName].bgc_size
1398
        bgcJsonDict[bgcName]["mibig"] = bgcName in mibig_set
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
        
        pfdFile = os.path.join(pfd_folder, bgcName + ".pfd")
        fastaFile = os.path.join(bgc_fasta_folder, bgcName + ".fasta")
        
        orfDict = defaultdict(dict)
        
        ## read fasta file first to get orfs
        # We cannot get all the info exclusively from the pfd because that only
        # contains ORFs with predicted domains (and we need to draw empty genes
        # as well)
        for line in open(fastaFile):
            if line[0] == ">":
                header = line.strip()[1:].split(':')
1412
                orf = header[0]
1413
                if header[2]:
1414
                    orfDict[orf]["id"] = header[2]
1415
                elif header[4]:
1416
                    orfDict[orf]["id"] = header[4]
1417
                else:
1418
                    orfDict[orf]["id"] = orf
1419
1420
1421
                    
                ## broken gene goes into cluster, need this so js doesn't throw an error
                if int(header[6]) <= 1:
1422
                    orfDict[orf]["start"] = 1
1423
                else:
1424
                    orfDict[orf]["start"] = int(header[6])
1425
                    
1426
                orfDict[orf]["end"] = int(header[7])
1427
1428
                
                if header[-1] == '+':
1429
                    orfDict[orf]["strand"] = 1
1430
                else:
1431
                    orfDict[orf]["strand"] = -1
1432
                    
1433
                orfDict[orf]["domains"] = []
1434
1435
1436
1437
1438
1439
1440
    
        ## now read pfd file to add the domains to each of the orfs
        for line in open(pfdFile):
            entry = line.split('\t')
            orf = entry[-1].strip().split(':')[0]
            pfamID = entry[5].split('.')[0]
            orfDict[orf]["domains"].append({'code': pfamID, 'start': int(entry[3]), 'end': int(entry[4]), 'bitscore': float(entry[1])})
1441
1442
1443
1444
        # order of ORFs is important here because I use it to get a translation
        # between the "list of ORFs with domains" to "list of all ORFs" later on
        bgcJsonDict[bgcName]['orfs'] = sorted(orfDict.values(), key=itemgetter("start"))
    bs_data = [bgcJsonDict[clusterNames[bgc]] for bgc in bgcs]
1445
1446
    
    
1447
1448
1449
    # Create network
    g = nx.Graph()
    
1450
1451
1452
1453
1454
1455
1456
1457
    results = {}
    for cutoff in cutoffs:    
        family_data = { # will be returned, for use in overview.js
            "label": className,
            "families": [],
            "families_similarity": []
        }
        
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
        print("  Cutoff: {}".format(cutoff))
        # first task is to find labels (exemplars) for each node.
        # Assign disconnected (singleton) BGCs to themselves
        labels = [0 for i in range(numBGCs)]
        
        # clear all edges from network
        g.clear()
        
        # add all nodes
        g.add_nodes_from(bgcs)
        
        # add all (allowed) edges
        for bgc1 in bgcs:
            for bgc2 in simDict.get(bgc1,{}).keys():
                if simDict[bgc1][bgc2] > 1 - cutoff:
                    g.add_edge(bgc1, bgc2)
                    
        for subgraph in nx.connected_components(g):
            numBGCs_subgraph = len(subgraph)
            # smaller subgraphs don't need to be clustered
            if numBGCs_subgraph < 3:
                temp = list(subgraph)
                for bgc in temp:
                    labels[bgcExt2Int[bgc]] = bgcExt2Int[temp[0]]
            else:
                bgcExt2Sub_ = dict(zip([c for c in subgraph], range(numBGCs_subgraph)))
                bgcSub2Ext_ = dict(zip(range(numBGCs_subgraph), [c for c in subgraph]))
                                   
                simMatrix = np.zeros((numBGCs_subgraph, numBGCs_subgraph), dtype=np.float32)
                for bgc1, bgc2 in combinations(subgraph,2):
                    try:
                        simMatrix[bgcExt2Sub_[bgc1], bgcExt2Sub_[bgc2]] = simDict[bgc1][bgc2]
                    except KeyError:
                        simMatrix[bgcExt2Sub_[bgc1], bgcExt2Sub_[bgc2]] = simDict[bgc2][bgc1]
                        
1493
                af = AffinityPropagation(damping=damping, max_iter=1000, convergence_iter=200, affinity="precomputed").fit(simMatrix)
1494
1495
1496
1497
1498
1499
                labelsSub = af.labels_
                exemplarsSub = af.cluster_centers_indices_
                
                # TODO go straight to familiesDict
                for i in range(numBGCs_subgraph):
                    labels[bgcExt2Int[bgcSub2Ext_[i]]] = bgcExt2Int[bgcSub2Ext_[exemplarsSub[labelsSub[i]]]]
1500
        
1501
        # Recalculate distance matrix as we'll need it with clans
1502
1503
1504
        #simMatrix = lil_matrix((numBGCs, numBGCs), dtype=np.float32)
        simMatrix = np.zeros((numBGCs, numBGCs), dtype=np.float32)
        for bgc1 in bgcs:
1505
            # first make sure it is similar to itself
1506
            simMatrix[bgcExt2Int[bgc1],bgcExt2Int[bgc1]] = 1
1507
            for bgc2 in simDict.get(bgc1,{}).keys():
1508
1509
                # you might get 0 values if there were matrix entries under the
                # cutoff. No need to input these in the sparse matrix
1510
                
1511
1512
                if simDict[bgc1][bgc2] > 1-cutoff:
                    # Ensure symmetry
1513
1514
1515
                    simMatrix[bgcExt2Int[bgc1], bgcExt2Int[bgc2]] = simDict[bgc1][bgc2]
                    simMatrix[bgcExt2Int[bgc2], bgcExt2Int[bgc1]] = simDict[bgc1][bgc2]
        
1516
1517
        if verbose:
            print("   ...done")
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527

        bs_distances = [[float("{:.3f}".format(simMatrix[row, col])) for col in 
                         range(row+1)] for row in range(numBGCs)]
        
        
        familiesDict = defaultdict(list)
        for i in range(numBGCs):
            familiesDict[bgcs[labels[i]]].append(bgcs[i])
        familyIdx = sorted(familiesDict.keys()) # identifiers for each family
        
Jorge Navarro Muñoz's avatar
Bugfix    
Jorge Navarro Muñoz committed
1528
        
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
        ##
        ## Get conserved domain core information to build phylogenetic tree
        ## 
        gcf_trees_path = os.path.join(pathBase, "GCF_trees")
        if not os.path.exists(gcf_trees_path):
            os.makedirs(gcf_trees_path)
            
        newick_trees = {} # key:original bgc index
        for exemplar_idx in familiesDict:
            exemplar = clusterNames[exemplar_idx]
            gcf = familiesDict[exemplar_idx][:]
            if len(gcf) < 3:
1541
                newick_trees[exemplar_idx] = "({}):0.01;".format(",".join([str(bgcExt2Int[x])+":0.0" for x in gcf]))
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
                
                #print(newick_trees[exemplar_idx])
                #TODO make some default alignment data to send to the json file
                continue
            
            domain_sets = {}
            
            # make a frequency table (not counting copies):
            frequency_table = defaultdict(int)
            for bgc in gcf:
                domain_sets[bgc] = set(DomainList[clusterNames[bgc]])
                for domain in domain_sets[bgc]:
                    frequency_table[domain] += 1
            
            # Remove all PKS/NRPS domains
            # TODO but this was intended to be done when we were considering
            # directly using Corason. Should these domains still be removed?
            # We are now able to include them accurately with the Hungarian
            # matching
            # If implemented, fill nrps_pks_domains first!
            #domains_in_gcf = set(frequency_table.keys())
            #for erase_domain in (nrps_pks_domains & domains_in_gcf):
                #del frequency_table[erase_domain]
                
            # Find the set of [(tree domains)]. They should 1) be in the exemplar
            # and 2) appear with the most frequency. Iterate over the different
            # frequencies (descending) until set is not empty
            tree_domains = set()
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
1570
            frequencies = sorted(set(frequency_table.values()), reverse=True)
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
            
            # first try with domain(s) with max frequency, even if it's just one
            f = 0 
            while len(tree_domains) == 0 and f < len(frequencies):
                for domain in frequency_table:
                    if frequency_table[domain] == frequencies[f] and domain in domain_sets[exemplar_idx]:
                            tree_domains.add(domain)
                    
                f += 1
            
            
            if len(tree_domains) == 1:
1583
1584
                if verbose:
                    print("  Warning: core shared domains for GCF {} consists of a single domain ({})".format(exemplar_idx, [x for x in tree_domains][0]))
1585
1586
1587
1588
1589
1590
1591
            
            # Get the alignments of the core domains
            alignments = {}
            # initialize every sequence alignment entry. Don't do defaultdict!
            alignments[exemplar_idx] = ""
            
            out_of_tree_bgcs = [] # bgcs that don't share a common domain core
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
1592
            delete_list = set() # remove this bgcs from alignment
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
            gcf.remove(exemplar_idx) # separate exemplar from the rest of the bgcs
            for bgc in gcf:
                alignments[bgc] = ""

            match_dict = {}
            for domain in tree_domains:
                specific_domain_list_A = BGCs[exemplar][domain]
                num_copies_a = len(specific_domain_list_A)
                for exemplar_domain_copy in specific_domain_list_A:
                    alignments[exemplar_idx] += AlignedDomainSequences[exemplar_domain_copy]
                
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
1604
1605
1606
                seq_length = len(AlignedDomainSequences[specific_domain_list_A[0]])
                
                for bgc in alignments:
1607
                    match_dict.clear()
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
1608
1609
1610
1611
                    if bgc == exemplar_idx:
                        pass
                    elif domain not in domain_sets[bgc]:
                        if verbose:
1612
                            print("   BGC {} ({}) does not share a common domain core (GCF: {}, domain: {})".format(clusterNames[bgc], bgc, exemplar_idx, domain))
1613
                        out_of_tree_bgcs.append(bgc)
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
1614
1615
1616
                        delete_list.add(bgc)
                    elif bgc in delete_list:
                        pass
1617
1618
1619
1620
1621
1622
                    else:
                        specific_domain_list_B = BGCs[clusterNames[bgc]][domain]
                        
                        num_copies_b = len(specific_domain_list_B)
                        
                        DistanceMatrix = np.ndarray((num_copies_a,num_copies_b))
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
1623
                        
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
                        for domsa in range(num_copies_a):
                            for domsb in range(num_copies_b):
                                # TODO NOT taking into consideration any LCS slicing
                                # i.e. we're comparing ALL copies of this domain
                                sequence_tag_a = specific_domain_list_A[domsa]
                                sequence_tag_b = specific_domain_list_B[domsb]
                                
                                aligned_seqA = AlignedDomainSequences[sequence_tag_a]
                                aligned_seqB = AlignedDomainSequences[sequence_tag_b]
                                
                                matches = 0
                                gaps = 0
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
1636
                                
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
                                for position in range(seq_length):
                                    if aligned_seqA[position] == aligned_seqB[position]:
                                        if aligned_seqA[position] != "-":
                                            matches += 1
                                        else:
                                            gaps += 1
                                            
                                DistanceMatrix[domsa][domsb] = 1 - ( matches/(seq_length-gaps) )

                        BestIndexes = linear_sum_assignment(DistanceMatrix)
                        # at this point is not ensured that we have the same order
                        # for the exemplar's copies (rows in BestIndexes)
                        # ideally they should go from 0-numcopies. Better make sure
                        for x in range(len(BestIndexes[0])):
                            match_dict[BestIndexes[0][x]] = BestIndexes[1][x]
                            
                        for copy in range(num_copies_a):
                            try:
                                alignments[bgc] += AlignedDomainSequences[specific_domain_list_B[match_dict[copy]]]
                            except KeyError:
                                # This means that this copy of exemplar did not
                                # have a match in bgc (i.e. bgc has less copies
                                # of this domain than exemplar)
                                alignments[bgc] += "-"*seq_length
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
1661
1662
1663
1664
1665
                    
                for bgc in list(delete_list):
                    del alignments[bgc]
                delete_list.clear()
                
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
            # need this to change the labels in the trees that are read from files
            bgc_name_to_idx = {}
            
            # save compiled alignments of the GCF domain core as fastas
            alignment_file_path = os.path.join(gcf_trees_path,"GCF_c{:4.2f}_{:05d}_alignment.fasta".format(cutoff,exemplar_idx))
            with open(alignment_file_path, "w") as gcf_alignment_file:
                gcf_alignment_file.write(">{}\n{}\n".format(exemplar, alignments[exemplar_idx]))
                bgc_name_to_idx[exemplar] = exemplar_idx
                
                for bgc in alignments:
                    if bgc != exemplar_idx:
                        gcf_alignment_file.write(">{}\n{}\n".format(clusterNames[bgc], alignments[bgc]))
                        bgc_name_to_idx[clusterNames[bgc]] = bgc
            
            # launch fasttree to make tree
1681
1682
            if verbose:
                print("  Working GCF {}, cutoff {}".format(exemplar_idx, cutoff))
1683
1684
                
            # make tree
1685
1686
1687
1688
1689
1690
            newick_file_path = os.path.join(gcf_trees_path, "GCF_c{:4.2f}_{:05d}.newick".format(cutoff,exemplar_idx))
            with open(newick_file_path, "w") as newick_file:
                command = ["fasttree", "-nopr", "-quiet", alignment_file_path]
                p = subprocess.Popen(command, stdout=newick_file, shell=False)
                p.wait() # only with process has terminated will the file be ready

1691
1692
            # read tree, post-process it and save it
            if not os.path.isfile(newick_file_path) or os.path.getsize(newick_file_path) == 0:
1693
                print(newick_file_path)
1694
                sys.exit(" ERROR: newick file not created or empty (GCF_c{:4.2f}_{:05d})".format(cutoff,exemplar_idx))
Jorge Navarro Muñoz's avatar
Jorge Navarro Muñoz committed
1695
            else:
1696
                with open(newick_file_path,"r") as newick_file:
1697
                    try:
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
                        tree = Phylo.read(newick_file, 'newick')
                    except ValueError as e:
                        print(" Warning! There was an error while reading tree file {}".format(newick_file))
                        print(str(e))
                        newick_trees[exemplar_idx] = ""
                    else:
                        try:
                            tree.root_at_midpoint()
                        except UnboundLocalError:
                            # Noticed this could happen if the sequences are exactly
                            # the same and all distances == 0
1709
1710
                            if verbose:
                                print(" Warning: Unable to root at midpoint file {}".format(newick_file_path))
1711
1712
1713
1714
1715
1716
1717
1718
                            pass
                        newick = tree.format("newick")
                        
                        # convert branches' names to indices for visualization
                        for name in bgc_name_to_idx:
                            newick = newick.replace(name, str(bgcExt2Int[bgc_name_to_idx[name]]))

                        newick_trees[exemplar_idx] = newick
Jorge Navarro Muñoz's avatar
Bugfix    
Jorge Navarro Muñoz committed
1719