Commit 04e9b58c authored by Jorge Navarro Muñoz's avatar Jorge Navarro Muñoz

Compatibility with antiSMASH 5 files

- Added compatibility with antiSMASH 5 annotations
- Added a filter to select only individual cluster (antiSMASH 4) or
region (antiSMASH 5) files ('--include_gbk_str'). Previously, we had
been using 'final' as a string to filter out complete genome files, but
this string no longer appears in antiSMASH 5 files.
- domain_whitelist.txt: commented example domain
- domains_color_file.tsv: added random colors for more domains
parent 2356cb16
......@@ -6,11 +6,13 @@ BiG-SCAPE
PI: Marnix Medema marnix.medema@wur.nl
Main developers:
Maintainers/developers:
Jorge Navarro j.navarro@westerdijkinstitute.nl
Emmanuel (Emzo) de los Santos E.De-Los-Santos@warwick.ac.uk
Satria Kautsar satria.kautsar@wur.nl
Developer:
Emmanuel (Emzo) de los Santos E.De-Los-Santos@warwick.ac.uk
Usage: Please see `python bigscape.py -h`
......@@ -78,6 +80,7 @@ global mibig_set
global genbankDict
global valid_classes
def process_gbk_files(gbk, min_bgc_size, bgc_info, files_no_proteins, files_no_biosynthetic_genes):
""" Given a file path to a GenBank file, reads information about the BGC"""
......@@ -120,7 +123,6 @@ def process_gbk_files(gbk, min_bgc_size, bgc_info, files_no_proteins, files_no_b
bgc_size = 0
cds_ctr = 0
product = "no type"
del product_list_per_record[:]
offset_record_position = 0
max_width = 0 # This will be used for the SVG figure
......@@ -133,13 +135,14 @@ def process_gbk_files(gbk, min_bgc_size, bgc_info, files_no_proteins, files_no_b
max_width = len(record.seq)
for feature in record.features:
if "cluster" in feature.type:
# antiSMASH <= 4
if feature.type == "cluster":
if "product" in feature.qualifiers:
if len(feature.qualifiers["product"]) > 1:
print(" WARNING: more than product annotated in record " + str(record_count) + ", " + fname)
break
else:
product_list_per_record.append(feature.qualifiers["product"][0].replace(" ",""))
# in antiSMASH 4 there should only be 1 product qualifiers
for product in feature.qualifiers["product"]:
for p in product.replace(" ","").split("-"):
product_list_per_record.append(p)
if "contig_edge" in feature.qualifiers:
# there might be mixed contig_edge annotations
# in multi-record files. Turn on contig_edge when
......@@ -149,6 +152,22 @@ def process_gbk_files(gbk, min_bgc_size, bgc_info, files_no_proteins, files_no_b
print(" Contig edge detected in {}".format(fname))
contig_edge = True
# antiSMASH = 5
if "region" in feature.type:
if "product" in feature.qualifiers:
for product in feature.qualifiers["product"]:
product_list_per_record.append(product)
if "contig_edge" in feature.qualifiers:
# there might be mixed contig_edge annotations
# in multi-record files. Turn on contig_edge when
# there's at least one annotation
if feature.qualifiers["contig_edge"][0] == "True":
if verbose:
print(" Contig edge detected in {}".format(fname))
contig_edge = True
# Get biosynthetic genes + sequences
if feature.type == "CDS":
cds_ctr += 1
......@@ -179,10 +198,16 @@ def process_gbk_files(gbk, min_bgc_size, bgc_info, files_no_proteins, files_no_b
fasta_header = fasta_header.replace(">","") #the coordinates might contain larger than signs, tools upstream don't like this
fasta_header = fasta_header.replace(" ", "") #the domtable output format (hmmscan) uses spaces as a delimiter, so these cannot be present in the fasta header
# antiSMASH <=4
if "sec_met" in feature.qualifiers:
if "Kind: biosynthetic" in feature.qualifiers["sec_met"]:
biosynthetic_genes.add(fasta_header)
# antiSMASH == 5
if "gene_kind" in feature.qualifiers:
if "biosynthetic" in feature.qualifiers["gene_kind"]:
biosynthetic_genes.add(fasta_header)
fasta_header = ">"+fasta_header
......@@ -270,16 +295,16 @@ def process_gbk_files(gbk, min_bgc_size, bgc_info, files_no_proteins, files_no_b
product = product_list_per_record[0]
elif "other" in product_set: # more than one, and it contains "other"
if len(product_set) == 2:
product = list(product_set - set(['other']))[0] # product = not "other"
product = list(product_set - {'other'})[0] # product = not "other"
else:
product = "-".join(product_set - set(['other'])) # likely a hybrid
product = ".".join(product_set - {'other'}) # likely a hybrid
else:
product = "-".join(product_set) # likely a hybrid
product = ".".join(product_set) # likely a hybrid
# Don't keep this bgc if its type not in valid classes specified by user
# This will avoid redundant tasks like domain detection
subproduct = set()
for p in product.split("-"):
for p in product.split("."):
subproduct.add(sort_bgc(p).lower())
if "nrps" in subproduct and ("pksi" in subproduct or "pksother" in subproduct):
subproduct.add("pks-nrp_hybrids")
......@@ -382,7 +407,7 @@ def process_gbk_files(gbk, min_bgc_size, bgc_info, files_no_proteins, files_no_b
return adding_sequence
def get_gbk_files(inputpath, outputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc_info):
def get_gbk_files(inputpath, outputdir, bgc_fasta_folder, min_bgc_size, include_gbk_str, exclude_gbk_str, bgc_info):
"""Searches given directory for genbank files recursively, will assume that
the genbank files that have the same name are the same genbank file.
Returns a dictionary that contains the names of the clusters found as keys
......@@ -409,15 +434,16 @@ def get_gbk_files(inputpath, outputdir, bgc_fasta_folder, min_bgc_size, exclude_
for filepath in files:
file_folder, fname = os.path.split(filepath)
if type(exclude_gbk_str) == str and exclude_gbk_str != "" and \
exclude_gbk_str in fname:
if verbose:
if len(include_gbk_str) == 1 and include_gbk_str[0] == "*":
pass
else:
if not any([word in fname for word in include_gbk_str]):
continue
if exclude_gbk_str != [] and any([word in fname for word in exclude_gbk_str]):
print(" Skipping file " + fname)
continue
elif type(exclude_gbk_str) == list and exclude_gbk_str != [] and \
any([word in fname for word in exclude_gbk_str]):
print(" Skipping file " + fname)
continue
continue
if "_ORF" in fname:
print(" Skipping file {} (string '_ORF' is used internally)".format(fname))
continue
......@@ -463,11 +489,9 @@ def timeit(funct):
ret = funct(*args)
runtime = time.time()-start_time
runtime_string = '{} took {:.3f} seconds'.format(funct.__name__, runtime)
# To prevent insignificant runtimes from ending up in the file.
if runtime > 1:
with open(os.path.join(log_folder, "runtimes.txt"), 'a') as timings_file:
timings_file.write(runtime_string + "\n")
print(runtime_string)
with open(os.path.join(log_folder, "runtimes.txt"), 'a') as timings_file:
timings_file.write(runtime_string + "\n")
print(runtime_string)
return ret
return _wrap
......@@ -1181,7 +1205,7 @@ def cluster_distance_lcs(A, B, A_domlist, B_domlist, dcg_A, dcg_b, core_pos_A, c
return Distance, Jaccard, DSS, AI, DSS_non_anchor, DSS_anchor, S, S_anchor, lcsStartA, lcsStartB, seedLength, rev
@timeit
def launch_hmmalign(cores, domain_sequence_list):
"""
Launches instances of hmmalign with multiprocessing.
......@@ -1939,12 +1963,6 @@ def CMD_parser():
help="Input directory of gbk files, if left empty, all \
gbk files in current and lower directories will be used.")
parser.add_argument("--exclude_gbk_str", dest="exclude_gbk_str",
default="final", nargs="+",
help="If any string in this list occurs in the gbk \
filename, this file will not be used for the analysis.\
(default: final)")
parser.add_argument("-o", "--outputdir", dest="outputdir", default="",
required=True, help="Output directory, this will contain \
all output data files.")
......@@ -1957,7 +1975,20 @@ def CMD_parser():
parser.add_argument("-c", "--cores", dest="cores", default=cpu_count(),
help="Set the number of cores the script may use (default:\
use all available cores)")
parser.add_argument("--include_gbk_str", dest="include_gbk_str",
default=['cluster', 'region'], nargs="+",
help="Only gbk files with this string(s) will be used \
for the analysis (default: 'cluster', 'region'). Use an \
asterisk to accept every file (overrides \
'--exclude_gbk_str')")
parser.add_argument("--exclude_gbk_str", dest="exclude_gbk_str",
default=['final'], nargs="+",
help="If any string in this list occurs in the gbk \
filename, this file will not be used for the analysis\
(default: final).")
parser.add_argument("-v", "--verbose", dest="verbose", action="store_true",
default=False, help="Prints more detailed information. \
Toggle to activate.")
......@@ -2061,16 +2092,16 @@ def CMD_parser():
help="Include BGCs from the previous version of MIBiG (1.3)")
parser.add_argument("--query_bgc", help="Instead of making an all-VS-all \
comparison of all the input BGCs, choose one BGC to \
compare with the rest of the set (one-VS-all). The \
query BGC does not have to be within inputdir")
comparison of all the input BGCs, choose one BGC to \
compare with the rest of the set (one-VS-all). The \
query BGC does not have to be within inputdir")
parser.add_argument("--domain_whitelist", help="Only analyze include those\
BGCs that include domains with the pfam accessions \
found in the domain_whitelist.txt file", default=False,
parser.add_argument("--domain_whitelist", help="Only analyze BGCs that \
include domains with the pfam accessions found in the \
domain_whitelist.txt file", default=False,
action="store_true")
parser.add_argument("--version", action="version", version="%(prog)s 20181005")
parser.add_argument("--version", action="version", version="%(prog)s 20190603")
return parser.parse_args()
......@@ -2238,7 +2269,7 @@ if __name__=="__main__":
continue
domain_whitelist.add(line.split("\t")[0].strip())
if len(domain_whitelist) == 0:
print("Error: --domain_whitelist used, but no domains found in the file")
print("Warning: --domain_whitelist used, but no domains found in the file")
else:
has_whitelist = True
else:
......@@ -2303,13 +2334,17 @@ if __name__=="__main__":
genbankDict = {}
# Exclude single string
include_gbk_str = options.include_gbk_str
if len(include_gbk_str) == 1 and include_gbk_str[0] == "*":
print(" Including all files")
elif len(include_gbk_str) == 1 and include_gbk_str[0] == "":
sys.exit(" Stop: no strings specified for '--include_gbk_str'")
else:
print(" Including files with one or more of the following strings in their filename: '{}'".format("', '".join(include_gbk_str)))
exclude_gbk_str = options.exclude_gbk_str
if type(exclude_gbk_str) == str and exclude_gbk_str != "":
print(" Skipping files with '{}' in their filename".format(exclude_gbk_str))
# Exclude from list of strings
elif type(exclude_gbk_str) == list and exclude_gbk_str != []:
print(" Skipping files with one or more of the following strings in \
their filename: {}".format(", ".join(exclude_gbk_str)))
if exclude_gbk_str != []:
print(" Skipping files with one or more of the following strings in their filename: '{}'".format("', '".join(exclude_gbk_str)))
# Read included MIBiG
# Change this for every officially curated MIBiG bundle
......@@ -2346,14 +2381,16 @@ if __name__=="__main__":
sys.exit("Did not find the correct number of MIBiG BGCs ({}). Please clean the 'Annotated MIBiG reference' folder from any .gbk files first".format(mibig_zipfile_numbgcs[2]))
print("\nImporting MIBiG files")
get_gbk_files(bgcs_path, output_folder, bgc_fasta_folder, int(options.min_bgc_size), exclude_gbk_str, bgc_info)
get_gbk_files(bgcs_path, output_folder, bgc_fasta_folder, int(options.min_bgc_size),
include_gbk_str, exclude_gbk_str, bgc_info)
for i in genbankDict.keys():
mibig_set.add(i)
print("\nImporting GenBank files")
get_gbk_files(options.inputdir, output_folder, bgc_fasta_folder, int(options.min_bgc_size), exclude_gbk_str, bgc_info)
get_gbk_files(options.inputdir, output_folder, bgc_fasta_folder, int(options.min_bgc_size),
include_gbk_str, exclude_gbk_str, bgc_info)
if has_query_bgc:
query_bgc = ".".join(options.query_bgc.split(os.sep)[-1].split(".")[:-1])
......@@ -2362,7 +2399,9 @@ if __name__=="__main__":
pass
else:
print("\nImporting query BGC file")
get_gbk_files(options.query_bgc, output_folder, bgc_fasta_folder, int(options.min_bgc_size), exclude_gbk_str, bgc_info)
get_gbk_files(options.query_bgc, output_folder, bgc_fasta_folder,
int(options.min_bgc_size), include_gbk_str,
exclude_gbk_str, bgc_info)
if query_bgc not in genbankDict:
sys.exit("Error: not able to include Query BGC (check valid classes, BGC size, etc. Run again with --verbose)")
......@@ -2820,7 +2859,6 @@ if __name__=="__main__":
shutil.copy(os.path.join(os.path.realpath(os.path.dirname(__file__)), "html_template", "overview_html"), os.path.join(network_html_folder_cutoff, "overview.html"))
rundata_networks_per_run[network_html_folder_cutoff] = []
html_subs_per_run[network_html_folder_cutoff] = []
print(rundata_networks_per_run)
# create pfams.js
pfams_js_file = os.path.join(output_folder, "html_content", "js", "pfams.js")
......@@ -3059,9 +3097,9 @@ if __name__=="__main__":
if "t1pks" not in product and "pksother" in valid_classes:
BGC_classes["PKSother"].append(clusterIdx)
if predicted_class == "Others" and "-" in product:
if predicted_class == "Others" and "." in product:
subclasses = set()
for subproduct in product.split("-"):
for subproduct in product.split("."):
subclass = sort_bgc(subproduct)
if subclass.lower() in valid_classes:
subclasses.add(subclass)
......
# Enter a list of pfam domains to be used for filtering BGCs during analysis
# One item per line (but comments are allowed after a tab). e.g.:
PF00067 Cytochrome P450
# PF00067 Cytochrome P450
......@@ -6444,3 +6444,513 @@ PF03985 202,68,92
PF09733 183,84,101
PF16944 205,55,208
PF07156 170,89,201
PF18314 185,90,146
PF18325 186,70,103
PF17951 75,195,64
PF17828 105,228,171
PF18558 104,215,138
PF06703 72,127,198
PF17829 192,81,111
PF10198 209,201,60
PF18098 124,106,218
PF17871 169,81,187
PF17784 86,191,90
PF17767 82,185,201
PF18029 90,208,104
PF17777 89,218,162
PF17667 201,185,51
PF17801 220,166,99
PF18132 65,193,98
PF17800 178,66,91
PF18129 208,165,93
PF18334 71,181,116
PF18332 157,185,86
PF17846 85,179,74
PF18317 56,215,75
PF18631 66,72,227
PF17786 63,209,201
PF17753 204,84,92
PF18018 79,116,186
PF14773 52,74,197
PF05238 130,60,196
PF02179 95,169,202
PF16035 98,78,180
PF10348 209,63,131
PF09135 158,86,188
PF05346 63,207,104
PF01331 77,89,198
PF03919 56,140,193
PF17678 89,123,201
PF17748 207,84,146
PF17747 71,154,203
PF08553 146,180,76
PF18369 192,179,67
PF18605 162,179,56
PF17754 222,71,138
PF18844 169,97,196
PF18288 52,189,209
PF09134 58,68,188
PF17969 142,204,85
PF18427 76,90,188
PF17843 83,185,170
PF17914 77,79,213
PF17755 175,198,76
PF17844 79,187,68
PF17837 198,68,92
PF17853 202,193,91
PF18412 204,137,88
PF17929 194,179,90
PF17931 130,202,54
PF18218 176,180,58
PF18563 95,217,77
PF18328 143,220,82
PF18291 76,186,201
PF18604 148,214,94
PF18105 71,184,96
PF18238 108,225,168
PF18456 81,192,95
PF17770 205,78,145
PF18267 130,82,207
PF13698 97,192,214
PF17645 72,105,187
PF17863 58,87,190
PF18047 91,195,69
PF18065 93,82,184
PF18679 96,205,78
PF17874 63,189,171
PF18158 179,52,62
PF17765 190,109,59
PF18603 90,215,56
PF16332 198,84,61
PF17836 164,52,206
PF17762 190,83,224
PF01396 90,49,178
PF17720 206,64,131
PF18152 80,206,228
PF17925 58,191,127
PF17652 86,216,70
PF17940 59,82,188
PF17848 69,198,91
PF17920 194,84,83
PF18678 91,161,189
PF17668 185,82,101
PF17967 92,177,219
PF18130 80,117,197
PF17866 196,89,162
PF02395 219,77,223
PF04986 123,58,184
PF02387 93,137,191
PF10723 200,50,138
PF01848 58,194,211
PF04352 109,196,84
PF12602 223,100,74
PF07057 216,79,67
PF06586 93,196,194
PF05309 211,89,184
PF07178 192,155,93
PF05513 56,221,193
PF05509 221,191,59
PF05261 133,107,225
PF17496 55,185,116
PF06952 223,100,159
PF06290 102,224,124
PF06006 192,228,102
PF07339 220,106,212
PF18090 135,90,221
PF07395 72,93,186
PF01278 78,219,163
PF18863 70,77,187
PF06291 75,103,197
PF00407 56,195,215
PF18335 92,193,209
PF17932 165,217,107
PF18370 182,49,109
PF17886 215,176,93
PF18073 82,197,196
PF17805 152,81,182
PF18042 156,229,83
PF18058 83,217,199
PF13737 102,219,229
PF17827 125,183,79
PF18381 125,91,193
PF17918 107,98,204
PF18480 193,159,69
PF18374 193,193,79
PF18376 104,190,211
PF17657 220,76,163
PF14398 73,222,228
PF02334 170,189,64
PF07737 89,139,178
PF17476 203,83,103
PF17475 70,220,141
PF03495 181,146,54
PF17938 120,104,228
PF04195 141,183,87
PF17759 209,87,97
PF18089 199,101,213
PF17676 72,81,192
PF17912 107,58,183
PF17900 63,189,168
PF04392 56,122,208
PF00771 226,88,61
PF18164 93,113,215
PF18082 96,202,155
PF16962 116,205,90
PF18741 50,58,197
PF18234 61,82,187
PF17788 139,60,217
PF17885 60,168,185
PF18019 222,221,100
PF18395 227,68,146
PF08852 79,83,181
PF18305 72,105,216
PF17763 163,191,57
PF17830 71,209,67
PF12917 196,97,191
PF18306 94,191,184
PF04269 58,79,203
PF18143 186,160,63
PF18565 72,209,137
PF10162 124,220,72
PF17782 80,199,164
PF17764 190,47,144
PF17802 184,154,56
PF17855 96,185,78
PF18086 75,212,112
PF06990 189,228,60
PF18014 218,55,73
PF09610 198,100,97
PF17615 216,201,60
PF17933 107,218,109
PF09150 51,195,140
PF17954 224,140,102
PF07338 203,158,62
PF17677 117,82,211
PF01371 178,180,63
PF18299 68,218,71
PF09764 104,225,124
PF02077 227,217,96
PF15469 225,79,183
PF18759 78,207,202
PF18388 70,209,113
PF09494 147,222,110
PF09507 123,183,53
PF18553 191,56,212
PF18262 227,130,104
PF06728 70,219,73
PF15404 53,88,212
PF14633 182,95,209
PF14635 191,84,117
PF14639 201,124,61
PF14641 203,76,131
PF14632 185,74,170
PF18785 140,183,88
PF08784 77,204,97
PF13869 162,181,90
PF17674 208,54,195
PF04161 166,80,195
PF04190 84,193,145
PF06825 172,72,219
PF18307 62,208,136
PF03849 171,79,210
PF08696 203,140,64
PF17120 223,88,95
PF08265 94,111,212
PF08149 209,55,158
PF18773 115,86,197
PF12656 90,196,186
PF04699 221,183,94
PF08637 194,148,53
PF06817 69,67,201
PF18031 192,72,179
PF00382 152,190,64
PF08271 69,183,85
PF01781 215,63,196
PF01576 152,215,98
PF02736 78,213,177
PF17215 90,73,186
PF17849 181,164,70
PF08767 46,185,109
PF10444 219,136,100
PF03896 203,91,91
PF01778 59,223,63
PF04135 102,215,111
PF07061 77,145,211
PF02893 62,186,162
PF18803 170,211,82
PF18758 176,55,192
PF17862 219,69,225
PF03215 107,110,219
PF09320 150,53,203
PF07808 222,154,100
PF05982 203,84,133
PF12484 152,183,76
PF01747 77,108,187
PF16525 181,119,75
PF04650 74,138,184
PF11151 74,200,204
PF13038 88,64,191
PF10814 98,217,147
PF03872 125,65,209
PF17188 69,220,133
PF09899 161,204,61
PF11142 94,202,102
PF04333 81,122,214
PF05494 147,194,90
PF05951 98,212,113
PF09140 104,207,95
PF03755 77,185,85
PF08340 131,69,198
PF00684 80,212,124
PF11457 193,225,90
PF13670 92,205,200
PF13651 197,223,102
PF17964 183,74,83
PF17757 222,94,166
PF17775 90,215,84
PF18367 83,192,90
PF17939 133,60,210
PF17689 207,104,89
PF17994 64,180,66
PF18183 70,221,207
PF17778 187,64,137
PF17990 195,117,49
PF18417 198,69,77
PF18556 51,96,184
PF18431 225,118,112
PF18593 191,76,191
PF18614 144,184,78
PF17831 207,124,101
PF18277 74,218,84
PF17957 186,161,76
PF17758 95,198,229
PF17991 189,89,139
PF17760 59,119,208
PF18859 179,73,157
PF18564 111,50,185
PF18845 64,211,155
PF17769 152,85,204
PF18726 131,105,212
PF18364 88,212,196
PF18841 179,190,74
PF18783 80,206,110
PF17926 220,76,64
PF18448 89,76,184
PF18693 78,198,192
PF17854 127,207,62
PF18075 122,191,80
PF17648 103,215,57
PF18085 181,197,85
PF17928 80,198,61
PF17935 74,126,218
PF18011 128,72,194
PF17996 172,179,88
PF18860 166,179,67
PF18313 184,192,93
PF17820 65,63,208
PF17851 110,225,110
PF18725 60,212,220
PF15591 222,81,90
PF18407 201,193,54
PF18413 200,200,97
PF18276 87,187,108
PF18096 170,181,61
PF18765 225,120,66
PF18013 106,215,163
PF17766 92,186,71
PF18766 64,57,212
PF18135 194,210,79
PF18092 205,197,77
PF17806 142,192,60
PF18495 62,184,54
PF18559 198,218,59
PF18280 185,177,79
PF17864 80,71,202