diff --git a/bigscape.py b/bigscape.py
index 9deb125ab14ca6c88402f4fcd6c2537d3e3dad0d..ee6fbc7750f714f1f0a597216464bdf60936b7c8 100644
--- a/bigscape.py
+++ b/bigscape.py
@@ -71,10 +71,11 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc
 
     file_counter = 0
     processed_sequences = 0
-    #biosynthetic_genes = set()
+    biosynthetic_genes = set()
     product_list_per_record = []
     fasta_data = []
     save_fasta = True
+    contig_edge = False
     
     print("\nImporting GenBank files")
     if exclude_gbk_str != "":
@@ -122,7 +123,7 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc
             else:
                 bgc_size = 0
                 cds_ctr = 0
-                group = "no type"
+                product = "no type"
                 del product_list_per_record[:]
                 
                 max_width = 0 # This will be used for the SVG figure
@@ -135,7 +136,6 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc
                         max_width = len(record.seq)
                     
                     for feature in record.features:
-                        #is_biosynthetic = False
                         if "cluster" in feature.type and "product" in feature.qualifiers:
                             if len(feature.qualifiers["product"]) > 1:
                                 print("  WARNING: more than product annotated in record " + str(record_count) + ", " + fname)
@@ -144,12 +144,8 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc
                                 product_list_per_record.append(feature.qualifiers["product"][0].replace(" ",""))
                 
                         # Get biosynthetic genes + sequences
-                        if feature.type == "CDS" and save_fasta:
+                        if feature.type == "CDS":
                             cds_ctr += 1
-                            
-                            #if "sec_met" in feature.qualifiers:
-                                #if "Kind: biosynthetic" in feature.qualifiers["sec_met"]:
-                                    #is_biosynthetic = True
                                     
                             CDS = feature
                             gene_id = ""
@@ -173,70 +169,76 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc
                                 strand = '+'
                             else:
                                 strand = '-'
-
-                            if 'translation' in CDS.qualifiers.keys():
-                                prot_seq = CDS.qualifiers['translation'][0]
-                            # If translation isn't available translate manually, this will take longer
-                            else:
-                                nt_seq = CDS.location.extract(record.seq)
-                                
-                                # If we know sequence is an ORF (like all CDSs), codon table can be
-                                #  used to correctly translate alternative start codons.
-                                #  see http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc25
-                                # If the sequence has a fuzzy start/end, it might not be complete,
-                                # (therefore it might not be the true start codon)
-                                # However, in this case, if 'translation' not available, assume 
-                                #  this is just a random sequence 
-                                complete_cds = False 
                                 
-                                # More about fuzzy positions
-                                # http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc39
-                                fuzzy_start = False 
-                                if str(CDS.location.start)[0] in "<>":
-                                    complete_cds = False
-                                    fuzzy_start = True
+                            fasta_header = clusterName + "_ORF" + str(cds_ctr)+ ":gid:" + str(gene_id) + ":pid:" + str(protein_id) + ":loc:" + str(gene_start) + ":" + str(gene_end) + ":strand:" + strand
+                            fasta_header = fasta_header.replace(">","") #the coordinates might contain larger than signs, tools upstream don't like this
+                            fasta_header = fasta_header.replace(" ", "") #the domtable output format (hmmscan) uses spaces as a delimiter, so these cannot be present in the fasta header
+
+                            if "sec_met" in feature.qualifiers:
+                                if "Kind: biosynthetic" in feature.qualifiers["sec_met"]:
+                                    biosynthetic_genes.add(fasta_header)
+
+                            fasta_header = ">"+fasta_header
+                            
+
+                            if save_fasta:
+                                if 'translation' in CDS.qualifiers.keys():
+                                    prot_seq = CDS.qualifiers['translation'][0]
+                                # If translation isn't available translate manually, this will take longer
+                                else:
+                                    nt_seq = CDS.location.extract(record.seq)
                                     
-                                fuzzy_end = False
-                                if str(CDS.location.end)[0] in "<>":
-                                    fuzzy_end = True
-                                
-                                #for protein sequence if it is at the start of the entry assume 
-                                # that end of sequence is in frame and trim from the beginning
-                                #if it is at the end of the genbank entry assume that the start 
-                                # of the sequence is in frame
-                                reminder = len(nt_seq)%3
-                                if reminder > 0:
-                                    if fuzzy_start and fuzzy_end:
-                                        print("Warning, CDS (" + clusterName + ", " + CDS.qualifiers.get('locus_tag',"")[0] + ") has fuzzy start and end positions, and a sequence length not multiple of three. Skipping")
-                                        break
+                                    # If we know sequence is an ORF (like all CDSs), codon table can be
+                                    #  used to correctly translate alternative start codons.
+                                    #  see http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc25
+                                    # If the sequence has a fuzzy start/end, it might not be complete,
+                                    # (therefore it might not be the true start codon)
+                                    # However, in this case, if 'translation' not available, assume 
+                                    #  this is just a random sequence 
+                                    complete_cds = False 
                                     
-                                    if fuzzy_start:
-                                        if reminder == 1:
-                                            nt_seq = nt_seq[1:]
-                                        else:
-                                            nt_seq = nt_seq[2:]
-                                    # fuzzy end
-                                    else:
-                                        #same logic reverse direction
-                                        if reminder == 1:
-                                            nt_seq = nt_seq[:-1]
+                                    # More about fuzzy positions
+                                    # http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc39
+                                    fuzzy_start = False 
+                                    if str(CDS.location.start)[0] in "<>":
+                                        complete_cds = False
+                                        fuzzy_start = True
+                                        
+                                    fuzzy_end = False
+                                    if str(CDS.location.end)[0] in "<>":
+                                        fuzzy_end = True
+                                    
+                                    #for protein sequence if it is at the start of the entry assume 
+                                    # that end of sequence is in frame and trim from the beginning
+                                    #if it is at the end of the genbank entry assume that the start 
+                                    # of the sequence is in frame
+                                    reminder = len(nt_seq)%3
+                                    if reminder > 0:
+                                        if fuzzy_start and fuzzy_end:
+                                            print("Warning, CDS (" + clusterName + ", " + CDS.qualifiers.get('locus_tag',"")[0] + ") has fuzzy start and end positions, and a sequence length not multiple of three. Skipping")
+                                            break
+                                        
+                                        if fuzzy_start:
+                                            if reminder == 1:
+                                                nt_seq = nt_seq[1:]
+                                            else:
+                                                nt_seq = nt_seq[2:]
+                                        # fuzzy end
                                         else:
-                                            nt_seq = nt_seq[:-2]
-                                
-                                # The Genetic Codes: www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
-                                if "transl_table" in CDS.qualifiers.keys():
-                                    CDStable = CDS.qualifiers.get("transl_table", "")[0]
-                                    prot_seq = str(nt_seq.translate(table=CDStable, to_stop=True, cds=complete_cds))
-                                else:
-                                    prot_seq = str(nt_seq.translate(to_stop=True, cds=complete_cds))
+                                            #same logic reverse direction
+                                            if reminder == 1:
+                                                nt_seq = nt_seq[:-1]
+                                            else:
+                                                nt_seq = nt_seq[:-2]
                                     
-                            fasta_header = clusterName + "_ORF" + str(cds_ctr)+ ":gid:" + str(gene_id) + ":pid:" + str(protein_id) + ":loc:" + str(gene_start) + ":" + str(gene_end) + ":strand:" + strand
-                            fasta_header = fasta_header.replace(">","") #the coordinates might contain larger than signs, tools upstream don't like this
-                            fasta_header = ">"+(fasta_header.replace(" ", "")) #the domtable output format (hmmscan) uses spaces as a delimiter, so these cannot be present in the fasta header
-                            fasta_data.append((fasta_header, prot_seq))
-                            
-                            #if is_biosynthetic:
-                                #biosynthetic_genes.add(fasta_header[1:])
+                                    # The Genetic Codes: www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
+                                    if "transl_table" in CDS.qualifiers.keys():
+                                        CDStable = CDS.qualifiers.get("transl_table", "")[0]
+                                        prot_seq = str(nt_seq.translate(table=CDStable, to_stop=True, cds=complete_cds))
+                                    else:
+                                        prot_seq = str(nt_seq.translate(to_stop=True, cds=complete_cds))
+                                        
+                                fasta_data.append((fasta_header, prot_seq))
                 
                 if bgc_size > min_bgc_size:  # exclude the bgc if it's too small
                     file_counter += 1
@@ -244,26 +246,35 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc
                     # In particular, handle different products for multi-record files
                     product_set = set(product_list_per_record)
                     if len(product_set) == 1: # only one type of product
-                        group = product_list_per_record[0]
+                        product = product_list_per_record[0]
                     elif "other" in product_set: # more than one, and it contains "other"
                         if len(product_set) == 2:
-                            group = list(product_set - set(['other']))[0] # group = not "other"
+                            product = list(product_set - set(['other']))[0] # product = not "other"
                         else:
-                            group = "-".join(product_set - set(['other'])) # likely a hybrid
+                            product = "-".join(product_set - set(['other'])) # likely a hybrid
                     else:
-                        group = "-".join(product_set) # likely a hybrid
+                        product = "-".join(product_set) # likely a hybrid
                      
                     
                     # assuming that the definition field is the same in all records
-                    # group: antiSMASH predicted class of metabolite
+                    # product: antiSMASH predicted class of metabolite
                     # gbk definition
                     # number of records (for Arrower figures)
                     # max_width: width of the largest record (for Arrower figures)
                     # id: the GenBank's accession
-                    # Change this tuple into a nice object. TODO
-                    bgc_info[clusterName] = (group, records[0].description, len(records), max_width, records[0].id)
-                    #bgc_info[clusterName] = (group, records[0].description, len(records), max_width, records[0].id, biosynthetic_genes.copy())
-
+                    #bgc_info[clusterName] = (product, records[0].description, len(records), max_width, records[0].id, biosynthetic_genes.copy())
+                    # TODO contig_edge annotation is not present for antiSMASH v < 4
+                    # Perhaps we can try to infer if it's in a contig edge: if
+                    # - first biosynthetic gene start < 10kb or
+                    # - max_width - last biosynthetic gene end < 10kb (but this will work only for the largest record)
+                    bgc_info[clusterName] = bgc_data(records[0].id, records[0].description, product, len(records), max_width, biosynthetic_genes.copy(), contig_edge)
+
+                    if len(bgc_info[clusterName].biosynthetic_genes) == 0:
+                        print("  Warning: No Biosynthetic Genes found in " + clusterName)
+
+                    # TODO why re-process everything if it was already in the list?
+                    # if name already in genbankDict.keys -> add current_dir
+                    # else: extract all info
                     if clusterName in genbankDict.keys():
                         # Name was already in use. Use current_dir as the new sample's name
                         genbankDict[clusterName][1].add(current_dir) 
@@ -272,13 +283,14 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc
                         genbankDict.setdefault(clusterName, [os.path.join(dirpath, fname), set([current_dir])])
                         
                         # See if we need to write down the sequence
-                        if save_fasta:
+                        outputfile = os.path.join(bgc_fasta_folder, clusterName + '.fasta')
+                        if os.path.isfile(outputfile) and os.path.getsize(outputfile) > 0:
+                            processed_sequences += 1
+                        else:
                             with open(outputfile,'w') as fastaHandle:
                                 for header_sequence in fasta_data:
                                     fastaHandle.write('%s\n' % header_sequence[0])
                                     fastaHandle.write('%s\n' % header_sequence[1])
-                        else:
-                            processed_sequences += 1
                             
                             
                     if verbose:
@@ -288,7 +300,7 @@ def get_gbk_files(inputdir, bgc_fasta_folder, min_bgc_size, exclude_gbk_str, bgc
                     print(" Discarding " + clusterName +  " (size less than " + str(min_bgc_size) + " bp, was " + str(bgc_size) + ")")
                 
                 del fasta_data[:]
-                #biosynthetic_genes.clear()
+                biosynthetic_genes.clear()
     
     if file_counter == 0:
         sys.exit("\nError: There are no files to process")
@@ -887,9 +899,9 @@ def clusterJsonBatch(outputFileBase,matrix,cutoffs=[1.0],damping=0.8):
             bgcName = clusterNames[int(bgc)]
             bgcJsonDict[bgcName] = {}
             bgcJsonDict[bgcName]["id"] = clusterNames[int(bgc)]
-            bgcJsonDict[bgcName]["desc"] = bgc_info[bgcName][1]
-            bgcJsonDict[bgcName]["start"] = int(bgc_info[bgcName][2])
-            bgcJsonDict[bgcName]["end"] = int(bgc_info[bgcName][3])
+            bgcJsonDict[bgcName]["desc"] = bgc_info[bgcName].description
+            bgcJsonDict[bgcName]["start"] = 1
+            bgcJsonDict[bgcName]["end"] = bgc_info[bgcName].max_width
             pfdFile = os.path.join(pfd_folder, bgcName + ".pfd")
             fastaFile = os.path.join(bgc_fasta_folder, bgcName + ".fasta")
             orfDict = defaultdict(dict)
@@ -1026,6 +1038,25 @@ def CMD_parser():
 if __name__=="__main__":
     options = CMD_parser()
     
+    class bgc_data:
+        def __init__(self, accession_id, description, product, records, max_width, biosynthetic_genes, contig_edge):
+            # These two properties come from the genbank file:
+            self.accession_id = accession_id
+            self.description = description
+            # AntiSMASH predicted class of compound:
+            self.product = product
+            # number of records in the genbank file (think of multi-locus BGCs):
+            self.records = records
+            # length of largest record (it will be used for ArrowerSVG):
+            self.max_width = int(max_width)
+            # Internal set of tags corresponding to genes that AntiSMASH marked 
+            # as "Kind: Biosynthetic". It is formed as
+            # clusterName + "_ORF" + cds_number + ":gid:" + gene_id + ":pid:" + protein_id + ":loc:" + gene_start + ":" + gene_end + ":strand:" + {+,-}
+            self.biosynthetic_genes = biosynthetic_genes
+            # AntiSMASH 4+ marks BGCs that sit on the edge of a contig
+            self.contig_edge = contig_edge
+    
+    
     if options.outputdir == "":
         print "please provide a name for an output folder using parameter -o or --outputdir"
         sys.exit(0)
@@ -1433,7 +1464,7 @@ if __name__=="__main__":
         # is not found, the text files with colors need to be updated
         print("  Reading BGC information and writing SVG")
         for bgc in working_set:
-            SVG(False, os.path.join(svg_folder,bgc+".svg"), genbankDict[bgc][0], os.path.join(pfd_folder,bgc+".pfd"), True, color_genes, color_domains, pfam_domain_categories, pfam_info, bgc_info[bgc][2], bgc_info[bgc][3])
+            SVG(False, os.path.join(svg_folder,bgc+".svg"), genbankDict[bgc][0], os.path.join(pfd_folder,bgc+".pfd"), True, color_genes, color_domains, pfam_domain_categories, pfam_info, bgc_info[bgc].records, bgc_info[bgc].max_width)
             
         color_genes.clear()
         color_domains.clear()
@@ -1546,7 +1577,7 @@ if __name__=="__main__":
             # only choose from valid classes
             mix_set = []
             for clusterIdx,cluster in enumerate(clusterNames):
-                product = bgc_info[cluster][0]
+                product = bgc_info[cluster].product
                 predicted_class = sort_bgc(product)
                 if predicted_class.lower() in valid_classes:
                     mix_set.append(clusterIdx)
@@ -1558,8 +1589,8 @@ if __name__=="__main__":
                 list_file.write("BGC\tAccesion ID\tDescription\tProduct Prediction\tBiG-SCAPE class\n")
                 for idx in mix_set:
                     bgc = clusterNames[idx]
-                    product = bgc_info[bgc][0]
-                    list_file.write("\t".join([bgc, bgc_info[bgc][4], bgc_info[bgc][1], product, sort_bgc(product)]) + "\n")
+                    product = bgc_info[bgc].product
+                    list_file.write("\t".join([bgc, bgc_info[bgc].accession_id, bgc_info[bgc].description, product, sort_bgc(product)]) + "\n")
             
             print("  Calculating all pairwise distances")
             
@@ -1592,7 +1623,7 @@ if __name__=="__main__":
             # Preparing gene cluster classes
             print("  Sorting the input BGCs\n")
             for clusterIdx,cluster in enumerate(clusterNames):
-                product = bgc_info[cluster][0]
+                product = bgc_info[cluster].product
                 predicted_class = sort_bgc(product)
                 if predicted_class.lower() in valid_classes:
                     BGC_classes[predicted_class].append(clusterIdx)
@@ -1634,8 +1665,8 @@ if __name__=="__main__":
                     list_file.write("BGC\tAccesion ID\tDescription\tProduct Prediction\tBiG-SCAPE class\n")
                     for idx in BGC_classes[bgc_class]:
                         bgc = clusterNames[idx]
-                        product = bgc_info[bgc][0]
-                        list_file.write("\t".join([bgc, bgc_info[bgc][4], bgc_info[bgc][1], product, sort_bgc(product)]) + "\n")
+                        product = bgc_info[bgc].product
+                        list_file.write("\t".join([bgc, bgc_info[bgc].accession_id, bgc_info[bgc].description, product, sort_bgc(product)]) + "\n")
             
                     
                 if len(BGC_classes[bgc_class]) > 1:
@@ -1686,7 +1717,7 @@ if __name__=="__main__":
                         
                         mix_set = []
                         for clusterIdx, sampleCluster in enumerate(sampleClusters):
-                            product = bgc_info[sampleCluster][0]
+                            product = bgc_info[sampleCluster].product
                             predicted_class = sort_bgc(product)
                             if predicted_class.lower() in valid_classes:
                                 mix_set.append(clusterIdx)
@@ -1698,8 +1729,8 @@ if __name__=="__main__":
                             list_file.write("BGC\tAccesion ID\tDescription\tProduct Prediction\tBiG-SCAPE class\n")
                             for idx in mix_set:
                                 bgc = clusterNames[idx]
-                                product = bgc_info[bgc][0]
-                                list_file.write("\t".join([bgc, bgc_info[bgc][4], bgc_info[bgc][1], product, sort_bgc(product)]) + "\n")
+                                product = bgc_info[bgc].product
+                                list_file.write("\t".join([bgc, bgc_info[bgc].accession_id, bgc_info[bgc].description, product, sort_bgc(product)]) + "\n")
             
                         pairs = set(map(tuple, map(sorted, combinations(mix_set, 2))))
                         del mix_set[:]
@@ -1729,7 +1760,7 @@ if __name__=="__main__":
                         # Preparing gene cluster classes
                         print("   Sorting the input BGCs\n")
                         for cluster in sampleClusters:
-                            product = bgc_info[cluster][0]
+                            product = bgc_info[cluster].product
                             predicted_class = sort_bgc(product)
                             if predicted_class.lower() in valid_classes:
                                 BGC_classes[predicted_class].append(clusterNames2idx[cluster])
@@ -1770,8 +1801,8 @@ if __name__=="__main__":
                                 list_file.write("BGC\tAccesion ID\tDescription\tProduct Prediction\tBiG-SCAPE class\n")
                                 for idx in BGC_classes[bgc_class]:
                                     bgc = clusterNames[idx]
-                                    product = bgc_info[bgc][0]
-                                    list_file.write("\t".join([bgc, bgc_info[bgc][4], bgc_info[bgc][1], product, sort_bgc(product)]) + "\n")
+                                    product = bgc_info[bgc].product
+                                    list_file.write("\t".join([bgc, bgc_info[bgc].accession_id, bgc_info[bgc].description, product, sort_bgc(product)]) + "\n")
 
                             if len(BGC_classes[bgc_class]) > 1:
                                 pairs = set(map(tuple, map(sorted, combinations(BGC_classes[bgc_class], 2))))
diff --git a/functions.py b/functions.py
index ea13534a404148c53ea97474615396e7a5707397..2610b0f22e39c9690b4671a80bbb7d28c68b32fa 100644
--- a/functions.py
+++ b/functions.py
@@ -301,7 +301,7 @@ def network_parser(network_file, Jaccardw, DSSw, GKw, anchorboost):
     return network
 
 
-def write_network_matrix(matrix, cutoffs_and_filenames, include_singletons, clusterNames, group_dict):
+def write_network_matrix(matrix, cutoffs_and_filenames, include_singletons, clusterNames, bgc_info):
     """
     An entry in the distance matrix is currently (all floats):
       0         1           2      3      4       5    6    7    8      9     10   11
@@ -332,8 +332,8 @@ def write_network_matrix(matrix, cutoffs_and_filenames, include_singletons, clus
         row = [gc1, gc2]
         
         # get AntiSMASH annotations
-        clus1group = group_dict[gc1][0]
-        clus2group = group_dict[gc2][0]
+        clus1group = bgc_info[gc1].product
+        clus2group = bgc_info[gc2].product
         
         # add all the other floats
         row.extend(matrix_entry[3:-2])