diff --git a/bigscape.py b/bigscape.py
index fd71d88173ba142b9c1036dd06241c2d11b22160..c649469a31727172de327261fe8cd28f8bbffa80 100644
--- a/bigscape.py
+++ b/bigscape.py
@@ -70,9 +70,11 @@ FFTNS2 method will be used.
 
 
 def get_gbk_files(inputdir, min_bgc_size, exclude_gbk_str):
-    """Searches given directory for genbank files recursively, will assume that the genbank files that have the same name
-    are the same genbank file. Returns a dictionary that contains the names of the clusters found as keys and a list
-    that contains a path to the genbank file and the samples that the genbank file is a part of:
+    """Searches given directory for genbank files recursively, will assume that
+    the genbank files that have the same name are the same genbank file. 
+    Returns a dictionary that contains the names of the clusters found as keys
+    and a list that contains [0] a path to the genbank file and [1] the 
+    samples that the genbank file is a part of.
     return: {cluster_name:[genbank_path,[s_a,s_b...]]}
     """
 
@@ -80,7 +82,9 @@ def get_gbk_files(inputdir, min_bgc_size, exclude_gbk_str):
 
     file_counter = 0
 
-    print("Importing the gbk files, while skipping those with '" + exclude_gbk_str + "' in their filename")
+    print("\nImporting GenBank files")
+    if exclude_gbk_str != "":
+        print(" Skipping files with '" + exclude_gbk_str + "' in their filename")
 
     # this doesn't seem to make a difference. Confirm
     # if inputdir != "" and inputdir[-1] != "/":
@@ -98,12 +102,29 @@ def get_gbk_files(inputdir, min_bgc_size, exclude_gbk_str):
 
         for fname in filenames:
             fname_parse = fname.split('.')
-            if fname_parse[-1] == "gbk" and exclude_gbk_str not in fname:
-                if " " in fname:
-                    print "Your GenBank files should not have spaces in their filenames as HMMscan cannot process them properly ('too many arguments')."
-                    sys.exit(0)
-
-                gbk_header = open(os.path.join(dirpath_, fname), "r").readline().split(" ")
+            
+            if fname_parse[-1] != "gbk":
+                continue
+            
+            if exclude_gbk_str != "" and exclude_gbk_str in fname:
+                print(" Skipping file " + fname)
+                continue
+            
+            if " " in fname:
+                sys.exit("\nError: Input GenBank files should not have spaces in their filenames as HMMscan cannot process them properly ('too many arguments').")
+                
+            
+            with open(os.path.join(dirpath_, fname), "r") as f:
+                try:
+                    # basic file verification. Substitutes check_data_integrity
+                    SeqIO.read(f, "genbank")
+                except ValueError as e:
+                    print("   Error with file " + os.path.join(dirpath_, fname) + ": \n    '" + str(e) + "'")
+                    print("    (This file will be excluded from the analysis)")
+                    continue
+            
+            with open(os.path.join(dirpath_, fname), "r") as f:
+                gbk_header = f.readline().split(" ")
                 gbk_header = filter(None, gbk_header)  # remove the empty items from the list
                 try:
                     bgc_size = int(gbk_header[gbk_header.index("bp") - 1])
@@ -116,17 +137,28 @@ def get_gbk_files(inputdir, min_bgc_size, exclude_gbk_str):
                     file_counter += 1
                     
                     if clusterName in genbankDict.keys():
-                        genbankDict[clusterName][1].add(current_dir)
+                        # current_dir gets to be the name of the sample
+                        genbankDict[clusterName][1].add(current_dir) 
                     else:
-                        genbankDict.setdefault(clusterName, [os.path.join(dirpath_, fname), set([current_dir])]) # first instance of the file is genbankDict[clustername][0]
+                        # location of first instance of the file is genbankDict[clustername][0]
+                        genbankDict.setdefault(clusterName, [os.path.join(dirpath_, fname), set([current_dir])])
+                        
+                    if verbose == True:
+                        print("  Adding " + fname + " (" + str(bgc_size) + " bps)")
                 else:
-                    print(" Discarding " + clusterName +  " (size < " + str(min_bgc_size) + " bp)")
-                if verbose == True:
-                    print("  Adding " + fname + " (" + str(bgc_size) + " bps)")
+                    print(" Discarding " + clusterName +  " (size less than " + str(min_bgc_size) + " bp, was " + str(bgc_size) + ")")
+                    
+            # else: The file does not have the gbk extension. Skip it
+    
+    if file_counter == 0:
+        sys.exit("\nError: There are no files to process")
+        
+    if file_counter == 1:
+        sys.exit("\nError: Only one file found. Please input at least two files")
     
     if verbose:
         print("\n Starting with " + str(file_counter) + " files")
-    print("")
+
     return genbankDict
 
 def timeit(f):
@@ -532,7 +564,7 @@ def generateFasta(gbkfilePath,outputdir):
                     fastaHandle.write('%s\n' % prot_seq)
     return outputfile
 
-def runHmmScan(fastaPath,hmmPath,outputdir):
+def runHmmScan(fastaPath, hmmPath, outputdir, verbose):
     ## will run hmmscan command on a fasta file with a single core and generate a domtable file
     hmmFile = os.path.join(hmmPath,"Pfam-A.hmm")
     if os.path.isfile(fastaPath):
@@ -541,7 +573,8 @@ def runHmmScan(fastaPath,hmmPath,outputdir):
         
         hmmscan_cmd = "hmmscan --cpu 1 --domtblout %s --cut_tc %s %s" % (outputName,hmmFile,fastaPath)
         if verbose == True:
-            print("\t"+hmmscan_cmd)
+            print("  Processing " + name)
+            #print("\t"+hmmscan_cmd)
         subprocess.check_output(hmmscan_cmd, shell=True)
 
     else:
@@ -549,25 +582,41 @@ def runHmmScan(fastaPath,hmmPath,outputdir):
 
 def parseHmmScan(hmmscanResults,outputdir,overlapCutoff):
     outputbase = hmmscanResults.split(os.sep)[-1].replace(".domtable", "")
-    # try to read the domtable file to find out if this gbk has domains. Needed to parse domains into fastas anyway.
+    # try to read the domtable file to find out if this gbk has domains. Domains need to be parsed into fastas anyway.
     if os.path.isfile(hmmscanResults):
         pfd_matrix = domtable_parser(outputbase, hmmscanResults)
-        num_domains = len(pfd_matrix) # these might still be overlapped, but we only need a minimum number
+        num_domains = len(pfd_matrix) # these might still be overlapped, but we need at least 1
 
         if num_domains > 0:
             print(" Processing domtable file: " + outputbase)
 
             filtered_matrix, domains = check_overlap(pfd_matrix,overlapCutoff)  #removes overlapping domains, and keeps the highest scoring domain
+            
             # Save list of domains per BGC
             pfsoutput = os.path.join(outputdir, outputbase + ".pfs")
             with open(pfsoutput, 'wb') as pfs_handle:
                 write_pfs(pfs_handle, domains)
+            
             # Save more complete information of each domain per BGC
             pfdoutput = os.path.join(outputdir, outputbase + ".pfd")
             with open(pfdoutput,'wb') as pfd_handle:
                 write_pfd(pfd_handle, filtered_matrix)
+        else:
+            # there aren't any domains in this BGC
+            # delete from all data structures
+            print("  No domains where found in " + outputbase + ".domtable. Removing it from further analysis")
+            info = genbankDict.get(outputbase)
+            clusters.remove(outputbase)
+            baseNames.remove(outputbase)
+            gbk_files.remove(info[0])
+            for sample in info[1]:
+                sampleDict[sample].remove(outputbase)
+            del genbankDict[outputbase]
+            
     else:
-        print "Error hmmscan file doesn't exist"
+        sys.exit("Error: hmmscan file " + outputbase + " was not found! (parseHmmScan)")
+
+    return("")
 
 def genbank_grp_dict(gb_files,gbk_group):
     for gb_file in gb_files:
@@ -700,11 +749,11 @@ def CMD_parser():
     parser.add_option("-i", "--inputdir", dest="inputdir", default="",
                       help="Input directory of gbk files, if left empty, all gbk files in current and lower directories will be used.")
     parser.add_option("-c", "--cores", dest="cores", default=cpu_count(),
-                      help="Set the amount of cores the script and hmmscan may use")
+                      help="Set the amount of cores the script and hmmscan may use (default: use _all_ available cores)")
     parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False,
-                      help="Toggle to true, will print and save some variables.")
+                      help="Prints more detailed information. Toggle to true.")
     parser.add_option("--include_disc_nodes", dest="include_disc_nodes", action="store_true", default=False,
-                      help="Toggle to true. Include nodes that have no edges to other nodes from the network, default is false.")
+                      help="Include nodes that have no edges to other nodes from the network, default is false. Toggle to true.")
     parser.add_option("-d", "--domain_overlap_cutoff", dest="domain_overlap_cutoff", default=0.1,
                       help="Specify at which overlap percentage domains are considered to overlap")
     parser.add_option("-m", "--min_bgc_size", dest="min_bgc_size", default=0,
@@ -734,8 +783,8 @@ def CMD_parser():
                       help="Location of hmmpress-processed Pfam files. Default is same location of BiG-SCAPE")
     parser.add_option("--anchorfile", dest="anchorfile", default="anchor_domains.txt",
                       help="Provide a custom name for the anchor domains file, default is anchor_domains.txt.")
-    parser.add_option("--exclude_gbk_str", dest="exclude_gbk_str", default="final",
-                      help="If this string occurs in the gbk filename, this will not be used for the analysis. Best to just leave out these samples to begin with.")
+    parser.add_option("--exclude_gbk_str", dest="exclude_gbk_str", default="",
+                      help="If this string occurs in the gbk filename, this file will not be used for the analysis.")
     
     parser.add_option("--mafft_pars", dest="mafft_pars", default="",
                       help="Add single/multiple parameters for mafft specific enclosed by quotation marks e.g. \"--nofft --parttree\"")
@@ -749,6 +798,8 @@ def CMD_parser():
                       help="Let the script calculate the percent identity between sequences? \
                       Or use the distout scores from the mafft output? As default it calculates the percent identity from the MSAs.")
     
+    parser.add_option("--force_hmmscan", dest="force_hmmscan", action="store_true", default=False, 
+                      help="Force domain prediction using hmmscan even if BiG-SCAPE finds processed domtable files (e.g. to use a new version of PFAM).")
     parser.add_option("--skip_hmmscan", dest="skip_hmmscan", action="store_true", default=False,
                       help="When skipping hmmscan, the GBK files should be available, and the domain tables need to be in the output folder.")
     parser.add_option("--skip_mafft", dest="skip_mafft", action="store_true", default=False, 
@@ -819,12 +870,6 @@ if __name__=="__main__":
         sys.exit()
                     
     verbose = options.verbose
-    args = sys.argv[1:]
-    if "-o" in args:
-        args.remove("-o")
-    else:
-        args.remove("--outputdir")
-    args.remove(output_folder)
     networks_folder = "networks"
     domainsout = "domains"
     seqdist_networks = options.seqdist_networks.split(",")
@@ -838,11 +883,13 @@ if __name__=="__main__":
             options.skip_hmmscan = False
             options.skip_mafft = False
     
+    time1 = time.time()
+    print("\n   - - Obtaining input files - -")
     
-    # Obtain gbk files
-    global gbk_files
+    # Make the following available for possibly deleting entries within parseHmmScan
+    global genbankDict, gbk_files, sampleDict, clusters, baseNames
     
-    # genbankDickt: {cluster_name:[genbank_path_to_1st_instance,[sample_1,sample_2,...]]}
+    # genbankDict: {cluster_name:[genbank_path_to_1st_instance,[sample_1,sample_2,...]]}
     genbankDict = get_gbk_files(options.inputdir, int(options.min_bgc_size), options.exclude_gbk_str)
 
     # clusters and sampleDict contain the necessary structure for all-vs-all and sample analysis
@@ -857,7 +904,13 @@ if __name__=="__main__":
             clustersInSample.add(cluster)
             sampleDict[sample] = clustersInSample
 
-    check_data_integrity([gbk_files]) # turned into a list-within-a-list to retain backwards compatibility
+    # This gets very messy if there are files with errors, as the original
+    # structures (clusters, sampleDict) are not changed. We'd have to extract
+    # only gbk_files, then process it and possibly delete entries, then obtain
+    # clusters and sampleDict. genbankDict.keys would still be holding non-valid
+    # entries. I'll pass the verifying to get_gbk_files for now
+    #print("\nVerifying input files")
+    #check_data_integrity([gbk_files]) # turned into a list-within-a-list to retain backwards compatibility
     
     print("\nCreating output directories")
     try:
@@ -890,7 +943,7 @@ if __name__=="__main__":
         # 183 (Windows) "[Error 183] Cannot create a file when that file already exists"
         if "Errno 17" in str(e) or "Error 183" in str(e):
             if not (options.skip_all or options.skip_hmmscan or options.skip_mafft):
-                print(" Emptying domains directory first")
+                print(" Emptying domains directory")
                 for thing in os.listdir(os.path.join(output_folder, domainsout)):
                     os.remove(os.path.join(output_folder, domainsout, thing))
             else:
@@ -898,153 +951,177 @@ if __name__=="__main__":
         else:
             print("Fatal error when trying to create domains' directory")
             sys.exit(str(e))
-    print("")
 
 
-    timings_file = open(os.path.join(output_folder, "runtimes.txt"), 'w') #open the file that will contain the timed functions
+    if verbose:
+        print(" Trying threading on %i cores" % cores)
     
+    #open the file that will contain the timed functions
+    timings_file = open(os.path.join(output_folder, "runtimes.txt"), 'w') 
     
-    #===========================================================================
-    # gbk_handle = open("gbk.txt", "w")
-    # for i in gbk_files:
-    #     gbk_handle.write(str(i)+"\n")
-    # gbk_handle.close()
-    #===========================================================================
-
-    
-    """BGCs -- dictionary of this structure:  BGCs = {'cluster_name_x': { 'general_domain_name_x' : ['specific_domain_name_1',
+    """BGCs -- 
+    dictionary of this structure:
+    BGCs = {'cluster_name_x': { 'general_domain_name_x' : ['specific_domain_name_1',
      'specific_domain_name_2'] } }
     - cluster_name_x: cluster name (can be anything)
     - general_domain_name_x: PFAM ID, for example 'PF00550'
     - specific_domain_name_x: ID of a specific domain that will allow to you to map it to names in DMS unequivocally
-     (for example, 'PF00550_start_end', where start and end are genomic positions)."""
-     
-     
+     (for example, 'PF00550_start_end', where start and end are genomic positions)."""     
     BGCs = {} #will contain the BGCs
+    
+    # contains the class of the Gene Clusters (predicted by antiSMASH and annotated in the GenBank files. Will be used in the final network files)
     global group_dct
     group_dct = {}
-    sequences_per_domain = {} # to avoid calling MAFFT if only 1 seq. in a particular domain
+    
+    # to avoid calling MAFFT if there's only 1 seq. representing a particular domain
+    sequences_per_domain = {} 
     
     #Loop over the samples
-    if options.skip_hmmscan or options.skip_mafft or options.skip_all:
-        print("Skipping processing of hmmscan output files")
-    else:
-        print("Predicting domains and parsing the hmmscan output files...")
-
+    #if options.skip_hmmscan or options.skip_mafft or options.skip_all:
+        #print("\nSkipping processing of hmmscan output files")
+    #else:
+        #print("\nPredicting domains and parsing the hmmscan output files...\n")
 
+    print("\n\n   - - Processing input files - -")
+    
     if options.splitProcess:
-        if verbose:
-            print("Trying Threading on %i cores" % cores)
-        # since the structure of gbk_files is nested lists of file paths need to unpack this
-
-        # This will go into clusters later for checking if we dropped files in processing
-        genbankFileNames = set(gbk_files)
-        baseNames = set(x.split(os.sep)[-1].replace(".gbk","") for x in genbankFileNames)
+        # These will be used to track if we drop files in processing
+        genbankFileLocations = set(gbk_files)
+        baseNames = set(clusters)
 
         ### Step 1: Generate Fasta Files
-        print "Parsing genbank Files to generate fasta files for hmmsearch"
-
-        # filter through task list to avoid unecessary computation, if output file is there and nonempty exclude it from list
+        print "\nParsing genbank files to generate fasta files for hmmscan"
 
+        # filter through task list to avoid unecessary computation: if output file is already there and non-empty, exclude it from list
         alreadyDone = set()
-
-        for genbank in genbankFileNames:
-            outputbase  = genbank.split(os.sep)[-1].replace(".gbk","")
+        for genbank in genbankFileLocations:
+            outputbase = genbank.split(os.sep)[-1].replace(".gbk","")
             outputfile = os.path.join(output_folder,outputbase + '.fasta')
             if os.path.isfile(outputfile) and os.path.getsize(outputfile) > 0:
                 alreadyDone.add(genbank)
 
-        if len(alreadyDone) > 0:
-            print "Found output files for %s, skipping these" % list(x.split(os.sep)[-1].replace(".gbk","") for x in alreadyDone)
-
+        if len(genbankFileLocations - alreadyDone) == 0:
+            print(" All GenBank files had already been processed")
+        elif len(alreadyDone) > 0:
+            print " Warning! The following NEW input file(s) will be processed: %s" % ", ".join(x.split(os.sep)[-1].replace(".gbk","") for x in genbankFileLocations - alreadyDone)
+        else:
+            print(" Processing " + str(len(genbankFileLocations)) + " files")
 
         # Generate Pool of workers
         pool = Pool(cores,maxtasksperchild=32)
-        for genbankFile in set(genbankFileNames) - alreadyDone:
+        for genbankFile in (genbankFileLocations - alreadyDone):
             pool.apply_async(generateFasta,args =(genbankFile,output_folder))
         pool.close()
         pool.join()
+        print " Finished generating fasta files."
 
-        print "Done generating fasta files checking output...."
 
         ### Step 2: Run hmmscan
-        fastaFiles = glob(os.path.join(output_folder,"*.fasta"))
+        print("\nPredicting domains using hmmscan")
+        
+        # grab fasta files locations
+        fastaFiles = set(glob(os.path.join(output_folder,"*.fasta")))
+        
+        # verify previous step
         fastaBases = set(x.split(os.sep)[-1].replace(".fasta","") for x in fastaFiles)
-
         if len(baseNames - fastaBases) > 0:
-            print "WARNING no fasta files for:", baseNames - fastaBases
-
-        if fastaFiles:
+            sys.exit("Error! The following files did NOT have their fasta sequences extracted: " + ", ".join(baseNames - fastaBases))
+        
+        if options.force_hmmscan:
+            # process all files, regardless of whether they already existed
+            task_set = fastaFiles
+            print(" Forcing domain prediction on ALL fasta files (--force_hmmscan)")
+        else:
+            # find already processed files
             alreadyDone = set()
             for fasta in fastaFiles:
                 outputbase  = fasta.split(os.sep)[-1].replace(".fasta","")
                 outputfile = os.path.join(output_folder,outputbase + '.domtable')
                 if os.path.isfile(outputfile) and os.path.getsize(outputfile) > 0:
                     alreadyDone.add(fasta)
+             
+            if len(fastaFiles - alreadyDone) == 0:
+                 print(" All fasta files had already been processed")
+            elif len(alreadyDone) > 0:
+                print " Warning! The following NEW fasta file(s) will be processed: %s" % ", ".join(x.split(os.sep)[-1].replace(".fasta","") for x in fastaFiles - alreadyDone)
+            else:
+                print(" Predicting domains for " + str(len(fastaFiles)) + " fasta files")
 
-            if len(alreadyDone) > 0:
-                print "Found output files for %s, skipping these" % list(x.split(os.sep)[-1].replace(".fasta","") for x in alreadyDone)
+            task_set = fastaFiles - alreadyDone
+            
+        pool = Pool(cores,maxtasksperchild=1)
+        for fastaFile in task_set:
+            pool.apply_async(runHmmScan,args=(fastaFile,pfam_dir,output_folder, verbose))
+        pool.close()
+        pool.join()
 
-            pool = Pool(cores,maxtasksperchild=1)
-            for fastaFile in set(fastaFiles) - alreadyDone:
-                pool.apply_async(runHmmScan,args=(fastaFile,pfam_dir,output_folder))
-            pool.close()
-            pool.join()
+        print " Finished generating domtable files."
 
-            print "Done generating domtable files checking output...."
-        else:
-            "ERROR: Tasklist empty for running hmmscan"
 
         ### Step 3: Parse hmmscan domtable results and generate pfs and pfd files
-
-        domtblFiles = glob(os.path.join(output_folder,"*.domtable"))
+        print("\nParsing hmmscan domtable files")
+        
+        # grab all domtable file locations
+        domtblFiles = set(glob(os.path.join(output_folder,"*.domtable")))
+        
+        # verify previous step
         domtblBases = set(x.split(os.sep)[-1].replace(".domtable","") for x in domtblFiles)
         if len(baseNames - domtblBases) > 0:
-            print "WARNING no domtbl files for:", baseNames - domtblBases
-
-        if domtblFiles:
-
-            alreadyDone = set()
-
-            for domtable in domtblFiles:
-                outputbase  = fasta.split(os.sep)[-1].replace(".domtable","")
-                outputfile = os.path.join(output_folder,outputbase + '.pfd')
-                if os.path.isfile(outputfile) and os.path.getsize(outputfile) > 0:
-                    alreadyDone.add(domtable)
+            sys.exit("Error! The following files did NOT have their domains predicted by  sequences extracted: " + ", ".join(baseNames - domtblBases))
+        
+        # find already processed files
+        alreadyDone = set()
+        for domtable in domtblFiles:
+            outputbase  = domtable.split(os.sep)[-1].replace(".domtable","")
+            outputfile = os.path.join(output_folder,outputbase + '.pfd')
+            if os.path.isfile(outputfile) and os.path.getsize(outputfile) > 0:
+                alreadyDone.add(domtable)
 
-            if len(alreadyDone) > 0:
-                print "Found output files for %s, skipping these" % list(x.split(os.sep)[-1].replace(".fasta","") for x in alreadyDone)
+        if len(domtblFiles - alreadyDone) == 0:
+            print(" All domtable files had already been processed")
+        elif len(alreadyDone) > 0:
+            print " Warning! The following domtable files had not been processed: %s" % ", ".join(x.split(os.sep)[-1].replace(".domtable","") for x in domtblFiles - alreadyDone)
+        else:
+            print(" Processing " + str(len(fastaFiles)) + " domtable files")
 
-            pool = Pool(cores,maxtasksperchild=32)
+        # If using the multiprocessing version and outputbase doesn't have any
+        #  predicted domains, it's not as easy to remove if from the analysis
+        #  (probably because parseHmmScan only has a copy of clusters et al?)
+        # Using serialized version for now. Probably doesn't have too bad an impact
+        #pool = Pool(cores,maxtasksperchild=32)
+        for domtableFile in domtblFiles - alreadyDone:
+            parseHmmScan(domtableFile,output_folder,options.domain_overlap_cutoff)
+            #pool.apply_async(parseHmmScan, args=(domtableFile,output_folder,options.domain_overlap_cutoff))
+        #pool.close()
+        #pool.join()
 
-            for domtableFile in set(domtblFiles) - alreadyDone:
-                pool.apply_async(parseHmmScan, args=(domtableFile,output_folder,options.domain_overlap_cutoff))
-            pool.close()
-            pool.join()
+        print " Finished generating generating pfs and pfd files."
 
-            print "Done generating generating pfs and pfd files checking output...."
-        else:
-            "ERROR: Tasklist empty for parsing hmmscan"
 
         ### Step 4: Parse the pfs, pfd files to generate BGC dictionary, clusters, and clusters per sample objects
-            ## Note: this code currently doesn't modify the fasta files to include the domain fasta IDs
-
+        print("\nProcessing domains sequence files")
+        
+        # grab all pfd files
         pfdFiles = glob(os.path.join(output_folder,"*.pfd"))
+        
+        # verify previous step. All BGCs without predicted domains should be gone
         pfdBases = set(x.split(os.sep)[-1].replace(".pfd","") for x in pfdFiles)
-
         if len(baseNames - pfdBases) > 0:
-            print "WARNING no domtbl files for:", baseNames - pfdBases
-
+            sys.exit("Error! The following files did NOT have their domains predicted by  sequences extracted: " + ", ".join(baseNames - pfdBases))
+    
+        if verbose:
+            print(" Adding sequences to corresponding domains file")
         for outputbase in pfdBases:
-            print "Processing: %s" % outputbase
+            if verbose:
+                print("   Processing: " + outputbase)
 
             pfdFile = os.path.join(output_folder, outputbase + ".pfd")
-            pfsFile = os.path.join(output_folder, outputbase + ".pfs")
             fasta_file = os.path.join(output_folder, outputbase + ".fasta")
 
-            fasta_dict = fasta_parser(open(fasta_file))
+            fasta_dict = fasta_parser(open(fasta_file)) # all fasta info from a BGC
             filtered_matrix = [map(lambda x: x.strip(), line.split('\t')) for line in open(pfdFile)]
 
+            # save each domain sequence from a single BGC in its corresponding file
             save_domain_seqs(filtered_matrix, fasta_dict, domainsout, output_folder, outputbase)
 
             BGCs[outputbase] = BGC_dic_gen(filtered_matrix)
@@ -1120,7 +1197,6 @@ if __name__=="__main__":
             pairs = set(map(tuple, map(sorted, combinations(clusters, 2))))
             cluster_pairs = [(x, y, "domain_dist", anchor_domains) for (x, y) in pairs]
             network_matrix = generate_network(cluster_pairs, cores)
-            allCalcFlag = True
             for cutoff in cutoff_list:
                 write_network_matrix(network_matrix, cutoff, os.path.join(output_folder, networks_folder, "networkfile_domain_dist_all_vs_all_c" + cutoff + ".network"), include_disc_nodes)
             if 'S' in domaindist_networks:
@@ -1168,10 +1244,10 @@ if __name__=="__main__":
 
     if seqdist_networks:
         print("")
-        if options.skip_all:
+        if options.skip_all and os.path.isfile(os.path.join(output_folder, networks_folder, "networkfile_seqdist_all_vs_all_c1.network")):
             network_matrix = network_parser(os.path.join(output_folder, networks_folder, "networkfile_seqdist_all_vs_all_c1.network"), Jaccardw, DDSw, GKw)
             
-        elif options.skip_mafft:
+        elif options.skip_mafft and os.path.isfile(os.path.join(output_folder, "DMS.dict")):
             DMS = {}
             print("Trying to read domain alignments (DMS.dict file)")
             with open(os.path.join(output_folder, "DMS.dict"), "r") as DMS_file:
@@ -1269,7 +1345,7 @@ if __name__=="__main__":
                 for sample, sampleClusters in sampleDict.iteritems():
                     print(" Sample: " + sample)
                     if len(sampleClusters) == 1:
-                        print(" Warning: Sample size = 1 detected. Not generating network for this sample (" + sample_name[sn] + ")")
+                        print(" Warning: Sample size = 1 detected. Not generating network for this sample (" + sample + ")")
                     else:
                         pairs = set(map(tuple, map(sorted, combinations(sampleClusters, 2))))
                         cluster_pairs = [(x, y, "seqdist", anchor_domains) for (x, y) in pairs]
@@ -1285,4 +1361,10 @@ if __name__=="__main__":
                                                  os.path.join(output_folder, networks_folder,
                                                               "networkfile_seqdist_" + sample + "_c" + cutoff + ".network"),
                                                  include_disc_nodes)
+
+    runtime = time.time()-time1
+    runtime_string = '\tMain function took %0.3f s' % (runtime)
+    timings_file.write(runtime_string + "\n")
+    print runtime_string
+    
     timings_file.close()
diff --git a/functions.py b/functions.py
index e28a6d70998e1e4c7dd20a8ca3f1c140657dacd2..e966dc45d0f46b69c972fb075be057034a87c8ba 100644
--- a/functions.py
+++ b/functions.py
@@ -60,6 +60,8 @@ def get_domain_list(filename):
     handle = open(filename, 'r')
     domains_string = handle.readline().strip()
     domains = domains_string.split(" ")
+    handle.close()
+    
     return domains
 
 
@@ -192,13 +194,12 @@ def dct_writeout(handle, dct):
 def check_data_integrity(gbk_files):
     """Perform some integrity checks on the input gbk files."""
 
-    # check for existance of files
-    if gbk_files == []:
-        print "No .gbk files were found"
-        sys.exit()
-    
+    discarded_set = set()
+
     # check for potential errors while reading the gbk files
     # Structure must remain intact after removing offending files
+    # -> The following used to be necessary before storing sample information
+    # in genbankDict. It could eventually be removed/simplified
     gbk_files_check = copy.deepcopy(gbk_files)
     samples_for_deletion = []
     for sample in range(len(gbk_files_check)):
@@ -207,8 +208,9 @@ def check_data_integrity(gbk_files):
             try:
                 SeqIO.read(handle, "genbank")
             except ValueError as e:
-                print("   Error with file " + f + ": \n   " + str(e))
+                print("   Error with file " + f + ": \n    '" + str(e) + "'")
                 print("    (This file will be excluded from the analysis)")
+                discarded_set.add(f)
                 if len(gbk_files[sample]) > 1:
                     gbk_files[sample].remove(f)
                 else:
@@ -236,12 +238,23 @@ def check_data_integrity(gbk_files):
                     duplication = True
                     print "duplicated file at:", cfile 
     
+    # Without proper checking downstream, allowing duplicate files results in
+    # problems: domain's sequences could be duplicated and the length's mismatch
+    # raises an exception in sequence similarity scoring. 
+    # This probably is not an issue anymore with the new genbankDict structure
     if duplication == True:
         print "There was duplication in the input files, if this is not intended remove them."
         cont = raw_input("Continue anyway? Y/N ")
         if cont.lower() == "n":
             sys.exit()
             
+    # The possibility of not having files to analyze was checked for in get_gbk_files
+    # so if the list is empty now, it's due to issues in the gbk files
+    if len(gbk_files_check) < 2:
+        sys.exit("Due to errors in the input files, there are no files left for the analysis (" + str(len(gbk_files_check)) + ")")
+       
+    return discarded_set
+            
 
 def hmmscan(pfam_dir, fastafile, outputdir, name, cores):
     """Runs hmmscan"""
@@ -543,66 +556,6 @@ def write_network_matrix(matrix, cutoff, filename, include_disc_nodes):
     networkfile.close()
                     
 
-def get_gbk_files(inputdir, samples, min_bgc_size, exclude_gbk_str):
-    """Find .gbk files, and store the .gbk files in lists, separated by sample."""
-    genbankfiles = [] #Will contain lists of gbk files
-    sample_name = []
-    dirpath_ = ""
-    file_counter = 0
-    
-    print("Importing the gbk files, while skipping gbk files with '" + exclude_gbk_str + "' in their filename")
-    
-    #this doesn't seem to make a difference. Confirm
-    #if inputdir != "" and inputdir[-1] != "/":
-        #inputdir += "/"
-    current_dir = ""
-    for dirpath, dirnames, filenames in os.walk(inputdir):
-        head, tail = os.path.split(dirpath)
-        if current_dir != tail:
-            current_dir = tail
-        
-        genbankfilelist=[]
-        
-        #avoid double slashes
-        dirpath_ = dirpath[:-1] if dirpath[-1] == os.sep else dirpath
-        
-        for fname in filenames:
-            if fname.split(".")[-1] == "gbk" and exclude_gbk_str not in fname:
-                if " " in fname:
-                    print "Your GenBank files should not have spaces in their filenames. Please remove the spaces from their names, HMMscan doesn't like spaces (too many arguments)."
-                    sys.exit(0)
-                
-                gbk_header = open( os.path.join(dirpath_,fname) , "r").readline().split(" ")
-                gbk_header = filter(None, gbk_header) #remove the empty items from the list
-                try:
-                    bgc_size = int(gbk_header[gbk_header.index("bp")-1])
-                except ValueError as e:
-                    print("  " + str(e) + ": " + os.path.join(dirpath_,fname))
-                    sys.exit()
-                    
-                file_counter += 1
-                if bgc_size > min_bgc_size: #exclude the bgc if it's too small
-                    genbankfilelist.append(os.path.join(dirpath_, fname))
-                    
-                    if samples == False: #Then each gbk file represents a different sample
-                        genbankfiles.append(genbankfilelist)
-                        genbankfilelist = []
-                    
-                if verbose == True:
-                    print fname
-                    print "bgc size in bp", bgc_size
-                    
-        if genbankfilelist != []:
-            if current_dir not in sample_name:
-                sample_name.append(current_dir)
-            else:
-                sys.exit("Error! Found two sample directories with the same name! This would cause trouble when writing the network files...")
-            genbankfiles.append(genbankfilelist)
-    
-    print("")
-    return genbankfiles, sample_name
-
-
 def domtable_parser(gbk, dom_file):
     """Parses the domain table output files from hmmscan"""
     
@@ -653,7 +606,6 @@ def domtable_parser(gbk, dom_file):
     return pfd_matrix
 
 
-
 def hmm_table_parser(gbk, hmm_table):
 ##AB050629.gbk    score   yxjC    1    1167    +    PF03600    CitMHS