diff --git a/src/main/java/nl/wur/bif/pantools/pangenome/Phylogeny.java b/src/main/java/nl/wur/bif/pantools/pangenome/Phylogeny.java index 19cb69582b3c270bb6529aa7686c5f34473b6466..b698844bde9533dd87beebbc875e4316a0207f8d 100644 --- a/src/main/java/nl/wur/bif/pantools/pangenome/Phylogeny.java +++ b/src/main/java/nl/wur/bif/pantools/pangenome/Phylogeny.java @@ -132,9 +132,10 @@ public class Phylogeny { ResourceIterator<Node> all_gene_nodes = graphDb.findNodes(gene_label); int gene_nodes = (int) count_nodes(gene_label); int gene_node_counter = 0; + System.out.print("\rSearching for gene names in gene nodes: "); while (all_gene_nodes.hasNext()) { - gene_node_counter ++; - if (gene_node_counter % 100 == 0 || gene_nodes == gene_node_counter) { + gene_node_counter++; + if (gene_node_counter % 10000 == 0 || gene_nodes == gene_node_counter ) { System.out.print("\rSearching for gene names in gene nodes: " + gene_node_counter + "/" + gene_nodes + " "); } Node gene_node = all_gene_nodes.next(); @@ -557,7 +558,7 @@ public class Phylogeny { continue; } if (line.contains(">")) { - seq_count ++; + seq_count++; line = line.replace(">",""); String[] line_array = line.split("_gn_"); String[] line_array2 = line_array[1].split("_"); @@ -604,7 +605,7 @@ public class Phylogeny { continue; } if (line.contains(">")) { - line_count ++; + line_count++; String[] line_list = line.split("gn_"); String[] line_list2 = line_list[1].split("_"); current_genome = line_list2[0]; @@ -645,7 +646,7 @@ public class Phylogeny { log_builder.append(genome_key).append(", ").append(seq_str.length()).append("bp\n"); all_sizes.add(seq_str.length()); output.append(">").append(genome_key).append("\n").append(seq_str).append("\n"); - counter ++; + counter++; } String info = counter + " genomes\n" @@ -679,7 +680,7 @@ public class Phylogeny { ArrayList<Integer> protein_length_list = new ArrayList<>(); HashMap<Integer, Integer> gene_genome_map = new HashMap<>(); int [] presence_array = new int[total_genomes]; - for(int i = 1 ; i < total_genomes + 1; ++i) { + for(int i = 1 ; i <= total_genomes; i++) { gene_genome_map.put(i, 0); } HashMap<String, String> prot_seqs_per_genome = new HashMap<>(); @@ -744,20 +745,19 @@ public class Phylogeny { .append(", Protein sequence is missing! Something wrong with GFF?\n"); continue; } - presence_array[genome_nr-1] ++; + presence_array[genome_nr-1]++; log_builder.append(gene_name).append("; ").append(genome_nr).append("; ").append(gene_node.getId()).append("; ").append(mrna_node_str).append("; ") .append(hm_node_str).append("; ").append(gene_length).append("; ").append(protein_length).append("; ").append(address_str).append("\n"); int gene_count = gene_genome_map.get(genome_nr); - gene_count ++; + gene_count++; gene_genome_map.put(genome_nr, gene_count); prot_seqs_per_genome.put(genome_nr + "_" + gene_count + "_header", seq_header); prot_seqs_per_genome.put(genome_nr + "_" + gene_count, prot_sequence); nuc_seqs_per_genome.put(genome_nr + "_" + gene_count, nuc_sequence); } - boolean multi_groups = false; + boolean multi_groups = false; // when true, the genes are found in multiple homology groups if (prot_seqs_per_hmgroup.size() > 1) { - // the genes are found in multiple homology groups multi_groups = true; if (mlsa_function) { create_directory_in_DB("/mlsa/input/protein/" + SELECTED_NAME); @@ -842,7 +842,7 @@ public class Phylogeny { // do nothing } else { log_builder.append("\nWarning! Gene was not found!\n\n"); - warning_counter ++; + warning_counter++; return false; } String[] present_array = present_genomes.split(","); @@ -852,7 +852,7 @@ public class Phylogeny { if (absent_genomes.length() > 27) { log_builder.append(present_genomes).append("\n") .append(absent_genomes).append("\n"); - warning_counter ++; + warning_counter++; } else { log_builder.append("Gene was found in every genome!\n"); absent = false; @@ -860,7 +860,7 @@ public class Phylogeny { if (multiple_copies.length() > 29) { log_builder.append("Warning! ").append(multiple_copies).append("\n"); - warning_counter ++; + warning_counter++; } if (all_hms.size() == 1) { @@ -869,16 +869,16 @@ public class Phylogeny { log_builder.append("Warning! You must remove duplicate copies from the input files\n ") .append(WORKING_DIRECTORY).append("/mlsa/input/nucleotide/").append(SELECTED_NAME).append(".fasta\n ") .append(WORKING_DIRECTORY).append("/mlsa/input/protein/").append(SELECTED_NAME).append(".fasta\n"); - warning_counter ++; + warning_counter++; } } else if (all_hms.size() > 1) { log_builder.append("Warning! Genes were found in ").append(all_hms.size()).append(" different homology groups! Sequences for are written to:\n") .append(" ").append(WORKING_DIRECTORY).append("mlsa/input/protein/").append(SELECTED_NAME).append("/\n") .append(" ").append(WORKING_DIRECTORY).append("mlsa/input/nucleotide/").append(SELECTED_NAME).append("/\n\n"); - warning_counter ++; + warning_counter++; } else { log_builder.append("Warning! No (active) homology grouping is present!\n"); - warning_counter ++; + warning_counter++; } StringBuilder hm_builder = new StringBuilder(); @@ -900,9 +900,9 @@ public class Phylogeny { .append(".fasta as the group size matches the number of genomes.\n"); prot_seq_builder = prot_seqs_per_hmgroup.get(hm_node); nuc_seq_builder = nuc_seqs_per_hmgroup.get(hm_node); - warning_counter ++; - perfect_group_counter ++; - } else { + warning_counter++; + perfect_group_counter++; + } else if (multi_groups && num_members == total_genomes) { log_builder.append("Warning! Group ").append(hm_node.getId()).append(" is also suitable for the analysis for ").append(SELECTED_NAME) .append(". It was not selected because a suitable candidate was already found. \n"); } @@ -1034,7 +1034,7 @@ public class Phylogeny { } StringBuilder seq = new StringBuilder(); try (BufferedWriter out = new BufferedWriter(new FileWriter(WORKING_DIRECTORY + "/databases/genome.db/Genome_" + genome_nr + ".fasta"))) { - for (int sequence = 1; sequence <= genomeDb.num_sequences[genome_nr]; ++sequence) { + for (int sequence = 1; sequence <= genomeDb.num_sequences[genome_nr]; sequence++) { out.write(">" + genomeDb.sequence_titles[genome_nr][sequence] + "\n"); int begin = 0; int end = (int) genomeDb.sequence_length[genome_nr][sequence] - 1; @@ -1763,7 +1763,7 @@ public class Phylogeny { } else { phylogeny = phylogeny.replace("(" + i + ":", "(" + i + phenotype + ":"); } - changes_counter ++; + changes_counter++; } else if (phylogeny.contains("(" + i + "_")) { String[] phylogeny_array = phylogeny.split("\\(" + i + "_"); if (phylogeny_array.length != 2) { @@ -1777,30 +1777,30 @@ public class Phylogeny { } else { phylogeny = phylogeny_array[0] + "(" + i + phenotype + str.substring(index1, str.length()); } - changes_counter ++; + changes_counter++; } else if (phylogeny.contains("(" + i + ",")) { if (exclude_numbers) { phylogeny = phylogeny.replace("(" + i + ",", "(" + phenotype + ","); } else { phylogeny = phylogeny.replace("(" + i + ",", "(" + i + phenotype + ","); } - changes_counter ++; + changes_counter++; } else if (phylogeny.contains("," + i + ")")) { - changes_counter ++; + changes_counter++; if (exclude_numbers) { phylogeny = phylogeny.replace("," + i + ")", "," + phenotype + ")"); } else { phylogeny = phylogeny.replace("," + i + ")", "," + i + phenotype + ")"); } } else if (phylogeny.contains("," + i + ":")) { - changes_counter ++; + changes_counter++; if (exclude_numbers) { phylogeny = phylogeny.replace("," + i + ":", "," + phenotype + ":"); } else { phylogeny = phylogeny.replace("," + i + ":", "," + i + phenotype + ":"); } } else if (phylogeny.contains("," + i + "_")) { - changes_counter ++; + changes_counter++; String[] phylogeny_array = phylogeny.split("," + i + "_"); if (phylogeny_array.length != 2) { correct_tree = false; @@ -1821,7 +1821,7 @@ public class Phylogeny { } else if (changes_counter == 0) { // genome nr is not present in the phylogeny // do nothing } else { // 1 change - total_changes ++; + total_changes++; } } @@ -1953,7 +1953,7 @@ public class Phylogeny { int line_counter = 0; for (int c = 0; in.ready();) { phylogeny = in.readLine().trim(); - line_counter ++; + line_counter++; } if (line_counter > 1) { System.out.println("\n" + INPUT_FILE + " is not a (correctly formatted) Newick file\n"); @@ -2157,7 +2157,7 @@ public class Phylogeny { for (Node hm_node : sco_node_list) { int[] copy_number_array = (int[]) hm_node.getProperty("copy_number"); boolean sco_group = true; - for (int i = 1; i < copy_number_array.length; ++i) { // walk over the cnv array in the homology node + for (int i = 1; i < copy_number_array.length; i++) { // walk over the cnv array in the homology node if (skip_array[i - 1]) { continue; } @@ -2265,7 +2265,7 @@ public class Phylogeny { try_incr_hashmap(pos_count_map, nuc, 1); } if (pos_count_map.size() == 1) { - conserved_positions ++; + conserved_positions++; continue; } boolean informative = check_if_informative_position(pos_count_map); @@ -2310,7 +2310,7 @@ public class Phylogeny { if (skip_array[genome_nr-1]) { continue; } - sequence_counter ++; + sequence_counter++; all_seq_position_array[genome_nr-1] = sequence.split(""); // -1 because data in array } return all_seq_position_array; @@ -2337,7 +2337,7 @@ public class Phylogeny { boolean first = true; for (String group_id : sco_string_list) { - group_counter ++; + group_counter++; System.out.print("\r Reading alignments: " + group_counter + "/" + sco_string_list.size()); int[] var_inf_sites = get_snps_from_msa_and_create_alignments( WORKING_DIRECTORY + "alignments/msa_" + msaMethod + "/grouping_v" + grouping_version + "/" + group_id + "/output/" + nuc_or_prot + "_trimmed.afa", @@ -2472,7 +2472,7 @@ public class Phylogeny { System.out.println("Reading in all alignments belonging to user-specified homology groups\n"); int group_counter = 0; for (Node hm_node : hmNodeList) { - group_counter ++; + group_counter++; if (group_counter % 1000 == 0) { System.out.print("\rReading gene tree: " + group_counter + "/" + hmNodeList.size()); } @@ -2843,10 +2843,12 @@ public class Phylogeny { } /** - * @param single_copy_input_file + * @param single_copy_input_file file location to single-copy homology group identifiers + * @param sco_node_list list with 'homology_group' node identifiers * @return */ - public static HashMap<String, String> gather_info_for_core_snp_tree_log(String single_copy_input_file, ArrayList<Node> sco_node_list) { + public static HashMap<String, String> gather_info_for_core_snp_tree_log(String single_copy_input_file, + ArrayList<Node> sco_node_list) { check_if_graphDb_is_available(); SELECTED_HMGROUPS = single_copy_input_file; // SCOs need to be read again because the database was closed HashMap<String, String> info_map = new HashMap<>(); @@ -2854,7 +2856,7 @@ public class Phylogeny { int counter = 0; for (Node hm_node : sco_node_list) { String group_id = hm_node.getId() +""; - counter ++; + counter++; if (counter % 100 == 0 || counter == sco_node_list.size()) { System.out.print("\r Retrieving gene information: " + counter + "/" + sco_node_list.size() + " groups"); }