diff --git a/src/main/java/nl/wur/bif/pantools/pangenome/AnnotationLayer.java b/src/main/java/nl/wur/bif/pantools/pangenome/AnnotationLayer.java index a85bf6da677bdc637c948d465ea51551ad1ec280..96a4e176e02f1527fd9c97736106a4ba3148502b 100755 --- a/src/main/java/nl/wur/bif/pantools/pangenome/AnnotationLayer.java +++ b/src/main/java/nl/wur/bif/pantools/pangenome/AnnotationLayer.java @@ -337,7 +337,9 @@ public class AnnotationLayer { * @param annotation_file Path to the GFF3 file * @param proteins_directory The directory to write protein FASTA files in */ - private void parse_gff(int genome, String annotation_id, BufferedWriter log_file, String annotation_file, String proteins_directory, Node annotation_node) { + private void parse_gff(int genome, String annotation_id, BufferedWriter log_file, String annotation_file, + String proteins_directory, Node annotation_node) { + int i, trsc, num_genes, num_mRNAs, num_tRNAs, num_rRNAs, feature_len, offset, num_cds_without_genes; MAX_TRANSACTION_SIZE = 50000; // commit changes every 50k lines long seq_len = -1, line_nr = 0; @@ -563,10 +565,16 @@ public class AnnotationLayer { } // while lines in.close(); - System.out.println("\r" + address[0] + "\t" + (num_genes + num_cds_without_genes) + "\t" + num_mRNAs + "\t" + num_tRNAs + "\t" + num_rRNAs + "\t"); + System.out.print("\r" + address[0] + "\t" + (num_genes + num_cds_without_genes) + "\t" + num_mRNAs + "\t" + + num_tRNAs + "\t" + num_rRNAs + "\t"); if (num_genes == 0 && num_cds_without_genes == 0) { - System.out.println("\nNo genes could be added for genome " + address[0] +"!\n"); + System.out.println("-> No gene nodes could be created!\n"); + undo_failed_annotation(annotation_node); + System.out.println("\rNone of the final GFF's annotations were included to the pangenome\n" + + "Please verify the correctness of: " + annotation_file + "\n"); System.exit(1); + } else { + System.out.println(""); } try (Transaction tx = graphDb.beginTx()) { @@ -586,7 +594,54 @@ public class AnnotationLayer { System.out.println("Could not open " + annotation_file + "!"); } } - + + /** + * Use in case something went wrong during the annotation and you want to rollback the latest annotation + * @param annotation_node an 'annotation' node + */ + private void undo_failed_annotation(Node annotation_node) { + int trsc = 0; + long node_counter = 0; + Transaction tx = graphDb.beginTx(); // start database transaction + try { + String id = (String) annotation_node.getProperty("identifier"); + Iterable<Relationship> anno_relations = annotation_node.getRelationships(); + for (Relationship rel : anno_relations) { + rel.delete(); + } + annotation_node.delete(); + ResourceIterator<Node> annotated_nodes = graphDb.findNodes(feature_label, "annotation_id", id); + while (annotated_nodes.hasNext()) { + node_counter ++; + Node new_node = annotated_nodes.next(); + Iterable<Relationship> new_relations = new_node.getRelationships(); + for (Relationship rel : new_relations) { + rel.delete(); + trsc ++; + if (trsc >= 100000) { // 100k actions + tx.success(); + tx.close(); + trsc = 0; + tx = graphDb.beginTx(); // start a new database transaction + } + } + trsc ++; + if (trsc >= 100000) { // 100k actions + tx.success(); + tx.close(); + trsc = 0; + tx = graphDb.beginTx(); // start a new database transaction + } + if (node_counter % 5000 == 0){ + System.out.print("\rRolling back annotated features: " + node_counter); + } + } + } finally { + tx.close(); + } + } + + /** * Parses a GenBank file and annotates the genomes at the same time. * @@ -1161,7 +1216,8 @@ public class AnnotationLayer { create_skip_arrays(false, true); // create skip arrays for sequences and genomes if -skip/-ref is provided by user String genome_or_identifier = "genome"; - if ((target_genome != null || skip_genomes != null) && PATH_TO_THE_ANNOTATIONS_FILE == null) { // option 1, remove all from selected genomes + + if ((target_genome != null || skip_genomes != null) && PATH_TO_THE_ANNOTATIONS_FILE == null) { // option 1, remove all from selected genomes try (Transaction tx = graphDb.beginTx()) { ResourceIterator<Node> annotation_nodes = graphDb.findNodes(annotation_label); while (annotation_nodes.hasNext()) { @@ -1225,8 +1281,8 @@ public class AnnotationLayer { /** * - * @param target_label - * @param identifier + * @param target_label a node label + * @param identifier an annotation identifier * @param genome_or_identifier */ public void remove_node_rels_matching_genome_or_identifier(Label target_label, String identifier, String genome_or_identifier) { @@ -1252,8 +1308,9 @@ public class AnnotationLayer { rel.delete(); trsc1 ++; rel_counter ++; - if (rel_counter % 100 == 0 || rel_counter == rel_set.size()) { - System.out.print("\rRemoving " + target_label + " relationships of annotation " + identifier + ": " + rel_counter + "/" + rel_set.size() + " "); + if (rel_counter % 10000 == 0 || rel_counter == rel_set.size()) { + System.out.print("\rRemoving " + target_label + " relationships of annotation " + identifier + ": " + + rel_counter + "/" + rel_set.size() + " "); } if (trsc1 >= MAX_TRANSACTION_SIZE) { tx.success(); @@ -1275,8 +1332,9 @@ public class AnnotationLayer { trsc1 ++; node1.delete(); node_counter ++; - if (node_counter % 1000 == 0 || node_counter == node_set.size()) { - System.out.print("\rRemoving " + target_label + " nodes of annotation " + identifier + ": " + node_counter + "/" + node_set.size() + " "); + if (node_counter % 10000 == 0 || node_counter == node_set.size()) { + System.out.print("\rRemoving " + target_label + " nodes of annotation " + identifier + ": " + + node_counter + "/" + node_set.size() + " "); } if (trsc1 >= MAX_TRANSACTION_SIZE) { tx.success(); diff --git a/src/main/java/nl/wur/bif/pantools/pangenome/Classification.java b/src/main/java/nl/wur/bif/pantools/pangenome/Classification.java index f33513e59fdcf10edc9a5a614a163131b12372c4..56df71d500d732103315c4d8a4acbfe7c7476311 100644 --- a/src/main/java/nl/wur/bif/pantools/pangenome/Classification.java +++ b/src/main/java/nl/wur/bif/pantools/pangenome/Classification.java @@ -232,7 +232,7 @@ public class Classification { ProteomeLayer proLayer = new ProteomeLayer(); StringBuilder output_builder = new StringBuilder(); - System.out.print("\rGathering pangenome statistics"); + System.out.print("\rGathering pangenome statistics1"); ArrayList<String> phenotype_properties = retrieve_phenotypes_properties(); HashMap<String, String> all_phenotypes_map = retrieve_phenotypes_for_metrics(); StringBuilder genome_csv_builder = create_header_for_genome_metrics_csv_file(phenotype_properties); @@ -4117,7 +4117,8 @@ public class Classification { } /** - * Key is annotation identifers and also stores the genome number as key. The genome number matches the annotation id from annotation_identifiers + * Key is annotation identifers and also stores the genome number as key. + * The genome number matches the annotation id from annotation_identifiers * On a panproteome the returned hashmap is empty as no annotation nodes are present * @return */ @@ -4210,7 +4211,7 @@ public class Classification { } /** - * + * Get all the annotation identifiers of annotation nodes * @return */ public static HashMap<Integer, ArrayList<String>> get_all_annotation_identifiers() { diff --git a/src/main/java/nl/wur/bif/pantools/pangenome/ProteomeLayer.java b/src/main/java/nl/wur/bif/pantools/pangenome/ProteomeLayer.java index fee24cf2fdb45b5d659c60cc7b72f9e919d1bb83..3b1e31aaffbcb4b7b6ea57ee2960391f2f0537a6 100755 --- a/src/main/java/nl/wur/bif/pantools/pangenome/ProteomeLayer.java +++ b/src/main/java/nl/wur/bif/pantools/pangenome/ProteomeLayer.java @@ -2730,8 +2730,8 @@ public class ProteomeLayer { /** * Sets 'longest_transcript' property to mRNA, exon, intron and CDS nodes. * - * @param count_nodes - * @param original_number_of_proteins + * @param count_nodes count the total number of longest transcripts + * @param original_number_of_proteins number of protein * @return number of proteins, only counting the longest transcripts */ public int find_longest_transcript_per_gene(boolean count_nodes, int original_number_of_proteins) { @@ -2752,17 +2752,17 @@ public class ProteomeLayer { } boolean file_exists = check_if_file_exists(WORKING_DIRECTORY + "proteins/longest_transcripts/proteins_" + anno_id + ".fasta"); if (file_exists && !count_nodes) { - continue; + continue; // longest transcripts are already set and counting is not required } try (Transaction tx = graphDb.beginTx()) { // start database transaction, commit changes after every genome String[] anno_id_array = anno_id.split("_"); duplicate_id_builder.append("#Genome ").append(anno_id_array[0]).append("\n"); ResourceIterator<Node> gene_nodes = graphDb.findNodes(gene_label, "annotation_id", anno_id); - int gene_counter = 0, prot_per_genome = 0, prot_per_genome2 = 0; + int gene_counter = 0; // prot_per_genome = 0, prot_per_genome2 = 0; StringBuilder longest_proteins = new StringBuilder(); while (gene_nodes.hasNext()) { gene_counter ++; - if (gene_counter % 100 == 0 || gene_counter == 1) { + if (gene_counter % 1000 == 0 || gene_counter == 1) { System.out.print("\rFinding longest transcript per gene: Genome " + anno_id_array[0] + ", " + gene_counter + " "); } Node gene_node = gene_nodes.next(); @@ -2777,7 +2777,7 @@ public class ProteomeLayer { } if (first) { first = false; - prot_per_genome ++; + //prot_per_genome ++; } String protein = (String) mrna_node.getProperty("protein_sequence"); if (protein.length() > longest_transcript_length) { @@ -2825,7 +2825,7 @@ public class ProteomeLayer { } protein_counter ++; - prot_per_genome2 ++; + //prot_per_genome2 ++; String protein_id = (String) longest_transcript_node.getProperty("protein_ID"); String protein_sequence = get_protein_sequence(longest_transcript_node); protein_sequence = split_seq_in_parts_of_80bp(protein_sequence);