Commit 067c8993 authored by Jonkheer, Eef's avatar Jonkheer, Eef
Browse files

Merge branch 'master' into pantools_v3.2

parents 956e1e86 56b71abd
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
No preview for this file type
......@@ -13,7 +13,8 @@
<open-files xmlns="http://www.netbeans.org/ns/projectui-open-files/2">
<group>
<file>file:/local/jonkh004/git_branch/pantools/src/pangenome/Classification.java</file>
<file>file:/local/jonkh004/git_branch/pantools/src/pangenome/FunctionalAnnotations.java</file>
<file>file:/local/jonkh004/git_branch/pantools/src/pangenome/Phylogeny.java</file>
<file>file:/local/jonkh004/git_branch/pantools/src/pangenome/AnnotationLayer.java</file>
<file>file:/local/jonkh004/git_branch/pantools/src/pantools/Pantools.java</file>
</group>
</open-files>
......
......@@ -35,7 +35,6 @@ import org.neo4j.graphdb.NotFoundException;
import org.neo4j.graphdb.Relationship;
import org.neo4j.graphdb.ResourceIterator;
import org.neo4j.graphdb.Transaction;
import static pangenome.Classification.annotation_overview;
import static pangenome.Classification.try_incr_hashmap;
import static pangenome.GenomeLayer.locate;
......@@ -81,6 +80,7 @@ import static pantools.Pantools.stop_if_panproteome;
import static pantools.Pantools.pangenome_label;
import static pangenome.create_skip_arrays.create_skip_arrays;
import static pantools.Pantools.skip_array;
import static pantools.Pantools.write_string_to_file_in_DB;
/**
* Implements all the functionalities related to the annotation layer of the pangenome
......@@ -181,9 +181,17 @@ public class AnnotationLayer {
genomeSc = new SequenceScanner(genomeDb, 1, 1, K_SIZE, indexDb.get_pre_len());
num_proteins = 0;
try{
try (Transaction tx = graphDb.beginTx()) {
Node pangenome_node = graphDb.findNodes(pangenome_label).next();
stop_if_panproteome(pangenome_node, "add_annotations"); // sets PROTEOME boolean that controls functions, retrieves K_size & total_genomes
tx.success();
}
try {
BufferedReader paths = new BufferedReader(new FileReader(PATH_TO_THE_ANNOTATIONS_FILE));
log_file = new BufferedWriter(new FileWriter(OUTPUT_PATH + "/log/annotation.log"));
log_file = new BufferedWriter(new FileWriter(OUTPUT_PATH + "/log/annotation.log", true)); // true allows to append the original instead of overwriting
String date = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss").format(new Date());
log_file.write("## LOG of run on " + date + "\n");
if (! new File(OUTPUT_PATH + "/proteins").exists()) {
Files.createDirectory(Paths.get(OUTPUT_PATH + "/proteins"));
}
......@@ -204,9 +212,9 @@ public class AnnotationLayer {
continue;
}
genome = Integer.parseInt(fields[0]);
log_file.write("#Genome " + genome + "\n");
log_file.write("#Genome " + genome +".");
if (! new File(annotation_file).exists()) {
log_file.write("Genome " + genome + "'s annotation file not found.");
log_file.write(" annotation file not found: " + annotation_file + "\n");
System.out.println("Genome " + genome + "'s annotation file not found.");
continue;
}
......@@ -223,7 +231,9 @@ public class AnnotationLayer {
annotation_node.setProperty("genome", genome);
annotation_node.setProperty("number", degree);
annotation_node.setProperty("identifier", genome + "_" + degree);
log_file.write("Used annotation file " + annotation_file + "\n");
log_file.write(" Annotation identifier: " + genome + "_" + degree + "\n"
+ "Used annotation file " + annotation_file + "\n");
if (annotation_file.endsWith(".gff") || annotation_file.endsWith(".gff3")) {
//parse_gff(genome, genome + "_" + degree, log_file, annotation_file, OUTPUT_PATH + "/proteins", annotation_node);
annotation_node.setProperty("type", "GFF");
......@@ -250,9 +260,9 @@ public class AnnotationLayer {
} else if (annotation_file.endsWith(".gbk") || annotation_file.endsWith(".gbff")) {
parse_gbk(genome, genome + "_" + degree, log_file, annotation_file, OUTPUT_PATH + "/proteins", annotation_node);
}
} // for genomes
paths.close();
log_file.write("\n");
log_file.close();
} catch (IOException ioe) {
System.out.println("Failed to open " + PATH_TO_THE_ANNOTATIONS_FILE);
......@@ -263,17 +273,59 @@ public class AnnotationLayer {
Node pangenome_node = graphDb.findNodes(pangenome_label).next();
pangenome_node.removeProperty("num_proteins");
pangenome_node.setProperty("num_proteins", num_proteins);
annotation_overview();
create_annotation_overview();
tx.success();
}
disconnect_pangenome();
System.out.println("\nAnnotated proteins available in:\n "
+ WORKING_DIRECTORY + "proteins/\n\n"
+ "Log output written to:\n "
+ WORKING_DIRECTORY + "log/annotation.log\n "
+ WORKING_DIRECTORY + "databases/genome.db/annotations.txt\n");
+ WORKING_DIRECTORY + "annotation_overview.txt\n "
+ WORKING_DIRECTORY + "log/annotation.log\n ");
}
/**
* Creates annotation_overview.txt, an overview of the .gff and .gbk files incorporated in the pangenome
*/
public static void create_annotation_overview() {
int total_genomes = (int) graphDb.findNodes(pangenome_label).next().getProperty("num_genomes");
StringBuilder output_builder = new StringBuilder();
int annotation_count = 0;
for (int i = 1; i <= total_genomes; i++) {
StringBuilder genome_builder = new StringBuilder();
ResourceIterator<Node> annotation_nodes = graphDb.findNodes(annotation_label, "genome", i);
HashSet<String> id_set = new HashSet<>();
while (annotation_nodes.hasNext()) {
Node genome_node = annotation_nodes.next();
long genome_node_id = genome_node.getId();
String genome_path = (String) genome_node.getProperty("path");
String identifier = (String) genome_node.getProperty("identifier");
id_set.add(identifier);
String date = (String) genome_node.getProperty("date");
String type = (String) genome_node.getProperty("type");
genome_builder.append("\nAnnotation id: ").append(identifier)
.append("\nAnnotation file: ").append(genome_path)
.append("\nFiletype: ").append(type)
.append("\nStored in node: ").append(genome_node_id)
.append("\nCreation date ").append(date).append("\n");
annotation_count ++;
}
String ids = id_set.toString().replace("[","").replace("]","").replace(", ",",");
String total = "No annotations";
if (id_set.size() == 1) {
total = "1 annotation: ";
} else if (id_set.size() > 1) {
total = id_set.size() + " annotations: ";
}
output_builder.append("\n#Genome ").append(i).append("\n")
.append(total).append(ids).append("\n")
.append(genome_builder.toString());
}
String anno_str = "Total annotations included in the pangenome: " + annotation_count + "\n";
write_string_to_file_in_DB(anno_str + output_builder.toString(), "annotation_overview.txt");
}
/**
* Parses a GFF3 file and annotates the genomes at the same time.
*
......@@ -818,7 +870,7 @@ public class AnnotationLayer {
continue;
}
if (first) {
log_file.write("\nThe following mRNAs have the same identifier. This is caused by incorrect naming in your GFF file."
log_file.write("\nThe following mRNAs have the same identifier. This is caused by incorrect naming in your GFF file. "
+ "The 'protein_ID' property is incremented by an underscore and a number for the following mrna's/proteins: \n");
first = false;
}
......
This diff is collapsed.
This diff is collapsed.
......@@ -2290,6 +2290,7 @@ public class GenomeLayer {
/**
* Retrieves some genomic regions from the genome database.
*
*/
public void retrieve_regions() {
String[] fields;
......@@ -2331,10 +2332,16 @@ public class GenomeLayer {
continue;
}
fields = line.trim().split("\\s");
genome = Integer.parseInt(fields[0]);
sequence = Integer.parseInt(fields[1]);
begin = Integer.parseInt(fields[2]);
end = Integer.parseInt(fields[3]);
try {
genome = Integer.parseInt(fields[0]);
sequence = Integer.parseInt(fields[1]);
begin = Integer.parseInt(fields[2]);
end = Integer.parseInt(fields[3]);
} catch(NumberFormatException e) {
System.out.println("Unable to correctly retrieve four numbers in: " + line);
out.close();
continue;
}
if (genome >= 1 && genome <= genomeDb.num_genomes && sequence <= genomeDb.num_sequences[genome] &&
begin >= 1 && end <= genomeDb.sequence_length[genome][sequence]) {
......@@ -2344,7 +2351,7 @@ public class GenomeLayer {
end -= 1;
seq.setLength(0);
genomeSc.get_sub_sequence(seq, genome, sequence, begin, end - begin + 1, true);
if (line.endsWith("-") || line.endsWith("rv")) {
if (line.endsWith("-") || line.endsWith("rv")) { // manual only mentions '-'
String rv_sequence = get_rv_complement(seq.toString());
write_fasta(out, rv_sequence, 70);
} else {
......@@ -2401,7 +2408,7 @@ public class GenomeLayer {
continue;
try{
genome = Integer.parseInt(genome_number);
}catch(NumberFormatException e) {
} catch(NumberFormatException e) {
System.out.println(genome_number + "is not a valid genome number.");
continue;
}
......@@ -2414,8 +2421,6 @@ public class GenomeLayer {
out = new BufferedWriter(new FileWriter(OUTPUT_PATH + "/Genome_" + genome_number + ".fasta"));
for (sequence = 1; sequence <= genomeDb.num_sequences[genome]; ++sequence) {
System.out.println("Sequence " + sequence + " length = " + genomeDb.sequence_length[genome][sequence]);
//begin = 1;
//end = (int)genomeDb.sequence_length[genome][sequence];
out.write(">" + genomeDb.sequence_titles[genome][sequence] + "\n");
begin = 0;
end = (int)genomeDb.sequence_length[genome][sequence] - 1;
......
This diff is collapsed.
......@@ -1212,7 +1212,7 @@ public class ProteomeLayer {
print_info = true; // compare_busco_to_grouping() now prints some info to user
compare_busco_to_grouping(skip_groupings);
String dir = WORKING_DIRECTORY + "optimal_grouping/";
if (grouping_version == -1){
if (grouping_version == -1) {
System.out.println("\rNo grouping is currently active. Use 'change_active_grouping'");
} else {
System.out.println("\rGrouping " + grouping_version + " is currently active.");
......@@ -2695,7 +2695,7 @@ public class ProteomeLayer {
if (annotation_identifiers.contains(identifier)) { // the number of proteins for genomes that are skipped are not present
int prot_count = (int) annotation_node.getProperty("num_proteins");
updated_num_proteins += prot_count;
if (prot_count == 0){ // even though the genome is annotated, there are no protein coding genes
if (prot_count == 0) { // even though the genome is annotated, there are no protein coding genes
int genome_nr = Integer.parseInt(id_array[0]);
skip_array[genome_nr -1] = true;
skip_list.add(genome_nr);
......@@ -2802,14 +2802,14 @@ public class ProteomeLayer {
}
if (longest_transcript_length > 0) {
if (!longest_transcript_node.hasProperty("longest_transcript")){
if (!longest_transcript_node.hasProperty("longest_transcript")) {
longest_transcript_node.setProperty("longest_transcript", "yes");
}
Iterable<Relationship> relations = longest_transcript_node.getRelationships(RelTypes.is_parent_of);
for (Relationship rel : relations) {
Node cds_intron_exon_node = rel.getEndNode(); // CDS, intron or exon
if (cds_intron_exon_node.hasLabel(exon_label) || cds_intron_exon_node.hasLabel(CDS_label) || cds_intron_exon_node.hasLabel(intron_label)) {
if (!cds_intron_exon_node.hasProperty("longest_transcript")){
if (!cds_intron_exon_node.hasProperty("longest_transcript")) {
cds_intron_exon_node.setProperty("longest_transcript", "yes");
}
}
......@@ -2819,7 +2819,7 @@ public class ProteomeLayer {
for (Relationship rel : relations) {
Node cds_node = rel.getStartNode(); // CDS
if (cds_node.hasLabel(CDS_label)) {
if (!cds_node.hasProperty("longest_transcript")){
if (!cds_node.hasProperty("longest_transcript")) {
cds_node.setProperty("longest_transcript", "yes");
}
}
......@@ -2850,7 +2850,7 @@ public class ProteomeLayer {
When a genome does not have an annotation (or is skipped) the id will end with '_0'.
*/
public void update_skip_array_based_on_anno_ids() {
if (annotation_identifiers == null){
if (annotation_identifiers == null) {
annotation_identifiers = get_annotation_identifiers(true, true);
}
for (String anno_id : annotation_identifiers) {
......@@ -2876,7 +2876,6 @@ public class ProteomeLayer {
check_if_graphDb_is_available(); // starts up the graph database if needed
report_number_of_threads(true, true); // prints how many threads were selected by user
delete_files_from_previous_group_run();
//connect_panproteome();
create_directory_in_DB("group");
proteins = new LinkedBlockingQueue<>();
......@@ -3012,7 +3011,6 @@ public class ProteomeLayer {
tx.success();
}
copy_file(WORKING_DIRECTORY + "pantools_homology_groups.txt", WORKING_DIRECTORY + "/group/" + MIN_NORMALIZED_SIMILARITY);
//disconnect_panproteome();
System.out.println("\nComponents = " + num_components);
System.out.println("Groups = " + num_groups + "\n");
System.out.println("Database size = " + getFolderSize(new File(WORKING_DIRECTORY + GRAPH_DATABASE_PATH)) + " MB");
......
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment