Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • add-logging
  • add-phenotype-to-consensus-tree
  • add-progress-bars
  • add-scoring-matrices-to-resources
  • add_RG_option
  • add_gfa_build_pangenome
  • add_gfa_export
  • add_loading_bars
  • add_panva_queries
  • add_promoter_sequence_analysis
  • add_wgs_roi
  • automatic_skip_core_phylogeny_fix
  • develop
  • docker
  • eggnog_fix
  • fasta-validation
  • feature/fix-msa-with-threshold
  • feature/map_kmers_to_coords
  • feature/update-msa-with-aa-presence-threshold
  • fix-core-phylogeny-mode
  • fix-indexing-error-add_annotations
  • fix_blast_for_panproteomes
  • fix_genomelayer_large_pangenomes
  • fix_inconsistent_homology_grouping
  • fix_integer_overflow_kmer_structure
  • fix_kmer_pangenome_structure
  • get_hm_groups_from_genes_of_interest
  • hotfix/kmer_classification_with_phenotypes_crashes
  • java-11
  • optimized-build-pangenome
  • optimized-map
  • pantools_v1
  • pantools_v2
  • pantools_v3
  • pantools_v3.1
  • pantools_v3.2
  • pantools_v4
  • phased_pangenomics_NW_blosum
  • phased_pangenomics_restructure
  • playing_around_with_cicd
  • refactor-check-if-nucleotide-sequence
  • refactor-retrieve-features
  • release
  • release_v4.0.0
  • remove-url-validation
  • remove_genomes
  • remove_skiparrays
  • remove_unused_phylogeny_functions
  • separate_connect_pangenome
  • single-seqalign-class
  • split_classification_busco
  • test_mcl_files
  • try_create_export_json_entire_pangenome
  • update_dependencies_pantools
  • update_retrieve_function
  • validate_feature_strands
  • v1.3.0
  • v2.0.0
  • v3.0.0
  • v3.1.0
  • v3.2.0
  • v3.4.0
  • v4.0.0
  • v4.1.0
  • v4.1.1
  • v4.2.0
  • v4.2.1
  • v4.2.2
  • v4.2.3
  • v4.3.0
  • v4.3.1
  • v4.3.2
  • v4.3.3
73 results

Target

Select target project
  • tan0531/pantools
  • he063/pantools
  • sakel001/pantools
  • bioinformatics/pantools
4 results
Select Git revision
  • add-logging
  • add-phenotype-to-consensus-tree
  • add-progress-bars
  • add-scoring-matrices-to-resources
  • add_RG_option
  • add_gfa_build_pangenome
  • add_gfa_export
  • add_loading_bars
  • add_panva_queries
  • add_promoter_sequence_analysis
  • add_wgs_roi
  • automatic_skip_core_phylogeny_fix
  • develop
  • docker
  • eggnog_fix
  • fasta-validation
  • feature/fix-msa-with-threshold
  • feature/map_kmers_to_coords
  • feature/update-msa-with-aa-presence-threshold
  • fix-core-phylogeny-mode
  • fix-indexing-error-add_annotations
  • fix_blast_for_panproteomes
  • fix_genomelayer_large_pangenomes
  • fix_inconsistent_homology_grouping
  • fix_integer_overflow_kmer_structure
  • fix_kmer_pangenome_structure
  • get_hm_groups_from_genes_of_interest
  • hotfix/kmer_classification_with_phenotypes_crashes
  • java-11
  • optimized-build-pangenome
  • optimized-map
  • pantools_v1
  • pantools_v2
  • pantools_v3
  • pantools_v3.1
  • pantools_v3.2
  • pantools_v4
  • phased_pangenomics_NW_blosum
  • phased_pangenomics_restructure
  • playing_around_with_cicd
  • refactor-check-if-nucleotide-sequence
  • refactor-retrieve-features
  • release
  • release_v4.0.0
  • remove-url-validation
  • remove_genomes
  • remove_skiparrays
  • remove_unused_phylogeny_functions
  • separate_connect_pangenome
  • single-seqalign-class
  • split_classification_busco
  • test_mcl_files
  • try_create_export_json_entire_pangenome
  • update_dependencies_pantools
  • update_retrieve_function
  • validate_feature_strands
  • v1.3.0
  • v2.0.0
  • v3.0.0
  • v3.1.0
  • v3.2.0
  • v3.4.0
  • v4.0.0
  • v4.1.0
  • v4.1.1
  • v4.2.0
  • v4.2.1
  • v4.2.2
  • v4.2.3
  • v4.3.0
  • v4.3.1
  • v4.3.2
  • v4.3.3
73 results
Show changes
Commits on Source (2)
Showing with 215 additions and 72 deletions
......@@ -36,7 +36,7 @@ public class BuildPangenomeCLI implements Callable<Integer> {
Path genomesFile;
private static int kSize;
private static Path graphFile;
private static boolean graphFileProvided;
@ArgGroup private GraphOptions graphOptions;
private static class GraphOptions {
@Option(names = "--kmer-size")
......@@ -46,7 +46,7 @@ public class BuildPangenomeCLI implements Callable<Integer> {
@Option(names = "--graph")
@InputFile(message = "{file.graph}")
void setGraphFile(Path value) {graphFile = value;}
void setGraphFileProvided(boolean value) {graphFileProvided = value;}
}
@Option(names = "--scratch-directory")
......@@ -77,11 +77,13 @@ public class BuildPangenomeCLI implements Callable<Integer> {
pantools.createLogger(spec);
BeanUtils.argValidation(spec, this, threadNumber);
crossValidate();
setGlobalParameters(); //TODO: use local parameters instead
seqLayer.initialize_pangenome(
pantools.getDatabaseDirectory(),
graphFile,
graphFileProvided,
scratchDirectory,
numBuckets,
transactionSize,
......@@ -93,6 +95,12 @@ public class BuildPangenomeCLI implements Callable<Integer> {
return 0;
}
private void crossValidate() {
if (graphFileProvided && !genomesFile.toString().endsWith(".gfa")) {
throw new ParameterException(spec.commandLine(), "The provided graph file is not a GFA file.");
}
}
private void setGlobalParameters() {
PATH_TO_THE_GENOMES_FILE = genomesFile.toString();
K_SIZE = (kSize == 0) ? -1 : kSize;
......
......@@ -25,6 +25,7 @@ import org.neo4j.graphdb.*;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.zip.GZIPInputStream;
......@@ -217,7 +218,7 @@ public class GenomeLayer {
* Constructs a pangenome (gDBG) for a set of genomes.
* build_pangenome()
*/
public void initialize_pangenome(Path databaseDirectory, Path graphFile, Path scratchDirectory, int numBuckets, int transactionSize,
public void initialize_pangenome(Path databaseDirectory, boolean graphFileProvided, Path scratchDirectory, int numBuckets, int transactionSize,
int numDbWriterThreads, int nodePropertiesCacheSize, boolean keepIntermediateFiles)
throws IOException {
Pantools.logger.info("Constructing the pangenome graph database.");
......@@ -227,9 +228,20 @@ public class GenomeLayer {
}
check_kmc_version(); // check if program is set to $PATH and if appropriate version
Node pangenome_node;
verify_if_all_genome_files_exist();
scratchDirectory = FileUtils.createScratchDirectory(scratchDirectory);
GraphUtils.createPangenomeDatabase(databaseDirectory);
GfaReader gfaReader = null; //only used when graphFileProvided is true
if (graphFileProvided) { //construct from GFA backbone
try {
gfaReader = new GfaReader(Paths.get(PATH_TO_THE_GENOMES_FILE)); //PATH_TO_THE_GENOMES_FILE is a GFA file now
} catch (IOException ioe) {
Pantools.logger.error("Error reading GFA file: {}.", PATH_TO_THE_GENOMES_FILE);
throw new RuntimeException("Error reading GFA file", ioe);
}
GraphUtils.createPangenomeDatabaseFromGfa(databaseDirectory, gfaReader);
} else { //normal cDBG construction
verify_if_all_genome_files_exist();
GraphUtils.createPangenomeDatabase(databaseDirectory);
}
INDEX_SC = new IndexScanner(INDEX_DB);
K_SIZE = INDEX_SC.get_K();
Pantools.logger.info("k-size = {}", K_SIZE);
......@@ -248,11 +260,11 @@ public class GenomeLayer {
num_bases = 0;
num_degenerates = 0;
if (graphFile == null) {
if (graphFileProvided) {
load_pangenome(pangenome_node, gfaReader);
} else {
construct_pangenome(pangenome_node);
add_sequence_properties(scratchDirectory);
} else {
load_pangenome(pangenome_node, graphFile);
}
try {
......@@ -748,10 +760,11 @@ public class GenomeLayer {
* @param src The source node
* @param des The destination node
* @param edge_type One of the four possible edge types: FF, FR, RF, RR
* @return The created relationship
*/
private void connect(Node src, Node des, RelationshipType edge_type) {
private Relationship connect(Node src, Node des, RelationshipType edge_type) {
Pantools.logger.trace("connect {} {} {}.", src.getId(), edge_type.name(), des.getId());
src.createRelationshipTo(des, edge_type);
return src.createRelationshipTo(des, edge_type);
}
/**
......@@ -1214,42 +1227,13 @@ public class GenomeLayer {
/**
* Loads a pangenome from a GFA file starting from the pangenome node.
* @param pangenomeNode The pangenome node
* @param gfaFile The GFA file
* @param gfaReader The GFA reader
*/
private void load_pangenome(Node pangenomeNode, Path gfaFile) {
private void load_pangenome(Node pangenomeNode, GfaReader gfaReader) {
int trsc = 0;
Node genomeNode, sequenceNode;
phaseTime = System.currentTimeMillis();
// parse the GFA file
GfaReader gfaReader;
try {
gfaReader = new GfaReader(gfaFile);
} catch (IOException ioe) {
Pantools.logger.error("Error reading GFA file: {}.", gfaFile);
throw new RuntimeException("Error reading GFA file", ioe);
}
// check if the size of the paths correspond with its sequence length in the genome database
try {
for (GraphPath path : gfaReader.getGraphPaths()) {
final String pathOrigin = path.getOrigin();
final long pathLength = path.getLength();
final int genomeNr = Utils.getGenomeNrFromOrigin(pathOrigin);
final int sequenceNr = Utils.getSequenceNrFromOrigin(pathOrigin);
final long sequenceLength = GENOME_DB.sequence_length[genomeNr][sequenceNr];
if (pathLength != sequenceLength) {
Pantools.logger.error("Path {} has length {} but sequence length is {}", pathOrigin, pathLength, sequenceLength);
throw new RuntimeException("Invalid path length in GFA file");
} else {
Pantools.logger.debug("Path {} has correct length {}", pathOrigin, pathLength);
}
}
} catch (NumberFormatException nfe) { //TODO: this should not happen, we need to be able to parse any path name
Pantools.logger.error("Error parsing path origin in GFA file; currently only supports GFA files with paths named GxSy.");
throw new RuntimeException("Error parsing path origin in GFA file", nfe);
}
Transaction tx = GRAPH_DB.beginTx();
try {
pangenomeNode.setProperty("type", "imported");
......@@ -1298,7 +1282,9 @@ public class GenomeLayer {
}
/**
* Add a new sequence to the pangenome from GFA
* Add a new sequence to the pangenome from GFA. This is done by walking the path of the sequence in the GFA file
* and creating nodes and relationships in the graph database. Importantly, in the case of non-zero overlap
* between two nodes, the overlap from the viewpoint of the target is taken based on the CIGAR of the overlap.
*
* @param gfaReader The GFA reader
* @param sequenceNode The sequence node that is the start and end of the path
......@@ -1326,12 +1312,12 @@ public class GenomeLayer {
// connect the sequence node to the first node with F + orientation of first node
final RelationshipType firstRelType = edge.getSourceOrientation() == '+' ? RelTypes.FF : RelTypes.FR;
createNewRelationshipGfa(sequenceNode, firstRelType, sourceNode, trsc);
createNewRelationshipGfa(sequenceNode, firstRelType, 0, sourceNode, trsc);
// deal with the first edge
pathPosition += edge.getTargetStartOnSource();
Node targetNode = createNewNodeGfa(edge.getTarget(), new int[]{genomeNr, sequenceNr, pathPosition}, index);
createNewRelationshipGfa(sourceNode, edge.getType(), targetNode, trsc);
createNewRelationshipGfa(sourceNode, edge.getType(), edge.getSourceEndOnTarget(), targetNode, trsc);
sourceNode = targetNode;
// now we can start walking the path and deal with only target nodes
......@@ -1339,12 +1325,12 @@ public class GenomeLayer {
edge = walker.next();
pathPosition += edge.getTargetStartOnSource();
targetNode = createNewNodeGfa(edge.getTarget(), new int[]{genomeNr, sequenceNr, pathPosition}, index);
createNewRelationshipGfa(sourceNode, edge.getType(), targetNode, trsc);
createNewRelationshipGfa(sourceNode, edge.getType(), edge.getSourceEndOnTarget(), targetNode, trsc);
sourceNode = targetNode;
}
// connect the last node to the sequence node with FF
createNewRelationshipGfa(sourceNode, RelTypes.FF, sequenceNode, trsc);
createNewRelationshipGfa(sourceNode, RelTypes.FF, -1, sequenceNode, trsc);
}
/**
......@@ -1417,6 +1403,13 @@ public class GenomeLayer {
INDEX_SC.put_pointer(newNode.getId(), offset, newKmer.get_canonical(), -1L, newIndex);
}
// mark a node as degenerate if its length was smaller than K (thus firstIndex = -1L)
if (firstIndex == -1L) {
newNode.addLabel(DEGENERATE_LABEL);
num_degenerates++;
Pantools.logger.trace("Node {} is degenerate because its length is smaller than K", newNode.getId());
}
// set the first and last k-mer properties on the node if non-degenerate
if (!newNode.hasLabel(DEGENERATE_LABEL)) {
newNode.setProperty("first_kmer", firstIndex);
......@@ -1434,10 +1427,11 @@ public class GenomeLayer {
* Create a new releationship for nodes from GFA if it doesn't already exist
* @param sourceNode The source node (neo4j object)
* @param relType The relationship type
* @param overlap The overlap between the source and target node
* @param targetNode The target node (neo4j object)
* @param trsc The transaction counter
*/
private void createNewRelationshipGfa(final Node sourceNode, final RelationshipType relType, final Node targetNode,
private void createNewRelationshipGfa(final Node sourceNode, final RelationshipType relType, int overlap, final Node targetNode,
int trsc) {
Pantools.logger.trace("Creating new relationship from {} to {} with type {}", sourceNode, targetNode, relType);
......@@ -1450,7 +1444,10 @@ public class GenomeLayer {
}
}
if (!relExists) {
connect(sourceNode, targetNode, relType);
Relationship rel = connect(sourceNode, targetNode, relType);
if (overlap >= 0) {
rel.setProperty("overlap", overlap);
}
++num_edges;
}
......
......@@ -54,13 +54,16 @@ public class NodeLocalizationTask implements Callable<Path> {
public Path call() {
// Adapted from GenomeLayer::localize_nodes()
final int defaultOverlap = K_SIZE - 1; // Default overlap for cDBGs
// log.trace("defaultOverlap = {}", defaultOverlap);
final Kryo kryo = KryoUtils.getKryo();
int neighbor_length = 0;
int neighbor_length = 0, overlap;
char node_side, neighbor_side;
long length, distance;
Node node, neighbor;
String rel_name, origin;
int[] address = new int[3], addr = null;
int[] address = new int[3], addr1 = new int[3], addr2 = null; //address keeps track of the current position; addr1 is the position of the potential neighbor node; addr2 is the actual position of the neighbor node
boolean found = true;
final Anchors anchors = new Anchors();
......@@ -71,8 +74,8 @@ public class NodeLocalizationTask implements Callable<Path> {
final Path path = outputDirectory.resolve("localizations-" + origin + ".kryo.xz");
try (Transaction tx = GRAPH_DB.beginTx(); Output output = KryoUtils.createOutput(path)) {
address[0] = (int) sequenceNode.getProperty("genome");
address[1] = (int) sequenceNode.getProperty("number");
address[0] = addr1[0] = (int) sequenceNode.getProperty("genome");
address[1] = addr1[1] = (int) sequenceNode.getProperty("number");
length = (long) sequenceNode.getProperty("length");
log.debug(String.format("Localizing node %s of length %,d (ID: %d)", origin, length, sequenceNode.getId()));
......@@ -87,12 +90,20 @@ public class NodeLocalizationTask implements Callable<Path> {
for (address[2] = 0; address[2] + K_SIZE - 1 < length && found; ) { // K-1 bases of the last node not added
found = false;
// log.trace("Localizing position {} at node with ID {}", address[2], node.getId());
for (Relationship r : node.getRelationships(Direction.OUTGOING)) {
rel_name = r.getType().name();
if (rel_name.charAt(0) != node_side)
continue;
// Get overlap: for a cDBG there is no such property, so we use the default value
overlap = r.hasProperty("overlap") ? (int) r.getProperty("overlap") : defaultOverlap; //TODO: this could slow down the code a lot
// log.trace("\t{} (addr1[2]) = Math.max({} (address[2]) - {} (overlap), 0)", Math.max(address[2] - overlap, 0), address[2], overlap);
addr1[2] = Math.max(address[2] - overlap, 0); //max is needed at the beginning of the sequence
neighbor = r.getEndNode();
neighbor_side = rel_name.charAt(1);
......@@ -111,14 +122,18 @@ public class NodeLocalizationTask implements Callable<Path> {
);
assert nodeProperties != null;
addr = nodeProperties.getAddress();
addr2 = nodeProperties.getAddress();
neighbor_length = nodeProperties.getLength();
}
if ((is_node && genomeSc.compare(address, addr, K_SIZE - 1,
neighbor_side == 'F' ? K_SIZE - 1 : neighbor_length - K_SIZE, 1, neighbor_side == 'F'))
|| (is_degenerate && Arrays.equals(addr, address))) {
// log.trace("Comparing address 1 ({}) with address 2 ({}) and offset 1 ({}) with offset 2 ({}) for direction {}",
// Arrays.toString(addr1), Arrays.toString(addr2), overlap,
// neighbor_side == 'F' ? overlap : neighbor_length - overlap - 1, neighbor_side == 'F');
if ((is_node && genomeSc.compare(addr1, addr2, overlap,
neighbor_side == 'F' ? overlap : neighbor_length - overlap - 1, 1, neighbor_side == 'F'))
|| (is_degenerate && Arrays.equals(addr2, addr1))) {
found = true;
address[2] = addr1[2];
// Write localization
......@@ -147,7 +162,8 @@ public class NodeLocalizationTask implements Callable<Path> {
previousTimestamp = currentTimestamp;
}
address[2] = address[2] + neighbor_length - K_SIZE + 1;
// log.trace("\t\t{} (address[2]) = {} (address[2]) + {} (neighbor_length)", address[2] + neighbor_length - overlap, address[2], neighbor_length);
address[2] = address[2] + neighbor_length;
node = neighbor;
node_side = neighbor_side;
break;
......
package nl.wur.bif.pantools.utils;
import nl.wur.bif.pantools.Pantools;
import nl.wur.bif.pantools.utils.graph.GfaReader;
import nl.wur.bif.pantools.utils.graph.graph_objects.GraphPath;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
......@@ -20,6 +22,9 @@ import java.util.Scanner;
import java.util.stream.Stream;
import java.util.zip.GZIPInputStream;
import static nl.wur.bif.pantools.utils.Globals.GENOME_DB;
import static nl.wur.bif.pantools.utils.Utils.getGenomeNrFromOrigin;
import static nl.wur.bif.pantools.utils.Utils.getSequenceNrFromOrigin;
import static org.apache.logging.log4j.core.util.Loader.getClassLoader;
public final class FileUtils {
......@@ -269,4 +274,44 @@ public final class FileUtils {
return size / 1048576 + 1;
}
/**
* Create a FASTA file for each path in the GFA file.
* @param gfaReader GFA reader object
* @param temporaryGenomesPath Path to the directory where the temporary genomes will be stored
* @return Path to the file listing all the temporary genomes
*/
public static Path createFastaFilesFromGfa(GfaReader gfaReader, Path temporaryGenomesPath) {
Path temporaryGenomesList = temporaryGenomesPath.resolve("temporary_genomes.list");
// create the fasta files
int counter = 1; //counting genome numbers
String pathOrigin; //genome name as well as the sequence name
String pathSequence; //genome sequence
for (GraphPath path : gfaReader.getGraphPaths()) {
// do the bookkeeping
final String fastaFilename = String.format("genome_%d.fasta", counter++);
final Path fastaFile = temporaryGenomesPath.resolve(fastaFilename);
Pantools.logger.debug("Creating temporary genome: {}", fastaFilename);
// start writing the fasta file
pathOrigin = path.getOrigin();
pathSequence = path.getSequence();
try (BufferedWriter writer = Files.newBufferedWriter(fastaFile)) {
writer.write(String.format(">%s\n%s\n", pathOrigin, pathSequence));
} catch (IOException e) {
Pantools.logger.error("Unable to write temporary genome.", e);
}
}
// write the list of full paths to the temporary genomes
try (BufferedWriter writer = Files.newBufferedWriter(temporaryGenomesList)) {
for (int i = 1; i < counter; i++) {
writer.write(String.format("%s/genome_%d.fasta\n", temporaryGenomesPath, i));
}
} catch (IOException e) {
Pantools.logger.error("Unable to write temporary genomes list.", e);
}
return temporaryGenomesList;
}
}
......@@ -6,6 +6,7 @@ import nl.wur.bif.pantools.construction.index.IndexPointer;
import nl.wur.bif.pantools.construction.index.kmer;
import nl.wur.bif.pantools.construction.sequence.SequenceDatabase;
import nl.wur.bif.pantools.utils.graph.GenomeRange;
import nl.wur.bif.pantools.utils.graph.GfaReader;
import org.codehaus.plexus.util.StringUtils;
import org.neo4j.graphdb.*;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
......@@ -80,6 +81,7 @@ public final class GraphUtils {
* @throws IOException when directories cannot be created in the database directory
*/
public static void createPangenomeDatabase(Path databaseDirectory) throws IOException {
Pantools.logger.info("Creating pangenome database in {}", databaseDirectory);
createGraphDatabaseService(databaseDirectory);
GENOME_DB = new SequenceDatabase(WORKING_DIRECTORY + GENOME_DATABASE_PATH, PATH_TO_THE_GENOMES_FILE);
INDEX_DB = new IndexDatabase(WORKING_DIRECTORY + INDEX_DATABASE_PATH, PATH_TO_THE_GENOMES_FILE, GENOME_DB, K_SIZE);
......@@ -87,6 +89,30 @@ public final class GraphUtils {
Files.createDirectory(databaseDirectory.resolve("log")); //TODO: remove
}
/**
* Creates and connects to genome, index and graph databases of the pangenome built from GFA.
*
* @param databaseDirectory path to the pangenome database directory
* @param gfaReader GFA reader object
* @throws IOException when directories cannot be created in the database directory
*/
public static void createPangenomeDatabaseFromGfa(Path databaseDirectory, GfaReader gfaReader) throws IOException {
Pantools.logger.info("Creating pangenome database from GFA in {}", databaseDirectory);
createGraphDatabaseService(databaseDirectory);
Path temporaryGenomesPath = databaseDirectory.resolve("temp_genomes"); //for unfolding the fasta files from the GFA into
Utils.create_directory_full_path(temporaryGenomesPath);
// create the temporary genome fasta files
Pantools.logger.debug("Creating temporary genome fasta files from GFA.");
Path temporaryPathToGenomesFile = FileUtils.createFastaFilesFromGfa(gfaReader, temporaryGenomesPath);
// continue with normal database construction
GENOME_DB = new SequenceDatabase(WORKING_DIRECTORY + GENOME_DATABASE_PATH, temporaryPathToGenomesFile.toString());
INDEX_DB = new IndexDatabase(WORKING_DIRECTORY + INDEX_DATABASE_PATH, temporaryPathToGenomesFile.toString(), GENOME_DB, K_SIZE);
Files.createDirectories(databaseDirectory.resolve("databases").resolve("genome.db").resolve("genomes"));
Files.createDirectory(databaseDirectory.resolve("log")); //TODO: remove
}
/**
* Creates and connects to graph databases of the panproteome.
*
......
......@@ -16,7 +16,7 @@ public class GfaReader {
private final Path gfaFile;
private final Map<Long, GraphNode> vertices = new HashMap<>();
private final Map<Long, GraphEdge> edges = new HashMap<>();
private final Set<GraphPath> paths = new HashSet<>();
private final List<GraphPath> paths = new ArrayList<>();
public GfaReader(Path gfaFile) throws IOException {
this.gfaFile = gfaFile;
......@@ -72,7 +72,7 @@ public class GfaReader {
private void getGraphPaths(Set<String> pathLines) {
for (String pathLine : pathLines) {
final String[] fields = pathLine.split("\t");
final String id = fields[1];
final String id = fields[1]; //TODO: parse genome name, phase information and sequence number
final String[] steps = fields[2].split(","); // e.g.: 1+,3-,5+,7-
final String[] cigars = fields[3].split(","); // e.g.: 10M,10M,10M,10M //TODO: parse CIGAR strings
......@@ -172,7 +172,7 @@ public class GfaReader {
if (field.startsWith("VN:")) {
final String[] versionFields = field.split(":");
final String version = versionFields[2];
if (version.startsWith("1.")) {
if (version.equals("1.0")) { //version 1.1 adds W-lines which we do not support yet
return;
} else {
Pantools.logger.error("GFA version {} is not supported.", version);
......@@ -192,22 +192,19 @@ public class GfaReader {
return this.edges;
}
public Set<GraphPath> getGraphPaths() {
public List<GraphPath> getGraphPaths() {
return this.paths;
}
public GraphPath getGraphPath(int genomeNr, int sequenceNr) {
String origin = String.format("G%dS%d", genomeNr, sequenceNr);
for (GraphPath path : this.paths) {
if (path.getOrigin().equals(origin)) {
return path;
}
if (sequenceNr != 1) {
Pantools.logger.error("GFA currently only supports one sequence per genome.");
throw new RuntimeException("Unsupported option");
}
Pantools.logger.error("GraphPath with origin {} does not exist.", origin);
throw new RuntimeException("GraphPath with origin " + origin + " does not exist");
return this.paths.get(genomeNr - 1);
}
public Graph getGraph() {
return new Graph(this.vertices, this.edges, this.paths);
return new Graph(this.vertices, this.edges, new HashSet<>(this.paths));
}
}
......@@ -115,7 +115,7 @@ public class GraphEdge {
}
public String toString() {
return String.format("Edge %d: %s -> %s", id, source, target);
return String.format("Edge %d: %s -[%s]> %s", id, source, cigar, target);
}
/**
......
......@@ -189,6 +189,60 @@ public class GraphPath {
return graphEdges;
}
/**
* Get the sequence of the path by walking it
* @return sequence of the path
*/
public String getSequence() {
Pantools.logger.trace("Getting sequence of path {}.", origin);
StringBuilder sb = new StringBuilder();
StringBuilder tempReverseComplement = new StringBuilder(); //temporary stringbuilder for reverse complement
Iterator<GraphEdge> walker = walk(); //create iterator
// handle first edge's source
GraphEdge edge = walker.next();
Pantools.logger.trace("\tEdge: {}", edge);
String sourceSequence = edge.getSource().getSequence();
Pantools.logger.trace("\tSequence of source node: {}", sourceSequence);
if (edge.getSourceOrientation() == '+') {
sb.append(sourceSequence);
} else {
Utils.reverse_complement(tempReverseComplement, sourceSequence);
sb.append(tempReverseComplement);
}
Pantools.logger.trace("Sequence after first edge: {} (node {}).", sb.toString(), edge.getSource());
// handle first edge's target
String targetSequence = edge.getTarget().getSequence();
Pantools.logger.trace("\tSequence of target node: {}", targetSequence);
Pantools.logger.trace("\tStart on source: {}", edge.getTargetStartOnSource());
if (edge.getTargetOrientation() == '+') {
sb.append(targetSequence.substring(edge.getSource().getLength() - edge.getTargetStartOnSource()));
} else {
Utils.reverse_complement(tempReverseComplement, targetSequence);
sb.append(tempReverseComplement.substring(edge.getSource().getLength() - edge.getTargetStartOnSource()));
}
Pantools.logger.trace("Sequence after first target: {} (node {}).", sb.toString(), edge.getTarget());
// handle all other edges
while (walker.hasNext()) {
edge = walker.next();
Pantools.logger.trace("\tEdge: {}", edge);
targetSequence = edge.getTarget().getSequence();
Pantools.logger.trace("\tSequence of target node: {}", targetSequence);
Pantools.logger.trace("\tStart on source: {}", edge.getTargetStartOnSource());
if (edge.getTargetOrientation() == '+') {
sb.append(targetSequence.substring(edge.getSource().getLength() - edge.getTargetStartOnSource()));
} else {
Utils.reverse_complement(tempReverseComplement, targetSequence);
sb.append(tempReverseComplement.substring(edge.getSource().getLength() - edge.getTargetStartOnSource()));
}
Pantools.logger.trace("Sequence after edge: {} (node {}).", sb.toString(), edge.getTarget());
}
return sb.toString();
}
public boolean equals(GraphPath graphPath) {
return this.origin.equals(graphPath.getOrigin());
}
......
......@@ -291,7 +291,7 @@ pantools.busco_protein.skip-busco = A list of questionable BUSCOs. The completen
genes.
pantools.busco_protein.busco-version = The BUSCO version (default: ${DEFAULT-VALUE}).
# BuildPangenome
pantools.build_pangenome.graph = GFA graph to use instead of building a DBG.
pantools.build_pangenome.graph = The provided genomes file is a GFA file for use as backbone instead of constructing cDBG.
pantools.build_pangenome.scratch-directory = Temporary directory for storing localization update files.
pantools.build_pangenome.num-buckets = Number of buckets for sorting (default: ${DEFAULT-VALUE}).
pantools.build_pangenome.transaction-size = Number of localization updates to pack into a single Neo4j transaction (default: ${DEFAULT-VALUE}).
......