diff --git a/src/main/java/nl/wur/bif/pantools/pangenome/GenomeLayer.java b/src/main/java/nl/wur/bif/pantools/pangenome/GenomeLayer.java index 8f026a957f837de859100a9250d722fd12f3e011..cb09b09fb65bdf3f2dbdcbf0c30d39d52f2af785 100755 --- a/src/main/java/nl/wur/bif/pantools/pangenome/GenomeLayer.java +++ b/src/main/java/nl/wur/bif/pantools/pangenome/GenomeLayer.java @@ -741,84 +741,66 @@ public class GenomeLayer { public void explore_node(Node node, int mate, int position, int read_len) { final boolean is_canonical = current_kmer.get_canonical(); - //noinspection ConstantConditions - final long frequency = caches - .getNodeFrequencyCache() - .get(node, n -> (long) n.getProperty("frequency")); - - if (isHighlyFrequent(frequency)) { - // for each incoming edge to the node of the anchor - final List<Relationship> incomingRelationships = caches - .getIncomingRelationshipsCache() - .get(node, n -> { - final List<Relationship> relationships = new ArrayList<>(); - node - .getRelationships(Direction.INCOMING, RelTypes.FF, RelTypes.FR, RelTypes.RF, RelTypes.RR) - .forEach(relationships::add); - return relationships; - }); + if (isHighlyFrequent(getNodeFrequency(node))) + return; - // TODO: fix dereference warning - for (Relationship r: incomingRelationships) { - final char side = r.getType().name().charAt(1); - // for all sequences passing that node - for (String seq_id: r.getPropertyKeys()) { - final Address address = getAddress(seq_id); - final int genome = address.getGenomeIndex(); + for (Relationship r: getIncomingRelationships(node)) { + final char side = r.getType().name().charAt(1); + // for all sequences passing that node + for (String seq_id: r.getPropertyKeys()) { + final Address address = getAddress(seq_id); + final int genome = address.getGenomeIndex(); - if (locations[mate][genome] != null) {// should map against this genome - final int sequence = address.getSequenceIndex(); - // calculate the locations based on the offsets in the node - final int[] location_array = (int[])r.getProperty(seq_id); - final long seq_len = sequence_length[genome][sequence]; - if (side == 'F') { - for (int j : location_array) { - if (pointer.canonical ^ is_canonical) { - final int loc = j + pointer.offset - read_len + position + K; - if (loc >= 0 && loc <= seq_len - read_len) { - node_results.add(new int[]{genome, sequence, -(1 + loc), 1}); - } - } else { - final int loc = j + pointer.offset - position; - if (loc >= 0 && loc <= seq_len - read_len) { - node_results.add(new int[]{genome, sequence, loc, 1}); - } + if (locations[mate][genome] != null) {// should map against this genome + final int sequence = address.getSequenceIndex(); + // calculate the locations based on the offsets in the node + final int[] location_array = (int[])r.getProperty(seq_id); + final long seq_len = sequence_length[genome][sequence]; + if (side == 'F') { + for (int j : location_array) { + if (pointer.canonical ^ is_canonical) { + final int loc = j + pointer.offset - read_len + position + K; + if (loc >= 0 && loc <= seq_len - read_len) { + node_results.add(new int[]{genome, sequence, -(1 + loc), 1}); + } + } else { + final int loc = j + pointer.offset - position; + if (loc >= 0 && loc <= seq_len - read_len) { + node_results.add(new int[]{genome, sequence, loc, 1}); } } - }else{ - //noinspection ConstantConditions - final int node_len = caches - .getNodeLengthCache() - .get(node, n -> (int) n.getProperty("length")); + } + }else{ + //noinspection ConstantConditions + final int node_len = getNodeLength(node); - for (int j : location_array) { - if (pointer.canonical ^ is_canonical) { - final int loc = j + node_len - K - pointer.offset - position; - if (loc >= 0 && loc <= seq_len - read_len) { - node_results.add(new int[]{genome, sequence, loc, -1}); - } - } else { - final int loc = j + node_len - pointer.offset - read_len + position; - if (loc >= 0 && loc <= seq_len - read_len) { - node_results.add(new int[]{genome, sequence, -(1 + loc), -1}); - } + for (int j : location_array) { + if (pointer.canonical ^ is_canonical) { + final int loc = j + node_len - K - pointer.offset - position; + if (loc >= 0 && loc <= seq_len - read_len) { + node_results.add(new int[]{genome, sequence, loc, -1}); + } + } else { + final int loc = j + node_len - pointer.offset - read_len + position; + if (loc >= 0 && loc <= seq_len - read_len) { + node_results.add(new int[]{genome, sequence, -(1 + loc), -1}); } } } } } - } + } } } /** - * Test whether a node with a given frequency is highly-frequent. + * Test whether a node with a given frequency is not highly-frequent. * @param frequency frequency of the node. * @return true if the node is considered highly-frequent, false if not. */ private boolean isHighlyFrequent(long frequency) { // TODO: cast to int should probably be a cast to long - return frequency <= (int)(total_genomes_size / 10000000.0 + num_genomes * 5 * Math.log(total_genomes_size)); + return frequency > (int)(total_genomes_size / 10000000.0 + num_genomes * 5 * Math.log(total_genomes_size)); } /** @@ -830,6 +812,51 @@ public class GenomeLayer { return Address.fromRelationshipPropertyName(propertyName); } + /** + * Return frequency of a nucleotide node. Will attempt to retrieve the frequency from cache first and, if + * missing, retrieve it from Neo4j instead and storing it in the cache for later use. + * @param node node to get frequency of. + * @return node frequency. + */ + public long getNodeFrequency(Node node) { + //noinspection ConstantConditions + return caches + .getNodeFrequencyCache() + .get(node, n -> (long) n.getProperty("frequency")); + } + + /** + * Return length of a nucleotide node (i.e. its sequence length). Will attempt to retrieve the length from cache + * first and, if missing, retrieve it from Neo4j instead and storing it in the cache for later use. + * @param node node to get length of. + * @return node length. + */ + public int getNodeLength(Node node) { + //noinspection ConstantConditions + return caches + .getNodeLengthCache() + .get(node, n -> (int) n.getProperty("length")); + } + + /** + * Return all incoming relationships of type FF, FR, RF and RR of a nucleotide node. Will attempt to retrieve + * relationships from a cache first and, if missing, retrieve them from Neo4j instead and storing them in the + * cache for later use. + * @param node node to get incoming relationships for. + * @return all incoming relationships of type FF, FR, RF and RR. + */ + public List<Relationship> getIncomingRelationships(Node node) { + return caches + .getIncomingRelationshipsCache() + .get(node, n -> { + final List<Relationship> relationships = new ArrayList<>(); + node + .getRelationships(Direction.INCOMING, RelTypes.FF, RelTypes.FR, RelTypes.RF, RelTypes.RR) + .forEach(relationships::add); + return relationships; + }); + } + /** * Clusters all the candidate genomic locations based on their proximity and align the read to the candidate locations *