More refactoring on explore_node().

42bb0c7b · Moed, Matthijs · a960b920 · 42bb0c7b
Commit 42bb0c7b authored 3 years ago by Moed, Matthijs
--- a/src/main/java/nl/wur/bif/pantools/pangenome/GenomeLayer.java
+++ b/src/main/java/nl/wur/bif/pantools/pangenome/GenomeLayer.java
@@ -741,84 +741,66 @@ public class GenomeLayer {
        public void explore_node(Node node, int mate, int position, int read_len) {
            final boolean is_canonical = current_kmer.get_canonical();
-            //noinspection ConstantConditions
+            if (isHighlyFrequent(getNodeFrequency(node)))
-            final long frequency = caches
+                return;
-                .getNodeFrequencyCache()
-                .get(node, n -> (long) n.getProperty("frequency"));
-            if (isHighlyFrequent(frequency)) {
-                // for each incoming edge to the node of the anchor
-                final List<Relationship> incomingRelationships = caches
-                    .getIncomingRelationshipsCache()
-                    .get(node, n -> {
-                        final List<Relationship> relationships = new ArrayList<>();
-                        node
-                            .getRelationships(Direction.INCOMING, RelTypes.FF, RelTypes.FR, RelTypes.RF, RelTypes.RR)
-                            .forEach(relationships::add);
-                        return relationships;
-                    });
-                // TODO: fix dereference warning
+            for (Relationship r: getIncomingRelationships(node)) {
-                for (Relationship r: incomingRelationships) {
+                final char side = r.getType().name().charAt(1);
-                    final char side = r.getType().name().charAt(1);
+                // for all sequences passing that node
-                    // for all sequences passing that node
+                for (String seq_id: r.getPropertyKeys()) {
-                    for (String seq_id: r.getPropertyKeys()) {
+                    final Address address = getAddress(seq_id);
-                        final Address address = getAddress(seq_id);
+                    final int genome = address.getGenomeIndex();
-                        final int genome = address.getGenomeIndex();
-                        if (locations[mate][genome] != null) {// should map against this genome
+                    if (locations[mate][genome] != null) {// should map against this genome
-                            final int sequence = address.getSequenceIndex();
+                        final int sequence = address.getSequenceIndex();
-                            // calculate the locations based on the offsets in the node
+                        // calculate the locations based on the offsets in the node
-                            final int[] location_array = (int[])r.getProperty(seq_id);
+                        final int[] location_array = (int[])r.getProperty(seq_id);
-                            final long seq_len = sequence_length[genome][sequence];
+                        final long seq_len = sequence_length[genome][sequence];
-                            if (side == 'F') {
+                        if (side == 'F') {
-                                for (int j : location_array) {
+                            for (int j : location_array) {
-                                    if (pointer.canonical ^ is_canonical) {
+                                if (pointer.canonical ^ is_canonical) {
-                                        final int loc = j + pointer.offset - read_len + position + K;
+                                    final int loc = j + pointer.offset - read_len + position + K;
-                                        if (loc >= 0 && loc <= seq_len - read_len) {
+                                    if (loc >= 0 && loc <= seq_len - read_len) {
-                                            node_results.add(new int[]{genome, sequence, -(1 + loc), 1});
+                                        node_results.add(new int[]{genome, sequence, -(1 + loc), 1});
-                                        }
+                                    }
-                                    } else {
+                                } else {
-                                        final int loc = j + pointer.offset - position;
+                                    final int loc = j + pointer.offset - position;
-                                        if (loc >= 0 && loc <= seq_len - read_len) {
+                                    if (loc >= 0 && loc <= seq_len - read_len) {
-                                            node_results.add(new int[]{genome, sequence, loc, 1});
+                                        node_results.add(new int[]{genome, sequence, loc, 1});
-                                        }
                                    }
                                }
-                            }else{
+                            }
-                                //noinspection ConstantConditions
+                        }else{
-                                final int node_len = caches
+                            //noinspection ConstantConditions
-                                    .getNodeLengthCache()
+                            final int node_len = getNodeLength(node);
-                                    .get(node, n -> (int) n.getProperty("length"));
-                                for (int j : location_array) {
+                            for (int j : location_array) {
-                                    if (pointer.canonical ^ is_canonical) {
+                                if (pointer.canonical ^ is_canonical) {
-                                        final int loc = j + node_len - K - pointer.offset - position;
+                                    final int loc = j + node_len - K - pointer.offset - position;
-                                        if (loc >= 0 && loc <= seq_len - read_len) {
+                                    if (loc >= 0 && loc <= seq_len - read_len) {
-                                            node_results.add(new int[]{genome, sequence, loc, -1});
+                                        node_results.add(new int[]{genome, sequence, loc, -1});
-                                        }
+                                    }
-                                    } else {
+                                } else {
-                                        final int loc = j + node_len - pointer.offset - read_len + position;
+                                    final int loc = j + node_len - pointer.offset - read_len + position;
-                                        if (loc >= 0 && loc <= seq_len - read_len) {
+                                    if (loc >= 0 && loc <= seq_len - read_len) {
-                                            node_results.add(new int[]{genome, sequence, -(1 + loc), -1});
+                                        node_results.add(new int[]{genome, sequence, -(1 + loc), -1});
-                                        }
                                    }
                                }
                            }
                        }
                    }
-                } 
+                }
            }
        }
        /**
-         * Test whether a node with a given frequency is highly-frequent.
+         * Test whether a node with a given frequency is not highly-frequent.
         * @param frequency frequency of the node.
         * @return true if the node is considered highly-frequent, false if not.
         */
        private boolean isHighlyFrequent(long frequency) {
            // TODO: cast to int should probably be a cast to long
-            return frequency <= (int)(total_genomes_size / 10000000.0 + num_genomes * 5 * Math.log(total_genomes_size));
+            return frequency > (int)(total_genomes_size / 10000000.0 + num_genomes * 5 * Math.log(total_genomes_size));
        }
        /**
@@ -830,6 +812,51 @@ public class GenomeLayer {
            return Address.fromRelationshipPropertyName(propertyName);
        }
+        /**
+         * Return frequency of a nucleotide node. Will attempt to retrieve the frequency from cache first and, if
+         * missing, retrieve it from Neo4j instead and storing it in the cache for later use.
+         * @param node node to get frequency of.
+         * @return node frequency.
+         */
+        public long getNodeFrequency(Node node) {
+            //noinspection ConstantConditions
+            return caches
+                .getNodeFrequencyCache()
+                .get(node, n -> (long) n.getProperty("frequency"));
+        }
+        /**
+         * Return length of a nucleotide node (i.e. its sequence length). Will attempt to retrieve the length from cache
+         * first and, if missing, retrieve it from Neo4j instead and storing it in the cache for later use.
+         * @param node node to get length of.
+         * @return node length.
+         */
+        public int getNodeLength(Node node) {
+            //noinspection ConstantConditions
+            return caches
+                .getNodeLengthCache()
+                .get(node, n -> (int) n.getProperty("length"));
+        }
+        /**
+         * Return all incoming relationships of type FF, FR, RF and RR of a nucleotide node. Will attempt to retrieve
+         * relationships from a cache first and, if missing, retrieve them from Neo4j instead and storing them in the
+         * cache for later use.
+         * @param node node to get incoming relationships for.
+         * @return all incoming relationships of type FF, FR, RF and RR.
+         */
+        public List<Relationship> getIncomingRelationships(Node node) {
+            return caches
+                .getIncomingRelationshipsCache()
+                .get(node, n -> {
+                    final List<Relationship> relationships = new ArrayList<>();
+                    node
+                        .getRelationships(Direction.INCOMING, RelTypes.FF, RelTypes.FR, RelTypes.RF, RelTypes.RR)
+                        .forEach(relationships::add);
+                    return relationships;
+                });
+        }
        /**
         * Clusters all the candidate genomic locations based on their proximity and align the read to the candidate locations
         *