Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
PanTools
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
bioinformatics
PanTools
Commits
42bb0c7b
Commit
42bb0c7b
authored
3 years ago
by
Moed, Matthijs
Browse files
Options
Downloads
Patches
Plain Diff
More refactoring on explore_node().
parent
a960b920
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/main/java/nl/wur/bif/pantools/pangenome/GenomeLayer.java
+86
-59
86 additions, 59 deletions
src/main/java/nl/wur/bif/pantools/pangenome/GenomeLayer.java
with
86 additions
and
59 deletions
src/main/java/nl/wur/bif/pantools/pangenome/GenomeLayer.java
+
86
−
59
View file @
42bb0c7b
...
@@ -741,84 +741,66 @@ public class GenomeLayer {
...
@@ -741,84 +741,66 @@ public class GenomeLayer {
public
void
explore_node
(
Node
node
,
int
mate
,
int
position
,
int
read_len
)
{
public
void
explore_node
(
Node
node
,
int
mate
,
int
position
,
int
read_len
)
{
final
boolean
is_canonical
=
current_kmer
.
get_canonical
();
final
boolean
is_canonical
=
current_kmer
.
get_canonical
();
//noinspection ConstantConditions
if
(
isHighlyFrequent
(
getNodeFrequency
(
node
)))
final
long
frequency
=
caches
return
;
.
getNodeFrequencyCache
()
.
get
(
node
,
n
->
(
long
)
n
.
getProperty
(
"frequency"
));
if
(
isHighlyFrequent
(
frequency
))
{
// for each incoming edge to the node of the anchor
final
List
<
Relationship
>
incomingRelationships
=
caches
.
getIncomingRelationshipsCache
()
.
get
(
node
,
n
->
{
final
List
<
Relationship
>
relationships
=
new
ArrayList
<>();
node
.
getRelationships
(
Direction
.
INCOMING
,
RelTypes
.
FF
,
RelTypes
.
FR
,
RelTypes
.
RF
,
RelTypes
.
RR
)
.
forEach
(
relationships:
:
add
);
return
relationships
;
});
// TODO: fix dereference warning
for
(
Relationship
r:
getIncomingRelationships
(
node
))
{
for
(
Relationship
r:
incomingRelationships
)
{
final
char
side
=
r
.
getType
().
name
().
charAt
(
1
);
final
char
side
=
r
.
getType
().
name
().
charAt
(
1
);
// for all sequences passing that node
// for all sequences passing that node
for
(
String
seq_id:
r
.
getPropertyKeys
())
{
for
(
String
seq_id:
r
.
getPropertyKeys
())
{
final
Address
address
=
getAddress
(
seq_id
);
final
Address
address
=
getAddress
(
seq_id
);
final
int
genome
=
address
.
getGenomeIndex
();
final
int
genome
=
address
.
getGenomeIndex
();
if
(
locations
[
mate
][
genome
]
!=
null
)
{
// should map against this genome
if
(
locations
[
mate
][
genome
]
!=
null
)
{
// should map against this genome
final
int
sequence
=
address
.
getSequenceIndex
();
final
int
sequence
=
address
.
getSequenceIndex
();
// calculate the locations based on the offsets in the node
// calculate the locations based on the offsets in the node
final
int
[]
location_array
=
(
int
[])
r
.
getProperty
(
seq_id
);
final
int
[]
location_array
=
(
int
[])
r
.
getProperty
(
seq_id
);
final
long
seq_len
=
sequence_length
[
genome
][
sequence
];
final
long
seq_len
=
sequence_length
[
genome
][
sequence
];
if
(
side
==
'F'
)
{
if
(
side
==
'F'
)
{
for
(
int
j
:
location_array
)
{
for
(
int
j
:
location_array
)
{
if
(
pointer
.
canonical
^
is_canonical
)
{
if
(
pointer
.
canonical
^
is_canonical
)
{
final
int
loc
=
j
+
pointer
.
offset
-
read_len
+
position
+
K
;
final
int
loc
=
j
+
pointer
.
offset
-
read_len
+
position
+
K
;
if
(
loc
>=
0
&&
loc
<=
seq_len
-
read_len
)
{
if
(
loc
>=
0
&&
loc
<=
seq_len
-
read_len
)
{
node_results
.
add
(
new
int
[]{
genome
,
sequence
,
-(
1
+
loc
),
1
});
node_results
.
add
(
new
int
[]{
genome
,
sequence
,
-(
1
+
loc
),
1
});
}
}
}
else
{
}
else
{
final
int
loc
=
j
+
pointer
.
offset
-
position
;
final
int
loc
=
j
+
pointer
.
offset
-
position
;
if
(
loc
>=
0
&&
loc
<=
seq_len
-
read_len
)
{
if
(
loc
>=
0
&&
loc
<=
seq_len
-
read_len
)
{
node_results
.
add
(
new
int
[]{
genome
,
sequence
,
loc
,
1
});
node_results
.
add
(
new
int
[]{
genome
,
sequence
,
loc
,
1
});
}
}
}
}
}
}
else
{
}
//noinspection ConstantConditions
}
else
{
final
int
node_len
=
caches
//noinspection ConstantConditions
.
getNodeLengthCache
()
final
int
node_len
=
getNodeLength
(
node
);
.
get
(
node
,
n
->
(
int
)
n
.
getProperty
(
"length"
));
for
(
int
j
:
location_array
)
{
for
(
int
j
:
location_array
)
{
if
(
pointer
.
canonical
^
is_canonical
)
{
if
(
pointer
.
canonical
^
is_canonical
)
{
final
int
loc
=
j
+
node_len
-
K
-
pointer
.
offset
-
position
;
final
int
loc
=
j
+
node_len
-
K
-
pointer
.
offset
-
position
;
if
(
loc
>=
0
&&
loc
<=
seq_len
-
read_len
)
{
if
(
loc
>=
0
&&
loc
<=
seq_len
-
read_len
)
{
node_results
.
add
(
new
int
[]{
genome
,
sequence
,
loc
,
-
1
});
node_results
.
add
(
new
int
[]{
genome
,
sequence
,
loc
,
-
1
});
}
}
}
else
{
}
else
{
final
int
loc
=
j
+
node_len
-
pointer
.
offset
-
read_len
+
position
;
final
int
loc
=
j
+
node_len
-
pointer
.
offset
-
read_len
+
position
;
if
(
loc
>=
0
&&
loc
<=
seq_len
-
read_len
)
{
if
(
loc
>=
0
&&
loc
<=
seq_len
-
read_len
)
{
node_results
.
add
(
new
int
[]{
genome
,
sequence
,
-(
1
+
loc
),
-
1
});
node_results
.
add
(
new
int
[]{
genome
,
sequence
,
-(
1
+
loc
),
-
1
});
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
/**
/**
* Test whether a node with a given frequency is highly-frequent.
* Test whether a node with a given frequency is
not
highly-frequent.
* @param frequency frequency of the node.
* @param frequency frequency of the node.
* @return true if the node is considered highly-frequent, false if not.
* @return true if the node is considered highly-frequent, false if not.
*/
*/
private
boolean
isHighlyFrequent
(
long
frequency
)
{
private
boolean
isHighlyFrequent
(
long
frequency
)
{
// TODO: cast to int should probably be a cast to long
// TODO: cast to int should probably be a cast to long
return
frequency
<=
(
int
)(
total_genomes_size
/
10000000.0
+
num_genomes
*
5
*
Math
.
log
(
total_genomes_size
));
return
frequency
>
(
int
)(
total_genomes_size
/
10000000.0
+
num_genomes
*
5
*
Math
.
log
(
total_genomes_size
));
}
}
/**
/**
...
@@ -830,6 +812,51 @@ public class GenomeLayer {
...
@@ -830,6 +812,51 @@ public class GenomeLayer {
return
Address
.
fromRelationshipPropertyName
(
propertyName
);
return
Address
.
fromRelationshipPropertyName
(
propertyName
);
}
}
/**
* Return frequency of a nucleotide node. Will attempt to retrieve the frequency from cache first and, if
* missing, retrieve it from Neo4j instead and storing it in the cache for later use.
* @param node node to get frequency of.
* @return node frequency.
*/
public
long
getNodeFrequency
(
Node
node
)
{
//noinspection ConstantConditions
return
caches
.
getNodeFrequencyCache
()
.
get
(
node
,
n
->
(
long
)
n
.
getProperty
(
"frequency"
));
}
/**
* Return length of a nucleotide node (i.e. its sequence length). Will attempt to retrieve the length from cache
* first and, if missing, retrieve it from Neo4j instead and storing it in the cache for later use.
* @param node node to get length of.
* @return node length.
*/
public
int
getNodeLength
(
Node
node
)
{
//noinspection ConstantConditions
return
caches
.
getNodeLengthCache
()
.
get
(
node
,
n
->
(
int
)
n
.
getProperty
(
"length"
));
}
/**
* Return all incoming relationships of type FF, FR, RF and RR of a nucleotide node. Will attempt to retrieve
* relationships from a cache first and, if missing, retrieve them from Neo4j instead and storing them in the
* cache for later use.
* @param node node to get incoming relationships for.
* @return all incoming relationships of type FF, FR, RF and RR.
*/
public
List
<
Relationship
>
getIncomingRelationships
(
Node
node
)
{
return
caches
.
getIncomingRelationshipsCache
()
.
get
(
node
,
n
->
{
final
List
<
Relationship
>
relationships
=
new
ArrayList
<>();
node
.
getRelationships
(
Direction
.
INCOMING
,
RelTypes
.
FF
,
RelTypes
.
FR
,
RelTypes
.
RF
,
RelTypes
.
RR
)
.
forEach
(
relationships:
:
add
);
return
relationships
;
});
}
/**
/**
* Clusters all the candidate genomic locations based on their proximity and align the read to the candidate locations
* Clusters all the candidate genomic locations based on their proximity and align the read to the candidate locations
*
*
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment