Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bioinformatics
PanTools
Commits
fa46e477
Commit
fa46e477
authored
Apr 19, 2022
by
Moed, Matthijs
Browse files
Add mapping validation to end-to-end test.
parent
02b4ddfe
Pipeline
#43281
failed with stage
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
tests/build-reference-dataset.bash
View file @
fa46e477
...
...
@@ -5,7 +5,6 @@
# validate-pangenome.bash.
set
-e
# exit on any error
set
-x
# This is the PanTools version with which we're building a pangenome graph (the reference version)
...
...
@@ -38,7 +37,7 @@ mkdir "${DATA_DIRECTORY}"
# TODO: do not hardcode link
echo
"Downloading data..."
curl
--silent
"https://surfdrive.surf.nl/files/index.php/s/
mxeyy6DKKoj2G13
/download"
|
tar
x
-C
"
${
DATA_DIRECTORY
}
"
curl
--silent
"https://surfdrive.surf.nl/files/index.php/s/
lQkyhBlLr4ad9An
/download"
|
tar
x
-C
"
${
DATA_DIRECTORY
}
"
find
"
${
DATA_DIRECTORY
}
"
-name
"*.fna.gz"
|
sort
>
"
${
GENOMES_FILE
}
"
echo
"Building pangenome..."
...
...
@@ -50,6 +49,21 @@ java \
-dp
"
${
DATABASE_DIRECTORY
}
"
\
-gf
"
${
GENOMES_FILE
}
"
export
FASTQ1
=
"
${
DATA_DIRECTORY
}
/R64-3-1_20210421.1.100k.fastq.gz"
export
FASTQ2
=
"
${
DATA_DIRECTORY
}
/R64-3-1_20210421.2.100k.fastq.gz"
echo
"Mapping against pangenome..."
java
\
-Xmx4g
\
-jar
\
target/pantools-
*
.jar
\
map
\
--reference
1-9
\
-1
"
${
FASTQ1
}
"
\
-2
"
${
FASTQ2
}
"
\
-dp
"
${
DATABASE_DIRECTORY
}
"
cd
"
${
OLD_PWD
}
"
echo
"Creating tar file with input data and output database..."
pwd
echo
"Creating tar file with input data, output database and mapped reads..."
tar
cvzf
"yeast-pangenome-
${
PANTOOLS_REFERENCE_VERSION_TAG
}
.tar.gz"
-C
"
${
SCRATCH_DIRECTORY
}
"
"databases"
"data"
tests/validate
-pangenome
.bash
→
tests/validate.bash
View file @
fa46e477
#!/usr/bin/env bash
# This script validates the current version (as locally available in the current directory) against a reference
# data set built with a particular PanTools version (see build-reference-dataset.bash). Two validations are
# performed:
#
# 1. Equality of the pangenome built by build_pangenome
# 2. Equality of the mapping output
#
# The first validation (pangenome equality) is run with `build_pangenome_parallel` on the local version,
# as opposed to `build_pangenome`. This needs to be changed later when `build_pangenome_parallel` replaces
# `build_pangenome`.
set
-e
# exit on any error
LOG_PREFIX
=
"==="
...
...
@@ -17,17 +28,19 @@ mvn clean package
echo
"
${
LOG_PREFIX
}
Downloading data..."
NUM_REFERENCE_GENOMES
=
9
SCRATCH_DIRECTORY
=
$(
mktemp
-d
)
DATA_DIRECTORY
=
"
${
SCRATCH_DIRECTORY
}
/data"
mkdir
"
${
DATA_DIRECTORY
}
"
GENOMES_FILE
=
"
${
DATA_DIRECTORY
}
/genomes.txt"
DATABASES_DIRECTORY
=
"
${
DATA_DIRECTORY
}
/databases"
REFERENCE_DATABASE_DIRECTORY
=
"
${
DATABASES_DIRECTORY
}
/reference"
CURRENT_DATABASE_DIRECTORY
=
"
${
DATABASES_DIRECTORY
}
/current"
DUMPS_DIRECTORY
=
"
${
SCRATCH_DIRECTORY
}
/dumps"
mkdir
"
${
DUMPS_DIRECTORY
}
"
# TODO: do not hardcode link
curl
--silent
"https://surfdrive.surf.nl/files/index.php/s/
Cv4p83cgmTsu0ZU
/download"
|
tar
x
-C
"
${
DATA_DIRECTORY
}
"
curl
--silent
"https://surfdrive.surf.nl/files/index.php/s/
d8N2e13RyAF7qWh
/download"
|
tar
x
-C
"
${
DATA_DIRECTORY
}
"
find
"
${
DATA_DIRECTORY
}
"
-name
"*.fna.gz"
|
sort
>
"
${
GENOMES_FILE
}
"
# Build pangenomes with build_pangenome_parallel
...
...
@@ -79,4 +92,41 @@ do
done
echo
"
${
LOG_PREFIX
}
Pangenomes are identical"
exit
0
# Map against built pangenome
FASTQ1
=
"
${
DATA_DIRECTORY
}
/data/R64-3-1_20210421.1.100k.fastq.gz"
FASTQ2
=
"
${
DATA_DIRECTORY
}
/data/R64-3-1_20210421.2.100k.fastq.gz"
java
\
-Xmx2g
\
-jar
\
target/pantools-
*
.jar
\
map
\
--reference
1-
"
${
NUM_REFERENCE_GENOMES
}
"
\
-1
"
${
FASTQ1
}
"
\
-2
"
${
FASTQ2
}
"
\
-dp
"
${
CURRENT_DATABASE_DIRECTORY
}
"
# Get Picard tools
PICARD_JAR
=
"
${
SCRATCH_DIRECTORY
}
/picard.jar"
curl
--silent
--location
--output
"
${
PICARD_JAR
}
"
"https://github.com/broadinstitute/picard/releases/download/2.27.1/picard.jar"
# TODO: number of reference genomes to constant
for
i
in
$(
seq
1
${
NUM_REFERENCE_GENOMES
}
)
do
REFERENCE_SAM
=
"
${
REFERENCE_DATABASE_DIRECTORY
}
/read_mapping/pantools_
${
i
}
.sam"
CURRENT_SAM
=
"
${
CURRENT_DATABASE_DIRECTORY
}
/read_mapping/pantools_
${
i
}
.sam"
echo
"
${
LOG_PREFIX
}
== Comparing reference sam
${
REFERENCE_SAM
}
against current SAM
${
CURRENT_SAM
}
..."
if
!
java
-jar
"
${
PICARD_JAR
}
"
CompareSAMs
--VALIDATION_STRINGENCY
STRICT
-O
output.tsv
"
${
REFERENCE_SAM
}
"
"
${
CURRENT_SAM
}
"
;
then
echo
"
${
LOG_PREFIX
}
SAM files
${
REFERENCE_SAM
}
and
${
CURRENT_SAM
}
differ!"
exit
1
fi
done
echo
"
${
LOG_PREFIX
}
Validation finished successfully"
Moed, Matthijs
@moed_m
mentioned in issue
#31
·
Jul 12, 2022
mentioned in issue
#31
mentioned in issue #31
Toggle commit list
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment