Commit fa46e477 authored by Moed, Matthijs's avatar Moed, Matthijs
Browse files

Add mapping validation to end-to-end test.

parent 02b4ddfe
Pipeline #43281 failed with stage
......@@ -5,7 +5,6 @@
# validate-pangenome.bash.
set -e # exit on any error
set -x
# This is the PanTools version with which we're building a pangenome graph (the reference version)
......@@ -38,7 +37,7 @@ mkdir "${DATA_DIRECTORY}"
# TODO: do not hardcode link
echo "Downloading data..."
curl --silent "https://surfdrive.surf.nl/files/index.php/s/mxeyy6DKKoj2G13/download" | tar x -C "${DATA_DIRECTORY}"
curl --silent "https://surfdrive.surf.nl/files/index.php/s/lQkyhBlLr4ad9An/download" | tar x -C "${DATA_DIRECTORY}"
find "${DATA_DIRECTORY}" -name "*.fna.gz" | sort > "${GENOMES_FILE}"
echo "Building pangenome..."
......@@ -50,6 +49,21 @@ java \
-dp "${DATABASE_DIRECTORY}" \
-gf "${GENOMES_FILE}"
export FASTQ1="${DATA_DIRECTORY}/R64-3-1_20210421.1.100k.fastq.gz"
export FASTQ2="${DATA_DIRECTORY}/R64-3-1_20210421.2.100k.fastq.gz"
echo "Mapping against pangenome..."
java \
-Xmx4g \
-jar \
target/pantools-*.jar \
map \
--reference 1-9 \
-1 "${FASTQ1}" \
-2 "${FASTQ2}" \
-dp "${DATABASE_DIRECTORY}"
cd "${OLD_PWD}"
echo "Creating tar file with input data and output database..."
pwd
echo "Creating tar file with input data, output database and mapped reads..."
tar cvzf "yeast-pangenome-${PANTOOLS_REFERENCE_VERSION_TAG}.tar.gz" -C "${SCRATCH_DIRECTORY}" "databases" "data"
#!/usr/bin/env bash
# This script validates the current version (as locally available in the current directory) against a reference
# data set built with a particular PanTools version (see build-reference-dataset.bash). Two validations are
# performed:
#
# 1. Equality of the pangenome built by build_pangenome
# 2. Equality of the mapping output
#
# The first validation (pangenome equality) is run with `build_pangenome_parallel` on the local version,
# as opposed to `build_pangenome`. This needs to be changed later when `build_pangenome_parallel` replaces
# `build_pangenome`.
set -e # exit on any error
LOG_PREFIX="==="
......@@ -17,17 +28,19 @@ mvn clean package
echo "${LOG_PREFIX} Downloading data..."
NUM_REFERENCE_GENOMES=9
SCRATCH_DIRECTORY=$(mktemp -d)
DATA_DIRECTORY="${SCRATCH_DIRECTORY}/data"
mkdir "${DATA_DIRECTORY}"
GENOMES_FILE="${DATA_DIRECTORY}/genomes.txt"
DATABASES_DIRECTORY="${DATA_DIRECTORY}/databases"
REFERENCE_DATABASE_DIRECTORY="${DATABASES_DIRECTORY}/reference"
CURRENT_DATABASE_DIRECTORY="${DATABASES_DIRECTORY}/current"
DUMPS_DIRECTORY="${SCRATCH_DIRECTORY}/dumps"
mkdir "${DUMPS_DIRECTORY}"
# TODO: do not hardcode link
curl --silent "https://surfdrive.surf.nl/files/index.php/s/Cv4p83cgmTsu0ZU/download" | tar x -C "${DATA_DIRECTORY}"
curl --silent "https://surfdrive.surf.nl/files/index.php/s/d8N2e13RyAF7qWh/download" | tar x -C "${DATA_DIRECTORY}"
find "${DATA_DIRECTORY}" -name "*.fna.gz" | sort > "${GENOMES_FILE}"
# Build pangenomes with build_pangenome_parallel
......@@ -79,4 +92,41 @@ do
done
echo "${LOG_PREFIX} Pangenomes are identical"
exit 0
# Map against built pangenome
FASTQ1="${DATA_DIRECTORY}/data/R64-3-1_20210421.1.100k.fastq.gz"
FASTQ2="${DATA_DIRECTORY}/data/R64-3-1_20210421.2.100k.fastq.gz"
java \
-Xmx2g \
-jar \
target/pantools-*.jar \
map \
--reference 1-"${NUM_REFERENCE_GENOMES}" \
-1 "${FASTQ1}" \
-2 "${FASTQ2}" \
-dp "${CURRENT_DATABASE_DIRECTORY}"
# Get Picard tools
PICARD_JAR="${SCRATCH_DIRECTORY}/picard.jar"
curl --silent --location --output "${PICARD_JAR}" "https://github.com/broadinstitute/picard/releases/download/2.27.1/picard.jar"
# TODO: number of reference genomes to constant
for i in $(seq 1 ${NUM_REFERENCE_GENOMES})
do
REFERENCE_SAM="${REFERENCE_DATABASE_DIRECTORY}/read_mapping/pantools_${i}.sam"
CURRENT_SAM="${CURRENT_DATABASE_DIRECTORY}/read_mapping/pantools_${i}.sam"
echo "${LOG_PREFIX} == Comparing reference sam ${REFERENCE_SAM} against current SAM ${CURRENT_SAM}..."
if ! java -jar "${PICARD_JAR}" CompareSAMs --VALIDATION_STRINGENCY STRICT -O output.tsv "${REFERENCE_SAM}" "${CURRENT_SAM}";
then
echo "${LOG_PREFIX} SAM files ${REFERENCE_SAM} and ${CURRENT_SAM} differ!"
exit 1
fi
done
echo "${LOG_PREFIX} Validation finished successfully"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment