diff --git a/CHANGELOG.md b/CHANGELOG.md index 0887f2fdb012ee0e88f7e12f7f8d276a339c71cd..420d0482754b68c1f0ebad95321414fa6339882a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ All notable changes to Pantools will be documented in this file. ### Added - The possibility to analyse a specific region of interest across the pangenome (!147). +- Reconstruction of a region of interest from sequencing reads (!252). - A GFA export function (not speed optimised, so not recommended for large pangenomes) (!147). ### Changed diff --git a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractFunctionsCLI.java b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractFunctionsCLI.java index 80e6cd125cdd81bdd5bd568619e4156cc27738d5..f2e713292a8f451c57779110f2a7bd8d298d497f 100644 --- a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractFunctionsCLI.java +++ b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractFunctionsCLI.java @@ -2,14 +2,18 @@ package nl.wur.bif.pantools.analysis.region_analysis; import nl.wur.bif.pantools.Pantools; import nl.wur.bif.pantools.utils.BeanUtils; +import nl.wur.bif.pantools.utils.FileUtils; +import nl.wur.bif.pantools.utils.Globals; import nl.wur.bif.pantools.utils.GraphUtils; import nl.wur.bif.pantools.utils.cli.mixins.SelectGenomes; +import nl.wur.bif.pantools.utils.cli.mixins.ThreadNumber; import nl.wur.bif.pantools.utils.cli.validation.Constraints.OutputDirectory; import picocli.CommandLine.*; import java.nio.file.Path; import java.util.concurrent.Callable; +import static nl.wur.bif.pantools.utils.Globals.THREADS; import static nl.wur.bif.pantools.utils.Globals.setGenomeSelectionOptions; /** @@ -20,6 +24,7 @@ import static nl.wur.bif.pantools.utils.Globals.setGenomeSelectionOptions; @OutputDirectory(directory = "outputPath") public class ExtractFunctionsCLI implements Callable<Integer> { @Spec Model.CommandSpec spec; + @Mixin private ThreadNumber threadNumber; @ArgGroup private SelectGenomes selectGenomes; @ParentCommand @@ -75,6 +80,19 @@ public class ExtractFunctionsCLI implements Callable<Integer> { ) private boolean writeRepeats; + @Option(names = {"--reads-file"}, + description = "Tab-delimited file with library names and corresponding read files. Default: none" + ) + private Path readsFile; + + @Option(names = {"--filter-high-frequency"}, + defaultValue = "true", + negatable = true, + fallbackValue = "true", + description = "Filter out high-frequency k-mers for read reconstruction. Default: true" + ) + private boolean filterHighFrequency; + //TODO: add --write-blast option to visualize BLASTP and BLASTN results private String[] functions; @@ -103,20 +121,21 @@ public class ExtractFunctionsCLI implements Callable<Integer> { GraphUtils.validateRepeats(); // check that repeats are present } - // determine whether to use the fast option - boolean fast = !writeGfa && !writeUnitigs; + // always use fast option, except when writing GFA or unitigs, or using read files + boolean fast = !writeGfa && !writeUnitigs && readsFile == null; if (!fast) { Pantools.logger.warn("Using a slow method for extraction."); } final RegionExtractor regionExtractor = new RegionExtractor(outputPath); - regionExtractor.extractFunctions(functions, flanking, writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, fast); + regionExtractor.extractFunctions(functions, flanking, writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, FileUtils.parseReadsFile(readsFile), filterHighFrequency, fast); return 0; } private void setGlobalParameters() { setGenomeSelectionOptions(selectGenomes); + THREADS = threadNumber.getnThreads(); } private void crossValidate() { @@ -126,6 +145,11 @@ public class ExtractFunctionsCLI implements Callable<Integer> { throw new ParameterException(spec.commandLine(), "Functions must be GO, InterPro, Pfam or TIGRFAM"); } } + + // Assert that K is smaller than 64 when using reads (because of storing 2bit version of k-mers in long) + if (readsFile != null && Globals.K_SIZE > 63) { + throw new ParameterException(spec.commandLine(), "K should be smaller than 63 when using reads"); + } } public Pantools getPantools() { diff --git a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractGenesCLI.java b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractGenesCLI.java index e4657fbdd37f911bbf156c027dcc719683b379d7..7e7343de9b72325a42b73b142040c2e3f56da7be 100644 --- a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractGenesCLI.java +++ b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractGenesCLI.java @@ -3,14 +3,18 @@ package nl.wur.bif.pantools.analysis.region_analysis; import jakarta.validation.constraints.Min; import nl.wur.bif.pantools.Pantools; import nl.wur.bif.pantools.utils.BeanUtils; +import nl.wur.bif.pantools.utils.FileUtils; +import nl.wur.bif.pantools.utils.Globals; import nl.wur.bif.pantools.utils.GraphUtils; import nl.wur.bif.pantools.utils.cli.mixins.SelectGenomes; +import nl.wur.bif.pantools.utils.cli.mixins.ThreadNumber; import nl.wur.bif.pantools.utils.cli.validation.Constraints.OutputDirectory; import picocli.CommandLine.*; import java.nio.file.Path; import java.util.concurrent.Callable; +import static nl.wur.bif.pantools.utils.Globals.THREADS; import static nl.wur.bif.pantools.utils.Globals.setGenomeSelectionOptions; /** @@ -21,6 +25,7 @@ import static nl.wur.bif.pantools.utils.Globals.setGenomeSelectionOptions; @OutputDirectory(directory = "outputPath") public class ExtractGenesCLI implements Callable<Integer> { @Spec Model.CommandSpec spec; + @Mixin private ThreadNumber threadNumber; @ArgGroup private SelectGenomes selectGenomes; @ParentCommand @@ -83,6 +88,19 @@ public class ExtractGenesCLI implements Callable<Integer> { ) private boolean writeRepeats; + @Option(names = {"--reads-file"}, + description = "Tab-delimited file with library names and corresponding read files. Default: none" + ) + private Path readsFile; + + @Option(names = {"--filter-high-frequency"}, + defaultValue = "true", + negatable = true, + fallbackValue = "true", + description = "Filter out high-frequency k-mers for read reconstruction. Default: true" + ) + private boolean filterHighFrequency; + //TODO: add --write-blast option to visualize BLASTP and BLASTN results private String[] genes; @@ -98,6 +116,7 @@ public class ExtractGenesCLI implements Callable<Integer> { BeanUtils.argValidation(spec, this, selectGenomes); pantools.setPangenomeGraph("pangenome"); + crossValidate(); setGlobalParameters(); if (writeAnnotations) { @@ -110,20 +129,28 @@ public class ExtractGenesCLI implements Callable<Integer> { GraphUtils.validateRepeats(); // check that repeats are present } - // determine whether to use the fast option - boolean fast = !writeGfa && !writeUnitigs; + // always use fast option, except when writing GFA or unitigs, or using read files + boolean fast = !writeGfa && !writeUnitigs && readsFile == null; if (!fast) { Pantools.logger.warn("Using a slow method for extraction."); } final RegionExtractor regionExtractor = new RegionExtractor(outputPath); - regionExtractor.extractGenes(genes, context, flanking, writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, fast); + regionExtractor.extractGenes(genes, context, flanking, writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, FileUtils.parseReadsFile(readsFile), filterHighFrequency, fast); return 0; } private void setGlobalParameters() { setGenomeSelectionOptions(selectGenomes); + THREADS = threadNumber.getnThreads(); + } + + private void crossValidate() { + // Assert that K is smaller than 64 when using reads (because of storing 2bit version of k-mers in long) + if (readsFile != null && Globals.K_SIZE > 63) { + throw new ParameterException(spec.commandLine(), "K should be smaller than 63 when using reads"); + } } public Pantools getPantools() { diff --git a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractHomologyGroupCLI.java b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractHomologyGroupCLI.java index 84f551656798e41a92000660628594ad9c177845..715bbbb4d32e10659d1663e4b6a843dfffdd73eb 100644 --- a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractHomologyGroupCLI.java +++ b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractHomologyGroupCLI.java @@ -2,15 +2,19 @@ package nl.wur.bif.pantools.analysis.region_analysis; import nl.wur.bif.pantools.Pantools; import nl.wur.bif.pantools.utils.BeanUtils; +import nl.wur.bif.pantools.utils.FileUtils; +import nl.wur.bif.pantools.utils.Globals; import nl.wur.bif.pantools.utils.GraphUtils; import nl.wur.bif.pantools.utils.cli.mixins.SelectGenomes; import nl.wur.bif.pantools.utils.cli.mixins.SelectHmGroups; +import nl.wur.bif.pantools.utils.cli.mixins.ThreadNumber; import nl.wur.bif.pantools.utils.cli.validation.Constraints.OutputDirectory; import picocli.CommandLine.*; import java.nio.file.Path; import java.util.concurrent.Callable; +import static nl.wur.bif.pantools.utils.Globals.THREADS; import static nl.wur.bif.pantools.utils.Globals.setGenomeSelectionOptions; /** @@ -21,6 +25,7 @@ import static nl.wur.bif.pantools.utils.Globals.setGenomeSelectionOptions; @OutputDirectory(directory = "outputPath") public class ExtractHomologyGroupCLI implements Callable<Integer> { @Spec Model.CommandSpec spec; + @Mixin private ThreadNumber threadNumber; @ArgGroup private SelectGenomes selectGenomes; @ArgGroup private SelectHmGroups selectHmGroups = new SelectHmGroups(); @@ -77,6 +82,19 @@ public class ExtractHomologyGroupCLI implements Callable<Integer> { ) private boolean writeRepeats; + @Option(names = {"--reads-file"}, + description = "Tab-delimited file with library names and corresponding read files. Default: none" + ) + private Path readsFile; + + @Option(names = {"--filter-high-frequency"}, + defaultValue = "true", + negatable = true, + fallbackValue = "true", + description = "Filter out high-frequency k-mers for read reconstruction. Default: true" + ) + private boolean filterHighFrequency; + //TODO: add --write-blast option to visualize BLASTP and BLASTN results @Override @@ -85,13 +103,14 @@ public class ExtractHomologyGroupCLI implements Callable<Integer> { BeanUtils.argValidation(spec, this, selectGenomes, selectHmGroups); pantools.setPangenomeGraph("pangenome"); + crossValidate(); setGlobalParameters(); if (writeRepeats) { GraphUtils.validateRepeats(); } - // determine whether to use the fast option - boolean fast = !writeGfa && !writeUnitigs; + // always use fast option, except when writing GFA or unitigs, or using read files + boolean fast = !writeGfa && !writeUnitigs && readsFile == null; if (!fast) { Pantools.logger.warn("Using a slow method for extraction."); } @@ -100,13 +119,21 @@ public class ExtractHomologyGroupCLI implements Callable<Integer> { GraphUtils.validateHomologyGroups(); // check that homology groups are present final RegionExtractor regionExtractor = new RegionExtractor(outputPath); - regionExtractor.extractHmGroups(selectHmGroups.getHomologyGroups(), flanking, writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, fast); + regionExtractor.extractHmGroups(selectHmGroups.getHomologyGroups(), flanking, writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, FileUtils.parseReadsFile(readsFile), filterHighFrequency, fast); return 0; } private void setGlobalParameters() { setGenomeSelectionOptions(selectGenomes); + THREADS = threadNumber.getnThreads(); + } + + private void crossValidate() { + // Assert that K is smaller than 64 when using reads (because of storing 2bit version of k-mers in long) + if (readsFile != null && Globals.K_SIZE > 63) { + throw new ParameterException(spec.commandLine(), "K should be smaller than 63 when using reads"); + } } public Pantools getPantools() { diff --git a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractRegionCLI.java b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractRegionCLI.java index a8b356c7c4ab498a1a04fabb21fb33e65805c828..aad3c5fc585ffb0c6507a466068130e451fe3ad9 100644 --- a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractRegionCLI.java +++ b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractRegionCLI.java @@ -6,6 +6,9 @@ import jakarta.validation.constraints.NotNull; import jakarta.validation.constraints.Pattern; import nl.wur.bif.pantools.Pantools; import nl.wur.bif.pantools.utils.BeanUtils; +import nl.wur.bif.pantools.utils.FileUtils; +import nl.wur.bif.pantools.utils.Globals; +import nl.wur.bif.pantools.utils.FileUtils; import nl.wur.bif.pantools.utils.GraphUtils; import nl.wur.bif.pantools.utils.cli.mixins.SelectGenomes; import nl.wur.bif.pantools.utils.cli.mixins.ThreadNumber; @@ -158,6 +161,19 @@ public class ExtractRegionCLI implements Callable<Integer> { ) private boolean writeRepeats; + @Option(names = {"--reads-file"}, + description = "Tab-delimited file with library names and corresponding read files. Default: none" + ) + private Path readsFile; + + @Option(names = {"--filter-high-frequency"}, + defaultValue = "true", + negatable = true, + fallbackValue = "true", + description = "Filter out high-frequency k-mers for read reconstruction. Default: true" + ) + private boolean filterHighFrequency; + //TODO: add --write-blast option to visualize BLASTP and BLASTN results @NotNull(message = "Please provide a valid genome range: genomeNr#sequenceName:start-end") @@ -184,8 +200,8 @@ public class ExtractRegionCLI implements Callable<Integer> { GraphUtils.validateRepeats(); // check that repeat annotations are present } - // determine whether to use the fast option - boolean fast = !writeGfa && !writeUnitigs; + // always use fast option, except when writing GFA or unitigs, or using read files + boolean fast = !writeGfa && !writeUnitigs && readsFile == null; if (!fast) { Pantools.logger.warn("Using a slow method for extraction."); } @@ -195,7 +211,7 @@ public class ExtractRegionCLI implements Callable<Integer> { regionExtractor.setGenomeRange(inputGenomeRange); regionExtractor.extractRegion(useKmers, useHomology, useBlast, allowSinglePositions, searchSelf, kmerDensity, maxEvalue, maxDistance, fractPositions, maxFrequency, flanking, - writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, fast); + writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, FileUtils.parseReadsFile(readsFile), filterHighFrequency, fast); return 0; } @@ -203,6 +219,7 @@ public class ExtractRegionCLI implements Callable<Integer> { private void setGlobalParameters() { THREADS = threadNumber.getnThreads(); setGenomeSelectionOptions(selectGenomes); + THREADS = threadNumber.getnThreads(); } private void crossValidate() { @@ -219,6 +236,11 @@ public class ExtractRegionCLI implements Callable<Integer> { throw new ParameterException(spec.commandLine(), "--max-evalue can only be used with --use-blast=true"); } } + + // Assert that K is smaller than 64 when using reads (because of storing 2bit version of k-mers in long) + if (readsFile != null && Globals.K_SIZE > 63) { + throw new ParameterException(spec.commandLine(), "K should be smaller than 63 when using reads"); + } } public Pantools getPantools() { diff --git a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractRegionGeneCLI.java b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractRegionGeneCLI.java index db53e83613029d31d5158b7125d547ca4dc12e4e..fa768a16d638ad049fe0979e9f723941ad3765f7 100644 --- a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractRegionGeneCLI.java +++ b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ExtractRegionGeneCLI.java @@ -5,6 +5,9 @@ import jakarta.validation.constraints.Min; import jakarta.validation.constraints.NotNull; import nl.wur.bif.pantools.Pantools; import nl.wur.bif.pantools.utils.BeanUtils; +import nl.wur.bif.pantools.utils.FileUtils; +import nl.wur.bif.pantools.utils.Globals; +import nl.wur.bif.pantools.utils.FileUtils; import nl.wur.bif.pantools.utils.GraphUtils; import nl.wur.bif.pantools.utils.cli.mixins.SelectGenomes; import nl.wur.bif.pantools.utils.cli.mixins.ThreadNumber; @@ -163,6 +166,19 @@ public class ExtractRegionGeneCLI implements Callable<Integer> { ) private boolean writeRepeats; + @Option(names = {"--reads-file"}, + description = "Tab-delimited file with library names and corresponding read files. Default: none" + ) + private Path readsFile; + + @Option(names = {"--filter-high-frequency"}, + defaultValue = "true", + negatable = true, + fallbackValue = "true", + description = "Filter out high-frequency k-mers for read reconstruction. Default: true" + ) + private boolean filterHighFrequency; + //TODO: add --write-blast option to visualize BLASTP and BLASTN results @NotNull(message = "Please provide a gene name") @@ -186,8 +202,8 @@ public class ExtractRegionGeneCLI implements Callable<Integer> { GraphUtils.validateRepeats(); // check that repeat annotations are present } - // determine whether to use the fast option - boolean fast = !writeGfa && !writeUnitigs; + // always use fast option, except when writing GFA or unitigs, or using read files + boolean fast = !writeGfa && !writeUnitigs && readsFile == null; if (!fast) { Pantools.logger.warn("Using a slow method for extraction."); } @@ -196,7 +212,7 @@ public class ExtractRegionGeneCLI implements Callable<Integer> { regionExtractor.setGene(gene, context); regionExtractor.extractRegion(useKmers, useHomology, useBlast, allowSinglePositions, searchSelf, kmerDensity, maxEvalue, maxDistance, fractPositions, maxFrequency, flanking, - writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, fast); + writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, FileUtils.parseReadsFile(readsFile), filterHighFrequency, fast); return 0; } @@ -220,6 +236,11 @@ public class ExtractRegionGeneCLI implements Callable<Integer> { throw new ParameterException(spec.commandLine(), "--max-evalue can only be used with --use-blast=true"); } } + + // Assert that K is smaller than 64 when using reads (because of storing 2bit version of k-mers in long) + if (readsFile != null && Globals.K_SIZE > 63) { + throw new ParameterException(spec.commandLine(), "K should be smaller than 63 when using reads"); + } } public Pantools getPantools() { diff --git a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ReadReconstructor.java b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ReadReconstructor.java new file mode 100644 index 0000000000000000000000000000000000000000..c9a1d5c9554e50535634278eecc23825af2538e4 --- /dev/null +++ b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/ReadReconstructor.java @@ -0,0 +1,455 @@ +package nl.wur.bif.pantools.analysis.region_analysis; + +import com.google.common.collect.ImmutableList; +import htsjdk.samtools.fastq.FastqRecord; +import htsjdk.samtools.fastq.FastqWriterFactory; +import htsjdk.samtools.fastq.FastqWriter; +import htsjdk.samtools.fastq.AsyncFastqWriter; +import me.tongfei.progressbar.ProgressBar; +import me.tongfei.progressbar.ProgressBarBuilder; +import me.tongfei.progressbar.ProgressBarStyle; +import nl.wur.bif.pantools.Pantools; +import nl.wur.bif.pantools.utils.Globals; +import nl.wur.bif.pantools.utils.Utils; +import nl.wur.bif.pantools.utils.graph.graph_objects.GraphRange; +import nl.wur.bif.pantools.utils.reader.FastqReader; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import java.util.stream.LongStream; + +public class ReadReconstructor { + private static final int HIGH_KMER_FREQUENCY_THRESHOLD = Globals.GENOME_DB.num_genomes * 100; // 100 times the number of genomes; definition from GenomeLayer.java + private final double MINIMUM_MATCHING_KMERS_FRAC = 0.6; + private static final int BATCH_SIZE = 10_000; + private static final int QUEUE_CAPACITY = 10_000_000; + private static final int PROGRESS_BAR_UPDATE_INTERVAL = 100; + + private final ImmutableList<Long> allKmers; // all k-mers stored in 2bit format and sorted for binary search + private final ImmutableList<Long> highFrequencyKmers; // high frequency k-mers stored in 2bit format and sorted for binary search + private BlockingQueue<FastqRecord[]> readQueue; + private volatile boolean finishedReadingFromQueue = false; + private final AtomicInteger activeFilterThreads; + private final AtomicInteger totalFilterCount; + private BlockingQueue<FastqRecord[]> writeQueue; + private boolean finishedWritingToQueue = false; + private ProgressBarManager pbm; + + private String libraryName; + + private static final AtomicInteger preFilterCount = new AtomicInteger(0); + private static final AtomicInteger postFilterCount = new AtomicInteger(0); + + public ReadReconstructor(GraphRange graphRange, boolean filterHighFrequency) { + // obtain all k-mers from the graph range + Pantools.logger.info("Obtaining k-mers from graph range."); + this.allKmers = ImmutableList.copyOf(LongStream.of(graphRange.getKmersAs2bit()).boxed().collect(Collectors.toList())); + if (filterHighFrequency) { + this.highFrequencyKmers = ImmutableList.copyOf(LongStream.of(graphRange.getKmersAs2bit(HIGH_KMER_FREQUENCY_THRESHOLD)).boxed().collect(Collectors.toList())); + } else { + this.highFrequencyKmers = ImmutableList.of(); + } + Pantools.logger.info("Obtained {} k-mers from graph range (of which {} are high-frequency).", allKmers.size(), highFrequencyKmers.size()); + + // check that high-frequency k-mers are a strict subset of all k-mers + if (!allKmers.containsAll(highFrequencyKmers)) { + Pantools.logger.error("High-frequency k-mers are not a subset of all k-mers."); + Pantools.logger.debug("All k-mers: {}", allKmers); + Pantools.logger.debug("High-frequency k-mers: {}", highFrequencyKmers); + Pantools.logger.debug("High-frequency k-mers not in all k-mers: {}", highFrequencyKmers.stream().filter(k -> !allKmers.contains(k)).collect(Collectors.toList())); + throw new IllegalArgumentException("High-frequency k-mers are not a subset of all k-mers"); + } + + this.activeFilterThreads = new AtomicInteger(0); + this.totalFilterCount = new AtomicInteger(0); + } + + private void reset() { + this.readQueue = null; + this.finishedReadingFromQueue = false; + this.activeFilterThreads.set(0); + this.totalFilterCount.set(0); + this.writeQueue = null; + this.finishedWritingToQueue = false; + this.pbm = null; + + preFilterCount.set(0); + postFilterCount.set(0); + } + + public void reconstruct(String libraryName, Path[] readFiles, Path outputPath) { + Pantools.logger.info("Reconstructing region for {} from {} read files.", libraryName, readFiles.length); + reset(); + this.libraryName = libraryName; + + // create output directory if it does not exist + createOutputDirectory(outputPath); + + // set read length + if (readFiles.length != 1 && readFiles.length != 2) { + Pantools.logger.error("Invalid number of read files for library {}: {}", libraryName, readFiles.length); + throw new IllegalArgumentException("Invalid number of read files."); + } + boolean pairedReads = readFiles.length == 2; + + // process the reads using a thread pool + try { + pbm = new ProgressBarManager(); + + // Use separate thread pools for different tasks + ExecutorService readerExecutor = Executors.newSingleThreadExecutor(); + final int filterThreadCount = Math.max(1, Globals.THREADS - 2); + ExecutorService filterExecutor = Executors.newFixedThreadPool( + filterThreadCount, + r -> { + Thread t = new Thread(r); + t.setPriority(Thread.MAX_PRIORITY); // Prioritize filtering threads + return t; + } + ); + ExecutorService writerExecutor = Executors.newSingleThreadExecutor(); + Pantools.logger.debug("Created thread pools for reader (1), filter ({}), and writer (1) tasks.", filterThreadCount); + + // Configure larger queue sizes + readQueue = new LinkedBlockingQueue<>(QUEUE_CAPACITY); + writeQueue = new LinkedBlockingQueue<>(QUEUE_CAPACITY); + + // Start reader thread + readerExecutor.submit(new fastqFileReader(readFiles, pairedReads)); + + // Start filter threads + Pantools.logger.trace("READSTATS\tlibrary\tread_name\tnumber_k-mers\tthreshold_matches\tmatches\thigh_frequency_matches\trelevant_matches"); + activeFilterThreads.set(filterThreadCount); + for (int i = 0; i < filterThreadCount; i++) { + filterExecutor.submit(new fastqReadFilter(pairedReads)); + } + + // Start writer thread + writerExecutor.submit(new fastqFileWriter(outputPath, libraryName, pairedReads)); + + // Shutdown executors + readerExecutor.shutdown(); + filterExecutor.shutdown(); + writerExecutor.shutdown(); + + // Wait for completion + readerExecutor.awaitTermination(10, TimeUnit.DAYS); + filterExecutor.awaitTermination(10, TimeUnit.DAYS); + writerExecutor.awaitTermination(10, TimeUnit.DAYS); + + Pantools.logger.debug("Filtered {} reads down to {} reads", preFilterCount.get(), postFilterCount.get()); + } catch (InterruptedException e) { + Pantools.logger.error("Error while waiting for tasks to complete: {}", e.getMessage()); + throw new RuntimeException(e); + } + + // reconstruct the region through assembly + //TODO: filtering the reads as above works only so-so for lettuce + + // write the reconstructed region to the output path + + } + + private void createOutputDirectory(Path outputPath) { + if (!Files.exists(outputPath)) { + try { + Files.createDirectories(outputPath); + } catch (IOException ioe) { + Pantools.logger.error("Error while creating output directory: {}", ioe.getMessage()); + throw new RuntimeException(ioe); + } + } + } + + private static class ProgressBarManager { + private final ProgressBar filteringBar; + private volatile long totalReadsProcessed = 0; + + public ProgressBarManager() { + filteringBar = new ProgressBarBuilder() + .setTaskName("Filtering fastq") + .setInitialMax(1) + .setStyle(ProgressBarStyle.ASCII) + .setUpdateIntervalMillis(PROGRESS_BAR_UPDATE_INTERVAL) + .build(); + } + + public ProgressBar getFilteringBar() { + return filteringBar; + } + + public void updateReadCount(long count) { + totalReadsProcessed = count; + filteringBar.maxHint(totalReadsProcessed); + } + } + + private class fastqFileReader implements Runnable { + private final Path[] readFiles; + private final boolean pairedReads; + private long readCount; + + public fastqFileReader(Path[] readFiles, boolean pairedReads) { + this.readFiles = readFiles; + this.pairedReads = pairedReads; + this.readCount = 0; + } + + @Override + public void run() { + readFastqFiles(readFiles, pairedReads); + } + + private void readFastqFiles(Path[] readFiles, boolean pairedReads) { + try { + if (pairedReads) { + readPairedEndFiles(readFiles); + } else { + readSingleEndFile(readFiles[0]); + } + } catch (IOException e) { + Pantools.logger.error("Error while reading fastq files: {}", e.getMessage()); + throw new RuntimeException(e); + } + } + + private void readPairedEndFiles(Path[] readFiles) throws IOException { + try (FastqReader reader1 = new FastqReader(readFiles[0]); + FastqReader reader2 = new FastqReader(readFiles[1])) { + while (reader1.hasNext() && reader2.hasNext()) { + FastqRecord read1 = reader1.next(); + FastqRecord read2 = reader2.next(); + try { + readQueue.put(new FastqRecord[]{read1, read2}); + } catch (InterruptedException ie) { + Pantools.logger.error("Error while adding reads to queue: {}", ie.getMessage()); + throw new RuntimeException(ie); + } + readCount++; + } + pbm.updateReadCount(readCount); + } finally { + finishedReadingFromQueue = true; + } + } + + private void readSingleEndFile(Path readFile) throws IOException { + try (FastqReader reader = new FastqReader(readFile)) { + while (reader.hasNext()) { + FastqRecord read1 = reader.next(); + try { + readQueue.put(new FastqRecord[]{read1}); + } catch (InterruptedException ie) { + Pantools.logger.error("Error while adding reads to queue: {}", ie.getMessage()); + throw new RuntimeException(ie); + } + readCount++; + } + pbm.updateReadCount(readCount); + } finally { + finishedReadingFromQueue = true; + } + } + } + + private class fastqReadFilter implements Runnable { + private final boolean pairedReads; + private final List<FastqRecord[]> batch; + + public fastqReadFilter(boolean pairedReads) { + this.pairedReads = pairedReads; + this.batch = new ArrayList<>(BATCH_SIZE); + } + + @Override + public void run() { + filterReadsForKmers(pairedReads); + } + + private void filterReadsForKmers(boolean pairedReads) { + try { + ProgressBar pb2 = pbm.getFilteringBar(); + if (pairedReads) { + filterPairedEndReads(pb2); + } else { + filterSingleEndReads(pb2); + } + } catch (InterruptedException e) { + Pantools.logger.error("Error while filtering reads: {}", e.getMessage()); + throw new RuntimeException(e); + } + } + + private void filterPairedEndReads(ProgressBar pb) throws InterruptedException { + try { + while (!finishedReadingFromQueue || !readQueue.isEmpty()) { + // Batch processing + batch.clear(); + readQueue.drainTo(batch, BATCH_SIZE); + + if (batch.isEmpty()) { + FastqRecord[] reads = readQueue.poll(100, TimeUnit.MILLISECONDS); + if (reads != null) { + batch.add(reads); + } + } + + if (!batch.isEmpty()) { + processBatch(batch, pb); + } + } + } finally { + // Last filter thread marks writing as complete + if (activeFilterThreads.decrementAndGet() == 0) { + finishedWritingToQueue = true; + } + } + } + + private void processBatch(List<FastqRecord[]> batch, ProgressBar pb) throws InterruptedException { + for (FastqRecord[] reads : batch) { + preFilterCount.incrementAndGet(); + if (reads != null && (containsEnoughMatchingKmers(reads[0]) || + containsEnoughMatchingKmers(reads[1]))) { + writeQueue.put(reads); + postFilterCount.incrementAndGet(); + } + pb.step(); + } + } + + private void filterSingleEndReads(ProgressBar pb) throws InterruptedException { + try { + while (!finishedReadingFromQueue || !readQueue.isEmpty()) { + FastqRecord[] reads = readQueue.poll(100, TimeUnit.MILLISECONDS); + if (reads != null && containsEnoughMatchingKmers(reads[0])) { + writeQueue.put(reads); + } + pb.step(); + } + } finally { + // Last filter thread marks writing as complete + if (activeFilterThreads.decrementAndGet() == 0) { + finishedWritingToQueue = true; + } + } + } + + /** + * Main method to check if a read contains enough matching kmers. + * Strategy here is to count how many k-mers in the read are part of the + * allKmers list (all k-mers in a region) and how many are part of the + * highFrequencyKmers list (high frequency k-mers in the same region). + * If the number of informative (relevant) k-mers is above a certain + * threshold, the read is considered to be informative. + * @param read input read + * @return true if the read contains enough matching kmers, false otherwise + */ + private boolean containsEnoughMatchingKmers(FastqRecord read) { + // Get all kmers at once to reduce method calls + final long[] readKmers = Utils.kmerizeAs2bit(read.getReadString(), Globals.K_SIZE); + + // Count matching kmers + int matchCount = 0; + int highFrequencyCount = 0; + + for (long kmer : readKmers) { + if (Collections.binarySearch(allKmers, kmer) >= 0) { + matchCount++; + } + + // Check if we found a high frequency kmer for logging + if (Collections.binarySearch(highFrequencyKmers, kmer) >= 0) { + highFrequencyCount++; + } + } + + // Return false if no matching kmers are found + if (matchCount == 0) { + return false; + } + + // Log matching kmers + Pantools.logger.trace("READSTATS\t{}\t{}\t{}\t{}\t{}\t{}\t{}", libraryName, + read.getReadName(), readKmers.length, MINIMUM_MATCHING_KMERS_FRAC * readKmers.length, matchCount, highFrequencyCount, matchCount - highFrequencyCount); + + // Return true if enough matching kmers are found + return (matchCount - highFrequencyCount) >= MINIMUM_MATCHING_KMERS_FRAC * readKmers.length; + } + } + + private class fastqFileWriter implements Runnable { + private final Path outputPath; + private final String libraryName; + private final boolean pairedReads; + + public fastqFileWriter(Path outputPath, String libraryName, boolean pairedReads) { + this.outputPath = outputPath; + this.libraryName = libraryName; + this.pairedReads = pairedReads; + } + + @Override + public void run() { + writeReadsToTempFile(outputPath, libraryName, pairedReads); + } + + private void writeReadsToTempFile(Path outputPath, String libraryName, boolean pairedReads) { + FastqWriterFactory writerFactory = new FastqWriterFactory(); + try { + if (pairedReads) { + writePairedEndReads(outputPath, libraryName, writerFactory); + } else { + writeSingleEndReads(outputPath, libraryName, writerFactory); + } + } catch (IOException | InterruptedException e) { + Pantools.logger.error("Error while writing reads to temp file: {}", e.getMessage()); + throw new RuntimeException(e); + } + } + + private void writePairedEndReads(Path outputPath, String libraryName, FastqWriterFactory writerFactory) throws IOException, InterruptedException { + int readCount = 0; + + Path temporaryReadsFile1 = outputPath.resolve(String.format("%s.matching_1.fq.gz", libraryName)); + Path temporaryReadsFile2 = outputPath.resolve(String.format("%s.matching_2.fq.gz", libraryName)); + try (FastqWriter writer1 = new AsyncFastqWriter(writerFactory.newWriter(temporaryReadsFile1.toFile()), QUEUE_CAPACITY); + FastqWriter writer2 = new AsyncFastqWriter(writerFactory.newWriter(temporaryReadsFile2.toFile()), QUEUE_CAPACITY)) { + while (!finishedWritingToQueue || !writeQueue.isEmpty()) { + FastqRecord[] reads = writeQueue.poll(100, TimeUnit.MILLISECONDS); + if (reads != null) { + writer1.write(reads[0]); + writer2.write(reads[1]); + readCount++; + } + } + } + + Pantools.logger.debug("Wrote {} reads to temporary files.", readCount); + } + + private void writeSingleEndReads(Path outputPath, String libraryName, FastqWriterFactory writerFactory) throws IOException, InterruptedException { + int readCount = 0; + + Path temporaryReadsFile = outputPath.resolve(String.format("%s.matching.fq.gz", libraryName)); + try (FastqWriter writer = new AsyncFastqWriter(writerFactory.newWriter(temporaryReadsFile.toFile()), QUEUE_CAPACITY)) { + while (!finishedWritingToQueue || !writeQueue.isEmpty()) { + FastqRecord[] reads = writeQueue.poll(100, TimeUnit.MILLISECONDS); + if (reads != null) { + writer.write(reads[0]); + readCount++; + } + } + } + + Pantools.logger.debug("Wrote {} reads to temporary file.", readCount); + } + } +} diff --git a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/RegionExtractor.java b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/RegionExtractor.java index 2f6304df88e9400b9bdba9373303289004fee7fc..7147d1517cb846c5116816ebbfad96c9ced165bd 100644 --- a/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/RegionExtractor.java +++ b/src/main/java/nl/wur/bif/pantools/analysis/region_analysis/RegionExtractor.java @@ -11,7 +11,9 @@ import nl.wur.bif.pantools.utils.writer.*; import java.io.IOException; import java.nio.file.Path; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; /** * Extracts a region in one genome from the pangenome graph. @@ -23,6 +25,7 @@ public class RegionExtractor { /** * Constructor. + * * @param outputPath The path to the output file. */ public RegionExtractor(Path outputPath) { @@ -32,6 +35,7 @@ public class RegionExtractor { /** * Sets a genome range. * Should be formatted as: genome#sequence:start-end. + * * @param genomeRange The genome range. */ public void setGenomeRange(GenomeRange genomeRange) { @@ -41,7 +45,8 @@ public class RegionExtractor { /** * Sets a gene name to be used for the range. - * @param gene The gene name. + * + * @param gene The gene name. * @param context The number of flanking genes to include. */ public void setGene(String gene, int context) { @@ -67,16 +72,19 @@ public class RegionExtractor { * @param writeHomology Whether to write the homology groups. * @param writeUnitigs Whether to write the unitig (node) presence/absence. * @param writeRepeats Whether to write the repeats. + * @param readFiles The read files to use for reconstruction. + * @param filterHighFrequency Whether to filter out high frequency for read reconstruction. + * @param fast Whether to use the fast version of the algorithm (without obtaining all nodes). * @throws IOException If an IO error occurs during conversion. */ public void extractRegion(final boolean useKmers, final boolean useHomology, final boolean useBlast, final boolean allowSinglePositions, final boolean searchSelf, final int kmerDensity, final double maxEvalue, final int maxDistance, final double fractPositions, final int maxFrequency, final int flanking, - final boolean writeAnnotations, final boolean writeGfa, final boolean writeHomology, final boolean writeUnitigs, final boolean writeRepeats, - final boolean fast) + final boolean writeAnnotations, final boolean writeGfa, final boolean writeHomology, final boolean writeUnitigs, final boolean writeRepeats, final HashMap<String, Path[]> readFiles, + final boolean filterHighFrequency, final boolean fast) throws IOException { Pantools.logger.debug("Extracting region from pangenome for {}", this); - Pantools.logger.debug("useKmers: {}; useHomology: {}; useBlast: {}; allowSinglePositions: {}; searchSelf: {}; kmerDensity: {}; maxDistance: {}; fractPositions: {}; maxFrequency: {}; flanking: {}; writeAnnotations: {}; writeGfa: {}; writeHomology: {}; writeUnitigs: {}; writeRepeats: {}; fast: {}", - useKmers, useHomology, useBlast, allowSinglePositions, searchSelf, kmerDensity, maxDistance, fractPositions, maxFrequency, flanking, writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, fast); + Pantools.logger.debug("useKmers: {}; useHomology: {}; useBlast: {} allowSinglePositions: {}; searchSelf: {}; kmerDensity: {}; maxDistance: {}; fractPositions: {}; maxFrequency: {}; flanking: {}; writeAnnotations: {}; writeGfa: {}; writeHomology: {}; writeUnitigs: {}; writeRepeats: {}; readFiles: {}; fast: {}", + useKmers, useHomology, useBlast, allowSinglePositions, searchSelf, kmerDensity, maxDistance, fractPositions, maxFrequency, flanking, writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, readFiles, fast); // make sure that genomeRange has been set if (genomeRange == null) { @@ -89,21 +97,28 @@ public class RegionExtractor { // write the sequences + annotations writeRegion(writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, useBlast, graphRange); + + // reconstruct reads + reconstructRegion(graphRange, readFiles, filterHighFrequency); } /** * Extracts multiple genes from the pangenome graph by searching for all homologues of the gene. - * @param genes The gene names. - * @param context The number of flanking genes to search for. - * @param flanking The number of flanking bases to include in the output. + * + * @param genes The gene names. + * @param context The number of flanking genes to search for. + * @param flanking The number of flanking bases to include in the output. * @param writeAnnotations Whether to write the annotations in GFF3 format. - * @param writeGfa Whether to write the GFA file. - * @param writeHomology Whether to write the homology groups. - * @param writeUnitigs Whether to write the unitig (node) presence/absence. - * @param writeRepeats Whether to write the repeats. + * @param writeGfa Whether to write the GFA file. + * @param writeHomology Whether to write the homology groups. + * @param writeUnitigs Whether to write the unitig (node) presence/absence. + * @param writeRepeats Whether to write the repeats. + * @param readFiles The read files to use for reconstruction. + * @param filterHighFrequency Whether to filter out high frequency for read reconstruction. + * @param fast Whether to use the fast version of the algorithm (without obtaining all nodes). * @throws IOException If an IO error occurs during conversion. */ - public void extractGenes(String[] genes, int context, int flanking, boolean writeAnnotations, boolean writeGfa, boolean writeHomology, boolean writeUnitigs, boolean writeRepeats, boolean fast) throws IOException { + public void extractGenes(String[] genes, int context, int flanking, boolean writeAnnotations, boolean writeGfa, boolean writeHomology, boolean writeUnitigs, boolean writeRepeats, final HashMap<String, Path[]> readFiles, boolean filterHighFrequency, boolean fast) throws IOException { if (genes == null || genes.length == 0) { Pantools.logger.error("No genes provided."); throw new IllegalArgumentException("No genes provided."); @@ -119,20 +134,27 @@ public class RegionExtractor { // write the sequences + annotations writeRegion(writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, false, graphRange); + + // reconstruct reads + reconstructRegion(graphRange, readFiles, filterHighFrequency); } /** * Extracts multiple homology groups from the pangenome graph. - * @param hmGroups The homology group node identifiers. - * @param flanking The number of flanking bases to include in the output. + * + * @param hmGroups The homology group node identifiers. + * @param flanking The number of flanking bases to include in the output. * @param writeAnnotations Whether to write the annotations in GFF3 format. - * @param writeGfa Whether to write the GFA file. - * @param writeHomology Whether to write the homology groups. - * @param writeUnitigs Whether to write the unitig (node) presence/absence. - * @param writeRepeats Whether to write the repeats. + * @param writeGfa Whether to write the GFA file. + * @param writeHomology Whether to write the homology groups. + * @param writeUnitigs Whether to write the unitig (node) presence/absence. + * @param writeRepeats Whether to write the repeats. + * @param readFiles The read files to use for reconstruction. + * @param filterHighFrequency Whether to filter out high frequency for read reconstruction. + * @param fast Whether to use the fast version of the algorithm (without obtaining all nodes). * @throws IOException If an IO error occurs during conversion. */ - public void extractHmGroups(List<Long> hmGroups, int flanking, boolean writeAnnotations, boolean writeGfa, boolean writeHomology, boolean writeUnitigs, boolean writeRepeats, boolean fast) throws IOException { + public void extractHmGroups(List<Long> hmGroups, int flanking, boolean writeAnnotations, boolean writeGfa, boolean writeHomology, boolean writeUnitigs, boolean writeRepeats, final HashMap<String, Path[]> readFiles, boolean filterHighFrequency, boolean fast) throws IOException { if (hmGroups == null || hmGroups.isEmpty()) { Pantools.logger.error("No homology groups provided."); throw new IllegalArgumentException("No homology groups provided."); @@ -148,19 +170,27 @@ public class RegionExtractor { // write the sequences + annotations writeRegion(writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, false, graphRange); + + // reconstruct reads + reconstructRegion(graphRange, readFiles, filterHighFrequency); } /** * Extracts genes from the pangenome graph by searching for functional annotations. - * @param functions The functional annotations. - * @param flanking The number of flanking bases to include in the output. + * + * @param functions The functional annotations. + * @param flanking The number of flanking bases to include in the output. * @param writeAnnotations Whether to write the annotations in GFF3 format. - * @param writeGfa Whether to write the GFA file. - * @param writeHomology Whether to write the homology groups. - * @param writeUnitigs Whether to write the unitig (node) presence/absence. + * @param writeGfa Whether to write the GFA file. + * @param writeHomology Whether to write the homology groups. + * @param writeUnitigs Whether to write the unitig (node) presence/absence. + * @param writeRepeats Whether to write the repeats. + * @param readFiles The read files to use for reconstruction. + * @param filterHighFrequency Whether to filter out high frequency for read reconstruction. + * @param fast Whether to use the fast version of the algorithm (without obtaining all nodes). * @throws IOException If an IO error occurs during conversion. */ - public void extractFunctions(String[] functions, int flanking, boolean writeAnnotations, boolean writeGfa, boolean writeHomology, boolean writeUnitigs, boolean writeRepeats, boolean fast) throws IOException { + public void extractFunctions(String[] functions, int flanking, boolean writeAnnotations, boolean writeGfa, boolean writeHomology, boolean writeUnitigs, boolean writeRepeats, final HashMap<String, Path[]> readFiles, boolean filterHighFrequency, boolean fast) throws IOException { if (functions == null || functions.length == 0) { Pantools.logger.error("No functions provided."); throw new IllegalArgumentException("No functions provided."); @@ -176,10 +206,14 @@ public class RegionExtractor { // write the sequences + annotations writeRegion(writeAnnotations, writeGfa, writeHomology, writeUnitigs, writeRepeats, false, graphRange); + + // reconstruct reads + reconstructRegion(graphRange, readFiles, filterHighFrequency); } /** * Writes the region to the output path. + * * @param writeAnnotations Whether to write the annotations in GFF3 format. * @param writeGfa Whether to write the GFA file. * @param writeHomology Whether to write the homology groups. @@ -248,6 +282,25 @@ public class RegionExtractor { } } + /** + * Reconstructs reads from the pangenome graph if read files are provided. + * @param graphRange The graph range to reconstruct. + * @param readFiles The read files to use for reconstruction. + * @param filterHighFrequency Whether to filter out high frequency for read reconstruction. + */ + private void reconstructRegion(GraphRange graphRange, HashMap<String, Path[]> readFiles, boolean filterHighFrequency) { + // if there are read files, reconstruct their sequence for the region + if (readFiles !=null && !readFiles.isEmpty()) { + final ReadReconstructor readReconstructor = new ReadReconstructor(graphRange, filterHighFrequency); + for (Map.Entry<String, Path[]> entry : readFiles.entrySet()) { + final String libraryName = entry.getKey(); + final Path[] files = entry.getValue(); + final Path outputDirectory = outputPath.resolve(libraryName); + readReconstructor.reconstruct(libraryName, files, outputDirectory); + } + } + } + /** * toString method. * @return A string representation of this object. diff --git a/src/main/java/nl/wur/bif/pantools/construction/index/IndexScanner.java b/src/main/java/nl/wur/bif/pantools/construction/index/IndexScanner.java index 8dea7b2ba06014b281b9fea5130b57dfd78abc38..3c8b7305fd910a768a31e7407cf36539f62377be 100644 --- a/src/main/java/nl/wur/bif/pantools/construction/index/IndexScanner.java +++ b/src/main/java/nl/wur/bif/pantools/construction/index/IndexScanner.java @@ -104,7 +104,7 @@ public final class IndexScanner { pos = (int) (number * suffix_record_size % full_suffix_page_size); get_suffix(database.suf_buff[(int) (number * suffix_record_size / full_suffix_page_size)], pos, key.get_fwd_suffix()); } - + /** * Reads a suffix as a byte array from th index database. * diff --git a/src/main/java/nl/wur/bif/pantools/construction/index/kmer.java b/src/main/java/nl/wur/bif/pantools/construction/index/kmer.java index 162ee54183b0d6c730278ff5c1511b8631bc35f5..88ca3c8d531bc12028d48699f8196feeef24ff4e 100755 --- a/src/main/java/nl/wur/bif/pantools/construction/index/kmer.java +++ b/src/main/java/nl/wur/bif/pantools/construction/index/kmer.java @@ -273,7 +273,7 @@ public class kmer { for(int i=0; i < rev_suffix.length; ++i) build_sequence(sym, rev_suffix[i], 4, rev_seq); return fwd_seq.append(" ").append(rev_seq).toString(); - } + } /** * Recursively decodes the binary kmer into a string @@ -289,5 +289,5 @@ public class kmer { build_sequence(sym, word >> 2,k-1,seq); seq.append(sym[word & 0x00000003]); } - } + } } \ No newline at end of file diff --git a/src/main/java/nl/wur/bif/pantools/utils/FileUtils.java b/src/main/java/nl/wur/bif/pantools/utils/FileUtils.java index a3c232123ea6ac0962aec94783b8805d76fa69f1..56f3ca96c115c8a6919443bf8f57c41c5b825bf9 100644 --- a/src/main/java/nl/wur/bif/pantools/utils/FileUtils.java +++ b/src/main/java/nl/wur/bif/pantools/utils/FileUtils.java @@ -11,12 +11,8 @@ import java.nio.file.Files; import java.nio.file.NotDirectoryException; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.HashMap; +import java.util.*; -import java.util.Comparator; - -import java.util.Scanner; import java.util.stream.Stream; import java.util.zip.GZIPInputStream; @@ -269,4 +265,45 @@ public final class FileUtils { return size / 1048576 + 1; } + /** + * Parses a tab-delimited file with library names and corresponding read files. + * @param readsFile tab-delimited file with library names and corresponding read files (single or paired) + * @return a map with library names as keys and read files as values + */ + public static HashMap<String, Path[]> parseReadsFile(Path readsFile) { + HashMap<String, Path[]> readFiles = new HashMap<>(); + try (BufferedReader reader = Files.newBufferedReader(readsFile)) { + String line; + while ((line = reader.readLine()) != null) { + String[] fields = line.split("\t"); + if (fields.length < 2) { + Pantools.logger.error("Reads file must contain at least two columns: library name and read file(s)."); + throw new IllegalArgumentException("Invalid reads file format"); + } + String library = fields[0]; + Path[] files = new Path[fields.length - 1]; + for (int i = 1; i < fields.length; i++) { + files[i - 1] = Paths.get(fields[i]); + if (!Files.exists(files[i - 1])) { + Pantools.logger.error("Read file does not exist: {}", files[i - 1]); + throw new IllegalArgumentException("Non-existent read file"); + } + } + + Pantools.logger.debug("Found library {} with read files: {}", library, Arrays.toString(files)); + if (!readFiles.containsKey(library)) { + readFiles.put(library, files); + } else { + Pantools.logger.error("Duplicate library name in reads file: {}", library); + throw new IllegalArgumentException("Duplicate library name"); + } + } + } catch (IOException e) { + Pantools.logger.error("Unable to read reads file: {}", readsFile); + System.exit(1); + } catch (NullPointerException ignored) { + // thrown when the file is empty or does not exist + } + return readFiles; + } } diff --git a/src/main/java/nl/wur/bif/pantools/utils/Utils.java b/src/main/java/nl/wur/bif/pantools/utils/Utils.java index 73c4068f5e5906061953dd4585c0ff738bb0bcbd..3812dd5fc66b7eb1d4f71f6d3c3991c270b44ba7 100644 --- a/src/main/java/nl/wur/bif/pantools/utils/Utils.java +++ b/src/main/java/nl/wur/bif/pantools/utils/Utils.java @@ -2169,6 +2169,114 @@ public final class Utils { return map; } + /** + * Kmerize a sequence to 2bit format (A=00, C=01, G=10, T=11). + * This method uses a two-pass approach to skip over invalid nucleotides but still keep the code fast. + * @param sequence sequence to kmerize + * @param k kmer size + * @return an array of kmers + */ + public static long[] kmerizeAs2bit(final String sequence, final int k) { + int validKmerCount = 0; + final long mask = (1L << (2 * k)) - 1; // Mask to keep only the last k nucleotides + long currentKmer = 0; + int validKmerLength = 0; + + // First pass to count valid k-mers + for (int i = 0; i < sequence.length(); ++i) { + try { + currentKmer <<= 2; + currentKmer |= binaryCode(sequence.charAt(i)) & 3; + currentKmer &= mask; + validKmerLength++; + + if (validKmerLength >= k) { + validKmerCount++; + } + } catch (IllegalArgumentException e) { + // Reset the current k-mer and valid k-mer length + currentKmer = 0; + validKmerLength = 0; + } + } + + // Allocate array for valid k-mers + long[] kmersArray = new long[validKmerCount]; + currentKmer = 0; + validKmerLength = 0; + int index = 0; + + // Second pass to store valid k-mers + for (int i = 0; i < sequence.length(); ++i) { + try { + currentKmer <<= 2; + currentKmer |= binaryCode(sequence.charAt(i)) & 3; + currentKmer &= mask; + validKmerLength++; + + if (validKmerLength >= k) { + kmersArray[index++] = Math.min(currentKmer, reverseComplement(currentKmer)); + } + } catch (IllegalArgumentException e) { + // Reset the current k-mer and valid k-mer length + currentKmer = 0; + validKmerLength = 0; + } + } + + return kmersArray; + } + + /** + * Return the binary code of a nucleotide + * @param nucleotide nucleotide + * @return binary code (A=00, C=01, G=10, T=11) + * @throws IllegalArgumentException if the nucleotide is invalid (not A, C, G, or T) + */ + public static int binaryCode(char nucleotide) throws IllegalArgumentException { + switch (nucleotide) { //compiles to a lookup switch (which should have O(log n) complexity; n = number of cases) + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 2; + case 'T': + return 3; + default: + throw new IllegalArgumentException("Invalid nucleotide: " + nucleotide); + } + } + + /** + * Get the sequence corresponding to a kmer. + * @param kmer The kmer. + * @return The sequence. + */ + public static String kmer2bitToSequence(long kmer) { + final StringBuilder sequence = new StringBuilder(); + for (int i = 0; i < Globals.K_SIZE; ++i) { + sequence.append(Globals.sym[(int) (kmer & 3)]); + kmer >>= 2; + } + return sequence.reverse().toString(); + } + + /** + * Get the reverse complement of a 2bit kmer + * @param kmer 2bit kmer + * @return reverse complement of the kmer + */ + public static long reverseComplement(long kmer) { + long reverseComplement = 0; + for (int i = 0; i < Globals.K_SIZE; ++i) { + reverseComplement <<= 2; + reverseComplement |= 3 - (kmer & 3); + kmer >>= 2; + } + return reverseComplement; + } + /** * Obtain a sequence name from the genome database * @param genomeNr genome number diff --git a/src/main/java/nl/wur/bif/pantools/utils/graph/graph_objects/GraphRange.java b/src/main/java/nl/wur/bif/pantools/utils/graph/graph_objects/GraphRange.java index 7917848fead7e1d0ccf0175230290cf830c02c76..6027f5711c3561b4e9166440567df23c82d9c1c7 100644 --- a/src/main/java/nl/wur/bif/pantools/utils/graph/graph_objects/GraphRange.java +++ b/src/main/java/nl/wur/bif/pantools/utils/graph/graph_objects/GraphRange.java @@ -1,5 +1,8 @@ package nl.wur.bif.pantools.utils.graph.graph_objects; +import me.tongfei.progressbar.ProgressBar; +import me.tongfei.progressbar.ProgressBarBuilder; +import me.tongfei.progressbar.ProgressBarStyle; import nl.wur.bif.pantools.Pantools; import nl.wur.bif.pantools.construction.index.kmer; import nl.wur.bif.pantools.utils.FileUtils; @@ -7,6 +10,7 @@ import nl.wur.bif.pantools.utils.Globals; import nl.wur.bif.pantools.utils.GraphUtils; import nl.wur.bif.pantools.utils.Utils; import nl.wur.bif.pantools.utils.graph.GenomeRange; +import org.apache.commons.lang.ArrayUtils; import org.neo4j.graphdb.*; import java.io.IOException; @@ -14,7 +18,7 @@ import java.nio.file.Path; import java.util.*; import java.util.concurrent.ExecutionException; -import static nl.wur.bif.pantools.utils.Globals.skip; +import static nl.wur.bif.pantools.utils.Globals.*; public class GraphRange { private final Set<Long> nucleotideNodes; // node IDs @@ -717,6 +721,61 @@ public class GraphRange { return graphRange; } + /** + * Obtain k-mers in the graph range in 2bit format using neo4j. + * @return sorted array of k-mers in 2bit format (long) + */ + public long[] getKmersAs2bit() { + if (this.nucleotideNodes == null) { + throw new IllegalStateException("Nucleotide nodes are not set."); + } + + final List<Long> kmers = new ArrayList<>(); + try (Transaction ignored = GRAPH_DB.beginTx()) { + for (Long nucleotideNodeId : this.nucleotideNodes) { + final Node nucleotideNode = GRAPH_DB.getNodeById(nucleotideNodeId); + for (long kmer : Utils.kmerizeAs2bit((String) nucleotideNode.getProperty("sequence"), Globals.K_SIZE)) { + kmers.add(Math.min(kmer, Utils.reverseComplement(kmer))); // add minimum of k-mer and reverse complement to list + } + } + } + + Pantools.logger.debug("Found {} distinct k-mers for graph range.", kmers.size()); + Collections.sort(kmers); + + // return the array + return ArrayUtils.toPrimitive(kmers.toArray(new Long[0])); + } + + /** + * Obtain k-mers in the graph range in 2bit format using neo4j. + * @param minFrequency The minimum frequency of a k-mer to be included + * @return sorted array of k-mers in 2bit format (long) + */ + public long[] getKmersAs2bit(final int minFrequency) { + if (this.nucleotideNodes == null) { + throw new IllegalStateException("Nucleotide nodes are not set."); + } + + final List<Long> kmers = new ArrayList<>(); + try (Transaction ignored = GRAPH_DB.beginTx()) { + for (Long nucleotideNodeId : this.nucleotideNodes) { + final Node nucleotideNode = GRAPH_DB.getNodeById(nucleotideNodeId); + if ((long) nucleotideNode.getProperty("frequency") >= minFrequency) { + for (long kmer : Utils.kmerizeAs2bit((String) nucleotideNode.getProperty("sequence"), Globals.K_SIZE)) { + kmers.add(Math.min(kmer, Utils.reverseComplement(kmer))); // add minimum of k-mer and reverse complement to list + } + } + } + } + + Pantools.logger.debug("Found {} distinct k-mers for graph range (min frequency = {}).", kmers.size(), minFrequency); + Collections.sort(kmers); + + // return the array + return ArrayUtils.toPrimitive(kmers.toArray(new Long[0])); + } + /** * Obtain the genome ranges of the range. * @return a set of genome ranges diff --git a/src/main/java/nl/wur/bif/pantools/utils/reader/FastqReader.java b/src/main/java/nl/wur/bif/pantools/utils/reader/FastqReader.java new file mode 100644 index 0000000000000000000000000000000000000000..73cabc1536a35237e4cda88fe5e0cc18ad030c81 --- /dev/null +++ b/src/main/java/nl/wur/bif/pantools/utils/reader/FastqReader.java @@ -0,0 +1,221 @@ +package nl.wur.bif.pantools.utils.reader; + +import htsjdk.samtools.fastq.FastqRecord; +import nl.wur.bif.pantools.Pantools; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.channels.Channels; +import java.nio.file.*; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.zip.GZIPInputStream; + +/** + * FastqReader is a class that reads Fastq files in the fastest way possible to my knowledge. + * It uses NIO to read the file and a BlockingQueue to store the records. + * The records are read in a separate thread to allow for parallel processing of the records. + * NB: most of this class was written with the help of claude.ai and two days of prompting. + */ +public class FastqReader implements Iterator<FastqRecord>, Iterable<FastqRecord>, Closeable { + private static final int BUFFER_SIZE = 4 * 1024 * 1024; // 4MB buffer + private static final int QUEUE_SIZE = 1_000_000; + private static final FastqRecord POISON_PILL = new FastqRecord("", "", "", ""); + + private final Path filePath; + private FastqRecord nextRecord; + private final BlockingQueue<FastqRecord> queue; + private final Thread readerThread; + private volatile boolean closed = false; + + public FastqReader(Path filePath) { + this.filePath = filePath; + this.queue = new LinkedBlockingQueue<>(QUEUE_SIZE); + + // Create and start the reader thread + this.readerThread = new Thread(this::processFile); + this.readerThread.setDaemon(true); + this.readerThread.start(); + + // Get the first record + this.nextRecord = this.readNextRecord(); + } + + /** + * Process the file and add the records to the queue. + */ + private void processFile() { + try { + final boolean isGzipped = filePath.toString().endsWith(".gz"); + if (isGzipped) { + this.readFileGzipNIO(); + } else { + this.readFileNIO(); + } + } catch (IOException | InterruptedException e) { + Pantools.logger.error("Error processing the file: {}", e.getMessage()); + } finally { + try { + queue.put(POISON_PILL); + } catch (InterruptedException e) { + Pantools.logger.error("Error adding end marker to queue: {}", e.getMessage()); + } + } + } + + @Override + public boolean hasNext() { + return this.nextRecord != null && this.nextRecord != POISON_PILL; + } + + @Override + public FastqRecord next() { + if (!this.hasNext()) { + throw new NoSuchElementException("next() called when !hasNext()"); + } + FastqRecord record = this.nextRecord; + this.nextRecord = this.readNextRecord(); + return record; + } + + @Override + public Iterator<FastqRecord> iterator() { + return this; + } + + @Override + public void close() { + if (!closed) { + closed = true; + readerThread.interrupt(); + queue.clear(); + } + } + + /** + * Read the next record from the queue. + * @return The next FastqRecord or an empty FastqRecord if the queue is empty. + */ + private FastqRecord readNextRecord() { + try { + return queue.take(); + } catch (InterruptedException e) { + Pantools.logger.error("Error reading the next record: {}", e.getMessage()); + return POISON_PILL; + } + } + + /** + * Read a Fastq file using NIO. + * @throws IOException If an I/O error occurs + * @throws InterruptedException If the thread is interrupted + */ + private void readFileGzipNIO() throws IOException, InterruptedException { + try (FileChannel fileChannel = FileChannel.open(this.filePath, StandardOpenOption.READ); + GZIPInputStream gzipInputStream = new GZIPInputStream(Channels.newInputStream(fileChannel))) { + + StringBuilder currentLine = new StringBuilder(); + String[] readComponents = new String[4]; + + int lineCount = 0; + byte lastByte = 0; + + ByteBuffer buffer = ByteBuffer.allocate(BUFFER_SIZE); + int bytesRead; + byte[] byteArray = new byte[BUFFER_SIZE]; + + while ((bytesRead = gzipInputStream.read(byteArray)) != -1) { + buffer.put(byteArray, 0, bytesRead); + buffer.flip(); + while (buffer.hasRemaining()) { + byte currentByte = buffer.get(); + if (currentByte == '\n' && lastByte != '\r') { + // Process completed line + readComponents[lineCount & 3] = currentLine.toString(); + currentLine.setLength(0); + + // When we've collected all 4 lines, create a FastqRecord object + if (lineCount % 4 == 3) { + addNewFastqRecord(readComponents); + } + + lineCount++; + } else if (currentByte != '\r') { + currentLine.append((char) currentByte); + } + lastByte = currentByte; + } + buffer.clear(); + } + + // handle last bit + if (currentLine.length() > 0) { + readComponents[lineCount & 3] = currentLine.toString(); + if (lineCount % 4 == 3) { + addNewFastqRecord(readComponents); + } + } + } + } + + /** + * Read a Fastq file using NIO. + * @throws IOException If an I/O error occurs + * @throws InterruptedException If the thread is interrupted + */ + private void readFileNIO() throws IOException, InterruptedException { + try (FileChannel fileChannel = FileChannel.open(this.filePath, StandardOpenOption.READ)) { + ByteBuffer buffer = ByteBuffer.allocateDirect(BUFFER_SIZE); + + StringBuilder currentLine = new StringBuilder(); + String[] readComponents = new String[4]; + + int lineCount = 0; + byte lastByte = 0; + + while (fileChannel.read(buffer) > 0) { + buffer.flip(); + while (buffer.hasRemaining()) { + byte currentByte = buffer.get(); + if (currentByte == '\n' && lastByte != '\r') { + // Process completed line + readComponents[lineCount & 3] = currentLine.toString(); + currentLine.setLength(0); + + // When we've collected all 4 lines, create a FastqRecord object + if (lineCount % 4 == 3) { + addNewFastqRecord(readComponents); + } + + lineCount++; + } else if (currentByte != '\r') { + currentLine.append((char) currentByte); + } + lastByte = currentByte; + } + buffer.clear(); + } + + // handle last bit + if (currentLine.length() > 0) { + readComponents[lineCount & 3] = currentLine.toString(); + if (lineCount % 4 == 3) { + addNewFastqRecord(readComponents); + } + } + } + } + + private void addNewFastqRecord(final String[] readComponents) throws InterruptedException { + queue.put(new FastqRecord( + readComponents[0].substring(1), + readComponents[1], + readComponents[2].substring(1), + readComponents[3] + )); + } +} \ No newline at end of file diff --git a/src/test/java/nl/wur/bif/pantools/utils/UtilsTest.java b/src/test/java/nl/wur/bif/pantools/utils/UtilsTest.java new file mode 100644 index 0000000000000000000000000000000000000000..253cc4e0c5bc4b58308f0dcc8c08fcb24b16b1b5 --- /dev/null +++ b/src/test/java/nl/wur/bif/pantools/utils/UtilsTest.java @@ -0,0 +1,281 @@ +package nl.wur.bif.pantools.utils; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; + +class UtilsTest { + + @BeforeAll + static void setup() { + Globals.sym = new char[]{'A', 'C', 'G', 'T', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B', 'N'}; + Globals.K_SIZE = 7; + } + + // For every test, I write down the 2bit representation of the input string + // kmerized and the 2bit representation of the canonical k-mers. + + @Test + void kmerizeAs2bit1() { + String input = "AAAAAAA"; // 00 00 00 00 00 00 00 + long[] expected = {0}; // 00 00 00 00 00 00 00 + long[] actual = Utils.kmerizeAs2bit(input, Globals.K_SIZE); + System.out.println(Arrays.toString(actual)); + assert (Arrays.equals(expected, actual)); + } + + @Test + void kmerizeAs2bit2() { + String input = "TTTTTTT"; // 11 11 11 11 11 11 11 + long[] expected = {0}; // 00 00 00 00 00 00 00 + long[] actual = Utils.kmerizeAs2bit(input, Globals.K_SIZE); + System.out.println(Arrays.toString(actual)); + assert (Arrays.equals(expected, actual)); + } + + @Test + void kmerizeAs2bit3() { + String input = "ACGTCGT"; // 00 01 10 11 01 10 11 + long[] expected = {1563}; // 00 01 10 00 01 10 11 + long[] actual = Utils.kmerizeAs2bit(input, Globals.K_SIZE); + System.out.println(Arrays.toString(actual)); + assert (Arrays.equals(expected, actual)); + } + + @Test + void kmerizeAs2bit4() { + String input = "ACGTACGT"; // 00 01 10 11 00 01 10, 01 10 11 00 01 10 11 + long[] expected = {1734, 1734}; // 00 01 10 11 00 01 10, 00 01 10 11 00 01 10 + long[] actual = Utils.kmerizeAs2bit(input, Globals.K_SIZE); + System.out.println(Arrays.toString(actual)); + assert (Arrays.equals(expected, actual)); + } + + @Test + void kmerizeAs2bit5() { + String input = "ACGTACGTNACGTACGT"; // 00 01 10 11 00 01 10, 01 10 11 00 01 10 11, 00 01 10 11 00 01 10, 01 10 11 00 01 10 11 + long[] expected = {1734, 1734, 1734, 1734}; // 00 01 10 11 00 01 10, 00 01 10 11 00 01 10, 00 01 10 11 00 01 10, 00 01 10 11 00 01 10 + long[] actual = Utils.kmerizeAs2bit(input, Globals.K_SIZE); + System.out.println(Arrays.toString(actual)); + assert (Arrays.equals(expected, actual)); + } + + /** + * Test a real sequencing read. + * The input is "@SRR13153715.1 1/1". + * The expected output was computed using the following bash code: + * ``` + * 1 │ #!/bin/bash + * 2 │ sequence="TGGTCATACAGCAAAGCATAATTGTCACCATTACTATGGCAATCAAGCCAGCTATAAAACCTAGCCAAATGTACCATGGCCATTTTATATACTGCTCATACTTTCCAAGTTCTTGGAGATCGAT" + * 3 │ k=13 + * 4 │ + * 5 │ # Function to compute reverse complement + * 6 │ rev_comp() { + * 7 │ echo "$1" | tr "ACGT" "TGCA" | rev + * 8 │ } + * 9 │ + * 10 │ # Generate k-mers and find canonical k-mers + * 11 │ canonical_kmers=() + * 12 │ for ((i=0; i<=${#sequence}-k; i++)); do + * 13 │ kmer=${sequence:i:k} + * 14 │ rev_kmer=$(rev_comp "$kmer") + * 15 │ if [[ "$kmer" < "$rev_kmer" ]]; then + * 16 │ canonical_kmers+=("$kmer") + * 17 │ else + * 18 │ canonical_kmers+=("$rev_kmer") + * 19 │ fi + * 20 │ done + * 21 │ + * 22 │ # Convert canonical k-mers to 2-bit representation and compute decimal values + * 23 │ for kmer in "${canonical_kmers[@]}"; do + * 24 │ binary=$(echo "$kmer" | awk 'BEGIN{FS=""; printf "obase=10;ibase=2;"} { + * 25 │ for (i=1; i<=NF; i++) { + * 26 │ if ($i=="A") printf "00"; + * 27 │ else if ($i=="C") printf "01"; + * 28 │ else if ($i=="G") printf "10"; + * 29 │ else if ($i=="T") printf "11"; + * 30 │ } + * 31 │ printf "\n"; + * 32 │ }') + * 33 │ echo "$binary" | bc + * 34 │ done + * ``` + */ + @Test + void kmerizeAs2bit6() { + Globals.K_SIZE = 13; + String input = "TGGTCATACAGCAAAGCATAATTGTCACCATTACTATGGCAATCAAGCCAGCTATAAAACCTAGCCAAATGTACCATGGCCATTTTATATACTGCTCATACTTTCCAAGTTCTTGGAGATCGAT"; + long[] expected = { + 60739092, + 45401232, + 47387200, + 33454904, + 19997705, + 12881956, + 15202796, + 4784716, + 19138864, + 9446595, + 3991545, + 16927806, + 602363, + 2409453, + 9637812, + 38551249, + 19987269, + 12840212, + 15434812, + 3858703, + 16467260, + 46378512, + 11594628, + 47271708, + 13307576, + 18149838, + 5490490, + 21961961, + 20738980, + 15847056, + 16338096, + 37638956, + 7578676, + 30314704, + 33093964, + 15273225, + 44011412, + 43045012, + 32110841, + 17631561, + 3417383, + 13669532, + 13232632, + 17385932, + 2434864, + 9739456, + 38957824, + 21613569, + 19345413, + 10272791, + 41091164, + 30080818, + 41074636, + 12588837, + 50355348, + 94800, + 379200, + 1516803, + 6067214, + 5237194, + 29966572, + 46464668, + 9703109, + 38812436, + 15406074, + 17019214, + 967994, + 3871977, + 15487909, + 61160132, + 15290033, + 3822508, + 955627, + 238906, + 20599804, + 12597843, + 53481108, + 13370277, + 39058636, + 22016817, + 11743246, + 16724766, + 38482688, + 9620672, + 35959600, + 53681780, + 13400531, + 53602124, + 13081905, + 11766060, + 2941515, + 735378, + 33738276, + 41207797, + 30613460, + 55344976, + 20053314, + 8290483, + 2072620, + 8343741, + 8518146, + 2129536, + 17309600, + 21104616, + 21166056, + 17555362, + 3112584, + 12450339, + 37179521, + 26072096, + 56849672, + 14212418 + }; + long[] actual = Utils.kmerizeAs2bit(input, Globals.K_SIZE); + System.out.println(Arrays.toString(actual)); + assert (Arrays.equals(expected, actual)); + } + + @Test + void kmer2bitToSequence1() { + long input = 0; // 00 00 00 00 00 00 00 + String expected = "AAAAAAA"; + String actual = Utils.kmer2bitToSequence(input); + System.out.println(actual); + assert (expected.equals(actual)); + } + + @Test + void kmer2bitToSequence2() { + long input = 16383; // 11 11 11 11 11 11 11 + String expected = "TTTTTTT"; + String actual = Utils.kmer2bitToSequence(input); + System.out.println(actual); + assert (expected.equals(actual)); + } + + @Test + void kmer2bitToSequence3() { + long input = 1755; // 00 01 10 11 01 10 11 + String expected = "ACGTCGT"; + String actual = Utils.kmer2bitToSequence(input); + System.out.println(actual); + assert (expected.equals(actual)); + } + + @Test + void reverseComplement1() { + long input = 0b00000000000000; // 00 00 00 00 00 00 00 // AAAAAAA + long expected = 0b11111111111111; // 11 11 11 11 11 11 11 // TTTTTTT + long actual = Utils.reverseComplement(input); + System.out.println(actual); + assert (expected == actual); + } + + @Test + void reverseComplement2() { + long input = 0b00011011011011; // 00 01 10 11 01 10 11 // ACGTCGT + long expected = 0b00011000011011; // 00 01 10 00 01 10 11 // ACGACGT + long actual = Utils.reverseComplement(input); + System.out.println(actual); + assert (expected == actual); + } + + @Test + void reverseComplement3() { + long input = 0b00000101101011; // 00 00 01 01 10 10 11 // AACCGGT + long expected = 0b00010110101111; // 00 01 01 10 10 11 11 // ACCGGTT + long actual = Utils.reverseComplement(input); + System.out.println(actual); + assert (expected == actual); + } +} \ No newline at end of file