Commit ce5f7c4a authored by Jonkheer, Eef's avatar Jonkheer, Eef
Browse files

Busco protein now works on panproteomes

parent 67eef18c
Pipeline #57606 failed with stage
in 26 seconds
......@@ -44,7 +44,7 @@ public class BuscoProtein implements Callable<Integer> {
@Pattern(regexp = "BUSCO[345]", flags = CASE_INSENSITIVE, message = "{pattern.version.busco-version}")
String buscoVersion;
@ArgGroup(multiplicity = "1") BuscoDatasets buscoDatasets;
@ArgGroup() BuscoDatasets buscoDatasets; // multiplicity = "1", stond tussen haken
static class BuscoDatasets {
@Option(names = {"--busco9"})
@InputFile(message = "{file.busco}")
......@@ -67,6 +67,9 @@ public class BuscoProtein implements Callable<Integer> {
@Option(names = "--phasing")
boolean phasing;
@Option(names = "--log")
boolean log;
@Override
public Integer call() {
new BeanValidation().argValidation(spec, this, selectGenomes, buscoDatasets);
......@@ -79,7 +82,12 @@ public class BuscoProtein implements Callable<Integer> {
private void setGlobalParameters() {
setGenomeSelectionOptions(selectGenomes);
THREADS = threadNumber.getnThreads();
INPUT_FILE = (buscoDatasets.busco9 != null) ? buscoDatasets.busco9.toString() : buscoDatasets.busco10;
if (buscoDatasets != null) { // either --busco9 or --busco10 was set
if (buscoDatasets.busco9 != null && buscoDatasets.busco10 != null) {
INPUT_FILE = (buscoDatasets.busco9 != null) ? buscoDatasets.busco9.toString() : buscoDatasets.busco10;
}
}
if (annotationsFile != null) PATH_TO_THE_ANNOTATIONS_FILE = annotationsFile.toString();
BUSCO_VERSION = buscoVersion;
if (longestTranscript) longest_transcripts = true;
......@@ -87,6 +95,9 @@ public class BuscoProtein implements Callable<Integer> {
if (phasing){
PHASED_ANALYSIS = true;
}
if (log){
LOG = true;
}
}
}
......@@ -45,9 +45,6 @@ public class GeneClassification implements Callable<Integer> {
@Option(names = {"--sequence"})
boolean sequence;
//@Option(names = {"-sf","--selection-file"})
//Path selectionFile;
@Option(names = "--mlsa")
boolean mlsa;
......@@ -71,7 +68,6 @@ public class GeneClassification implements Callable<Integer> {
new BeanValidation().argValidation(spec, this, selectGenomes);
pantools.setPangenomeGraph();
setGlobalParameters(); //TODO: use local parameters instead
//pf.countGeneHaplotypes(true);
newClassification.geneClassification();
return 0;
}
......@@ -86,9 +82,6 @@ public class GeneClassification implements Callable<Integer> {
if (phasing) {
PHASED_ANALYSIS = true;
}
//if (selectionFile != null) {
// SELECTION_FILE = selectionFile;
//}
}
}
......@@ -37,7 +37,6 @@ public class StartCalculateDnDS implements Callable<Integer> {
@Mixin private ThreadNumber threadNumber;
/*
@CommandLine.ArgGroup(multiplicity = "1") PairwiseOrMSa PairwiseOrMSa;
private static class PairwiseOrMSa {
......
......@@ -35,19 +35,6 @@ public class Busco {
public void buscoProtein() {
System.out.println("\nRunning BUSCO in protein mode\n");
reportBuscoMode(); // check if busco3 4 or 5 was selected
if (INPUT_FILE == null) {
if (BUSCO_VERSION.equals("BUSCO4") || BUSCO_VERSION.equals("BUSCO5")) {
System.out.println("\nNo BUSCO dataset was selected via --input-file\n"
+ "Provide the full path to the database or just give the odb10 dataset name and it is automatically downloaded. "
+ "To display all available datasets, run the following command: busco --list-datasets\n"
+ "BUSCO will now be run with --auto-lineage, where the lineage is determined by the tool itself\n");
INPUT_FILE = "";
} else { // busco3
System.out.println("No BUSCO dataset was selected via -if\nProvide the full path to a odb9 database\n");
System.exit(1);
}
}
HashMap<String, ArrayList<String>> hmmMap = new HashMap<>();
HashMap<String, String> buscoScoresMap = new HashMap<>();
int totalBuscos = 0, counter = 0;
......@@ -167,12 +154,26 @@ public class Busco {
* @return
*/
private String getBuscoDatabaseName() {
if (INPUT_FILE == null) {
if (BUSCO_VERSION.equals("BUSCO4") || BUSCO_VERSION.equals("BUSCO5")) {
System.out.println("\nNo BUSCO dataset was selected via --input-file\n"
+ "Provide the full path to the database or just give the odb10 dataset name and it is automatically downloaded. "
+ "To display all available datasets, run the following command: busco --list-datasets\n"
+ "BUSCO will now be run with --auto-lineage, where the lineage is determined by the tool itself\n");
INPUT_FILE = "";
} else { // busco3
System.out.println("No BUSCO dataset was selected via --busco9\n" +
"Please provide the full path to an odb9 database\n");
System.exit(1);
}
}
String[] inPathArray = INPUT_FILE.split("/");
String dbName = inPathArray[inPathArray.length-1];
if (INPUT_FILE.equals("")) {
dbName = "automatic_lineage";
} else {
System.out.println("\rSelected BUSCO database: " + dbName);
}
System.out.println("\rSelected BUSCO database: " + dbName);
return dbName;
}
......@@ -277,7 +278,7 @@ public class Busco {
* @param buscoScores
* @return
*/
private String calculateAverageMedianStatisticsBusco( double[] buscoLowestHighest, double[][] buscoScores){
private String calculateAverageMedianStatisticsBusco(double[] buscoLowestHighest, double[][] buscoScores){
DecimalFormat df = new DecimalFormat("0.000");
ArrayList<Double> completeList = new ArrayList<>();
ArrayList<Double> singleCopyList = new ArrayList<>();
......@@ -618,7 +619,9 @@ public class Busco {
* @param genomeOrSubgenome
*/
private void runBusco(String buscoPath, String DbName, String outputName, String genomeOrSubgenome) {
delete_file_full_path(genomeOrSubgenome); // prevents crash when a file exists with an identical name to the (sub)genome number
String proteinInputFile = determineProteinInputFile(genomeOrSubgenome);
String[] command = null;
if (BUSCO_VERSION.equals("BUSCO3")) {
command = new String[]{"python3", buscoPath, "-f", "-i", proteinInputFile, "-o", genomeOrSubgenome,
......@@ -645,7 +648,9 @@ public class Busco {
command = new String[]{"busco", "-f", "-i", proteinInputFile, "-o", genomeOrSubgenome,
"-l", INPUT_FILE, "-m", "proteins", "--cpu", THREADS + "", "--tar"};
}
//System.out.println(Arrays.toString(command));
if (LOG) {
//System.out.println(Arrays.toString(command));
}
ExecCommand.ExecCommand(command);
copy_directory(genomeOrSubgenome, WORKING_DIRECTORY + "/busco/" + DbName + "/" + outputName + "/results/" + genomeOrSubgenome);
delete_directory(genomeOrSubgenome);
......@@ -798,18 +803,26 @@ public class Busco {
continue;
}
}
int[] address = (int[]) mrnaNode.getProperty("address");
int genomeNr = address[0];
if (proteinWriters[genomeNr-1] == null){
continue;
}
if (skip_array[genomeNr-1] || skip_seq_array[genomeNr-1][address[1]-1]) {
continue;
int genomeNr;
if (PROTEOME) {
genomeNr = (int) mrnaNode.getProperty("genome");
} else {
int[] address = (int[]) mrnaNode.getProperty("address");
genomeNr = address[0];
if (proteinWriters[genomeNr - 1] == null) {
continue;
}
if (skip_array[genomeNr - 1] || skip_seq_array[genomeNr - 1][address[1] - 1]) {
continue;
}
String requiredAnnotationId = annotation_identifiers.get(genomeNr - 1);
String annotationId = (String) mrnaNode.getProperty("annotation_id");
if (!requiredAnnotationId.equals(annotationId)) {
continue;
}
}
String requiredAnnotationId = annotation_identifiers.get(genomeNr-1);
String annotationId = (String) mrnaNode.getProperty("annotation_id");
if (!requiredAnnotationId.equals(annotationId)){
if (proteinWriters[genomeNr - 1] == null) {
continue;
}
......@@ -1185,7 +1198,6 @@ public class Busco {
writeStringToFile(rscript.toString(), buscoDirectory + "upset/upset_plot.R", false, false);
}
/**
*
* @param questionableBuscos
......@@ -1200,28 +1212,28 @@ public class Busco {
ArrayList<String> identifiersForBusco) {
String[] categories = {"Single copy", "Missing", "Fragmented", "Duplicated"};
StringBuilder outputBuilder = new StringBuilder("#split file on semicolon. Total BUSCOs: " + totalBuscos
+ "\nGenome;Complete single copy (total);Complete single copy (total percentage);Missing HMMs (total);"
+ "Missing HMMs (total percentage);Missing HMMs;Fragmented HMMs (total);Fragmented HMMs (total percentage);"
+ "Fragmented HMMs;Duplicated HMMs (total);Duplicated HMMs (total percentage);Duplicated HMMs");
StringBuilder outputBuilder = new StringBuilder("#Total BUSCOs: " + totalBuscos
+ "\nGenome,Complete single copy (total),Complete single copy (total percentage),Missing HMMs (total),"
+ "Missing HMMs (total percentage),Missing HMMs,Fragmented HMMs (total),Fragmented HMMs (total percentage),"
+ "Fragmented HMMs,Duplicated HMMs (total),Duplicated HMMs (total percentage),Duplicated HMMs");
if (questionableBuscos.length > 0) {
outputBuilder.append(";questionable BUSCOs not correct (max ").append(questionableBuscos.length).append(")\n");
outputBuilder.append(",Questionable BUSCOs not correct (max ").append(questionableBuscos.length).append(")\n");
} else {
outputBuilder.append("\n");
}
for (String genomeOrSubgenome : identifiersForBusco) {
outputBuilder.append(genomeOrSubgenome).append(";");
outputBuilder.append(genomeOrSubgenome).append(",");
int wrongBuscoCounter = 0;
for (int j=0; j < categories.length; j++) {
StringBuilder value_builder = hmmPerGenome.get(genomeOrSubgenome + "#" + categories[j]);
if (value_builder == null) {
outputBuilder.append("-;-;-");
outputBuilder.append("-,-,-");
} else if (categories[j].equals("Single copy")) {
String[] value_array = value_builder.toString().split(",");
String percentage = get_percentage_str(value_array.length, totalBuscos, 3);
outputBuilder.append(value_array.length).append(";").append(percentage);
String[] valueArray = value_builder.toString().split(",");
String percentage = get_percentage_str(valueArray.length, totalBuscos, 3);
outputBuilder.append(valueArray.length).append(",").append(percentage);
} else {
String value = value_builder.toString().replaceFirst(".$","");
String[] value_array = value.split(",");
......@@ -1231,9 +1243,9 @@ public class Busco {
}
}
String percentage = get_percentage_str(value_array.length, totalBuscos, 3);
outputBuilder.append(value_array.length).append(";").append(percentage).append(";").append(value);
outputBuilder.append(value_array.length).append(",").append(percentage).append(",").append(value.replace(","," "));
}
outputBuilder.append(";");
outputBuilder.append(",");
}
if (questionableBuscos.length > 0) {
outputBuilder.append(wrongBuscoCounter);
......
......@@ -26,7 +26,6 @@ import static nl.wur.bif.pantools.pantools.Pantools.*;
import static nl.wur.bif.pantools.utils.Globals.*;
import static nl.wur.bif.pantools.utils.Utils.*;
import static nl.wur.bif.pantools.pangenome.phased_functionalities.*;
import static nl.wur.bif.pantools.pangenome.create_skip_arrays.updateSkipArraysSequenceSelection;
/**
*
......@@ -2223,7 +2222,7 @@ public class Phylogeny {
String selectedStr = readOneLineFile(WORKING_DIRECTORY + "gene_classification/selected_sequences.info", false);
String[] sequenceIds = selectedStr.split(",");
selectedSequences.addAll(Arrays.asList(sequenceIds));
updateSkipArraysSequenceSelection(selectedSequences);
skip.updateSkipArraysSequenceSelection(selectedSequences);
pf.preparePhasedGenomeInformation(false);
for (String sequenceId : selectedSequences) {
String genomePhase = pf.getSubgenomeIdentifier(sequenceId);
......
......@@ -9,7 +9,6 @@ import java.util.*;
import static nl.wur.bif.pantools.pangenome.Classification.*;
import static nl.wur.bif.pantools.pangenome.Repeats.calculate_density_per_window;
import static nl.wur.bif.pantools.pangenome.create_skip_arrays.updateSkipArraysSequenceSelection;
import static nl.wur.bif.pantools.utils.Utils.*;
import static nl.wur.bif.pantools.utils.Globals.*;
import static nl.wur.bif.pantools.utils.Utils.checkIfFileExists;
......@@ -786,7 +785,7 @@ public class SequenceVisualization {
return;
}
updateSkipArraysSequenceSelection(sequencesWithoutOutput); // to only retrieve the gene order of the target sequences
skip.updateSkipArraysSequenceSelection(sequencesWithoutOutput); // to only retrieve the gene order of the target sequences
HashSet<String> repeat_types = repeatClass.retrieveRepeatTypes();
ArrayList<String> selected_repeat_types = repeatClass.update_repeat_types(repeat_types, false, false);
boolean check_types = false;
......@@ -803,7 +802,7 @@ public class SequenceVisualization {
} else if (phased_functionalities.selectedSequences.size() == sequencesWithoutOutput.size()) {
System.out.println("\r No repeats found (for sequence selection)");
}
updateSkipArraysSequenceSelection(phased_functionalities.selectedSequences); // return skip_array to original state
skip.updateSkipArraysSequenceSelection(phased_functionalities.selectedSequences); // return skip_array to original state
}
......
......@@ -76,12 +76,14 @@ public class create_skip_arrays {
}
skip_array[i-1] = true;
}
for (int i = 1; i <= total_genomes; i++) { // i is a genome number
int numberSequences = sequencesPerGenome.get(i);
for (int j = 0; j < numberSequences; j++) {
if (skip_array[i-1]) {
skip_seq_array[i-1][j] = true;
}
if (!PROTEOME) {
for (int i = 1; i <= total_genomes; i++) { // i is a genome number
int numberSequences = sequencesPerGenome.get(i);
for (int j = 0; j < numberSequences; j++) {
if (skip_array[i - 1]) {
skip_seq_array[i - 1][j] = true;
}
}
}
}
} else if (skip_genomes != null) { // --skip argument was provided
......@@ -141,11 +143,42 @@ public class create_skip_arrays {
updateAnnotationIdentifiers();
}
private HashMap<Integer, Integer> retrieveSequencesPerGenome() {
HashMap<Integer, Integer> sequencesPerGenome = new HashMap<>();
if (PROTEOME) {
return null;
}
int genomeCounter = 0;
try (Transaction tx = GRAPH_DB.beginTx()) { // start database transaction
ResourceIterator<Node> genomeNodes = GRAPH_DB.findNodes(genome_label);
while (genomeNodes.hasNext()) {
genomeCounter ++;
if (genomeCounter % 100 == 0 && genomeCounter > 1) {
System.out.print("\rGathering genomes: " + genomeCounter);
}
Node genomeNode = genomeNodes.next();
int genomeNr = (int) genomeNode.getProperty("number");
int num_sequences = (int) genomeNode.getProperty("num_sequences");
skip_seq_array[genomeNr-1] = new boolean[num_sequences];
for (int j = 1; j <= num_sequences; j++) {
skip_seq_array[genomeNr-1][j-1] = false;
}
sequencesPerGenome.put(genomeNr, num_sequences);
}
tx.success();
}
System.out.print("\r ");
return sequencesPerGenome;
}
/**
* annotation_identifiers list is initialized in 'setAnnotationIdentifiers' but the genome selection might be different now.
* Updates annotation_identifiers by changing the identifiers of skipped genomes to end with '_0'
*/
private void updateAnnotationIdentifiers() {
if (PROTEOME) { // panproteome does not have annotations
return;
}
for (int i = 1; i <= total_genomes; i++) { // i is a genome number
if (skip_array[i-1]) {
String annotationId = annotation_identifiers.get(i-1);
......@@ -669,7 +702,7 @@ public class create_skip_arrays {
* Sequences in 'sequenceIdentifiers' are set to false.
* @param sequenceIdentifiers
*/
public static void updateSkipArraysSequenceSelection(ArrayList<String> sequenceIdentifiers) {
public void updateSkipArraysSequenceSelection(ArrayList<String> sequenceIdentifiers) {
skip_array = new boolean[total_genomes];
skip_seq_array = new boolean[total_genomes][0];
HashMap<Integer, Integer> sequencesPerGenome = retrieveSequencesPerGenome();
......
......@@ -12,7 +12,6 @@ import java.util.concurrent.atomic.AtomicLong;
import static nl.wur.bif.pantools.pangenome.Classification.*;
import static nl.wur.bif.pantools.pangenome.Classification.try_incr_hashset_hashmap;
import static nl.wur.bif.pantools.pangenome.create_skip_arrays.updateSkipArraysSequenceSelection;
import static nl.wur.bif.pantools.utils.Utils.*;
import static nl.wur.bif.pantools.pangenome.phased_functionalities.*;
import static nl.wur.bif.pantools.utils.Globals.*;
......@@ -82,10 +81,10 @@ public class geneRetention {
}
ArrayList<String> targetSequenceIds = readInputFileRetention(inputSequences); // fills selectedSequences
updateSkipArraysSequenceSelection(targetSequenceIds); // to only retrieve the gene order of the target sequences
skip.updateSkipArraysSequenceSelection(targetSequenceIds); // to only retrieve the gene order of the target sequences
HashMap<String, String> renamedIdentifiersMap = phased_functionalities.retrieveRenamedSyntentyIdentifiers(false);
HashMap<String, Node[]> mrnaNodesPerSequence = phased_functionalities.get_ordered_mrna_per_seq(false, true, null); // key is sequence identifier, value is array with 'gene' nodes
updateSkipArraysSequenceSelection(selectedSequences); // return skip_array to original state
skip.updateSkipArraysSequenceSelection(selectedSequences); // return skip_array to original state
stringQueue = new LinkedBlockingQueue<>();
String[] modes;
......
......@@ -1602,30 +1602,7 @@ public class Utils {
return genomeNodesMap;
}
public static HashMap<Integer, Integer> retrieveSequencesPerGenome() {
HashMap<Integer, Integer> sequencesPerGenome = new HashMap<>();
int genomeCounter = 0;
try (Transaction tx = GRAPH_DB.beginTx()) { // start database transaction
ResourceIterator<Node> genomeNodes = GRAPH_DB.findNodes(genome_label);
while (genomeNodes.hasNext()) {
genomeCounter ++;
if (genomeCounter % 1000 == 0 && genomeCounter > 1){
System.out.print("\rGathering genomes: " + genomeCounter);
}
Node genomeNode = genomeNodes.next();
int genomeNr = (int) genomeNode.getProperty("number");
int num_sequences = (int) genomeNode.getProperty("num_sequences");
skip_seq_array[genomeNr-1] = new boolean[num_sequences];
for (int j = 1; j <= num_sequences; j++) {
skip_seq_array[genomeNr-1][j-1] = false;
}
sequencesPerGenome.put(genomeNr, num_sequences);
}
tx.success();
}
System.out.print("\r ");
return sequencesPerGenome;
}
public static int[] sequenceIdToIntegers(String sequenceId) {
String[] seq_array = sequenceId.split("_"); // 1_1 becomes [1,1]
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment