diff --git a/docs/source/tables/relaxation.csv b/docs/source/tables/relaxation.csv new file mode 100644 index 0000000000000000000000000000000000000000..e00d2e84ffddec1e13c1913860480123a3d96548 --- /dev/null +++ b/docs/source/tables/relaxation.csv @@ -0,0 +1,5 @@ +Relaxation;1;2;3;4;5;6;7;8 +Intersection rate;0.08;0.07;0.06;0.05;0.04;0.03;0.02;0.01 +Similarity threshold;95;85;75;65;55;45;35;25 +Mcl inflation;10.8;9.6;8.4;7.2;6.0;4.8;3.6;2.4 +Contrast;8;7;6;5;4;3;2;1 \ No newline at end of file diff --git a/docs/source/user_guide/construct.rst b/docs/source/user_guide/construct.rst index 38ede3f87bfa95be525416e285d54efdcf73a255..598e8d31bf77a51e40b1787c3569eac4baa7545b 100644 --- a/docs/source/user_guide/construct.rst +++ b/docs/source/user_guide/construct.rst @@ -203,20 +203,35 @@ of the sequence. When you want to run **group** another time but with different parameters, the currently active grouping must first either be moved or removed. This can be achieved with the -:ref:`move- or remove_homology_groups <construct:move or remove grouping>` +:ref:`move or remove grouping <construct:move or remove grouping>` functions. -| **Method** -| Here, we explain a simplified version of the original algorithm, - please take a look at our publication for an extensive explanation. - First, potential similar sequences are identified by counting shared - *k*-mer (protein) sequences. Similarity between the selected protein - sequences is calculated through (local) Smith-Waterman alignments. - When the (normalized) similarity score of two sequences is above a - given threshold (controlled by ``--relaxation``), the proteins are - connected with each other in the similarity graph. Every similarity - component is then passed to the MCL (Markov clustering) algorithm to - be possibly broken into several homology groups. +Method +"""""" +Here, we explain a simplified version of the original algorithm, +please take a look at our publication for an extensive explanation. +First, potential similar sequences are identified by counting shared +*k*-mer (protein) sequences. Similarity between the selected protein +sequences is calculated through (local) Smith-Waterman alignments. +When the (normalized) similarity score of two sequences is above a +given threshold (controlled by ``--relaxation``), the proteins are +connected with each other in the similarity graph. Every similarity +component is then passed to the MCL (Markov clustering) algorithm to +be possibly broken into several homology groups. + +Relaxation +"""""""""" +The ``relaxation`` parameter is a combination of four sub-parameters: +``intersection rate``, ``similarity threshold``, ``mcl inflation`` +and ``contrast``. The values for these parameter for each relaxation +setting can be seen in the table below. We recommend using the +``--relaxation`` option if there is any uncertainty about its +sub-parameters. + +.. csv-table:: + :file: tables/relaxation.csv + :header-rows: 1 + :delim: ; Required software """"""""""""""""" @@ -253,18 +268,18 @@ Options - Only cluster protein sequences of the longest transcript per gene. * - ``--relaxation`` - The relaxation in homology calls. Should be in range [1-8], from - strict to relaxed (default 1). This argument automatically sets + strict to relaxed. This argument automatically sets the four remaining arguments stated below. * - ``--intersection-rate`` - The fraction of *k*-mers that needs to be shared by two intersecting - proteins. Should be in range [0.001,0.1] (default: 0.08). + proteins. Should be in range [0.001,0.1]. * - ``--similarity-threshold`` - The minimum normalized similarity score of two proteins. Should be in - range [1..99] (default: 95). + range [1..99]. * - ``--mcl-inflation`` - - The MCL inflation. Should be in range [1,19] (default: 10.8). + - The MCL inflation. Should be in range [1,19]. * - ``--contrast`` - - The contrast factor. Should be in range [0,10] (default: 8). + - The contrast factor. Should be in range [0,10]. Example commands """""""""""""""" diff --git a/src/main/java/nl/wur/bif/pantools/cli/Group.java b/src/main/java/nl/wur/bif/pantools/cli/Group.java index 393dbc646825e8962382d60cf2906790c4fe6454..dfa0d01feeef4d9cdd08d8b92d02de1f15ff84b4 100644 --- a/src/main/java/nl/wur/bif/pantools/cli/Group.java +++ b/src/main/java/nl/wur/bif/pantools/cli/Group.java @@ -43,19 +43,19 @@ public class Group implements Callable<Integer> { @DecimalMin(value = "0.001", message = "{range.ir}") @DecimalMax(value = "0.1", message = "{range.ir") - static double intersectionRate = 0.05; + static double intersectionRate; @Min(value = 1, message = "{min.st}") @Max(value = 99, message = "{max.st}") - static int similarityThreshold = 65; + static int similarityThreshold; @Min(value = 1, message = "{min.mcl-i}") @Max(value = 99, message = "{max.mcl-i}") - static double mclInflation = 7.2; + static double mclInflation; @DecimalMin(value = "0", message = "{min.contrast}") @DecimalMax(value = "10", message = "{max.contrast}") - static double contrast = 5; + static double contrast; @ArgGroup(multiplicity = "1") RelaxationSettings relaxationSettings; private static class RelaxationSettings { @@ -75,16 +75,16 @@ public class Group implements Callable<Integer> { } private static class RelaxationParameters { - @Option(names = "--intersection-rate") + @Option(names = "--intersection-rate", required = true) void setIntersectionRate(double value) {intersectionRate = value;} - @Option(names = "--similarity-threshold") + @Option(names = "--similarity-threshold", required = true) void setSimilarityThreshold(int value) {similarityThreshold = value;} - @Option(names = "--mcl-inflation") + @Option(names = "--mcl-inflation", required = true) void setMclInflation(double value) {mclInflation = value;} - @Option(names = "--contrast") + @Option(names = "--contrast", required = false) void setContrast(double value) {contrast = value;} } diff --git a/src/main/resources/Defaults.properties b/src/main/resources/Defaults.properties index 7399c2833f70630a51d489c3ad715a835d5bd108..761cc2a70dbc1c8086cb79b438b1094aed1c810f 100644 --- a/src/main/resources/Defaults.properties +++ b/src/main/resources/Defaults.properties @@ -18,11 +18,6 @@ pantools.core_phylogeny.clustering-mode = ML pantools.create_tree_template.color = 2 # GoEnrichment pantools.go_enrichment.fdr = 5 -# Group -pantools.group.intersection-rate = 0.05 -pantools.group.similarity-threshold = 65 -pantools.group.mcl-inflation = 7.2 -pantools.group.contrast = 5 # LocateGenes pantools.locate_genes.core-threshold = 0 pantools.locate_genes.gap-open = 0 diff --git a/src/main/resources/MessageBundle.properties b/src/main/resources/MessageBundle.properties index b092e139ff27157121ba1aa514d6bb56115fd4cd..54845de012df3cdc6da2ad130ced4e2d978592c6 100644 --- a/src/main/resources/MessageBundle.properties +++ b/src/main/resources/MessageBundle.properties @@ -169,8 +169,6 @@ proteomes-file = A text file containing paths to FASTA files of proteins to be a include = Only include a selection of genomes. regions-file = A text file containing genome locations with on each line: a genome number, sequence number, begin and \ end position, separated by a space. -similarity-threshold = The minimum normalized similarity score of two proteins. Should be in range [1..99] (default: \ - ${DEFAULT-VALUE}). threads = Number of parallel working threads, default is the number of cores or 8, whichever is lower. unique-threshold = Threshold (%%) for unique/cloud genes. Default is a single genome, not a percentage. @@ -244,10 +242,12 @@ pantools.gene_classification.phenotype = A phenotype name, used to find genes sp # GoEnrichment pantools.go_enrichment.fdr = The false discovery rate (%%) (default: ${DEFAULT-VALUE}%%). # Group -pantools.group.contrast = The contrast factor. Should be in range [0,10] (default: ${DEFAULT-VALUE}). +pantools.group.contrast = The contrast factor. Should be in range [0,10]. pantools.group.intersection-rate = The fraction of k-mers that needs to be shared by two intersecting proteins. Should \ - be in range [0.001,0.1] (default: ${DEFAULT-VALUE}). -pantools.group.mcl-inflation = The MCL inflation. Should be in range [1,19] (default: ${DEFAULT-VALUE}). + be in range [0.001,0.1]. +pantools.group.mcl-inflation = The MCL inflation. Should be in range [1,19]. +pantools.group.similarity-threshold = The minimum normalized similarity score of two proteins. \ + Should be in range [1..99]. pantools.group.relaxation = The relaxation in homology calls. Should be in range [1..8], from strict to relaxed. \ Use optimal_grouping to determine the best relaxation setting. # GroupInfo