Compare revisions

Brankovics, Balazs · Brankovics, Balazs · af785f0e · af785f0e · af785f0e · af785f0e
--- a/README.md
+++ b/README.md
@@ -5,10 +5,12 @@ that programs can use the output of other programs. The main concept in the
 setup and design of this toolbox is to convert the output of all the programs
 that we want to use for primary analysis (read-mapping or homology tools) to
 a standard format.
+
 We have chosen for the SAM (or BAM which is the compressed form) format,
 because it is the standard output for read-mapping tools, it is a sufficiently
 flexible and robust format, and many programs accept it as input (including
 visualization tools such as [pavian](https://github.com/fbreitwieser/pavian)).
+
 Since read-mapping tools already produce correct SAM output, we only needed to
 make sure that outputs of the 3 most commonly used (nucleotide to nucleotide)
 homology tools (BLAST, nucmer and exonerate) are converted in to correct SAM
@@ -20,7 +22,8 @@ The "middle part" of workflows consists of filtering steps to process your raw
 output from the primary analysis (mapping, alignment or other homology tools).
 Since [samtools](http://samtools.github.io/) already provides many useful tools
 for this. The goal for this project was to add tools for common filtering
-steps that are not covered by samtools (but maybe included in CLC genomic workbench).
+steps that are not covered by samtools (but maybe included in CLC genomic
+workbench and similar tools).

 These different filtering options are covered by two scripts (sam-filter.pl & sam-keep-best.pl).
 The first (sam-filter.pl) filter is based on thresholds:
@@ -175,10 +178,10 @@ flowchart LR
 ```


-Metadata aspects (to do):
+Metadata aspects for future considerations:
 - Comment lines could be used to store input file information
 - Reference sequence can hold assembly ID information
- We could add Line to the header for storing provenance info of the runs
+

 ## SAM format

@@ -291,7 +294,7 @@ It specifically relies on finding `$subject-db\.ndb` and `$subject-db\.nin`. The
 Alignment score is stored in the optional field in SAM with `AS:i:` tag as an integer.
 Different programs calculate alignment scores in different ways.
 To combine the results from different methods, scores have to be calculated in the same way.
-Nucmer does not calculate alignment scores, so during conversion from delta it would be useful to calculate a score.
+Nucmer does not calculate alignment scores, so during conversion from delta a score is calculated using the default use by these tools (see below).

 ### Defalut scoring in exonerate


--- a/src/sam-filter.pl
+++ b/src/sam-filter.pl
@@ -7,6 +7,10 @@ use lib "$FindBin::RealBin/lib";  # use the lib directory
 use biointsam;
 use biointbasics;

+my $programname = "sam-filer.pl";
+my $version = "1.1";
+my $cmd = join(" ", $programname, @ARGV);
+
 #===DESCRIPTION=================================================================

 my $description = 
@@ -24,6 +28,8 @@ my $options =
    "\t-qcov=<float> | --query-coverage=<float>\n\t\tKeep only query sequences that have at least <float> part covered by the hit.\n" .
    "\t-rcov=<float> | --reference-coverage=<float>\n\t\tKeep only query sequences that cover at least <float> part of the reference sequence.\n" .
    "\t-mincomplexity=<float> | --mincomplexity=<float>\n\t\tKeep only hits where the aligned sequence has at least <float> 7-mer complexity (observed k-mers/maximal possible k-mers).\n" .
+    "\t-dust | --dust=<int>\n\t\tKeep only hits where the aligned sequence has at most <int> (7 if not specified) dust score. (Algorithm adapted from PRINSEQ-lite).\n" .
+    "\t-entropy | --entropy=<int>\n\t\tKeep only hits where the aligned sequence has at least <int> (70 if not specified) entropy score. (Algorithm adapted from PRINSEQ-lite).\n" .
    "\t-v\n\t\tInvert selection: report sequence that fail the requirement.\n" .
    "\n";
 my $info = {
@@ -41,6 +47,8 @@ my $minlen;
 my $minaln;
 my $minsim;
 my $mincomplexity;
+my $dust;
+my $entropy;
 my $invert;
 my $unmapped;
 my $qcov;
@@ -58,6 +66,12 @@ for (@ARGV) {
 	$minaln = $1;
    } elsif (/^--?mincomplexity=(\d\.\d+)$/) {
 	$mincomplexity = $1;
+    } elsif (/^--?dust(=\d+)?$/) {
+	$dust = 7;
+	$dust = $1 if /^--?dust=(\d+)$/;
+    } elsif (/^--?entropy(=\d+)?$/) {
+	$entropy = 7;
+	$entropy = $1 if /^--?entropy=(\d+)$/;
    } elsif (/^--?minsim=(\S+)$/) {
 	$minsim = $1;
    } elsif (/^-v$/) {
@@ -78,13 +92,25 @@ biointbasics::print_help(\@ARGV, $info);
 my $qname = "";
 my $mem = "";
 my $memflag = 0;
+my $header = "true";
+my $previousprogram = "";
 while(<>) {
    my %hit;
    biointsam::parse_sam($_, \%ref, \%hit); 
    unless (%hit) {
 	print "$_\n";
+	if (/^\@PG\tID:(\S+)/) {
+	    $previousprogram = $1;
+	}
 	next;
    }
+    if ($header) { # First line after the header section
+	my $text = "\@PG\tID:$programname\tPN:$programname";
+	$text .= "\tPP:$previousprogram" if $previousprogram;
+	print $text . "\tVN:$version\tCL:$cmd\n";
+	$header = undef;
+    }
+    
    if ($hit{"FLAG"} & 4 || $hit{'RNAME'} eq "*") {
 	print biointsam::sam_string(\%hit), "\n" if $unmapped;
 	next;
@@ -137,6 +163,24 @@ while(<>) {
 	    $failed++ if $mincomplexity > $complexity[0];
 	}
    }
+    if ($dust && ! $failed) {
+	my $score = &dust_score($aln->{'seq'});
+	if ($score eq "NA") {
+	    print STDERR "WARNING: could not check DUST score for $_\n";
+	} else {
+	    $hit{"ds:i"} = $score;
+	    $failed++ if $dust < $score;
+	}
+    }
+    if ($entropy && ! $failed) {
+	my $score = &entropy_score($aln->{'seq'});
+	if ($score eq "NA") {
+	    print STDERR "WARNING: could not check DUST score for $_\n";
+	} else {
+	    $hit{"ce:i"} = $score;
+	    $failed++ if $entropy > $score;
+	}
+    }
    if ($qcov && ! $failed) {
 	my $len = $aln->{'length'} - $aln->{'deletion'} + $aln->{'unmapped'};
 	my $covered = $aln->{'length'} - $aln->{'deletion'};
@@ -208,3 +252,144 @@ sub reverse_seq {
    $complement =~ tr/ACGTacgtWwMmRrSsKkYyBbVvDdHh/TGCAtgcaWwKkYySsMmRrVvBbHhDd/;
    return $complement;
 }
+
+sub dust_score {
+    my ($seq) = @_;
+    unless ($seq) {
+	return "NA";
+    }
+
+    my $length = length($seq);
+    
+    my $WINDOWSIZE = 64;
+    my $WINDOWSTEP = 32;
+    my $WORDSIZE = 3;
+    my @WINDOWSIZEARRAY = (0..61);
+    my $POINTFIVE = 1/2;
+
+    my $dustscore;
+    my ($rest,$steps,@vals,$str,$num,$bynum);
+    if($length <= $WINDOWSIZE) {
+	$rest = $length;
+	$steps = 0;
+    } else {
+	$steps = int(($length - $WINDOWSIZE) / $WINDOWSTEP) + 1;
+	$rest = $length - $steps * $WINDOWSTEP;
+	unless($rest > $WINDOWSTEP) {
+	    $rest += $WINDOWSTEP;
+	    $steps--;
+	}
+    }
+    $num = $WINDOWSIZE-2;
+    $bynum = 1/$num;
+    $num--;
+    my $mean = 0;
+    foreach my $i (0..$steps-1) {
+	$str = substr($seq,($i * $WINDOWSTEP),$WINDOWSIZE);
+	my %counts = ();
+	foreach my $i (@WINDOWSIZEARRAY) {
+	    $counts{substr($str,$i,3)}++;
+	}
+	$dustscore = 0;
+	foreach(values %counts) {
+	    $dustscore += ($_ * ($_ - 1) * $POINTFIVE);
+	}
+	push(@vals,($dustscore * $bynum));
+    }
+    #last step
+    if($rest > 5) {
+	$str = substr($seq,($steps * $WINDOWSTEP),$rest);
+	my %counts = ();
+	$num = $rest-2;
+	foreach my $i (0..($num - 1)) {
+	    $counts{substr($str,$i,3)}++;
+	}
+	$dustscore = 0;
+	foreach(values %counts) {
+	    $dustscore += ($_ * ($_ - 1) * $POINTFIVE);
+	}
+	push(@vals,(($dustscore / ($num-1)) * (($WINDOWSIZE - 2) / $num)));
+    } else {
+	push(@vals,31); #to assign a maximum score based on the scaling factor 100/31
+    }
+    $mean = &getArrayMean(@vals);
+    my $score = int($mean * 100 / 31);
+    return $score;
+
+    sub getArrayMean {
+	return @_ ? sum(@_) / @_ : 0;
+    }
+    sub sum {
+	my $sum = 0;
+	for (@_) {
+	    $sum += $_;
+	}
+	return $sum;
+    }
+}
+
+sub entropy_score {
+    my ($seq) = @_;
+    unless ($seq) {
+	return "NA";
+    }
+
+    my $length = length($seq);
+    
+    my $WINDOWSIZE = 64;
+    my $WINDOWSTEP = 32;
+    my $WORDSIZE = 3;
+    my @WINDOWSIZEARRAY = (0..61);
+    my $LOG62 = log(62);
+    my $ONEOVERLOG62 = 1/log(62);
+
+    my ($rest,$steps,@vals,$str,$num,$bynum);
+    if($length <= $WINDOWSIZE) {
+	$rest = $length;
+	$steps = 0;
+    } else {
+	$steps = int(($length - $WINDOWSIZE) / $WINDOWSTEP) + 1;
+	$rest = $length - $steps * $WINDOWSTEP;
+	unless($rest > $WINDOWSTEP) {
+	    $rest += $WINDOWSTEP;
+	    $steps--;
+	}
+    }
+    $num = $WINDOWSIZE-2;
+    $bynum = 1/$num;
+    $num--;
+    my $mean = 0;
+    
+    my $entropyval;
+    foreach my $i (0..$steps-1) {
+	$str = substr($seq,($i * $WINDOWSTEP),$WINDOWSIZE);
+	my %counts = ();
+	foreach my $i (@WINDOWSIZEARRAY) {
+	    $counts{substr($str,$i,3)}++;
+	}
+	$entropyval = 0;
+	foreach(values %counts) {
+	    $entropyval -= ($_ * $bynum) * log($_ * $bynum);
+	}
+	push(@vals,($entropyval * $ONEOVERLOG62));
+    }
+    #last step
+    if($rest > 5) {
+	$str = substr($seq,($steps * $WINDOWSTEP),$rest);
+	my %counts = ();
+	$num = $rest-2;
+	foreach my $i (0..($num - 1)) {
+	    $counts{substr($str,$i,3)}++;
+	}
+	$entropyval = 0;
+	$bynum = 1/$num;
+	foreach(values %counts) {
+	    $entropyval -= ($_ * $bynum) * log($_ * $bynum);
+	}
+	push(@vals,($entropyval / log($num)));
+    } else {
+	push(@vals,0);
+    }
+    $mean = &getArrayMean(@vals);
+    return int($mean * 100);
+}
--- a/src/sam-flip.pl
+++ b/src/sam-flip.pl
@@ -5,20 +5,69 @@ use strict;
 use FindBin;                     # locate this script
 use lib "$FindBin::RealBin/lib";  # use the lib directory
 use biointsam;
+use biointbasics;
+
+my $programname = "sam-flip.pl";
+my $version = "1.1";
+my $cmd = join(" ", $programname, @ARGV);
+
+#===DESCRIPTION=================================================================
+
+my $description = 
+    "Description:\n\t" .
+    "A tool to filter SAM files based on user specifications.\n" .
+    "\tThe tool either opens the file specified as input or reads from STDIN when no file is given.\n";
+my $usage = 
+    "Usage:\n\t$0 [OPTIONS]  [SAM file]\n";
+my $options = 
+    "Options:\n" .
+    "\t-h | --help\n\t\tPrint the help message; ignore other arguments.\n" .
+    "\t-minlen=<int> | --minlen=<int>\n\t\tKeep only query sequences that at least <int> long.\n" .
+    "\t-minaln=<int> | --minaln=<int>\n\t\tKeep only query sequences that have at least <int> long alignment with the reference.\n" .
+    "\t-minsim=<float> | --minsim=<float>\n\t\tKeep only query sequences that have at least <float> similarity score.\n" .
+    "\t-qcov=<float> | --query-coverage=<float>\n\t\tKeep only query sequences that have at least <float> part covered by the hit.\n" .
+    "\t-rcov=<float> | --reference-coverage=<float>\n\t\tKeep only query sequences that cover at least <float> part of the reference sequence.\n" .
+    "\t-mincomplexity=<float> | --mincomplexity=<float>\n\t\tKeep only hits where the aligned sequence has at least <float> 7-mer complexity (observed k-mers/maximal possible k-mers).\n" .
+    "\t-dust | --dust=<int>\n\t\tKeep only hits where the aligned sequence has at most <int> (7 if not specified) dust score. (Algorithm adapted from PRINSEQ-lite).\n" .
+    "\t-entropy | --entropy=<int>\n\t\tKeep only hits where the aligned sequence has at least <int> (70 if not specified) entropy score. (Algorithm adapted from PRINSEQ-lite).\n" .
+    "\t-v\n\t\tInvert selection: report sequence that fail the requirement.\n" .
+    "\n";
+my $info = {
+    description => $description,
+    usage => $usage,
+    options => $options,
+};
+
+#===MAIN========================================================================
+
+# Print help if needed
+biointbasics::print_help(\@ARGV, $info);

 my %ref;
 my %query;

 open(my $temp, '>', 'temp~') || die $!;

+my $header = "true";
+my $previousprogram = "";

 while(<>) {
    my %hit;
    biointsam::parse_sam($_, \%ref, \%hit); 
    unless (%hit) {
 	print {$temp} "$_\n" unless /^\@SQ/;
+	if (/^\@PG\tID:(\S+)/) {
+	    $previousprogram = $1;
+	}
 	next;
    }
+    if ($header) { # First line after the header section
+	my $text = "\@PG\tID:$programname\tPN:$programname";
+	$text .= "\tPP:$previousprogram" if $previousprogram;
+	print {$temp} $text . "\tVN:$version\tCL:$cmd\n";
+	$header = undef;
+    }
+
    next if $hit{"FLAG"} & 4 || $hit{'RNAME'} eq "*";
    # Get alginment data
    my $aln = biointsam::parse_cigar($hit{'CIGAR'}, $hit{'FLAG'}, $hit{'SEQ'});

--- a/src/sam-keep-best.pl
+++ b/src/sam-keep-best.pl
@@ -9,9 +9,12 @@ use biointsam;
 use File::Basename;
 use Data::Dumper;

+my $programname = "sam-keep-best.pl";
+my $version = "1.1";
+my $cmd = join(" ", $programname, @ARGV);
+
 #===DESCRIPTION=================================================================

-my $version = '1.0';
 my $description = 
    "Description:\n\t" .
    "A tool to filter SAM files and keep the best match/mapping for each query entry and\n" .
@@ -64,7 +67,6 @@ my $term = 'AS:i';
 my $maxoverlap = 0;
 my $percentoverlap = 0;

-my $cmd = "$0 @ARGV";
 my @keep;
 for (@ARGV) {
    if (/^--?bs$/ || /^--?bitscore$/) {
@@ -142,24 +144,26 @@ my $q2r = {};
 # Store information for local best hits
 my $location = {};

-my $pp;
-my $pg;
+my $header = "true";
+my $previousprogram = "";
 while(<>) {
    my %hit;
    biointsam::parse_sam($_, \%ref, \%hit); 
    unless (%hit) {
 	if (/^\@PG\tID:(\S+)/) {
-	    $pp = $1;
-	}
-	if ($pp && ! $pg && $_ !~ /^\@PG/) {
-	    my $tool = $0;
-	    ($tool) = fileparse($tool);
-	    print join("\t", '@PG', "ID:$tool", "PN:$tool", "PP:$pp", "VN:$version", "CL:$cmd"), "\n";
-	    $pg++;
+	    $previousprogram = $1;
 	}
 	print "$_\n";
 	next;
    }
+
+    if ($header) { # First line after the header section
+	my $text = "\@PG\tID:$programname\tPN:$programname";
+	$text .= "\tPP:$previousprogram" if $previousprogram;
+	print $text . "\tVN:$version\tCL:$cmd\n";
+	$header = undef;
+    }
+
    next if $hit{"FLAG"} & 4 || $hit{'RNAME'} eq "*";
    # Get alginment data
    my $aln = biointsam::parse_cigar($hit{'CIGAR'}, $hit{'FLAG'}, $hit{'SEQ'});

--- a/src/sam-update-cigar.pl
+++ b/src/sam-update-cigar.pl
@@ -10,6 +10,10 @@ use biointbasics;
 use Bio::SeqIO;
 use Bio::Seq;

+my $programname = "sam-update-cigar.pl";
+my $version = "1.1";
+my $cmd = join(" ", $programname, @ARGV);
+
 #===DESCRIPTION=================================================================

 my $description = 
@@ -42,14 +46,9 @@ biointbasics::print_help(\@ARGV, $info);
 my %ref;
 my %query;
 my $hits;
-my $samfh;
-if ($sam eq '-') {
-    $samfh = *STDIN;
-} else {
-    open($samfh, '<', $sam) || die $!;
-}
-
-while(<$samfh>) {
+my $header = "true";
+my $previousprogram = "";
+while(<>) {
    #print "$_";
    my %hit;
    biointsam::parse_sam($_, \%ref, \%hit);
@@ -57,8 +56,18 @@ while(<$samfh>) {
    unless (%hit) {
 	# Print header to maintain a valid SAM output
 	print "$_\n";
+	if (/^\@PG\tID:(\S+)/) {
+	    $previousprogram = $1;
+	}
 	next;
    }
+    if ($header) { # First line after the header section
+	my $text = "\@PG\tID:$programname\tPN:$programname";
+	$text .= "\tPP:$previousprogram" if $previousprogram;
+	print $text . "\tVN:$version\tCL:$cmd\n";
+	$header = undef;
+    }
+
    # Nothing to do if there is no hit for the query sequence, so skip it
    unless($hit{"FLAG"} & 4 || $hit{'RNAME'} eq "*") {

@@ -95,4 +104,4 @@ while(<$samfh>) {
    print biointsam::sam_string(\%hit), "\n";
 }

-print "No hits to plot in the SAM file\n" unless $hits;
+
--- a/src/sam-update-iupac.pl
+++ b/src/sam-update-iupac.pl
@@ -10,6 +10,10 @@ use biointbasics;
 use Bio::SeqIO;
 use Bio::Seq;

+my $programname = "sam-update-iupac.pl";
+my $version = "1.1";
+my $cmd = join(" ", $programname, @ARGV);
+
 #===DESCRIPTION=================================================================

 my $description = 
@@ -35,11 +39,11 @@ my $info = {
 # Print help if needed
 biointbasics::print_help(\@ARGV, $info);

-my $header;
+my $updateheader;
 my @keep;
 for (@ARGV) {
    if (/^--?head(er)?$/) {
-	$header++;
+	$updateheader++;
    } else {
 	push @keep, $_;
    }
@@ -64,7 +68,7 @@ if ($ref) {
 #print '%', ("123456789^" x 10), "\n";
 my %ref;
 # Update header if asked
-if ($header) {
+if ($updateheader) {
    my $rio = Bio::SeqIO->new(-file=>$ref,-format=>'fasta');
    while ( my $rseq = $rio->next_seq() ) {
 	my $id = $rseq->id();
@@ -86,17 +90,28 @@ if ($sam eq '-') {
 my $qname = "";
 my $mem = "";
 my $memflag = 0;
+my $header = "true";
+my $previousprogram = "";
 while(<$samfh>) {
    #print "$_";
    my %hit;
-    next if /^\@SQ/ && $header;
+    next if /^\@SQ/ && $updateheader;
    biointsam::parse_sam($_, \%ref, \%hit);
    # Header lines contain no hits
    unless (%hit) {
 	# Print header to maintain a valid SAM output
 	print "$_\n";
+	if (/^\@PG\tID:(\S+)/) {
+	    $previousprogram = $1;
+	}
 	next;
    }
+    if ($header) { # First line after the header section
+	my $text = "\@PG\tID:$programname\tPN:$programname";
+	$text .= "\tPP:$previousprogram" if $previousprogram;
+	print $text . "\tVN:$version\tCL:$cmd\n";
+	$header = undef;
+    }
    # Nothing to do if there is no hit for the query sequence, so skip it
    next if $hit{"FLAG"} & 4 || $hit{'RNAME'} eq "*";

@@ -115,7 +130,7 @@ while(<$samfh>) {
 	    if ($hit{'CIGAR'} =~ /(\d+)H$/) {
 		$seq = reverse substr(reverse($seq), $1);
 	    }
-	    $hit{'SEQ'} = $mem;
+	    $hit{'SEQ'} = $seq;
 	}
    } else {
 	unless ($hit{'CIGAR'} =~ /H/) {
@@ -133,7 +148,6 @@ while(<$samfh>) {
    $hits++;
 }

-print "No hits to plot in the SAM file\n" unless $hits;

 sub updatehit{
    # Update the alignment info for ambiguity sites

--- a/src/sam-update-seq.pl
+++ b/src/sam-update-seq.pl
@@ -10,7 +10,9 @@ use biointbasics;
 use Bio::SeqIO;
 use Bio::Seq;

-use Data::Dump qw(dump);
+my $programname = "sam-update-seq.pl";
+my $version = "1.1";
+my $cmd = join(" ", $programname, @ARGV);

 #===DESCRIPTION=================================================================

@@ -118,6 +120,8 @@ my $qname = "";
 my $mem = "";
 my $memflag = 0;
 my $qseq;
+my $header = "true";
+my $previousprogram = "";
 while(<$samfh>) {
    #print "$_";
    my %hit;
@@ -127,8 +131,17 @@ while(<$samfh>) {
    unless (%hit) {
 	# Print header to maintain a valid SAM output
 	print "$_\n";
+	if (/^\@PG\tID:(\S+)/) {
+	    $previousprogram = $1;
+	}
 	next;
    }
+    if ($header) { # First line after the header section
+	my $text = "\@PG\tID:$programname\tPN:$programname";
+	$text .= "\tPP:$previousprogram" if $previousprogram;
+	print $text . "\tVN:$version\tCL:$cmd\n";
+	$header = undef;
+    }
    unless ($qname eq $hit{'QNAME'}) {
 	# Look up seq in FASTQ
 	while ($qseq = $qio->next_seq()) {
No results found