Skip to content
Snippets Groups Projects
Commit 5514e136 authored by Aflitos, Saulo Alves's avatar Aflitos, Saulo Alves
Browse files

vcf stats

parent da7a0666
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/python
import os
import sys
import gzip
import simplejson as json
from pprint import pprint as pp
def openfile(infile):
if infile.endswith( '.gz' ):
return gzip.open(infie, mode='rb')
else:
return open(infile, 'r')
class stats( object ):
def __init__(self):
self.min = sys.maxint
self.max = (sys.maxint-1) * -1
self.count = 0
self.sum = 0
self.Asum = 0
self.Raverage = 0
self.variance = 0
def add( self, v ):
if self.min > v:
self.min = v
if self.max < v:
self.max = v
self.count += 1
self.sum += v
self.Asum += abs(v)
if self.count == 1:
self.Raverage = v
else:
self.Raverage = (( self.Raverage * self.count) + v ) / (self.count + 1)
self.variance = (v - self.Raverage) ** 2
#dev = ( self.variance'] / self.count']) ** 0.5
def get( self ):
return {
'average' : self.sum / self.count,
'Aaverage': self.Asum / self.count,
'stddev' : (self.variance / self.count) ** 0.5,
'min' : self.min,
'max' : self.max,
'count' : self.count,
'sum' : self.sum,
'Asum' : self.Asum,
'Raverage': self.Raverage,
'variance': self.variance
}
def main():
infile = sys.argv[1]
#$v=0;
#$c=0;
#$min=10000000;
#$max=0;
#$av=0;
#$var=0;
#
#$a=$v/$c;
#$dev=sqrt($var/$c);
#$mx=$a+(2*$dev);
#
#print "$c\t$v\t$a\t$min\t$max\t$av\t$var\t$dev\t$mx\n"; }
#
#$c+=1;
#$v+=$_;
#if ($_ < $min) { $min = $_; };
#if ($_ > $max) { $max = $_; };
#if ($c==1) {
# $av = $_; $var=0; }
#
#else {
# $av=(($av*$c)+$_)/($c+1);
# $var+=($_-$av)**2;
# $dev=sqrt($var/$c);
# print "$av\t$var\t$dev\n";}' | \
values = {
'count' : { },
'info' : { },
'format': { },
'qual' : { 'all': stats() },
'dist' : { 'all': stats() },
}
codes = { 'info': {}, 'format': {} }
count = 0
with openfile(infile) as fhd:
for line in fhd:
line = line.strip()
if len(line) == 0:
continue
if line[0] == "#":
print line
if '##FORMAT' in line:
lp = line[13:]
code = lp[:2]
nfo = lp[lp.find('Description="')+13:-2]
print 'formal', code, nfo
codes['format'][code] = nfo
elif '##INFO' in line:
lp = line[11:]
code = lp[:lp.index(',')]
nfo = lp[lp.find('Description="')+13:-2]
print 'info', code, nfo
codes['info'][code] = nfo
continue
#sys.exit(0)
count += 1
if count % 1000 != 0:
continue
pass
#print line
cols = line.split()
chrom = cols[0]
pos = int(cols[1])
qual = float(cols[5])
info = cols[7].split(";")
fmtLbl = cols[8].split(":")
fmtVal = cols[9].split(":")
if chrom not in values[ 'dist' ]:
values[ 'dist' ][ chrom ] = stats()
values[ 'qual' ][ chrom ] = stats()
values[ 'count' ][ chrom ] = 0
values[ 'count' ][ chrom ] += 1
if values[ 'count' ][ chrom ] == 1:
print 'first', chrom
values[ 'dist' ][ chrom ].prev = pos
#print values[ 'dist' ][ chrom ].prev
else:
#print 'consecutive', values[ 'dist' ][ chrom ].prev, pos,
dist = pos - values[ 'dist' ][ chrom ].prev
values[ 'dist' ][ 'all' ].add( dist )
values[ 'dist' ][ chrom ].add( dist )
values[ 'dist' ][ chrom ].prev = pos
#print values[ 'dist' ][ chrom ].prev
values['qual']['all'].add( qual )
values['qual'][chrom].add( qual )
for i in info:
if "=" in i:
k, v = i.split("=")
#print "K %s V %s" % (k ,v)
if k in codes['info']:
k = codes['info'][k]
try:
v = float(v)
except:
pass
if type(v) is float:
#print "float", v
if k not in values['info']:
values['info'][k] = { 'all': stats() }
if chrom not in values['info'][k]:
values['info'][k][chrom] = stats()
values['info'][k][chrom].add( v )
values['info'][k]['all'].add( v )
for fp in range(len(fmtLbl)):
f = fmtLbl[fp]
v = fmtVal[fp]
if f in codes['format']:
f = codes['format'][f]
#print f
if '/' in v:
continue
if ',' in v:
pass
else:
v = float(v)
#print fp, f, v
if f not in values['format']:
values['format'][f] = { 'all': stats() }
if chrom not in values['format'][f]:
values['format'][f][chrom] = stats()
values['format'][f][chrom].add( v )
values['format'][f]['all'].add( v )
if count == 1000:
#break
pass
for k in values:
if k in ['count']:
pass
elif k in ['dist', 'qual']:
for chrom in values[k]:
#print "chrom", chrom
values[k][chrom] = values[k][chrom].get()
#elif k in []:
#values[k] = values[k].get()
else:
for chrom in values[k]:
for kk in values[k][chrom]:
values[k][chrom][kk] = values[k][chrom][kk].get()
pp( values )
json.dump(values, open(os.path.basename(infile)+'.json', 'w'), sort_keys=True, indent=' ')
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment