Commit 4dde25c4 authored by Aflitos, Saulo Alves's avatar Aflitos, Saulo Alves
Browse files

fixed but stddev

parent 5514e136
......@@ -39,14 +39,14 @@ class stats( object ):
self.Raverage = v
else:
self.Raverage = (( self.Raverage * self.count) + v ) / (self.count + 1)
self.variance = (v - self.Raverage) ** 2
self.Raverage = (( self.Raverage * self.count) + v ) / (self.count + 1)
self.variance += (v - self.Raverage) ** 2
#dev = ( self.variance'] / self.count']) ** 0.5
def get( self ):
return {
'average' : self.sum / self.count,
'Aaverage': self.Asum / self.count,
'average' : self.sum / self.count,
'Aaverage': self.Asum / self.count,
'stddev' : (self.variance / self.count) ** 0.5,
'min' : self.min,
'max' : self.max,
......@@ -58,196 +58,195 @@ class stats( object ):
}
def main():
infile = sys.argv[1]
#$v=0;
#$c=0;
#$min=10000000;
#$max=0;
#$av=0;
#$var=0;
#
#$a=$v/$c;
#$dev=sqrt($var/$c);
#$mx=$a+(2*$dev);
#
#print "$c\t$v\t$a\t$min\t$max\t$av\t$var\t$dev\t$mx\n"; }
#
#$c+=1;
#$v+=$_;
#if ($_ < $min) { $min = $_; };
#if ($_ > $max) { $max = $_; };
#if ($c==1) {
# $av = $_; $var=0; }
#
#else {
# $av=(($av*$c)+$_)/($c+1);
# $var+=($_-$av)**2;
# $dev=sqrt($var/$c);
# print "$av\t$var\t$dev\n";}' | \
values = {
'count' : { },
'info' : { },
'format': { },
'qual' : { 'all': stats() },
'dist' : { 'all': stats() },
}
codes = { 'info': {}, 'format': {} }
count = 0
with openfile(infile) as fhd:
for line in fhd:
line = line.strip()
if len(line) == 0:
continue
if line[0] == "#":
print line
if '##FORMAT' in line:
lp = line[13:]
code = lp[:2]
nfo = lp[lp.find('Description="')+13:-2]
print 'formal', code, nfo
codes['format'][code] = nfo
elif '##INFO' in line:
lp = line[11:]
code = lp[:lp.index(',')]
nfo = lp[lp.find('Description="')+13:-2]
print 'info', code, nfo
codes['info'][code] = nfo
continue
#sys.exit(0)
count += 1
if count % 1000 != 0:
continue
pass
def main(infiles):
for infile in infiles:
#$v=0;
#$c=0;
#$min=10000000;
#$max=0;
#$av=0;
#$var=0;
#
#$a=$v/$c;
#$dev=sqrt($var/$c);
#$mx=$a+(2*$dev);
#
#print "$c\t$v\t$a\t$min\t$max\t$av\t$var\t$dev\t$mx\n"; }
#
#$c+=1;
#$v+=$_;
#if ($_ < $min) { $min = $_; };
#if ($_ > $max) { $max = $_; };
#if ($c==1) {
# $av = $_; $var=0; }
#
#else {
# $av=(($av*$c)+$_)/($c+1);
# $var+=($_-$av)**2;
# $dev=sqrt($var/$c);
# print "$av\t$var\t$dev\n";}' | \
values = {
'count' : { },
'info' : { },
'format': { },
'qual' : { 'all': stats() },
'dist' : { 'all': stats() },
}
#print line
codes = { 'info': {}, 'format': {} }
count = 0
with openfile(infile) as fhd:
for line in fhd:
line = line.strip()
cols = line.split()
chrom = cols[0]
pos = int(cols[1])
qual = float(cols[5])
info = cols[7].split(";")
fmtLbl = cols[8].split(":")
fmtVal = cols[9].split(":")
if len(line) == 0:
continue
if line[0] == "#":
#print line
if '##FORMAT' in line:
lp = line[13:]
code = lp[:2]
nfo = lp[lp.find('Description="')+13:-2]
print 'format\t', code, "\t", nfo
codes['format'][code] = nfo
elif '##INFO' in line:
lp = line[11:]
code = lp[:lp.index(',')]
nfo = lp[lp.find('Description="')+13:-2]
print 'info\t', code, "\t", nfo
codes['info'][code] = nfo
continue
if chrom not in values[ 'dist' ]:
values[ 'dist' ][ chrom ] = stats()
values[ 'qual' ][ chrom ] = stats()
values[ 'count' ][ chrom ] = 0
#sys.exit(0)
count += 1
if count % 1000 != 0:
#continue
pass
values[ 'count' ][ chrom ] += 1
#print line
if values[ 'count' ][ chrom ] == 1:
print 'first', chrom
values[ 'dist' ][ chrom ].prev = pos
#print values[ 'dist' ][ chrom ].prev
cols = line.split()
chrom = cols[0]
pos = int(cols[1])
qual = float(cols[5])
info = cols[7].split(";")
fmtLbl = cols[8].split(":")
fmtVal = cols[9].split(":")
else:
#print 'consecutive', values[ 'dist' ][ chrom ].prev, pos,
dist = pos - values[ 'dist' ][ chrom ].prev
values[ 'dist' ][ 'all' ].add( dist )
values[ 'dist' ][ chrom ].add( dist )
values[ 'dist' ][ chrom ].prev = pos
#print values[ 'dist' ][ chrom ].prev
if chrom not in values[ 'dist' ]:
values[ 'dist' ][ chrom ] = stats()
values[ 'qual' ][ chrom ] = stats()
values[ 'count' ][ chrom ] = 0
values['qual']['all'].add( qual )
values['qual'][chrom].add( qual )
values[ 'count' ][ chrom ] += 1
if values[ 'count' ][ chrom ] == 1:
print 'first', chrom
values[ 'dist' ][ chrom ].prev = pos
#print values[ 'dist' ][ chrom ].prev
else:
#print 'consecutive', values[ 'dist' ][ chrom ].prev, pos,
dist = pos - values[ 'dist' ][ chrom ].prev
values[ 'dist' ][ 'all' ].add( dist )
values[ 'dist' ][ chrom ].add( dist )
values[ 'dist' ][ chrom ].prev = pos
#print values[ 'dist' ][ chrom ].prev
for i in info:
if "=" in i:
k, v = i.split("=")
#print "K %s V %s" % (k ,v)
if k in codes['info']:
k = codes['info'][k]
values['qual']['all'].add( qual )
values['qual'][chrom].add( qual )
try:
v = float(v)
except:
pass
if type(v) is float:
#print "float", v
if k not in values['info']:
values['info'][k] = { 'all': stats() }
for i in info:
if "=" in i:
k, v = i.split("=")
#print "K %s V %s" % (k ,v)
if chrom not in values['info'][k]:
values['info'][k][chrom] = stats()
if k in codes['info']:
k = codes['info'][k]
values['info'][k][chrom].add( v )
values['info'][k]['all'].add( v )
try:
v = float(v)
except:
pass
if type(v) is float:
#print "float", v
if k not in values['info']:
values['info'][k] = { 'all': stats() }
for fp in range(len(fmtLbl)):
f = fmtLbl[fp]
v = fmtVal[fp]
if chrom not in values['info'][k]:
values['info'][k][chrom] = stats()
if f in codes['format']:
f = codes['format'][f]
values['info'][k][chrom].add( v )
values['info'][k]['all'].add( v )
#print f
if '/' in v:
continue
if ',' in v:
pass
for fp in range(len(fmtLbl)):
f = fmtLbl[fp]
v = fmtVal[fp]
else:
v = float(v)
#print fp, f, v
if f not in values['format']:
values['format'][f] = { 'all': stats() }
if f in codes['format']:
f = codes['format'][f]
if chrom not in values['format'][f]:
values['format'][f][chrom] = stats()
#print f
values['format'][f][chrom].add( v )
values['format'][f]['all'].add( v )
if '/' in v:
continue
if ',' in v:
pass
else:
v = float(v)
#print fp, f, v
if f not in values['format']:
values['format'][f] = { 'all': stats() }
if count == 1000:
#break
pass
if chrom not in values['format'][f]:
values['format'][f][chrom] = stats()
values['format'][f][chrom].add( v )
values['format'][f]['all'].add( v )
if count == 1000:
#break
pass
for k in values:
if k in ['count']:
pass
elif k in ['dist', 'qual']:
for chrom in values[k]:
#print "chrom", chrom
values[k][chrom] = values[k][chrom].get()
#elif k in []:
#values[k] = values[k].get()
else:
for chrom in values[k]:
for kk in values[k][chrom]:
values[k][chrom][kk] = values[k][chrom][kk].get()
pp( values )
json.dump(values, open(os.path.basename(infile)+'.json', 'w'), sort_keys=True, indent=' ')
for k in values:
if k in ['count']:
pass
elif k in ['dist', 'qual']:
for chrom in values[k]:
#print "chrom", chrom
values[k][chrom] = values[k][chrom].get()
#elif k in []:
#values[k] = values[k].get()
else:
for chrom in values[k]:
for kk in values[k][chrom]:
values[k][chrom][kk] = values[k][chrom][kk].get()
pp( values )
json.dump(values, open(os.path.basename(infile)+'.json', 'w'), sort_keys=True, indent=' ')
if __name__ == '__main__':
main()
main(sys.argv[1:])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment