Commit 53def274 authored by Aflitos, Saulo Alves's avatar Aflitos, Saulo Alves
Browse files

more columns

parent 4dde25c4
......@@ -9,7 +9,7 @@ from pprint import pprint as pp
def openfile(infile):
if infile.endswith( '.gz' ):
return gzip.open(infie, mode='rb')
return gzip.open(infile, mode='rb')
else:
return open(infile, 'r')
......@@ -44,22 +44,42 @@ class stats( object ):
#dev = ( self.variance'] / self.count']) ** 0.5
def get( self ):
return {
'average' : self.sum / self.count,
'Aaverage': self.Asum / self.count,
'stddev' : (self.variance / self.count) ** 0.5,
'min' : self.min,
'max' : self.max,
'count' : self.count,
'sum' : self.sum,
'Asum' : self.Asum,
'Raverage': self.Raverage,
'variance': self.variance
}
if self.count == 0:
return {
'average' : 0,
'Aaverage': 0,
'stddev' : 0,
'min' : 0,
'max' : 0,
'count' : 0,
'sum' : 0,
'Asum' : 0,
'Raverage': 0,
'variance': 0
}
else:
avg = self.sum / self.count
Aavg = self.Asum / self.count
stddev = (self.variance / self.count) ** 0.5
return {
'average' : avg,
'Aaverage': Aavg,
'stddev' : stddev,
'min' : self.min,
'max' : self.max,
'count' : self.count,
'sum' : self.sum,
'Asum' : self.Asum,
'Raverage': self.Raverage,
'variance': self.variance
}
def main(infiles):
for infile in infiles:
print "\n\nANALYZING", infile
#$v=0;
#$c=0;
#$min=10000000;
......@@ -87,11 +107,12 @@ def main(infiles):
# print "$av\t$var\t$dev\n";}' | \
values = {
'count' : { },
'info' : { },
'format': { },
'qual' : { 'all': stats() },
'dist' : { 'all': stats() },
'count' : { },
'info' : { },
'format' : { },
'polytype': { },
'qual' : { 'all': stats() },
'dist' : { 'all': stats() },
}
codes = { 'info': {}, 'format': {} }
......@@ -105,58 +126,97 @@ def main(infiles):
if line[0] == "#":
#print line
if '##FORMAT' in line:
lp = line[13:]
code = lp[:2]
nfo = lp[lp.find('Description="')+13:-2]
print 'format\t', code, "\t", nfo
codes['format'][code] = nfo
elif '##INFO' in line:
lp = line[11:]
code = lp[:lp.index(',')]
nfo = lp[lp.find('Description="')+13:-2]
print 'info\t', code, "\t", nfo
codes['info'][code] = nfo
continue
#sys.exit(0)
count += 1
if count % 1000 != 0:
#continue
continue
pass
#print line
cols = line.split()
chrom = cols[0]
pos = int(cols[1])
chrom = cols[0]
pos = int( cols[1])
ref = cols[3]
alt = cols[4]
qual = float(cols[5])
info = cols[7].split(";")
fmtLbl = cols[8].split(":")
fmtVal = cols[9].split(":")
info = cols[7].split(";")
fmtLbl = cols[8].split(":")
fmtVal = cols[9].split(":")
if chrom not in values[ 'dist' ]:
values[ 'dist' ][ chrom ] = stats()
values[ 'qual' ][ chrom ] = stats()
values[ 'count' ][ chrom ] = 0
values[ 'dist' ][ chrom ] = stats()
values[ 'qual' ][ chrom ] = stats()
values[ 'count' ][ chrom ] = 0
values[ 'count' ][ chrom ] += 1
polytype = None
if len(ref) == 1:
if len(alt) == 1:
polytype = 'SNP'
else:
polytype = 'INS'
else:
if len(alt) == 1:
polytype = 'DEL'
else:
if len(ref) == len(alt):
polytype = 'MNP'
else:
polytype = 'REP'
if values[ 'count' ][ chrom ] == 1:
print 'first', chrom
values[ 'dist' ][ chrom ].prev = pos
values[ 'dist' ][ chrom ].prev = pos
#print values[ 'dist' ][ chrom ].prev
else:
#print 'consecutive', values[ 'dist' ][ chrom ].prev, pos,
dist = pos - values[ 'dist' ][ chrom ].prev
values[ 'dist' ][ 'all' ].add( dist )
values[ 'dist' ][ chrom ].add( dist )
values[ 'dist' ][ chrom ].prev = pos
values[ 'dist' ][ 'all' ].add( dist )
values[ 'dist' ][ chrom ].add( dist )
values[ 'dist' ][ chrom ].prev = pos
#print values[ 'dist' ][ chrom ].prev
if polytype not in values[ 'polytype' ]:
values[ 'polytype' ][ polytype ] = { 'all': stats() }
if chrom not in values[ 'polytype' ][ polytype ]:
values[ 'polytype' ][ polytype ][ chrom ] = stats()
values[ 'polytype' ][ polytype ][ chrom ].prev = pos
else:
dist = pos - values[ 'polytype' ][ polytype ][ chrom ].prev
values[ 'polytype' ][ polytype ][ 'all' ].add( dist )
values[ 'polytype' ][ polytype ][ chrom ].add( dist )
values[ 'polytype' ][ polytype ][ chrom ].prev = pos
values['qual']['all'].add( qual )
values['qual'][chrom].add( qual )
......@@ -186,6 +246,20 @@ def main(infiles):
values['info'][k][chrom].add( v )
values['info'][k]['all'].add( v )
elif i == "INDEL":
if i not in values['info']:
values['info'][i] = { 'all': stats() }
if chrom not in values['info'][i]:
values['info'][i][chrom] = stats()
values['info'][i][chrom].prev = pos
continue
dist = pos - values['info'][i][chrom].prev
values['info'][i][chrom].add( dist )
values['info'][i]['all'].add( dist )
values['info'][i][chrom].prev = pos
......@@ -205,7 +279,11 @@ def main(infiles):
pass
else:
v = float(v)
try:
v = float(v)
except:
continue
#print fp, f, v
if f not in values['format']:
values['format'][f] = { 'all': stats() }
......@@ -229,10 +307,11 @@ def main(infiles):
for k in values:
if k in ['count']:
#print "k",k
if k in [ 'count' ]:
pass
elif k in ['dist', 'qual']:
elif k in [ 'dist', 'qual' ]:
for chrom in values[k]:
#print "chrom", chrom
values[k][chrom] = values[k][chrom].get()
......@@ -242,11 +321,19 @@ def main(infiles):
else:
for chrom in values[k]:
#print " chrom", chrom
for kk in values[k][chrom]:
values[k][chrom][kk] = values[k][chrom][kk].get()
#print " kk", kk
v = values[k][chrom][kk].get()
#pp(v)
values[k][chrom][kk] = v
pp( values )
#pp( values )
json.dump(values, open(os.path.basename(infile)+'.json', 'w'), sort_keys=True, indent=' ')
if __name__ == '__main__':
main(sys.argv[1:])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment