Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Aflitos, Saulo Alves
mapper
Commits
5514e136
Commit
5514e136
authored
Jul 01, 2014
by
Aflitos, Saulo Alves
Browse files
vcf stats
parent
da7a0666
Changes
1
Hide whitespace changes
Inline
Side-by-side
vcf_stats.py
0 → 100755
View file @
5514e136
#!/usr/bin/python
import
os
import
sys
import
gzip
import
simplejson
as
json
from
pprint
import
pprint
as
pp
def
openfile
(
infile
):
if
infile
.
endswith
(
'.gz'
):
return
gzip
.
open
(
infie
,
mode
=
'rb'
)
else
:
return
open
(
infile
,
'r'
)
class
stats
(
object
):
def
__init__
(
self
):
self
.
min
=
sys
.
maxint
self
.
max
=
(
sys
.
maxint
-
1
)
*
-
1
self
.
count
=
0
self
.
sum
=
0
self
.
Asum
=
0
self
.
Raverage
=
0
self
.
variance
=
0
def
add
(
self
,
v
):
if
self
.
min
>
v
:
self
.
min
=
v
if
self
.
max
<
v
:
self
.
max
=
v
self
.
count
+=
1
self
.
sum
+=
v
self
.
Asum
+=
abs
(
v
)
if
self
.
count
==
1
:
self
.
Raverage
=
v
else
:
self
.
Raverage
=
((
self
.
Raverage
*
self
.
count
)
+
v
)
/
(
self
.
count
+
1
)
self
.
variance
=
(
v
-
self
.
Raverage
)
**
2
#dev = ( self.variance'] / self.count']) ** 0.5
def
get
(
self
):
return
{
'average'
:
self
.
sum
/
self
.
count
,
'Aaverage'
:
self
.
Asum
/
self
.
count
,
'stddev'
:
(
self
.
variance
/
self
.
count
)
**
0.5
,
'min'
:
self
.
min
,
'max'
:
self
.
max
,
'count'
:
self
.
count
,
'sum'
:
self
.
sum
,
'Asum'
:
self
.
Asum
,
'Raverage'
:
self
.
Raverage
,
'variance'
:
self
.
variance
}
def
main
():
infile
=
sys
.
argv
[
1
]
#$v=0;
#$c=0;
#$min=10000000;
#$max=0;
#$av=0;
#$var=0;
#
#$a=$v/$c;
#$dev=sqrt($var/$c);
#$mx=$a+(2*$dev);
#
#print "$c\t$v\t$a\t$min\t$max\t$av\t$var\t$dev\t$mx\n"; }
#
#$c+=1;
#$v+=$_;
#if ($_ < $min) { $min = $_; };
#if ($_ > $max) { $max = $_; };
#if ($c==1) {
# $av = $_; $var=0; }
#
#else {
# $av=(($av*$c)+$_)/($c+1);
# $var+=($_-$av)**2;
# $dev=sqrt($var/$c);
# print "$av\t$var\t$dev\n";}' | \
values
=
{
'count'
:
{
},
'info'
:
{
},
'format'
:
{
},
'qual'
:
{
'all'
:
stats
()
},
'dist'
:
{
'all'
:
stats
()
},
}
codes
=
{
'info'
:
{},
'format'
:
{}
}
count
=
0
with
openfile
(
infile
)
as
fhd
:
for
line
in
fhd
:
line
=
line
.
strip
()
if
len
(
line
)
==
0
:
continue
if
line
[
0
]
==
"#"
:
print
line
if
'##FORMAT'
in
line
:
lp
=
line
[
13
:]
code
=
lp
[:
2
]
nfo
=
lp
[
lp
.
find
(
'Description="'
)
+
13
:
-
2
]
print
'formal'
,
code
,
nfo
codes
[
'format'
][
code
]
=
nfo
elif
'##INFO'
in
line
:
lp
=
line
[
11
:]
code
=
lp
[:
lp
.
index
(
','
)]
nfo
=
lp
[
lp
.
find
(
'Description="'
)
+
13
:
-
2
]
print
'info'
,
code
,
nfo
codes
[
'info'
][
code
]
=
nfo
continue
#sys.exit(0)
count
+=
1
if
count
%
1000
!=
0
:
continue
pass
#print line
cols
=
line
.
split
()
chrom
=
cols
[
0
]
pos
=
int
(
cols
[
1
])
qual
=
float
(
cols
[
5
])
info
=
cols
[
7
].
split
(
";"
)
fmtLbl
=
cols
[
8
].
split
(
":"
)
fmtVal
=
cols
[
9
].
split
(
":"
)
if
chrom
not
in
values
[
'dist'
]:
values
[
'dist'
][
chrom
]
=
stats
()
values
[
'qual'
][
chrom
]
=
stats
()
values
[
'count'
][
chrom
]
=
0
values
[
'count'
][
chrom
]
+=
1
if
values
[
'count'
][
chrom
]
==
1
:
print
'first'
,
chrom
values
[
'dist'
][
chrom
].
prev
=
pos
#print values[ 'dist' ][ chrom ].prev
else
:
#print 'consecutive', values[ 'dist' ][ chrom ].prev, pos,
dist
=
pos
-
values
[
'dist'
][
chrom
].
prev
values
[
'dist'
][
'all'
].
add
(
dist
)
values
[
'dist'
][
chrom
].
add
(
dist
)
values
[
'dist'
][
chrom
].
prev
=
pos
#print values[ 'dist' ][ chrom ].prev
values
[
'qual'
][
'all'
].
add
(
qual
)
values
[
'qual'
][
chrom
].
add
(
qual
)
for
i
in
info
:
if
"="
in
i
:
k
,
v
=
i
.
split
(
"="
)
#print "K %s V %s" % (k ,v)
if
k
in
codes
[
'info'
]:
k
=
codes
[
'info'
][
k
]
try
:
v
=
float
(
v
)
except
:
pass
if
type
(
v
)
is
float
:
#print "float", v
if
k
not
in
values
[
'info'
]:
values
[
'info'
][
k
]
=
{
'all'
:
stats
()
}
if
chrom
not
in
values
[
'info'
][
k
]:
values
[
'info'
][
k
][
chrom
]
=
stats
()
values
[
'info'
][
k
][
chrom
].
add
(
v
)
values
[
'info'
][
k
][
'all'
].
add
(
v
)
for
fp
in
range
(
len
(
fmtLbl
)):
f
=
fmtLbl
[
fp
]
v
=
fmtVal
[
fp
]
if
f
in
codes
[
'format'
]:
f
=
codes
[
'format'
][
f
]
#print f
if
'/'
in
v
:
continue
if
','
in
v
:
pass
else
:
v
=
float
(
v
)
#print fp, f, v
if
f
not
in
values
[
'format'
]:
values
[
'format'
][
f
]
=
{
'all'
:
stats
()
}
if
chrom
not
in
values
[
'format'
][
f
]:
values
[
'format'
][
f
][
chrom
]
=
stats
()
values
[
'format'
][
f
][
chrom
].
add
(
v
)
values
[
'format'
][
f
][
'all'
].
add
(
v
)
if
count
==
1000
:
#break
pass
for
k
in
values
:
if
k
in
[
'count'
]:
pass
elif
k
in
[
'dist'
,
'qual'
]:
for
chrom
in
values
[
k
]:
#print "chrom", chrom
values
[
k
][
chrom
]
=
values
[
k
][
chrom
].
get
()
#elif k in []:
#values[k] = values[k].get()
else
:
for
chrom
in
values
[
k
]:
for
kk
in
values
[
k
][
chrom
]:
values
[
k
][
chrom
][
kk
]
=
values
[
k
][
chrom
][
kk
].
get
()
pp
(
values
)
json
.
dump
(
values
,
open
(
os
.
path
.
basename
(
infile
)
+
'.json'
,
'w'
),
sort_keys
=
True
,
indent
=
' '
)
if
__name__
==
'__main__'
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment