Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
eQTL_Analysis
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Luna de Haro, Antonio
eQTL_Analysis
Commits
cb2d9e7f
Commit
cb2d9e7f
authored
7 years ago
by
Luna de Haro, Antonio
Browse files
Options
Downloads
Patches
Plain Diff
Add new file
parent
8eb380ed
Branches
master
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
SNPs_in_motif.py
+331
-0
331 additions, 0 deletions
SNPs_in_motif.py
with
331 additions
and
0 deletions
SNPs_in_motif.py
0 → 100644
+
331
−
0
View file @
cb2d9e7f
#!usr/bin/env python
from
sys
import
argv
from
pandas
import
read_table
import
time
"""
Script to find all cis-eQTLs and non cis-eQTLs, and then place SNPs in the promoter region of that
cis-eQTL.
Also finds motifs in the promoter region and how many SNPs where found inside the motif
This script writes a tab delimited table with all values claculated for each gene
To execute it from the command line:
python SNPs_in_ciseQTL_finder.py genes.gtf markerfile.txt LODscoreFile.txt
\
snps, mast_output_varant1, mast_output_variant2 output.csv
"""
def
parse_snp
(
snpfile
):
snips
=
[]
for
line
in
snpfile
:
line
=
line
.
strip
()
if
not
line
.
startswith
(
'
#
'
):
record
=
line
.
split
(
"
\t
"
)
record
[
1
]
=
int
(
record
[
1
])
snips
+=
[
record
]
return
snips
def
parse_markers
(
markerfile
):
markers
=
[]
for
line
in
markerfile
:
line
=
line
.
strip
()
if
not
line
.
startswith
(
'
marker
'
):
record
=
line
.
split
(
"
\t
"
)
markers
+=
[
record
]
print
'
Number of markers:
'
,
len
(
markers
)
return
markers
def
parse_GTF
(
GTFfile
):
genes
=
[]
for
line
in
GTFfile
:
line
=
line
.
strip
()
record
=
line
.
split
(
"
\t
"
)
genes
+=
[
record
]
return
genes
def
parse_mastoutput
(
mastfile
):
tfbs
=
[]
for
line
in
mastfile
:
line
=
line
.
strip
()
if
not
line
.
startswith
(
'
#
'
):
record
=
line
.
split
()
tfbs
+=
[
record
]
return
tfbs
def
get_motifs
(
mastfile1
,
mastfile2
):
first_file
=
[]
second_file
=
[]
for
i
in
range
(
len
(
mastfile1
)):
chr1
=
mastfile1
[
i
][
0
]
motif1
=
mastfile1
[
i
][
1
]
start1
=
mastfile1
[
i
][
2
]
end1
=
mastfile1
[
i
][
3
]
pvalue1
=
mastfile1
[
i
][
-
1
]
record1
=
(
chr1
,
motif1
,
int
(
start1
),
int
(
end1
),
float
(
pvalue1
))
first_file
.
append
(
record1
)
for
i
in
range
(
len
(
mastfile2
)):
chr2
=
mastfile2
[
i
][
0
]
motif2
=
mastfile2
[
i
][
1
]
start2
=
mastfile2
[
i
][
2
]
end2
=
mastfile2
[
i
][
3
]
pvalue2
=
mastfile2
[
i
][
-
1
]
record2
=
(
chr2
,
motif2
,
int
(
start2
),
int
(
end2
),
float
(
pvalue2
))
second_file
.
append
(
record2
)
in1_not2
=
set
(
first_file
)
-
set
(
second_file
)
in2_not1
=
set
(
second_file
)
-
set
(
first_file
)
diff_motifs
=
list
(
in1_not2
)
+
list
(
in2_not1
)
repetitions
=
set
(
first_file
)
&
set
(
second_file
)
total_motifs
=
list
(
repetitions
)
+
diff_motifs
print
'
Number of motifs in first file:
'
,
len
(
mastfile1
)
print
'
Number of motifs in second file:
'
,
len
(
mastfile2
)
print
'
Number of identical motifs between them:
'
,
len
(
list
(
repetitions
))
print
'
Number of different motifs between them (if P-value is different is
\
also considered as a different motif):
'
,
len
(
diff_motifs
)
print
'
Number of total motifs:
'
,
len
(
total_motifs
)
return
list
(
repetitions
),
diff_motifs
def
get_tss
(
genes
):
tss_dict
=
{}
for
gene
in
genes
:
if
gene
[
2
]
==
'
5UTR
'
:
chromosome
=
gene
[
0
]
tss
=
int
(
gene
[
3
])
raw_gene
=
gene
[
-
1
]
genename
=
raw_gene
.
split
(
"
\"
"
)[
3
]
if
genename
in
tss_dict
:
new_tss
=
tss
old_tss
=
tss_dict
[
genename
][
1
]
if
old_tss
>
new_tss
:
tss_dict
[
genename
]
=
[
chromosome
[
-
1
],
new_tss
]
else
:
pass
else
:
tss_dict
[
genename
]
=
[
chromosome
[
-
1
],
tss
]
return
tss_dict
def
get_marker
(
markers_parsed
,
tss_dict
):
marker_dict
=
{}
for
gene
in
tss_dict
:
chromosome_gene
=
tss_dict
[
gene
][
0
]
tss
=
tss_dict
[
gene
][
1
]
for
entry
in
markers_parsed
:
marker
=
entry
[
0
]
chromosome_marker
=
entry
[
1
]
position
=
int
(
entry
[
2
])
*
10
if
chromosome_gene
==
chromosome_marker
:
distance
=
abs
(
tss
-
position
)
if
gene
in
marker_dict
:
old
=
marker_dict
[
gene
][
3
]
new
=
distance
if
old
>
new
:
marker_dict
[
gene
]
=
[
chromosome_marker
,
tss
,
marker
,
new
]
else
:
pass
else
:
marker_dict
[
gene
]
=
[
chromosome_marker
,
tss
,
marker
,
distance
]
return
marker_dict
def
get_ciseQTLs
(
lod_table
,
marker_dict
,
lodthreshold
=
5
,
distancethreshold
=
5000000000000000000000000000000000000000000000000
):
print
'
LOD threshold:
'
,
lodthreshold
eQTLs_dict
=
{}
genes_wo_LOD
=
[]
for
gene
in
marker_dict
:
if
gene
in
lod_table
.
index
:
marker
=
marker_dict
[
gene
][
2
]
distance
=
marker_dict
[
gene
][
3
]
lodscore
=
lod_table
.
get_value
(
gene
,
marker
)
if
lodscore
>
lodthreshold
and
distance
<
distancethreshold
:
eQTLs_dict
[
gene
]
=
[
marker_dict
[
gene
][
0
],
marker_dict
[
gene
][
1
],
\
marker
,
distance
,
lodscore
]
else
:
pass
else
:
genes_wo_LOD
+=
[
gene
]
return
eQTLs_dict
,
genes_wo_LOD
def
get_all_genes
(
lod_table
,
marker_dict
,
lodthreshold
=
5
,
distancethreshold
=
5000000000000000000000000000000000000000000000000
):
print
'
LOD threshold:
'
,
lodthreshold
noneQTLs_dict
=
{}
genes_wo_LOD
=
[]
for
gene
in
marker_dict
:
if
gene
in
lod_table
.
index
:
marker
=
marker_dict
[
gene
][
2
]
distance
=
marker_dict
[
gene
][
3
]
lodscore
=
lod_table
.
get_value
(
gene
,
marker
)
if
distance
<
distancethreshold
:
noneQTLs_dict
[
gene
]
=
[
marker_dict
[
gene
][
0
],
marker_dict
[
gene
][
1
],
\
marker
,
distance
,
lodscore
]
else
:
pass
else
:
genes_wo_LOD
+=
[
gene
]
return
noneQTLs_dict
,
genes_wo_LOD
def
place_SNPs_in_motifs
(
snips
,
motifs
,
eQTLs_dict
,
filename
,
promoter_negative
=
1000
,
\
promoter_positive
=
200
,
threshold
=
float
(
1e-1
)):
print
'
Promoter region: -%i +%i
'
%
(
promoter_negative
,
promoter_positive
)
print
'
P-value threshold for the motif:
'
,
threshold
snips
=
sorted
(
snips
,
key
=
lambda
x
:
(
x
[
0
],
x
[
1
]))
motifs
=
sorted
(
motifs
,
key
=
lambda
x
:
(
x
[
0
],
x
[
2
]))
genes
=
sorted
(
eQTLs_dict
.
keys
(),
key
=
lambda
x
:
(
eQTLs_dict
[
x
][
0
],
eQTLs_dict
[
x
][
1
]))
motif_dict
=
{}
i_motifs
=
0
i_snips_g
=
0
i_snips_m
=
0
index_reset
=
10000
fil
=
open
(
filename
,
'
w
'
)
fil
.
write
(
'
Gene
\t
Chr
\t
TSS
\t
Motif
\t
Start
\t
End
\t
P-value
\t
SNP position
'
)
fil
.
write
(
'
\n
'
)
for
gene
in
genes
:
if
i_motifs
>
index_reset
:
i_motifs
-=
index_reset
else
:
i_motifs
=
0
if
i_snips_g
>
index_reset
:
i_snips_g
-=
index_reset
else
:
i_snips_g
=
0
if
i_snips_m
>
index_reset
:
i_snips_m
-=
index_reset
else
:
i_snips_m
=
0
amount_motifs
=
0
amount_snps_in_motif
=
0
amount_snps
=
0
chr_g
=
int
(
eQTLs_dict
[
gene
][
0
])
tss
=
eQTLs_dict
[
gene
][
1
]
start
=
tss
-
promoter_negative
if
start
<
0
:
start
=
0
end
=
tss
+
promoter_positive
nt_motif_in_promoter
=
0
#scanning for the next SNP in the gene
while
int
(
snips
[
i_snips_g
][
0
])
<
chr_g
or
int
(
snips
[
i_snips_g
][
1
])
<
start
:
i_snips_g
+=
1
#counting the SNPs in the gene
while
int
(
snips
[
i_snips_g
][
0
])
==
chr_g
and
int
(
snips
[
i_snips_g
][
1
])
<=
end
:
amount_snps
+=
1
i_snips_g
+=
1
#scanning for the next motif
while
int
(
motifs
[
i_motifs
][
0
])
<
chr_g
or
int
(
motifs
[
i_motifs
][
2
])
<
start
:
i_motifs
+=
1
#analysing the motifs in the gene
while
int
(
motifs
[
i_motifs
][
0
])
==
chr_g
and
int
(
motifs
[
i_motifs
][
3
])
<=
end
:
positions
=
[]
motifname
=
motifs
[
i_motifs
][
1
]
start_motif
=
int
(
motifs
[
i_motifs
][
2
])
end_motif
=
int
(
motifs
[
i_motifs
][
3
])
pvalue
=
motifs
[
i_motifs
][
-
1
]
if
pvalue
<=
threshold
:
amount_motifs
+=
1
#scanning for the next SNP in the motif
while
int
(
snips
[
i_snips_m
][
0
])
<
chr_g
or
int
(
snips
[
i_snips_m
][
1
])
<
start_motif
:
i_snips_m
+=
1
#counting the SNPs in the motif
while
int
(
snips
[
i_snips_m
][
0
])
==
chr_g
and
int
(
snips
[
i_snips_m
][
1
])
<=
end_motif
:
positions
.
append
(
int
(
snips
[
i_snips_m
][
1
]))
amount_snps_in_motif
+=
1
i_snips_m
+=
1
if
len
(
positions
)
!=
0
:
fil
.
write
(
'
%s
\t
%i
\t
%i
\t
%s
\t
%i
\t
%i
\t
%e
\t
'
\
%
(
gene
,
chr_g
,
tss
,
motifname
,
start_motif
,
end_motif
,
pvalue
))
for
i
in
range
(
len
(
positions
)):
fil
.
write
(
'
%i
'
%
(
positions
[
i
]))
if
len
(
positions
)
>
1
and
i
!=
len
(
positions
)
-
1
:
fil
.
write
(
'
,
'
)
fil
.
write
(
'
\n
'
)
else
:
fil
.
write
(
'
%s
\t
%i
\t
%i
\t
%s
\t
%i
\t
%i
\t
%e
\t
-
\n
'
\
%
(
gene
,
chr_g
,
tss
,
motifname
,
start_motif
,
end_motif
,
pvalue
))
i_motifs
+=
1
#Directamente escribir
motif_dict
[
gene
]
=
[
chr_g
,
motifname
,
amount_snps
,
amount_motifs
,
amount_snps_in_motif
,
nt_motif_in_promoter
]
print
'
Amount of motifs:
'
,
len
(
motifs
),
'
\n
'
return
motif_dict
def
create_table_from_dict
(
dictionary
,
filename
):
fil
=
open
(
filename
,
'
w
'
)
fil
.
write
(
'
Gene
\t
Chr
\t
TSS
\t
Marker
\t
Distance
\t
LODscore
\t
#SNPs
\t
#Motifs
\t
SNPs_in_motif
\t
NT_of_motif
'
)
fil
.
write
(
'
\n
'
)
for
entry
in
sorted
(
dictionary
):
fil
.
write
(
'
%s
\t
%s
\t
%i
\t
%s
\t
%i
\t
%s
\t
%i
\t
%i
\t
%i
\t
%i
\n
'
\
%
(
entry
,
dictionary
[
entry
][
0
],
dictionary
[
entry
][
1
],
dictionary
[
entry
][
2
],
dictionary
[
entry
][
3
],
\
dictionary
[
entry
][
4
],
dictionary
[
entry
][
5
],
dictionary
[
entry
][
6
],
dictionary
[
entry
][
7
],
dictionary
[
entry
][
8
]))
if
__name__
==
'
__main__
'
:
startTime
=
time
.
clock
()
directory
=
'
/home/luna003/Thesis/Data/
'
gtf_file
=
open
(
directory
+
argv
[
1
])
genes
=
parse_GTF
(
gtf_file
)
tss_dict
=
get_tss
(
genes
)
print
'
Number of genes got from the .gtf file: %i
'
%
(
len
(
tss_dict
))
#This genes is just the earliest 5UTR of each gene
marker_file
=
open
(
directory
+
argv
[
2
])
markers
=
parse_markers
(
marker_file
)
genes_dict_withmarker
=
get_marker
(
markers
,
tss_dict
)
print
'
Number of chromosomal genes: %i
'
%
(
len
(
genes_dict_withmarker
))
print
'
--------------
'
lodfile
=
open
(
directory
+
argv
[
3
])
lod_table
=
read_table
(
lodfile
,
index_col
=
0
)
all_dict
,
genes_nf
=
get_all_genes
(
lod_table
,
genes_dict_withmarker
)
print
'
There are %i eQTLs. %i genes where found in the gtf.file but not
\
in the lod score file.
'
%
(
len
(
all_dict
),
len
(
genes_nf
))
#print genes_nf2
snpsfile
=
open
(
directory
+
argv
[
4
])
snips
=
parse_snp
(
snpsfile
)
print
'
Number of SNPs:
'
,
len
(
snips
)
motiffile1
=
open
(
directory
+
argv
[
5
])
motiffile2
=
open
(
directory
+
argv
[
6
])
motifs1
=
parse_mastoutput
(
motiffile1
)
motifs2
=
parse_mastoutput
(
motiffile2
)
rep_motifs
,
diff_motifs
=
get_motifs
(
motifs1
,
motifs2
)
table_file_name
=
argv
[
7
]
place_SNPs_in_motifs
(
snips
,
diff_motifs
,
all_dict
,
table_file_name
)
spent_time
=
time
.
clock
()
-
startTime
print
'
Time: %f min
'
%
(
spent_time
/
60
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment