Skip to content
Snippets Groups Projects
Commit cb2d9e7f authored by Luna de Haro, Antonio's avatar Luna de Haro, Antonio
Browse files

Add new file

parent 8eb380ed
Branches master
No related tags found
No related merge requests found
#!usr/bin/env python
from sys import argv
from pandas import read_table
import time
"""
Script to find all cis-eQTLs and non cis-eQTLs, and then place SNPs in the promoter region of that
cis-eQTL.
Also finds motifs in the promoter region and how many SNPs where found inside the motif
This script writes a tab delimited table with all values claculated for each gene
To execute it from the command line:
python SNPs_in_ciseQTL_finder.py genes.gtf markerfile.txt LODscoreFile.txt\
snps, mast_output_varant1, mast_output_variant2 output.csv
"""
def parse_snp(snpfile):
snips = []
for line in snpfile:
line = line.strip()
if not line.startswith('#'):
record = line.split("\t")
record[1] = int(record[1])
snips += [record]
return snips
def parse_markers(markerfile):
markers = []
for line in markerfile:
line = line.strip()
if not line.startswith('marker'):
record = line.split("\t")
markers += [record]
print 'Number of markers: ', len(markers)
return markers
def parse_GTF(GTFfile):
genes = []
for line in GTFfile:
line = line.strip()
record = line.split("\t")
genes += [record]
return genes
def parse_mastoutput(mastfile):
tfbs = []
for line in mastfile:
line = line.strip()
if not line.startswith('#'):
record = line.split()
tfbs += [record]
return tfbs
def get_motifs (mastfile1, mastfile2):
first_file = []
second_file = []
for i in range(len(mastfile1)):
chr1 = mastfile1[i][0]
motif1 = mastfile1[i][1]
start1 = mastfile1[i][2]
end1 = mastfile1[i][3]
pvalue1 = mastfile1[i][-1]
record1 = (chr1, motif1, int(start1), int(end1), float(pvalue1))
first_file.append(record1)
for i in range(len(mastfile2)):
chr2 = mastfile2[i][0]
motif2 = mastfile2[i][1]
start2 = mastfile2[i][2]
end2 = mastfile2[i][3]
pvalue2 = mastfile2[i][-1]
record2 = (chr2, motif2, int(start2), int(end2), float(pvalue2))
second_file.append(record2)
in1_not2 = set(first_file) - set(second_file)
in2_not1 = set(second_file) - set(first_file)
diff_motifs = list(in1_not2) + list(in2_not1)
repetitions = set(first_file) & set(second_file)
total_motifs = list(repetitions) + diff_motifs
print 'Number of motifs in first file: ', len(mastfile1)
print 'Number of motifs in second file: ', len(mastfile2)
print 'Number of identical motifs between them: ',len(list(repetitions))
print 'Number of different motifs between them (if P-value is different is \
also considered as a different motif): ',len(diff_motifs)
print 'Number of total motifs: ', len(total_motifs)
return list(repetitions), diff_motifs
def get_tss(genes):
tss_dict ={}
for gene in genes:
if gene[2] == '5UTR':
chromosome = gene[0]
tss = int(gene[3])
raw_gene = gene[-1]
genename = raw_gene.split("\"")[3]
if genename in tss_dict:
new_tss = tss
old_tss = tss_dict[genename][1]
if old_tss > new_tss:
tss_dict[genename] = [chromosome[-1], new_tss]
else:
pass
else:
tss_dict[genename] = [chromosome[-1], tss]
return tss_dict
def get_marker(markers_parsed, tss_dict):
marker_dict={}
for gene in tss_dict:
chromosome_gene = tss_dict[gene][0]
tss = tss_dict[gene][1]
for entry in markers_parsed:
marker = entry[0]
chromosome_marker = entry[1]
position = int(entry[2])*10
if chromosome_gene == chromosome_marker:
distance = abs(tss-position)
if gene in marker_dict:
old = marker_dict[gene][3]
new = distance
if old>new:
marker_dict[gene] =[chromosome_marker, tss, marker, new]
else:
pass
else:
marker_dict[gene] =[chromosome_marker, tss, marker, distance]
return marker_dict
def get_ciseQTLs(lod_table, marker_dict, lodthreshold=5, distancethreshold = 5000000000000000000000000000000000000000000000000):
print 'LOD threshold:', lodthreshold
eQTLs_dict = {}
genes_wo_LOD = []
for gene in marker_dict:
if gene in lod_table.index:
marker = marker_dict[gene][2]
distance = marker_dict[gene][3]
lodscore = lod_table.get_value(gene, marker)
if lodscore > lodthreshold and distance < distancethreshold:
eQTLs_dict[gene] = [marker_dict[gene][0], marker_dict[gene][1], \
marker, distance, lodscore]
else:
pass
else:
genes_wo_LOD += [gene]
return eQTLs_dict, genes_wo_LOD
def get_all_genes(lod_table, marker_dict, lodthreshold=5, distancethreshold = 5000000000000000000000000000000000000000000000000):
print 'LOD threshold:', lodthreshold
noneQTLs_dict = {}
genes_wo_LOD = []
for gene in marker_dict:
if gene in lod_table.index:
marker = marker_dict[gene][2]
distance = marker_dict[gene][3]
lodscore = lod_table.get_value(gene, marker)
if distance < distancethreshold:
noneQTLs_dict[gene] = [marker_dict[gene][0], marker_dict[gene][1], \
marker, distance, lodscore]
else:
pass
else:
genes_wo_LOD += [gene]
return noneQTLs_dict, genes_wo_LOD
def place_SNPs_in_motifs(snips, motifs, eQTLs_dict, filename, promoter_negative = 1000, \
promoter_positive = 200, threshold = float(1e-1)):
print 'Promoter region: -%i +%i' %(promoter_negative, promoter_positive)
print 'P-value threshold for the motif: ', threshold
snips = sorted(snips,key = lambda x: (x[0], x[1]))
motifs = sorted(motifs,key = lambda x: (x[0], x[2]))
genes = sorted(eQTLs_dict.keys(),key = lambda x: (eQTLs_dict[x][0], eQTLs_dict[x][1]))
motif_dict = {}
i_motifs = 0
i_snips_g = 0
i_snips_m = 0
index_reset = 10000
fil = open(filename, 'w')
fil.write('Gene\tChr\tTSS\tMotif\tStart\tEnd\tP-value\tSNP position')
fil.write('\n')
for gene in genes:
if i_motifs > index_reset:
i_motifs-= index_reset
else :
i_motifs = 0
if i_snips_g > index_reset:
i_snips_g-= index_reset
else :
i_snips_g = 0
if i_snips_m > index_reset:
i_snips_m-= index_reset
else :
i_snips_m = 0
amount_motifs = 0
amount_snps_in_motif = 0
amount_snps = 0
chr_g = int(eQTLs_dict[gene][0])
tss = eQTLs_dict[gene][1]
start = tss -promoter_negative
if start < 0:
start = 0
end = tss + promoter_positive
nt_motif_in_promoter = 0
#scanning for the next SNP in the gene
while int(snips[i_snips_g][0]) < chr_g or int(snips[i_snips_g][1]) < start:
i_snips_g += 1
#counting the SNPs in the gene
while int(snips[i_snips_g][0]) == chr_g and int(snips[i_snips_g][1]) <= end:
amount_snps += 1
i_snips_g += 1
#scanning for the next motif
while int(motifs[i_motifs][0]) < chr_g or int(motifs[i_motifs][2]) < start:
i_motifs += 1
#analysing the motifs in the gene
while int(motifs[i_motifs][0]) == chr_g and int(motifs[i_motifs][3]) <= end:
positions = []
motifname = motifs[i_motifs][1]
start_motif = int(motifs[i_motifs][2])
end_motif = int(motifs[i_motifs][3])
pvalue = motifs[i_motifs][-1]
if pvalue <= threshold:
amount_motifs += 1
#scanning for the next SNP in the motif
while int(snips[i_snips_m][0]) < chr_g or int(snips[i_snips_m][1]) < start_motif:
i_snips_m += 1
#counting the SNPs in the motif
while int(snips[i_snips_m][0]) == chr_g and int(snips[i_snips_m][1]) <= end_motif:
positions.append(int(snips[i_snips_m][1]))
amount_snps_in_motif += 1
i_snips_m += 1
if len(positions) != 0:
fil.write('%s\t%i\t%i\t%s\t%i\t%i\t%e\t'\
%(gene, chr_g, tss, motifname, start_motif, end_motif, pvalue))
for i in range(len(positions)):
fil.write('%i' %(positions[i]))
if len(positions) > 1 and i != len(positions)-1:
fil.write(',')
fil.write('\n')
else:
fil.write('%s\t%i\t%i\t%s\t%i\t%i\t%e\t-\n'\
%(gene, chr_g, tss, motifname, start_motif, end_motif, pvalue))
i_motifs += 1
#Directamente escribir
motif_dict[gene] = [chr_g, motifname, amount_snps, amount_motifs, amount_snps_in_motif, nt_motif_in_promoter]
print 'Amount of motifs: ', len(motifs), '\n'
return motif_dict
def create_table_from_dict (dictionary, filename):
fil = open(filename, 'w')
fil.write('Gene\tChr\tTSS\tMarker\tDistance\tLODscore\t#SNPs\t#Motifs\tSNPs_in_motif\tNT_of_motif')
fil.write('\n')
for entry in sorted(dictionary):
fil.write('%s\t%s\t%i\t%s\t%i\t%s\t%i\t%i\t%i\t%i\n'\
%(entry, dictionary[entry][0], dictionary[entry][1], dictionary[entry][2], dictionary[entry][3],\
dictionary[entry][4], dictionary[entry][5], dictionary[entry][6], dictionary[entry][7], dictionary[entry][8]))
if __name__ == '__main__':
startTime = time.clock()
directory = '/home/luna003/Thesis/Data/'
gtf_file = open(directory + argv[1])
genes = parse_GTF(gtf_file)
tss_dict = get_tss(genes)
print 'Number of genes got from the .gtf file: %i'%(len(tss_dict))
#This genes is just the earliest 5UTR of each gene
marker_file = open(directory + argv[2])
markers = parse_markers(marker_file)
genes_dict_withmarker = get_marker(markers, tss_dict)
print 'Number of chromosomal genes: %i' %(len(genes_dict_withmarker))
print '--------------'
lodfile = open(directory + argv[3])
lod_table = read_table(lodfile, index_col=0)
all_dict, genes_nf = get_all_genes(lod_table, genes_dict_withmarker)
print 'There are %i eQTLs. %i genes where found in the gtf.file but not\
in the lod score file. '%(len(all_dict), len(genes_nf))
#print genes_nf2
snpsfile = open(directory + argv[4])
snips = parse_snp(snpsfile)
print 'Number of SNPs: ', len(snips)
motiffile1 = open(directory + argv[5])
motiffile2 = open(directory + argv[6])
motifs1 = parse_mastoutput(motiffile1)
motifs2 = parse_mastoutput(motiffile2)
rep_motifs, diff_motifs = get_motifs(motifs1, motifs2)
table_file_name = argv[7]
place_SNPs_in_motifs(snips, diff_motifs, all_dict, table_file_name)
spent_time = time.clock() - startTime
print 'Time: %f min' %(spent_time/60)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment