Add new file

cb2d9e7f · Luna de Haro, Antonio · 8eb380ed · cb2d9e7f
Commit cb2d9e7f authored 7 years ago by Luna de Haro, Antonio
--- a/SNPs_in_motif.py
+++ b/SNPs_in_motif.py
+#!usr/bin/env python 
+
+from sys import argv
+from pandas import read_table
+import time
+
+
+"""
+Script to find all cis-eQTLs and non cis-eQTLs, and then place SNPs in the promoter region of that
+cis-eQTL. 
+Also finds motifs in the promoter region and how many SNPs where found inside the motif
+This script writes a tab delimited table with all values claculated for each gene
+
+To execute it from the command line:
+    python SNPs_in_ciseQTL_finder.py genes.gtf markerfile.txt LODscoreFile.txt\
+    snps, mast_output_varant1, mast_output_variant2 output.csv
+
+"""
+def parse_snp(snpfile):
+    snips = []
+    for line in snpfile:
+        line = line.strip()
+        if not line.startswith('#'):
+            record = line.split("\t")
+            record[1] = int(record[1])
+            snips += [record]
+    return snips
+
+
+def parse_markers(markerfile):
+    markers = []
+    for line in markerfile:
+        line = line.strip()
+        if not line.startswith('marker'):
+            record = line.split("\t")
+            markers += [record]
+    print 'Number of markers: ', len(markers)
+    return markers
+
+
+def parse_GTF(GTFfile):
+    genes = []
+    for line in GTFfile:
+        line = line.strip()
+        record = line.split("\t")
+        genes += [record]
+    return genes
+
+
+def parse_mastoutput(mastfile):
+    tfbs = []
+    for line in mastfile:
+        line = line.strip()
+        if not line.startswith('#'):
+            record = line.split()
+            tfbs += [record]
+    return tfbs
+
+
+def get_motifs (mastfile1, mastfile2):
+    first_file = []
+    second_file = []
+    for i in range(len(mastfile1)):
+        chr1 = mastfile1[i][0]
+        motif1 = mastfile1[i][1]
+        start1 = mastfile1[i][2]
+        end1 = mastfile1[i][3]
+        pvalue1 = mastfile1[i][-1]
+        record1 = (chr1, motif1, int(start1), int(end1), float(pvalue1))
+        first_file.append(record1)    
+    for i in range(len(mastfile2)):
+        chr2 = mastfile2[i][0]
+        motif2 = mastfile2[i][1]
+        start2 = mastfile2[i][2]
+        end2 = mastfile2[i][3]
+        pvalue2 = mastfile2[i][-1]
+        record2 = (chr2, motif2, int(start2), int(end2), float(pvalue2))
+        second_file.append(record2)      
+    in1_not2 = set(first_file) - set(second_file)
+    in2_not1 = set(second_file) - set(first_file)
+    diff_motifs = list(in1_not2) + list(in2_not1)
+    repetitions = set(first_file) & set(second_file)
+    total_motifs = list(repetitions) + diff_motifs
+    print 'Number of motifs in first file: ', len(mastfile1)
+    print 'Number of motifs in second file: ', len(mastfile2)
+    print 'Number of identical motifs between them: ',len(list(repetitions))
+    print 'Number of different motifs between them (if P-value is different is \
+also considered as a different motif): ',len(diff_motifs)
+    print 'Number of total motifs: ', len(total_motifs)
+    return list(repetitions), diff_motifs
+
+
+def get_tss(genes):
+    tss_dict ={}
+    for gene in genes:
+        if gene[2] == '5UTR':
+            chromosome = gene[0]
+            tss = int(gene[3])
+            raw_gene = gene[-1]
+            genename = raw_gene.split("\"")[3]
+            if genename in tss_dict:
+                new_tss = tss
+                old_tss = tss_dict[genename][1]
+                if old_tss > new_tss:
+                    tss_dict[genename] = [chromosome[-1], new_tss]
+                else:
+                    pass
+            else:
+                tss_dict[genename] = [chromosome[-1], tss]
+    return tss_dict
+ 
+           
+def get_marker(markers_parsed, tss_dict):
+    marker_dict={}
+    for gene in tss_dict:
+        chromosome_gene = tss_dict[gene][0]
+        tss = tss_dict[gene][1]
+        for entry in markers_parsed:
+            marker = entry[0]
+            chromosome_marker = entry[1]
+            position = int(entry[2])*10
+            if chromosome_gene == chromosome_marker:
+                distance = abs(tss-position)
+                if gene in marker_dict:
+                    old = marker_dict[gene][3]
+                    new = distance
+                    if old>new:
+                        marker_dict[gene] =[chromosome_marker, tss, marker, new]
+                    else:
+                        pass
+                else:
+                    marker_dict[gene] =[chromosome_marker, tss, marker, distance]
+    return marker_dict
+
+
+def get_ciseQTLs(lod_table, marker_dict, lodthreshold=5, distancethreshold = 5000000000000000000000000000000000000000000000000):
+    print 'LOD threshold:', lodthreshold
+    eQTLs_dict = {}
+    genes_wo_LOD = []
+    for gene in marker_dict:
+        if gene in lod_table.index:
+            marker = marker_dict[gene][2]
+            distance = marker_dict[gene][3]
+            lodscore =  lod_table.get_value(gene, marker)
+            if lodscore > lodthreshold and distance < distancethreshold:
+                eQTLs_dict[gene] = [marker_dict[gene][0], marker_dict[gene][1], \
+                       marker, distance, lodscore]
+            else:
+                pass
+        else:
+            genes_wo_LOD += [gene]
+    return eQTLs_dict, genes_wo_LOD
+
+
+def get_all_genes(lod_table, marker_dict, lodthreshold=5, distancethreshold = 5000000000000000000000000000000000000000000000000):
+    print 'LOD threshold:', lodthreshold
+    noneQTLs_dict = {}
+    genes_wo_LOD = []
+    for gene in marker_dict:
+        if gene in lod_table.index:
+            marker = marker_dict[gene][2]
+            distance = marker_dict[gene][3]
+            lodscore =  lod_table.get_value(gene, marker)
+            if distance < distancethreshold:
+                noneQTLs_dict[gene] = [marker_dict[gene][0], marker_dict[gene][1], \
+                       marker, distance, lodscore]
+            else:
+                pass
+        else:
+            genes_wo_LOD += [gene]
+    return noneQTLs_dict, genes_wo_LOD
+
+
+def place_SNPs_in_motifs(snips, motifs, eQTLs_dict, filename, promoter_negative = 1000, \
+promoter_positive = 200, threshold = float(1e-1)):
+    print 'Promoter region: -%i +%i' %(promoter_negative, promoter_positive)
+    print 'P-value threshold for the motif: ', threshold
+    
+    
+    snips = sorted(snips,key = lambda x: (x[0], x[1]))
+    motifs = sorted(motifs,key = lambda x: (x[0], x[2]))
+    genes = sorted(eQTLs_dict.keys(),key = lambda x: (eQTLs_dict[x][0], eQTLs_dict[x][1]))
+    
+    motif_dict = {}
+    
+    i_motifs = 0
+    i_snips_g = 0
+    i_snips_m = 0
+    index_reset = 10000
+    
+    
+    fil = open(filename, 'w')
+    fil.write('Gene\tChr\tTSS\tMotif\tStart\tEnd\tP-value\tSNP position')
+    fil.write('\n')
+    
+    for gene in genes:
+        if i_motifs > index_reset:
+            i_motifs-= index_reset
+        else :
+            i_motifs = 0
+        if i_snips_g > index_reset:
+            i_snips_g-= index_reset
+        else :
+            i_snips_g = 0
+        if i_snips_m > index_reset:
+            i_snips_m-= index_reset
+        else :
+            i_snips_m = 0
+        amount_motifs = 0
+        amount_snps_in_motif = 0
+        amount_snps = 0
+        
+        chr_g = int(eQTLs_dict[gene][0])
+        tss = eQTLs_dict[gene][1]
+        start = tss -promoter_negative
+        if start < 0:
+            start = 0
+        end = tss + promoter_positive
+        
+        nt_motif_in_promoter = 0
+        
+        #scanning for the next SNP in the gene
+        while int(snips[i_snips_g][0]) < chr_g or int(snips[i_snips_g][1]) < start:
+            i_snips_g += 1
+
+        #counting the SNPs in the gene
+        while int(snips[i_snips_g][0]) == chr_g and int(snips[i_snips_g][1]) <= end:
+            amount_snps += 1
+            i_snips_g += 1
+
+        #scanning for the next motif
+        while int(motifs[i_motifs][0]) < chr_g or int(motifs[i_motifs][2]) < start:
+            i_motifs += 1
+
+        #analysing the motifs in the gene
+        while int(motifs[i_motifs][0]) == chr_g and int(motifs[i_motifs][3]) <= end:
+            positions = []
+            motifname = motifs[i_motifs][1]
+            start_motif = int(motifs[i_motifs][2])
+            end_motif = int(motifs[i_motifs][3])
+            pvalue = motifs[i_motifs][-1]
+            
+            if pvalue <= threshold:
+                
+                amount_motifs += 1
+                
+
+                #scanning for the next SNP in the motif
+                while int(snips[i_snips_m][0]) < chr_g or int(snips[i_snips_m][1]) < start_motif:
+                    i_snips_m += 1
+        
+                #counting the SNPs in the motif
+                while int(snips[i_snips_m][0]) == chr_g and int(snips[i_snips_m][1]) <= end_motif:
+                    positions.append(int(snips[i_snips_m][1]))
+                    amount_snps_in_motif += 1
+                    i_snips_m += 1
+            
+            if len(positions) != 0:
+                fil.write('%s\t%i\t%i\t%s\t%i\t%i\t%e\t'\
+%(gene, chr_g, tss, motifname, start_motif, end_motif, pvalue))
+
+                for i in range(len(positions)):
+                    
+                    fil.write('%i' %(positions[i]))
+                    if len(positions) > 1 and i != len(positions)-1:
+                        fil.write(',')
+                fil.write('\n')
+            else:
+                fil.write('%s\t%i\t%i\t%s\t%i\t%i\t%e\t-\n'\
+%(gene, chr_g, tss, motifname, start_motif, end_motif, pvalue))
+
+            
+                
+            
+            i_motifs += 1
+        
+        #Directamente escribir
+        motif_dict[gene] = [chr_g, motifname, amount_snps, amount_motifs, amount_snps_in_motif, nt_motif_in_promoter]
+    print 'Amount of motifs: ', len(motifs), '\n'
+    return motif_dict
+
+
+def create_table_from_dict (dictionary, filename):
+    fil = open(filename, 'w')
+    fil.write('Gene\tChr\tTSS\tMarker\tDistance\tLODscore\t#SNPs\t#Motifs\tSNPs_in_motif\tNT_of_motif')
+    fil.write('\n')
+    for entry in sorted(dictionary):
+        fil.write('%s\t%s\t%i\t%s\t%i\t%s\t%i\t%i\t%i\t%i\n'\
+%(entry, dictionary[entry][0], dictionary[entry][1], dictionary[entry][2], dictionary[entry][3],\
+dictionary[entry][4], dictionary[entry][5], dictionary[entry][6], dictionary[entry][7], dictionary[entry][8]))
+
+        
+if __name__ == '__main__':
+    startTime = time.clock()
+    directory = '/home/luna003/Thesis/Data/'
+    gtf_file = open(directory + argv[1])
+    genes = parse_GTF(gtf_file)
+    tss_dict = get_tss(genes)
+    print 'Number of genes got from the .gtf file: %i'%(len(tss_dict))
+#This genes is just the earliest 5UTR of each gene
+    marker_file = open(directory + argv[2]) 
+    markers = parse_markers(marker_file)
+    genes_dict_withmarker = get_marker(markers, tss_dict)
+    print 'Number of chromosomal genes: %i' %(len(genes_dict_withmarker))
+    print '--------------'
+    lodfile = open(directory + argv[3])
+    lod_table = read_table(lodfile, index_col=0)
+    all_dict, genes_nf = get_all_genes(lod_table, genes_dict_withmarker)
+
+    print 'There are %i eQTLs. %i genes where found in the gtf.file but not\
+ in the lod score file. '%(len(all_dict), len(genes_nf))
+    #print genes_nf2
+    snpsfile = open(directory + argv[4])
+    snips = parse_snp(snpsfile)
+    print 'Number of SNPs: ', len(snips)
+    motiffile1 = open(directory + argv[5])
+    motiffile2 = open(directory + argv[6])
+    motifs1 = parse_mastoutput(motiffile1)
+    motifs2 = parse_mastoutput(motiffile2)
+    rep_motifs, diff_motifs = get_motifs(motifs1, motifs2)  
+    
+    table_file_name = argv[7]
+    place_SNPs_in_motifs(snips, diff_motifs, all_dict, table_file_name)
+    
+    
+    spent_time = time.clock() - startTime
+    print 'Time: %f min' %(spent_time/60)
+
+    
+   
+    
\ No newline at end of file