Skip to content
Snippets Groups Projects
Commit fd61f420 authored by Siu, Pui Chung's avatar Siu, Pui Chung
Browse files

Replace generatekmercomposition.py

parent bada710e
No related branches found
No related tags found
No related merge requests found
......@@ -3,27 +3,65 @@
"""
Name = Siu Pui Chung Jacky
Student number = 1047527
Script for algorithm for bioinformatics preperation
input: integer k and string Text
output: composition of all kmers of text
Script for k-Mer Composition
input: A DNA string s in FASTA format (having length at most 100 kbp).
output: The 4-mer composition of s.
"""
from sys import argv
def composition(k, string):
comp = []
n = len(string)
for i in range(n - k + 1): #sliding window
comp.append(string[i: i+k])#size as kmer size
return comp
bases = 'ACGT'
def fastaparser(filetext):
ID, seq, fastas = None, [], []
for line in filetext:
line = line.strip()
if line.startswith(">"):
if ID:
fastas.append((ID, ''.join(seq)))
ID, seq = line, []
else:
seq.append(line)
if ID:
fastas.append((ID, ''.join(seq)))
return fastas
def basescombinations(bases):
"""
generate all bases combination to form dictionary
input: the four bases
output: kmers, list of combinations of bases
combinationdict, dictionary of combinations of bases
"""
kmers = []
for a in bases:
n1 = a
for b in bases:
n2 = n1 + b
for c in bases:
n3 = n2 + c
for d in bases:
kmers.append(n3+d)
combinationdict = {i:0 for i in kmers}
return kmers, combinationdict
def frequency(string, combinationdict):
"""
calculate the frequency of a particular 4-mer in string
input: string, sequences with bases of ATCG
combinationdict, dictionary of k-mers consist of combinations of bases, values all = 0
output: combinationdict, dictionary of k-mers consit of combinations of bases with frequency from string
"""
for i in range(len(string) - 4 + 1):
combinationdict[string[i:i + 4]] += 1
return combinationdict
if __name__ == "__main__":
filename = argv[1]
with open(filename, 'r') as f:
lines = f.readlines()
kmer = int(lines[0].strip())
string = lines[1].strip()
comp = composition(kmer, string)
f = open("answer.txt", "w")
f.write("\n".join(comp))
f.close()
with open(argv[1]) as f:
fastalist = fastaparser(f)
kmers, dictionary = basescombinations(bases)
dictionary = frequency(fastalist[0][1], dictionary)
print(' '.join(str(dictionary[kmer]) for kmer in kmers))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment