Commit fd61f420 authored by Siu, Pui Chung's avatar Siu, Pui Chung
Browse files

Replace generatekmercomposition.py

parent bada710e
...@@ -3,27 +3,65 @@ ...@@ -3,27 +3,65 @@
""" """
Name = Siu Pui Chung Jacky Name = Siu Pui Chung Jacky
Student number = 1047527 Student number = 1047527
Script for algorithm for bioinformatics preperation Script for k-Mer Composition
input: integer k and string Text input: A DNA string s in FASTA format (having length at most 100 kbp).
output: composition of all kmers of text output: The 4-mer composition of s.
""" """
from sys import argv from sys import argv
def composition(k, string): bases = 'ACGT'
comp = []
n = len(string) def fastaparser(filetext):
for i in range(n - k + 1): #sliding window ID, seq, fastas = None, [], []
comp.append(string[i: i+k])#size as kmer size for line in filetext:
return comp line = line.strip()
if line.startswith(">"):
if ID:
fastas.append((ID, ''.join(seq)))
ID, seq = line, []
else:
seq.append(line)
if ID:
fastas.append((ID, ''.join(seq)))
return fastas
def basescombinations(bases):
"""
generate all bases combination to form dictionary
input: the four bases
output: kmers, list of combinations of bases
combinationdict, dictionary of combinations of bases
"""
kmers = []
for a in bases:
n1 = a
for b in bases:
n2 = n1 + b
for c in bases:
n3 = n2 + c
for d in bases:
kmers.append(n3+d)
combinationdict = {i:0 for i in kmers}
return kmers, combinationdict
def frequency(string, combinationdict):
"""
calculate the frequency of a particular 4-mer in string
input: string, sequences with bases of ATCG
combinationdict, dictionary of k-mers consist of combinations of bases, values all = 0
output: combinationdict, dictionary of k-mers consit of combinations of bases with frequency from string
"""
for i in range(len(string) - 4 + 1):
combinationdict[string[i:i + 4]] += 1
return combinationdict
if __name__ == "__main__": if __name__ == "__main__":
filename = argv[1] with open(argv[1]) as f:
with open(filename, 'r') as f: fastalist = fastaparser(f)
lines = f.readlines() kmers, dictionary = basescombinations(bases)
kmer = int(lines[0].strip()) dictionary = frequency(fastalist[0][1], dictionary)
string = lines[1].strip() print(' '.join(str(dictionary[kmer]) for kmer in kmers))
comp = composition(kmer, string)
f = open("answer.txt", "w")
f.write("\n".join(comp))
f.close()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment