Skip to content
Snippets Groups Projects
Commit fd61f420 authored by Siu, Pui Chung's avatar Siu, Pui Chung
Browse files

Replace generatekmercomposition.py

parent bada710e
Branches
No related tags found
No related merge requests found
...@@ -3,27 +3,65 @@ ...@@ -3,27 +3,65 @@
""" """
Name = Siu Pui Chung Jacky Name = Siu Pui Chung Jacky
Student number = 1047527 Student number = 1047527
Script for algorithm for bioinformatics preperation Script for k-Mer Composition
input: integer k and string Text input: A DNA string s in FASTA format (having length at most 100 kbp).
output: composition of all kmers of text output: The 4-mer composition of s.
""" """
from sys import argv from sys import argv
def composition(k, string): bases = 'ACGT'
comp = []
n = len(string) def fastaparser(filetext):
for i in range(n - k + 1): #sliding window ID, seq, fastas = None, [], []
comp.append(string[i: i+k])#size as kmer size for line in filetext:
return comp line = line.strip()
if line.startswith(">"):
if ID:
fastas.append((ID, ''.join(seq)))
ID, seq = line, []
else:
seq.append(line)
if ID:
fastas.append((ID, ''.join(seq)))
return fastas
def basescombinations(bases):
"""
generate all bases combination to form dictionary
input: the four bases
output: kmers, list of combinations of bases
combinationdict, dictionary of combinations of bases
"""
kmers = []
for a in bases:
n1 = a
for b in bases:
n2 = n1 + b
for c in bases:
n3 = n2 + c
for d in bases:
kmers.append(n3+d)
combinationdict = {i:0 for i in kmers}
return kmers, combinationdict
def frequency(string, combinationdict):
"""
calculate the frequency of a particular 4-mer in string
input: string, sequences with bases of ATCG
combinationdict, dictionary of k-mers consist of combinations of bases, values all = 0
output: combinationdict, dictionary of k-mers consit of combinations of bases with frequency from string
"""
for i in range(len(string) - 4 + 1):
combinationdict[string[i:i + 4]] += 1
return combinationdict
if __name__ == "__main__": if __name__ == "__main__":
filename = argv[1] with open(argv[1]) as f:
with open(filename, 'r') as f: fastalist = fastaparser(f)
lines = f.readlines() kmers, dictionary = basescombinations(bases)
kmer = int(lines[0].strip()) dictionary = frequency(fastalist[0][1], dictionary)
string = lines[1].strip() print(' '.join(str(dictionary[kmer]) for kmer in kmers))
comp = composition(kmer, string)
f = open("answer.txt", "w")
f.write("\n".join(comp))
f.close()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment