Commit e3408d3e authored by Siu, Pui Chung's avatar Siu, Pui Chung
parent 1f3c365c
Name = Siu Pui Chung Jacky
Student number = 1047527
Script for Overlap Graphs
input: A collection of DNA strings in FASTA format having total length at most 10 kbp.
output: The adjacency list corresponding to O3. You may return edges in any order.
from sys import argv
def fastaparser(filetext):
Description: parse fastafile into fastalist
Input:list of file text
Output:list of list that contain ID and fasta sequence
ID, seq, fastas = None, [], []
for line in filetext:
line = line.strip()
if line.startswith(">"):
if ID:
fastas.append([ID, ''.join(seq)])
ID, seq = line, []
if ID:
fastas.append([ID, ''.join(seq)])
return fastas
def overlapgraph(fasta_list, k):
Description: find overlaps of length k in fasta_list sequences
Input:fasta_list, list of list that contain ID and respective fasta sequence
k, length of overlaps
Output:adjlist, list of list that contain two IDs that share overlap of length k
adjlist = []
for i in range(len(fasta_list)):
for j in range(len(fasta_list)):
#compare two fasta sequences
if i != j:
if fasta_list[i][1][-k:] == fasta_list[j][1][:k]:
adjlist.append([fasta_list[i][0], fasta_list[j][0]])
return adjlist
if __name__ == "__main__":
with open(argv[1]) as f:
fastalist = fastaparser(f)
adjlist = overlapgraph(fastalist, 3)
with open("answer.txt", "w") as v:
for overlaps in adjlist:
v.write(overlaps[0][1:] + " " + overlaps[1][1:] + "\n")
