Commit 7c8b1c79 authored by Siu, Pui Chung's avatar Siu, Pui Chung
Browse files

Upload New File

parent 56c19691
#!/usr/bin/env python3
"""
Author: Jacky Siu Pui Chung Student number = 1047527
Description: this is a script to replicate needleman-wunsch algorithm in
algorithm in bioinformatics assignment2
Usage:
"""
#import statements here
# functions between here and __main__
blosum = """
# http://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt
# Matrix made by matblas from blosum62.iij
# * column uses minimum score
# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
# Blocks Database = /data/blocks_5.0/blocks.dat
# Cluster Percentage: >= 62
# Entropy = 0.6979, Expected = -0.5209
A R N D C Q E G H I L K M F P S T W Y V B Z X *
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
"""
def blosum62():
"""Return order and similarity scores from BLOSUM62 matrix
order: dict of {res: idx_in_matrix}
blosum_matrix: list of lists with similarity scores
"""
order = {}
blosum_matrix = []
for line in blosum.split('\n'):
if line.startswith('#'):
continue
if not line.strip():
continue
parts = line.strip().split()
if len(parts) == 24:
for idx, sym in enumerate(parts):
order[sym] = idx
else:
# list around the map construction for python3 compatibility
blosum_matrix.append(list(map(int,parts[1:])))
return order, blosum_matrix
BLOSUM62_ORDER, BLOSUM62_MATRIX = blosum62()
def score(res1, res2):
"""Return similarity score from BLOSUM62 matrix for two residues
res1: string, amino acid
res2: string, amino acid
"""
lookup1 = BLOSUM62_ORDER[res1]
lookup2 = BLOSUM62_ORDER[res2]
return BLOSUM62_MATRIX[lookup1][lookup2]
# write your own functions below here
def initialmatrix(seq1, seq2, end_gap):
"""Return initial matrix with penality scores at (i,0) and (0,j)
input:
seq1: string, sequence of interest 1
seq2: string, sequence of interest 2
end_gap: end gap penalty score, positive value
Output:
matrix: list of list, initliazed matrix with multiples of end gap
penality score on the top and left edges
"""
#initialize matrix dimension
matrix = [[0 for j in range(len(seq1) + 1)] for i in range(len(seq2) + 1)]
for i in range(len(seq1)+1): # initialize column gap penalities
matrix[0][i] = -i * end_gap
for j in range(len(seq2)+1): # initialize row gap penalities
matrix[j][0] = -j * end_gap
return matrix
def needleman_wunsch(seq1, seq2, penality, matrix):
"""Return matrix with penality scores, and pointer matrix
for traceback
input:
seq1: sequence of interest 1
seq2: sequence of interest 2
penalty: gap penalty score, positive value
matrix: initialized matrix
Output:
matrix: filled in matrix with all penalty scores
Ptr: pointer matrix with strings of three direction for traceback
"""
gap_pen = matrix[0][1] * -1
print(gap_pen)
traceback_dir = ["diag", "left", "up"]
Ptr = [[None for j in range(len(seq1) + 1)] for i in range(len(seq2) + 1)]
for i in range(len(seq2)+1)[1:]: # exclude matrix(i = 0, j = 0)
for j in range(len(seq1)+1)[1:]:
#maximise matrix element in EQ5.18
if i != (len(seq2)) or j != (len(seq1)):
f = [matrix[i - 1][j - 1] + score(seq2[i - 1],seq1[j - 1]),
matrix[i - 1][j] - penality, matrix[i][j - 1] - penality]
matrix[i][j] = max(f)
#assign direction labels in matrix Ptr
Ptr[i][j] = traceback_dir[f.index(max(f))]
else:
f = [matrix[i - 1][j - 1] + score(seq2[i - 1],seq1[j - 1]),
matrix[i - 1][j] - gap_pen, matrix[i][j - 1] - gap_pen]
matrix[i][j] = max(f)
Ptr[i][j] = traceback_dir[f.index(max(f))]
algn_score = matrix[-1][-1]
return matrix, Ptr, algn_score
def traceback(seq1, seq2, matrix, Ptr):
"""Return string alignments for seq1 and seq2, as well as
string matches in readable format
input:
seq1: sequence of interest 1
seq2: sequence of interest 2
matrix: filled in matrix with all penalty scores
Ptr: pointer matrix with strings of three direction for traceback
Output:
alignment_1: string of alignment of sequence 1 resulted from traceback
alignment_2: string alignment of sequence 2 resulted from traceback
matches: string of connections between alignment_1 and alignment_2
"""
alignment_1, alignment_2 = "", ""
i = len(seq1)
j = len(seq2)
matches = ""
traceback_vector = {"diag":[-1, -1],"left":[-1, 0],"up":[0, -1]} #diag, left, up
while i > 0 and j > 0:
direction = Ptr[i][j]
x, y = traceback_vector[direction][0], traceback_vector[direction][1]
if direction == "diag":
alignment_1 += seq1[i - 1]
alignment_2 += seq2[j - 1]
if seq1[i - 1] == seq2[j - 1]:
matches += "|"
else:
matches += " "
elif direction == "left":
alignment_1 += seq1[i - 1]
alignment_2 += "-"
matches += " "
elif direction == "up":
alignment_1 += "-"
alignment_2 += seq2[j - 1]
matches += " "
i,j = i + x, j + y
if i != 0:
alignment_1 += seq1[:i][::-1]
alignment_2 += "-" * i
matches += " " * i
elif j != 0:
alignment_1 += "-" * j
alignment_2 += seq2[:j][::-1]
matches += " " * j
return(alignment_1[::-1], matches[::-1], alignment_2[::-1])
def identity_per(matches):
return (matches.count("|") / len(matches)) *100
if __name__ == "__main__":
seq1 = "THISLINE" # column
seq2 = "ISALIGNED" # row
# seq3: GPA1_ARATH
seq3 = ("MGLLCSRSRHHTEDTDENTQAAEIERRIEQEAKAEKHIRKLLLLGAGESGKSTIFKQIKLLFQ"
"TGFDEGELKSYVPVIHANVYQTIKLLHDGTKEFAQNETDSAKYMLSSESIAIGEKLSEIGGRLDYPRLTKD"
"IAEGIETLWKDPAIQETCARGNELQVPDCTKYLMENLKRLSDINYIPTKEDVLYARVRTTGVVEIQFSPVG"
"ENKKSGEVYRLFDVGGQRNERRKWIHLFEGVTAVIFCAAISEYDQTLFEDEQKNRMMETKELFDWVLKQPC"
"FEKTSFMLFLNKFDIFEKKVLDVPLNVCEWFRDYQPVSSGKQEIEHAYEFVKKKFEELYYQNTAPDRVDRV"
"FKIYRTTALDQKLVKKTFKLVDETLRRRNLLEA")
# seq4: GPA1 BRANA
seq4=(\
"MGLLCSRSRHHTEDTDENAQAAEIERRIEQEAKAEKHIRKLLLLGAGESGKSTIFKQASS"
"DKRKIIKLLFQTGFDEGELKSYVPVIHANVYQTIKLLHDGTKEFAQNETDPAKYTLSSEN"
"MAIGEKLSEIGARLDYPRLTKDLAEGIETLWNDPAIQETCSRGNELQVPDCTKYLMENLK"
"RLSDVNYIPTKEDVLYARVRTTGVVEIQFSPVGENKKSGEVYRLFDVGGQRNERRKWIHL"
"FEGVTAVIFCAAISEYDQTLFEDEQKNRMMETKELFDWVLKQPCFEKTSIMLFLNKFDIF"
"EKKVLDVPLNVCEWFRDYQPVSSGKQEIEHAYEFVKKKFEELYYQNTAPDRVDRVFKIYR"
"TTALDQKLVKKTFKLVDETLRRRNLLEAGLL")
global_matrix_8 = initialmatrix(seq2, seq1, 8)
global_matrix_4 = initialmatrix(seq2, seq1, 4)
semi_global_matrix_8 = initialmatrix(seq2, seq1, 0)
print("Q1")
matrix_8, ptr_8, score_8 = needleman_wunsch(seq2, seq1, 8, global_matrix_8)
align_1, matches, align_2 = traceback(seq1, seq2, matrix_8, ptr_8)
for rows in range(len(matrix_8)):
print(matrix_8[rows]) #figure 5.9
print("global alignment between seq2 and seq1 within gap penality of 8:")
print(align_1)
print(matches)
print(align_2)
print(score_8)
print(identity_per(matches))
matrix_4, ptr_4, score_4 = needleman_wunsch(seq2, seq1, 4, global_matrix_4)
align_1, matches, align_2= traceback(seq1, seq2, matrix_4, ptr_4)
for rows in range(len(matrix_4)):
print(matrix_4[rows]) #figure 5.11
print("global alignment between seq2 and seq1 within gap penality of 4:")
print(align_1)
print(matches)
print(align_2)
print(score_4)
print(identity_per(matches))
matrix_0, ptr_0, score_0 = needleman_wunsch(seq2, seq1, 8, semi_global_matrix_8)
align_1, matches, align_2= traceback(seq1, seq2, matrix_0, ptr_0)
for rows in range(len(matrix_0)):
print(matrix_0[rows]) #figure 5.12
print("semi global alignment between seq2 and seq1 within gap penality of 8:")
print(align_1)
print(matches)
print(align_2)
print(score_0)
print(identity_per(matches))
seq3, seq4 = "".join(seq3), "".join(seq4)
print(seq3, seq4)
print("Q3")
global_matrix_GPA1 = initialmatrix(seq3, seq4, 1) #end gap penalty
matrix_GPA1, ptr_GPA1, score_GPA1 = needleman_wunsch(seq3, seq4, 5, global_matrix_GPA1)
align_1, matches, align_2= traceback(seq4, seq3, matrix_GPA1, ptr_GPA1)
print("global alignment between GPA1_ARATH and GPA1_BRANA within gap penality of 5 and end gap of 1:")
print(score_GPA1)
print(identity_per(matches))
global_matrix_GPA1_b = initialmatrix(seq3, seq4, 10)#end gap penalty
matrix_GPA1_b, ptr_GPA1_b, score_GPA1_b = needleman_wunsch(seq3, seq4, 5, global_matrix_GPA1_b)
align_1, matches, align_2 = traceback(seq4, seq3, matrix_GPA1_b, ptr_GPA1_b)
print("global alignment between GPA1_ARATH and GPA1_BRANA within gap penality of 5 and end gap of 10:")
print(score_GPA1_b)
print(identity_per(matches))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment