Here I prepare some simple (i.e. not very efficient) implementations of the two algorithms presented in class. The first two versions use linear-gap penalties, the latter two are more general, and can be used to model affine gaps as well.
1Global alignment: the Needleman-Wunsch algorithm¶
def needleman_wunsch(seq1, seq2, match_score=1, mismatch_cost=-1, gap_cost=-1):
# Initialize matrices
m, n = len(seq1), len(seq2)
score_matrix = [[0] * (n + 1) for _ in range(m + 1)]
traceback_matrix = [[0] * (n + 1) for _ in range(m + 1)]
# Directions for traceback
MATCH, DELETE, INSERT = 1, 2, 3
# Initialize scoring and traceback matrices
for i in range(1, m + 1):
score_matrix[i][0] = i * gap_cost
traceback_matrix[i][0] = DELETE
for j in range(1, n + 1):
score_matrix[0][j] = j * gap_cost
traceback_matrix[0][j] = INSERT
# Fill the scoring and traceback matrices
for i in range(1, m + 1):
for j in range(1, n + 1):
match = score_matrix[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_cost)
delete = score_matrix[i - 1][j] + gap_cost
insert = score_matrix[i][j - 1] + gap_cost
score_matrix[i][j] = max(match, delete, insert)
# Determine traceback direction
if score_matrix[i][j] == match:
traceback_matrix[i][j] = MATCH
elif score_matrix[i][j] == delete:
traceback_matrix[i][j] = DELETE
elif score_matrix[i][j] == insert:
traceback_matrix[i][j] = INSERT
# Traceback to get the optimal global alignment
aligned_seq1 = []
aligned_seq2 = []
i, j = m, n
while i > 0 or j > 0:
if traceback_matrix[i][j] == MATCH:
aligned_seq1.append(seq1[i - 1])
aligned_seq2.append(seq2[j - 1])
i -= 1
j -= 1
elif traceback_matrix[i][j] == DELETE:
aligned_seq1.append(seq1[i - 1])
aligned_seq2.append('-')
i -= 1
elif traceback_matrix[i][j] == INSERT:
aligned_seq1.append('-')
aligned_seq2.append(seq2[j - 1])
j -= 1
aligned_seq1.reverse()
aligned_seq2.reverse()
return score_matrix[m][n], ''.join(aligned_seq1), ''.join(aligned_seq2)
# Example usage
seq1 = "ACGTCATCA"
seq2 = "TAGTGTCA"
score, aligned_seq1, aligned_seq2 = needleman_wunsch(seq1, seq2)
print("Score:", score)
print("Aligned Sequence 1:", aligned_seq1)
print("Aligned Sequence 2:", aligned_seq2)
Score: 2
Aligned Sequence 1: -ACGTCATCA
Aligned Sequence 2: TA-GT-GTCA
2Local alignment: the Smith-Waterman algorithm¶
def smith_waterman(seq1, seq2, match_score=1, mismatch_cost=-1, gap_cost=-1):
# Initialize matrices
m, n = len(seq1), len(seq2)
score_matrix = [[0] * (n + 1) for _ in range(m + 1)]
traceback_matrix = [[0] * (n + 1) for _ in range(m + 1)]
# Directions for traceback
MATCH, DELETE, INSERT, STOP = 1, 2, 3, 0
max_score = 0
max_pos = (0, 0)
# Fill the scoring and traceback matrices
for i in range(1, m + 1):
for j in range(1, n + 1):
match = score_matrix[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_cost)
delete = score_matrix[i - 1][j] + gap_cost
insert = score_matrix[i][j - 1] + gap_cost
score_matrix[i][j] = max(match, delete, insert, 0)
# Determine traceback direction
if score_matrix[i][j] == match:
traceback_matrix[i][j] = MATCH
elif score_matrix[i][j] == delete:
traceback_matrix[i][j] = DELETE
elif score_matrix[i][j] == insert:
traceback_matrix[i][j] = INSERT
else:
traceback_matrix[i][j] = STOP
# Track the maximum score
if score_matrix[i][j] > max_score:
max_score = score_matrix[i][j]
max_pos = (i, j)
# Traceback to get the optimal local alignment
aligned_seq1 = []
aligned_seq2 = []
i, j = max_pos
while traceback_matrix[i][j] != STOP:
if traceback_matrix[i][j] == MATCH:
aligned_seq1.append(seq1[i - 1])
aligned_seq2.append(seq2[j - 1])
i -= 1
j -= 1
elif traceback_matrix[i][j] == DELETE:
aligned_seq1.append(seq1[i - 1])
aligned_seq2.append('-')
i -= 1
elif traceback_matrix[i][j] == INSERT:
aligned_seq1.append('-')
aligned_seq2.append(seq2[j - 1])
j -= 1
aligned_seq1.reverse()
aligned_seq2.reverse()
return max_score, ''.join(aligned_seq1), ''.join(aligned_seq2)
# Example usage
seq1 = "TACGGGCCCGCTAC"
seq2 = "TAGCCCTATCGGTCA"
score, aligned_seq1, aligned_seq2 = smith_waterman(seq1, seq2)
print("Score:", score)
print("Aligned Sequence 1:", aligned_seq1)
print("Aligned Sequence 2:", aligned_seq2)
Score: 4
Aligned Sequence 1: TA-CGG
Aligned Sequence 2: TATCGG
3Global alignment with affine gaps¶
def needleman_wunsch_affine(seq1, seq2, match_score=1, mismatch_cost=-1, gap_open=-5, gap_extend=-1):
m, n = len(seq1), len(seq2)
# Initialize matrices
score_matrix = [[0] * (n + 1) for _ in range(m + 1)]
traceback_matrix = [[0] * (n + 1) for _ in range(m + 1)]
gap_seq1 = [[0] * (n + 1) for _ in range(m + 1)] # keep track of gaps in seq1 (insertions)
gap_seq2 = [[0] * (n + 1) for _ in range(m + 1)] # keep track of gaps in seq2 (deletions)
# Define directions for traceback
MATCH, DELETE, INSERT = 1, 2, 3
# Initialize scoring matrices
for i in range(1, m + 1):
score_matrix[i][0] = gap_open + (i - 1) * gap_extend
gap_seq2[i][0] = score_matrix[i][0]
traceback_matrix[i][0] = DELETE
for j in range(1, n + 1):
score_matrix[0][j] = gap_open + (j - 1) * gap_extend
gap_seq1[0][j] = score_matrix[0][j]
traceback_matrix[0][j] = INSERT
# Fill the scoring and traceback matrices
for i in range(1, m + 1):
for j in range(1, n + 1):
# Calculate match/mismatch score
match = score_matrix[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_cost)
# Calculate gap scores
delete = max(score_matrix[i - 1][j] + gap_open, gap_seq2[i - 1][j] + gap_extend)
insert = max(score_matrix[i][j - 1] + gap_open, gap_seq1[i][j - 1] + gap_extend)
# Update matrices
score_matrix[i][j] = max(match, delete, insert)
gap_seq2[i][j] = delete
gap_seq1[i][j] = insert
# Set traceback direction
if score_matrix[i][j] == match:
traceback_matrix[i][j] = MATCH
elif score_matrix[i][j] == delete:
traceback_matrix[i][j] = DELETE
elif score_matrix[i][j] == insert:
traceback_matrix[i][j] = INSERT
# Traceback to get the optimal global alignment
aligned_seq1 = []
aligned_seq2 = []
i, j = m, n
while i > 0 or j > 0:
if traceback_matrix[i][j] == MATCH:
aligned_seq1.append(seq1[i - 1])
aligned_seq2.append(seq2[j - 1])
i -= 1
j -= 1
elif traceback_matrix[i][j] == DELETE:
aligned_seq1.append(seq1[i - 1])
aligned_seq2.append('-')
i -= 1
elif traceback_matrix[i][j] == INSERT:
aligned_seq1.append('-')
aligned_seq2.append(seq2[j - 1])
j -= 1
aligned_seq1.reverse()
aligned_seq2.reverse()
return score_matrix[m][n], ''.join(aligned_seq1), ''.join(aligned_seq2)
# Example usage
seq1 = "ACGTCATCA"
seq2 = "TAGTGTCA"
score, aligned_seq1, aligned_seq2 = needleman_wunsch_affine(seq1, seq2)
print("Score:", score)
print("Aligned Sequence 1:", aligned_seq1)
print("Aligned Sequence 2:", aligned_seq2)
Score: -2
Aligned Sequence 1: AC-GTCATCA
Aligned Sequence 2: --TAGTGTCA
4Local alignment with affine gaps¶
def smith_waterman_affine(seq1, seq2, match_score=5, mismatch_cost=-4, gap_open=-5, gap_extend=-1):
m, n = len(seq1), len(seq2)
# Initialize matrices
score_matrix = [[0] * (n + 1) for _ in range(m + 1)]
traceback_matrix = [[0] * (n + 1) for _ in range(m + 1)]
gap_seq1 = [[0] * (n + 1) for _ in range(m + 1)]
gap_seq2 = [[0] * (n + 1) for _ in range(m + 1)]
# Define directions for traceback
MATCH, DELETE, INSERT, STOP = 1, 2, 3, 0
max_score = 0
max_pos = (0, 0)
# Fill the scoring and traceback matrices
for i in range(1, m + 1):
for j in range(1, n + 1):
match = score_matrix[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_cost)
# Calculate gap scores
delete = max(score_matrix[i - 1][j] + gap_open, gap_seq2[i - 1][j] + gap_extend)
insert = max(score_matrix[i][j - 1] + gap_open, gap_seq1[i][j - 1] + gap_extend)
# Choose the best score for the current position
score_matrix[i][j] = max(match, delete, insert, 0)
gap_seq2[i][j] = delete
gap_seq1[i][j] = insert
# Determine the traceback direction
if score_matrix[i][j] == match:
traceback_matrix[i][j] = MATCH
elif score_matrix[i][j] == delete:
traceback_matrix[i][j] = DELETE
elif score_matrix[i][j] == insert:
traceback_matrix[i][j] = INSERT
else:
traceback_matrix[i][j] = STOP
# Track the maximum score
if score_matrix[i][j] > max_score:
max_score = score_matrix[i][j]
max_pos = (i, j)
# Traceback to get the optimal local alignment
aligned_seq1 = []
aligned_seq2 = []
i, j = max_pos
while traceback_matrix[i][j] != STOP and i > 0 and j > 0:
if traceback_matrix[i][j] == MATCH:
aligned_seq1.append(seq1[i - 1])
aligned_seq2.append(seq2[j - 1])
i -= 1
j -= 1
elif traceback_matrix[i][j] == DELETE:
aligned_seq1.append(seq1[i - 1])
aligned_seq2.append('-')
i -= 1
elif traceback_matrix[i][j] == INSERT:
aligned_seq1.append('-')
aligned_seq2.append(seq2[j - 1])
j -= 1
aligned_seq1.reverse()
aligned_seq2.reverse()
return max_score, ''.join(aligned_seq1), ''.join(aligned_seq2)
# Example usage
seq1 = "TACGGGCCCGCTAC"
seq2 = "TAGCCCTATCGGTCA"
score, aligned_seq1, aligned_seq2 = smith_waterman_affine(seq1, seq2)
print("Score:", score)
print("Aligned Sequence 1:", aligned_seq1)
print("Aligned Sequence 2:", aligned_seq2)
Score: 27
Aligned Sequence 1: TACGGGCCCGCTA
Aligned Sequence 2: TA---G-CC-CTA