Sequence alignment - Notes for the Computational Biophysics class

Here I prepare some simple (i.e. not very efficient) implementations of the two algorithms presented in class. The first two versions use linear-gap penalties, the latter two are more general, and can be used to model affine gaps as well.

1Global alignment: the Needleman-Wunsch algorithm¶

def needleman_wunsch(seq1, seq2, match_score=1, mismatch_cost=-1, gap_cost=-1):
    # Initialize matrices
    m, n = len(seq1), len(seq2)
    score_matrix = [[0] * (n + 1) for _ in range(m + 1)]
    traceback_matrix = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Directions for traceback
    MATCH, DELETE, INSERT = 1, 2, 3

    # Initialize scoring and traceback matrices
    for i in range(1, m + 1):
        score_matrix[i][0] = i * gap_cost
        traceback_matrix[i][0] = DELETE
    for j in range(1, n + 1):
        score_matrix[0][j] = j * gap_cost
        traceback_matrix[0][j] = INSERT

    # Fill the scoring and traceback matrices
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            match = score_matrix[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_cost)
            delete = score_matrix[i - 1][j] + gap_cost
            insert = score_matrix[i][j - 1] + gap_cost
            score_matrix[i][j] = max(match, delete, insert)

            # Determine traceback direction
            if score_matrix[i][j] == match:
                traceback_matrix[i][j] = MATCH
            elif score_matrix[i][j] == delete:
                traceback_matrix[i][j] = DELETE
            elif score_matrix[i][j] == insert:
                traceback_matrix[i][j] = INSERT

    # Traceback to get the optimal global alignment
    aligned_seq1 = []
    aligned_seq2 = []
    i, j = m, n

    while i > 0 or j > 0:
        if traceback_matrix[i][j] == MATCH:
            aligned_seq1.append(seq1[i - 1])
            aligned_seq2.append(seq2[j - 1])
            i -= 1
            j -= 1
        elif traceback_matrix[i][j] == DELETE:
            aligned_seq1.append(seq1[i - 1])
            aligned_seq2.append('-')
            i -= 1
        elif traceback_matrix[i][j] == INSERT:
            aligned_seq1.append('-')
            aligned_seq2.append(seq2[j - 1])
            j -= 1

    aligned_seq1.reverse()
    aligned_seq2.reverse()

    return score_matrix[m][n], ''.join(aligned_seq1), ''.join(aligned_seq2)

# Example usage
seq1 = "ACGTCATCA"
seq2 = "TAGTGTCA"
score, aligned_seq1, aligned_seq2 = needleman_wunsch(seq1, seq2)
print("Score:", score)
print("Aligned Sequence 1:", aligned_seq1)
print("Aligned Sequence 2:", aligned_seq2)

Score: 2
Aligned Sequence 1: -ACGTCATCA
Aligned Sequence 2: TA-GT-GTCA

2Local alignment: the Smith-Waterman algorithm¶

def smith_waterman(seq1, seq2, match_score=1, mismatch_cost=-1, gap_cost=-1):
    # Initialize matrices
    m, n = len(seq1), len(seq2)
    score_matrix = [[0] * (n + 1) for _ in range(m + 1)]
    traceback_matrix = [[0] * (n + 1) for _ in range(m + 1)]
    
    # Directions for traceback
    MATCH, DELETE, INSERT, STOP = 1, 2, 3, 0

    max_score = 0
    max_pos = (0, 0)

    # Fill the scoring and traceback matrices
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            match = score_matrix[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_cost)
            delete = score_matrix[i - 1][j] + gap_cost
            insert = score_matrix[i][j - 1] + gap_cost
            score_matrix[i][j] = max(match, delete, insert, 0)

            # Determine traceback direction
            if score_matrix[i][j] == match:
                traceback_matrix[i][j] = MATCH
            elif score_matrix[i][j] == delete:
                traceback_matrix[i][j] = DELETE
            elif score_matrix[i][j] == insert:
                traceback_matrix[i][j] = INSERT
            else:
                traceback_matrix[i][j] = STOP

            # Track the maximum score
            if score_matrix[i][j] > max_score:
                max_score = score_matrix[i][j]
                max_pos = (i, j)

    # Traceback to get the optimal local alignment
    aligned_seq1 = []
    aligned_seq2 = []
    i, j = max_pos

    while traceback_matrix[i][j] != STOP:
        if traceback_matrix[i][j] == MATCH:
            aligned_seq1.append(seq1[i - 1])
            aligned_seq2.append(seq2[j - 1])
            i -= 1
            j -= 1
        elif traceback_matrix[i][j] == DELETE:
            aligned_seq1.append(seq1[i - 1])
            aligned_seq2.append('-')
            i -= 1
        elif traceback_matrix[i][j] == INSERT:
            aligned_seq1.append('-')
            aligned_seq2.append(seq2[j - 1])
            j -= 1
            
    aligned_seq1.reverse()
    aligned_seq2.reverse()

    return max_score, ''.join(aligned_seq1), ''.join(aligned_seq2)

# Example usage
seq1 = "TACGGGCCCGCTAC"
seq2 = "TAGCCCTATCGGTCA"

score, aligned_seq1, aligned_seq2 = smith_waterman(seq1, seq2)
print("Score:", score)
print("Aligned Sequence 1:", aligned_seq1)
print("Aligned Sequence 2:", aligned_seq2)

Score: 4
Aligned Sequence 1: TA-CGG
Aligned Sequence 2: TATCGG

3Global alignment with affine gaps¶

def needleman_wunsch_affine(seq1, seq2, match_score=1, mismatch_cost=-1, gap_open=-5, gap_extend=-1):
    m, n = len(seq1), len(seq2)

    # Initialize matrices
    score_matrix = [[0] * (n + 1) for _ in range(m + 1)]
    traceback_matrix = [[0] * (n + 1) for _ in range(m + 1)]
    gap_seq1 = [[0] * (n + 1) for _ in range(m + 1)] # keep track of gaps in seq1 (insertions)
    gap_seq2 = [[0] * (n + 1) for _ in range(m + 1)] # keep track of gaps in seq2 (deletions)
    
    # Define directions for traceback
    MATCH, DELETE, INSERT = 1, 2, 3    

    # Initialize scoring matrices
    for i in range(1, m + 1):
        score_matrix[i][0] = gap_open + (i - 1) * gap_extend
        gap_seq2[i][0] = score_matrix[i][0]
        traceback_matrix[i][0] = DELETE

    for j in range(1, n + 1):
        score_matrix[0][j] = gap_open + (j - 1) * gap_extend
        gap_seq1[0][j] = score_matrix[0][j]
        traceback_matrix[0][j] = INSERT

    # Fill the scoring and traceback matrices
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            # Calculate match/mismatch score
            match = score_matrix[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_cost)
            
            # Calculate gap scores
            delete = max(score_matrix[i - 1][j] + gap_open, gap_seq2[i - 1][j] + gap_extend)
            insert = max(score_matrix[i][j - 1] + gap_open, gap_seq1[i][j - 1] + gap_extend)

            # Update matrices
            score_matrix[i][j] = max(match, delete, insert)
            gap_seq2[i][j] = delete
            gap_seq1[i][j] = insert

            # Set traceback direction
            if score_matrix[i][j] == match:
                traceback_matrix[i][j] = MATCH
            elif score_matrix[i][j] == delete:
                traceback_matrix[i][j] = DELETE
            elif score_matrix[i][j] == insert:
                traceback_matrix[i][j] = INSERT

    # Traceback to get the optimal global alignment
    aligned_seq1 = []
    aligned_seq2 = []
    i, j = m, n

    while i > 0 or j > 0:
        if traceback_matrix[i][j] == MATCH:
            aligned_seq1.append(seq1[i - 1])
            aligned_seq2.append(seq2[j - 1])
            i -= 1
            j -= 1
        elif traceback_matrix[i][j] == DELETE:
            aligned_seq1.append(seq1[i - 1])
            aligned_seq2.append('-')
            i -= 1
        elif traceback_matrix[i][j] == INSERT:
            aligned_seq1.append('-')
            aligned_seq2.append(seq2[j - 1])
            j -= 1

    aligned_seq1.reverse()
    aligned_seq2.reverse()

    return score_matrix[m][n], ''.join(aligned_seq1), ''.join(aligned_seq2)

# Example usage
seq1 = "ACGTCATCA"
seq2 = "TAGTGTCA"

score, aligned_seq1, aligned_seq2 = needleman_wunsch_affine(seq1, seq2)
print("Score:", score)
print("Aligned Sequence 1:", aligned_seq1)
print("Aligned Sequence 2:", aligned_seq2)

Score: -2
Aligned Sequence 1: AC-GTCATCA
Aligned Sequence 2: --TAGTGTCA

4Local alignment with affine gaps¶

def smith_waterman_affine(seq1, seq2, match_score=5, mismatch_cost=-4, gap_open=-5, gap_extend=-1):
    m, n = len(seq1), len(seq2)

    # Initialize matrices
    score_matrix = [[0] * (n + 1) for _ in range(m + 1)]
    traceback_matrix = [[0] * (n + 1) for _ in range(m + 1)]
    gap_seq1 = [[0] * (n + 1) for _ in range(m + 1)]
    gap_seq2 = [[0] * (n + 1) for _ in range(m + 1)]

    # Define directions for traceback
    MATCH, DELETE, INSERT, STOP = 1, 2, 3, 0

    max_score = 0
    max_pos = (0, 0)

    # Fill the scoring and traceback matrices
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            match = score_matrix[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_cost)
            
            # Calculate gap scores
            delete = max(score_matrix[i - 1][j] + gap_open, gap_seq2[i - 1][j] + gap_extend)
            insert = max(score_matrix[i][j - 1] + gap_open, gap_seq1[i][j - 1] + gap_extend)
            
            # Choose the best score for the current position
            score_matrix[i][j] = max(match, delete, insert, 0)
            gap_seq2[i][j] = delete
            gap_seq1[i][j] = insert

            # Determine the traceback direction
            if score_matrix[i][j] == match:
                traceback_matrix[i][j] = MATCH
            elif score_matrix[i][j] == delete:
                traceback_matrix[i][j] = DELETE
            elif score_matrix[i][j] == insert:
                traceback_matrix[i][j] = INSERT
            else:
                traceback_matrix[i][j] = STOP

            # Track the maximum score
            if score_matrix[i][j] > max_score:
                max_score = score_matrix[i][j]
                max_pos = (i, j)

    # Traceback to get the optimal local alignment
    aligned_seq1 = []
    aligned_seq2 = []
    i, j = max_pos

    while traceback_matrix[i][j] != STOP and i > 0 and j > 0:
        if traceback_matrix[i][j] == MATCH:
            aligned_seq1.append(seq1[i - 1])
            aligned_seq2.append(seq2[j - 1])
            i -= 1
            j -= 1
        elif traceback_matrix[i][j] == DELETE:
            aligned_seq1.append(seq1[i - 1])
            aligned_seq2.append('-')
            i -= 1
        elif traceback_matrix[i][j] == INSERT:
            aligned_seq1.append('-')
            aligned_seq2.append(seq2[j - 1])
            j -= 1

    aligned_seq1.reverse()
    aligned_seq2.reverse()

    return max_score, ''.join(aligned_seq1), ''.join(aligned_seq2)

# Example usage
seq1 = "TACGGGCCCGCTAC"
seq2 = "TAGCCCTATCGGTCA"

score, aligned_seq1, aligned_seq2 = smith_waterman_affine(seq1, seq2)
print("Score:", score)
print("Aligned Sequence 1:", aligned_seq1)
print("Aligned Sequence 2:", aligned_seq2)

Score: 27
Aligned Sequence 1: TACGGGCCCGCTA
Aligned Sequence 2: TA---G-CC-CTA