Needleman-Wunsch.py and Smith-Waterman.py on Github

import numpy as np
import matplotlib.pyplot as plt
import random
from datetime import datetime

startTime = datetime.now()

#Running time: 0:11:36.344549 for 10000
length_sequence = 100
number_of_aligments = 1

# scoring scheme
MATCH    =  1; # +1 for letters that match (or BLOSUM62)
MISMATCH = -1; # -1 for letters that mismatch (or BLOSUM62)
GAP      = -2; # gap-creation penalty
GAP2     = -1; # gap-extension penalty
#BLOSUM = {'C':{'C':9, 'S':-1}}, a nested dictionary
BLOSUM62={'C':{'C':9,'S':-1,'T':-1,'P':-3,'A':0,'G':-3,'N':-3,'D':-3,'E':-4,'Q':-3,'H':-3,'R':-3,'K':-3,'M':-1,'I':-1,'L':-1,'V':-1,'F':-2,'Y':-2,'W':-2},
          'S':{'C':-1,'S':4,'T':1,'P':-1,'A':1,'G':0,'N':1,'D':0,'E':0,'Q':0,'H':-1,'R':-1,'K':0,'M':-1,'I':-2,'L':-2,'V':-2,'F':-2,'Y':-2,'W':-3},
          'T':{'C':-1,'S':1,'T':4,'P':1,'A':-1,'G':1,'N':0,'D':1,'E':0,'Q':0,'H':0,'R':-1,'K':0,'M':-1,'I':-2,'L':-2,'V':-2,'F':-2,'Y':-2,'W':-3},
          'P':{'C':-3,'S':-1,'T':1,'P':7,'A':-1,'G':-2,'N':-1,'D':-1,'E':-1,'Q':-1,'H':-2,'R':-2,'K':-1,'M':-2,'I':-3,'L':-3,'V':-2,'F':-4,'Y':-3,'W':-4},
          'A':{'C':0,'S':1,'T':-1,'P':-1,'A':4,'G':0,'N':-1,'D':-2,'E':-1,'Q':-1,'H':-2,'R':-1,'K':-1,'M':-1,'I':-1,'L':-1,'V':-2,'F':-2,'Y':-2,'W':-3},
          'G':{'C':-3,'S':0,'T':1,'P':-2,'A':0,'G':6,'N':-2,'D':-1,'E':-2,'Q':-2,'H':-2,'R':-2,'K':-2,'M':-3,'I':-4,'L':-4,'V':0,'F':-3,'Y':-3,'W':-2},
          'N':{'C':-3,'S':1,'T':0,'P':-2,'A':-2,'G':0,'N':6,'D':1,'E':0,'Q':0,'H':-1,'R':0,'K':0,'M':-2,'I':-3,'L':-3,'V':-3,'F':-3,'Y':-2,'W':-4},
          'D':{'C':-3,'S':0,'T':1,'P':-1,'A':-2,'G':-1,'N':1,'D':6,'E':2,'Q':0,'H':-1,'R':-2,'K':-1,'M':-3,'I':-3,'L':-4,'V':-3,'F':-3,'Y':-3,'W':-4},
          'E':{'C':-4,'S':0,'T':0,'P':-1,'A':-1,'G':-2,'N':0,'D':2,'E':5,'Q':2,'H':0,'R':0,'K':1,'M':-2,'I':-3,'L':-3,'V':-3,'F':-3,'Y':-2,'W':-3},
          'Q':{'C':-3,'S':0,'T':0,'P':-1,'A':-1,'G':-2,'N':0,'D':0,'E':2,'Q':5,'H':0,'R':1,'K':1,'M':0,'I':-3,'L':-2,'V':-2,'F':-3,'Y':-1,'W':-2},
          'H':{'C':-3,'S':-1,'T':0,'P':-2,'A':-2,'G':-2,'N':1,'D':1,'E':0,'Q':0,'H':8,'R':0,'K':-1,'M':-2,'I':-3,'L':-3,'V':-2,'F':-1,'Y':2,'W':-2},
          'R':{'C':-3,'S':-1,'T':-1,'P':-2,'A':-1,'G':-2,'N':0,'D':-2,'E':0,'Q':1,'H':0,'R':5,'K':2,'M':-1,'I':-3,'L':-2,'V':-3,'F':-3,'Y':-2,'W':-3},
          'K':{'C':-3,'S':0,'T':0,'P':-1,'A':-1,'G':-2,'N':0,'D':-1,'E':1,'Q':1,'H':-1,'R':2,'K':5,'M':-1,'I':-3,'L':-2,'V':-3,'F':-3,'Y':-2,'W':-3},
          'M':{'C':-1,'S':-1,'T':-1,'P':-2,'A':-1,'G':-3,'N':-2,'D':-3,'E':-2,'Q':0,'H':-2,'R':-1,'K':-1,'M':5,'I':1,'L':2,'V':-2,'F':0,'Y':-1,'W':-1},
          'I':{'C':-1,'S':-2,'T':-2,'P':-3,'A':-1,'G':-4,'N':-3,'D':-3,'E':-3,'Q':-3,'H':-3,'R':-3,'K':-3,'M':1,'I':4,'L':2,'V':1,'F':0,'Y':-1,'W':-3},
          'L':{'C':-1,'S':-2,'T':-2,'P':-3,'A':-1,'G':-4,'N':-3,'D':-4,'E':-3,'Q':-2,'H':-3,'R':-2,'K':-2,'M':2,'I':2,'L':4,'V':3,'F':0,'Y':-1,'W':-2},
          'V':{'C':-1,'S':-2,'T':-2,'P':-2,'A':0,'G':-3,'N':-3,'D':-3,'E':-2,'Q':-2,'H':-3,'R':-3,'K':-2,'M':1,'I':3,'L':1,'V':4,'F':-1,'Y':-1,'W':-3},
          'F':{'C':-2,'S':-2,'T':-2,'P':-4,'A':-2,'G':-3,'N':-3,'D':-3,'E':-3,'Q':-3,'H':-1,'R':-3,'K':-3,'M':0,'I':0,'L':0,'V':-1,'F':6,'Y':3,'W':1},
          'Y':{'C':-2,'S':-2,'T':-2,'P':-3,'A':-2,'G':-3,'N':-2,'D':-3,'E':-2,'Q':-1,'H':2,'R':-2,'K':-2,'M':-1,'I':-1,'L':-1,'V':-1,'F':3,'Y':7,'W':2},
          'W':{'C':-2,'S':-3,'T':-3,'P':-4,'A':-3,'G':-2,'N':-4,'D':-4,'E':-3,'Q':-2,'H':-2,'R':-3,'K':-3,'M':-1,'I':-3,'L':-2,'V':-3,'F':1,'Y':2,'W':11}
          }

def random_AA_seq(length):
    result='M'
    for i in range(length-1):
        result = result+str(random.choice('ACDEFGHIKLMNPQRSTVWY'))
    return result

x_ax=[]

for i in range(0,number_of_aligments):
    sequence1 = random_AA_seq(length_sequence)
    sequence2 = random_AA_seq(length_sequence)
    
    #>sp|P68871|HBB_HUMAN Hemoglobin subunit beta OS=Homo sapiens GN=HBB PE=1 SV=2
    sequence1 = "MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH"
    #>sp|P68225|HBB_MACNE Hemoglobin subunit beta OS=Macaca nemestrina GN=HBB PE=1 SV=2
    sequence2 = "MVHLTPEEKNAVTTLWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMGNPKVKAHGKKVLGAFSDGLNHLDNLKGTFAQLSELHCDKLHVDPENFKLLGNVLVCVLAHHFGKEFTPQVQAAYQKVVAGVANALAHKYH"
    
    print ("Alignment:",i+1)
    print (sequence1)
    print (sequence2)
    #initialisation
    score_matrix = np.zeros([len(sequence2)+1,len(sequence1)+1])
    trace_matrix = np.zeros([len(sequence2)+1,len(sequence1)+1],dtype=str)
    #creation and/or extension penalty
    for j in range(0,len(sequence1)+1):
        score_matrix[0][j] = GAP2*j
        trace_matrix[0][j] = "L"
    for i in range(0,len(sequence2)+1):
        score_matrix[i][0] = GAP2*i
        trace_matrix[i][0] = "U"
    score_matrix[0][0] = 0
    trace_matrix[0][0] = "N"
    print ("Aligning Needleman-Wunsch...",end=" ")
    for i in range(1,len(sequence2)+1):
        for j in range (1,len(sequence1)+1):
            diagonal_score=0
            left_score=0
            up_score=0
            # calculate match/mismatch score, for blosum62 the condition does not apply
            letter1 = sequence1[j-1:j]
            letter2 = sequence2[i-1:i]        
            if (letter1 == letter2):
                diagonal_score = score_matrix[i-1][j-1] + BLOSUM62[letter1][letter2]
                #diagonal_score = score_matrix[i-1][j-1] + MATCH
            else:
                diagonal_score = score_matrix[i-1][j-1] + BLOSUM62[letter1][letter2]
                #diagonal_score = score_matrix[i-1][j-1] + MISMATCH
            # calculate gap scores
            # only apply gap-creation penalty if previously found a match, otherwise it is an gap-extension
            if (trace_matrix[i-1][j] == "D"):
                up_score   = score_matrix[i-1][j] + GAP
            else:
                up_score   = score_matrix[i-1][j] + GAP2
            if (trace_matrix[i][j-1] == "D"):
                left_score = score_matrix[i][j-1] + GAP
            else:       
                left_score = score_matrix[i][j-1] + GAP2   
            # choose best score
            if (diagonal_score >= up_score):
                if (diagonal_score >= left_score):
                    score_matrix[i][j] = diagonal_score
                    trace_matrix[i][j] = "D"
                else:
                    score_matrix[i][j] = left_score
                    trace_matrix[i][j] = "L"
            else:
                if (up_score >= left_score):
                    score_matrix[i][j] = up_score
                    trace_matrix[i][j] = "U"
                else:
                    score_matrix[i][j] = left_score
                    trace_matrix[i][j] = "L"
                    
    #print ("Score Matrix:")
    #print (score_matrix)
    #print ("Trace Matrix:")
    #print (trace_matrix)
    align1 = ""
    align2 = ""
    tracking = ""
    j = len(sequence1)
    i = len(sequence2)
    #print ("Backtracking:",end="\n")
    tracking_score = []
    while trace_matrix[i][j] != "N":
        tracking = tracking + trace_matrix[i][j]
        tracking_score.append(score_matrix[i][j])
        if (trace_matrix[i][j] == "D"):
            align1 = align1 + sequence1[j-1:j]
            align2 = align2 + sequence2[i-1:i]
            i=i-1
            j=j-1
        elif (trace_matrix[i][j] == "L"):
            align1 = align1 + sequence1[j-1:j]
            align2 = align2 + "-"
            j=j-1
        elif (trace_matrix[i][j] == "U"):
            align1 = align1 + "-"
            align2 = align2 + sequence2[i-1:i]
            i=i-1
    print ("done")
    print ("Scoring - Backtrace - Alignment:",end="\n")
    align1 = align1[::-1]
    align2 = align2[::-1]
    tracking = tracking[::-1]
    print (list(reversed(tracking_score)))
    print (tracking)
    print (align1)
    print (align2)                
    print (" score:",score_matrix[len(sequence2)][len(sequence1)])
    x_ax.append(score_matrix[len(sequence2)][len(sequence1)])

print ("Running time:",datetime.now() - startTime)

if (number_of_aligments >5):
    # the histogram of the data
    plt.hist(x_ax, 50, normed=1, facecolor='green')
    plt.xlabel('Score')
    plt.ylabel('Probability')
    plt.title('Monte-Carlo simulation of '+str(number_of_aligments)+' alignment scores using BLOSUM62 and ('+str(GAP)+','+str(GAP2)+') as (creation,extension penalty)')
    #plt.axis([-80,-40,0,0.2])
    plt.axis([0,200,0,0.1])
    plt.grid(True)
    plt.show()