2014-05-01 1 views
0

나는 Smith-Waterman algorithm을 사용하여 동적 프로그래밍 행렬을 생성하기 위해 Python을 사용하고 있습니다. 여기 Smith-Waterman 알고리즘을 사용하여 파이썬에서 행렬을 생성합니다.

는 지금까지이 작업은 다음과 같습니다 입력으로

def score(base1,base2): 
    base1=base1.upper() 
    base2=base2.upper() 
    if base1 not in 'ACTG' or base2 not in 'ACTG': 
     print 'Not DNA base!' 
     sys.exit() 
    elif base1==base2: 
     return 3 
    elif base1+base2=='AG' or base1+base2=='GA': 
     return -1 
    elif base1+base2=='CT' or base1+base2=='TC': 
     return -1 
    else: 
     return -2 
import sys 

seq1 = sys.argv[1] 
seq2 = sys.argv[2] 
mRows = len(seq1) 
nCols = len(seq2) 
gap = int(sys.argv[3]) 
matrix = [] 

# generate empty matrix 
for x in range(mRows + 1): 
    matrix.append([]) 
    for y in range(nCols + 1): 
     matrix[x].append(0) 

for i in range(1, mRows + 1): 
    for j in range(1, nCols + 1): 
     dscore = matrix[i-1][j-1] + score(seq1[i-1], seq2[j-1]) 
     vscore = matrix[i-1][j] + gap 
     hscore = matrix[i][j-1] + gap 
     matrix[i][j]=max(0, vscore, hscore, dscore) 

: sw.py ATGCAT의 ACCT -1 내가 매트릭스 출력을 얻을

: 몇 가지 문제 해결 I와

0  0  0  0  0 
0  0  0  0  0 
0  0  0  0  3 
0  0  0  0  2 
0  0  0  0  1 
0  0  0  0  0 
0  0  0  0  3 

을 중첩 된 for 루프에서 j의 최종 값 (이 특정 입력에 대해 4)을 사용하는 점수 만이 행렬, 즉 마지막 열에 저장된다는 것을 알 수있었습니다.

제 질문은 왜 이런 일이 일어나고 어떻게 해결할 수 있습니까? for 루프가 왜 돌아가서 변수 점수를 계속하지 않습니까?

내 문제 해결의 일부 :

for i in range(1, mRows + 1): 
    for j in range(1, nCols + 1): 
     print 'this is i', i 
     print 'this is j', j 
     print 'seq1', seq1[i-1], 'seq2', seq2[j-1] 
     dscore = matrix[i-1][j-1] + score(seq1[i-1], seq2[j-1]) 
     vscore = matrix[i-1][j] + gap 
     hscore = matrix[i][j-1] + gap 
     matrix[i][j]=max(0, vscore, hscore, dscore) 
     print 'Vscore = ', vscore 
     print 'Hscore = ', hscore 
     print 'Dscore = ', dscore 
     print '\n' 

을 제공합니다

this is i 1 
this is j 1 
seq1 A seq2 A 
this is i 1 
this is j 2 
seq1 A seq2 C 
this is i 1 
this is j 3 
seq1 A seq2 C 
this is i 1 
this is j 4 
seq1 A seq2 T 
Vscore = -1 
Hscore = -1 
Dscore = -2 


this is i 2 
this is j 1 
seq1 T seq2 A 
this is i 2 
this is j 2 
seq1 T seq2 C 
this is i 2 
this is j 3 
seq1 T seq2 C 
this is i 2 
this is j 4 
seq1 T seq2 T 
Vscore = -1 
Hscore = -1 
Dscore = 3 


this is i 3 
this is j 1 
seq1 G seq2 A 
this is i 3 
this is j 2 
seq1 G seq2 C 
this is i 3 
this is j 3 
seq1 G seq2 C 
this is i 3 
this is j 4 
seq1 G seq2 T 
Vscore = 2 
Hscore = -1 
Dscore = -2 


this is i 4 
this is j 1 
seq1 C seq2 A 
this is i 4 
this is j 2 
seq1 C seq2 C 
this is i 4 
this is j 3 
seq1 C seq2 C 
this is i 4 
this is j 4 
seq1 C seq2 T 
Vscore = 1 
Hscore = -1 
Dscore = -1 


this is i 5 
this is j 1 
seq1 A seq2 A 
this is i 5 
this is j 2 
seq1 A seq2 C 
this is i 5 
this is j 3 
seq1 A seq2 C 
this is i 5 
this is j 4 
seq1 A seq2 T 
Vscore = 0 
Hscore = -1 
Dscore = -2 


this is i 6 
this is j 1 
seq1 T seq2 A 
this is i 6 
this is j 2 
seq1 T seq2 C 
this is i 6 
this is j 3 
seq1 T seq2 C 
this is i 6 
this is j 4 
seq1 T seq2 T 
Vscore = -1 
Hscore = -1 
Dscore = 3 

감사합니다!

답변

1

Smith-Waterman 알고리즘에 대한 클래스를 작성했습니다. 유용하다고 생각합니다. 나는 일반적으로 더 크고 더 큰 시퀀스를위한리스트보다 효율적이기 때문에이 작업을 위해 numpy 배열을 사용할 것을 권한다.

class LocalAligner(object): 
#Constructor 
def __init__(self, match=None, mismatch=None, gap=None, fileP=None, fileQ=None): 
    #Default args 
    #StringQ = GCTGGAAGGCAT 
    #StringP = GCAGAGCACG 
    #Match Score = +5, Mismatch Score = -4, Gap Penalty = -4 
    if match is None and mismatch is None and gap is None and fileP is None and fileQ is None: 
     #string1 
     self.q = "GCTGGAAGGCAT" 
     self.stringQName = "Sequence Q" 

     #string2 
     self.p = "GCAGAGCACG" 
     self.stringPName = "Sequence P" 

     #Scoring parameter 
     self.gapPen = -4 
     self.mismatchPen = -4 
     self.matchScore = 5 

    #User has given sequences and scoring arguments to the object 
    elif match is not None and mismatch is not None and gap is not None and fileP is not None and fileQ is not None: 
     #Default string name if one is not present in the file 
     self.stringQName = "String Q" 
     self.q = self.parseFile(fileQ, 1) 

     #Default string name if one is not present in the file 
     self.stringQName = "String P" 
     self.p = self.parseFile(fileP, 2) 

     #Scoring parameters given at the command line 
     self.gapPen = int(gap) 
     self.mismatchPen = int(mismatch) 
     self.matchScore = int(match) 


    #Final sequence alignments 
    self.finalQ = "" 
    self.finalP = "" 

    #Create a table and initialize to zero 
    #We will use numpy arrays as they are generally more efficient than lists for large amounts of data (ie sequences) 
    self.MatrixA = np.empty(shape=[len(self.p)+1,len(self.q)+1]) 

    #Create b table 
    self.MatrixB = np.empty(shape=[len(self.p)+1,len(self.q)+1]) 

    #Store max score and location 
    self.maxScore = 0 
    self.maxI = None 
    self.maxJ =None 

#Populates the A and B tables 
#A table holds the scores and the B table holds the direction of the optimal solution for each sub problem 
def calcTables(self): 
    #insert initial blank string 1 
    try: 
     self.q = '-' + self.q 
    except IOError: 
     print("Error with sequence 1") 

    #insert initial blank string 2 
    try: 
     self.p = '-' + self.p 
    except IOError: 
     print("Error with sequence 2") 

    #Initialize row and column 0 for A and B tables 
    self.MatrixA[:,0] = 0 
    self.MatrixA[0,:] = 0 
    self.MatrixB[:,0] = 0 
    self.MatrixB[0,:] = 0 

    for i in range(1,len(self.p)): 
     for j in range(1, len(self.q)): 

      #Look for match 
      if self.p[i] == self.q[j]: 
       #Match found 
       self.MatrixA[i][j] = self.MatrixA[i-1][j-1] + self.matchScore 
       #3 == "diagonal" for traversing solution 
       self.MatrixB[i][j] = 3 

       #Check for max score 
       if self.MatrixA[i][j] > self.maxScore: 
        self.maxScore = self.MatrixA[i][j] 
        self.maxI = i 
        self.maxJ = j 

      #Match not found 
      else: 
       self.MatrixA[i][j] = self.findMaxScore(i,j) 


#Finds the maximum score either in the north or west neighbor in the A table 
#Due to the ordering, gaps are checked first 
def findMaxScore(self, i, j): 

    #North score 
    qDelet = self.MatrixA[i-1][j] + self.gapPen 
    #West score 
    pDelet = self.MatrixA[i][j-1] + self.gapPen 
    #Diagonal Score 
    mismatch = self.MatrixA[i-1][j-1] + self.mismatchPen 

    #Determine the max score 
    maxScore = max(qDelet, pDelet, mismatch) 

    #Set B table 
    if qDelet == maxScore: 
     self.MatrixB[i][j] = 2 #2 == "up" for traversing solution 

    elif pDelet == maxScore: 
     self.MatrixB[i][j] = 1 #1 == "left" for traversing solution 

    elif mismatch == maxScore: 
     self.MatrixB[i][j] = 3 #3 == "diagonal" for traversing solution 

    return maxScore 

#Calculate the alignment with the highest score by tracing back the highest scoring local solution 
#Integers: 
#3 -> "DIAGONAL" -> match 
#2 -> "UP" -> gap in string q 
#1 -> "LEFT" -> gap in string p 
#were used in the B table 
def calcAlignemnt(self, i=None, j=None): 

    #Default arguments to the maximum score in the A table 
    if i is None and j is None: 
     i = self.maxI 
     j = self.maxJ 

    #Base case, end of the local alignment 
    if i == 0 or j == 0: 
     return 

    #Optimal solution "DIAGONAL" 
    if self.MatrixB[i][j] == 3: 
     self.calcAlignemnt(i-1 , j-1) 
     self.finalQ += self.q[j] 
     self.finalP += self.p[i] 

    else: 
     #Optimal solution "UP" 
     if self.MatrixB[i][j] == 2: 
      self.calcAlignemnt(i-1, j) 
      self.finalQ += '-' 
      self.finalP += self.p[i] 

     else: 
      #Optimal solution "LEFT" 
      self.calcAlignemnt(i, j-1) 
      self.finalP += '-' 
      self.finalQ += self.p[j] 

#Parse the input sequence file for string 
#Assumes fasta format 
def parseFile(self, filePath, stringNumber): 
    #Empty sequence 
    seq = "" 
    #Parse the file 
    with open(filePath) as f: 
     for line in f: 
      #Remove new line characters 
      line = line.replace('\r',"") #Windows 
      line = line.replace('\n', "") 

      #Header encountered 
      if line.startswith(">"): 
       if stringNumber == 2: 
        self.stringQName = line.replace('>',"") 
        continue 
       elif stringNumber == 1: 
        self.stringPName = line.replace('>',"") 
        continue 
       else: 
        continue 

      #Append line 
      seq += line 
     f.close() 
    return seq 
+0

고마워요! 나는 그 문제가 무엇이든 무작위로 고치려고했다. 귀하의 솔루션은 내 것보다 확실히 우아합니다 : https://github.com/mjmiguel/sequence-alignment – miguel