1 """
2 Collection of sequence alignment algorithms.
3
4 @note: The classes in this module have been optimized for performance.
5 Think twice before switching a field to a generally nicer property
6 access, because it turns out that these things often add significant
7 constants to the running time of a dynamic programming algorithm.
8 """
9
10 from csb.bio.sequence import AbstractSequence, SequenceAlignment, RichSequence, ResidueInfo
11 from abc import ABCMeta, abstractmethod
16
19 """
20 Defines a pairwise sequence alignment scoring function.
21 """
22
23 __metaclass__ = ABCMeta
24
25 @abstractmethod
27 """
28 Return the pairwise score of residues C{x} and C{y}.
29 C{x} and C{y} must be comparable (e.g. implement __eq__ and __hash__).
30
31 @param x: first residue
32 @type x: object
33 @param y: second residue
34 @type y: object
35
36 @rtype: float
37
38 @raise ResidueNotFoundError: if C{x} or C{y} cannot be handled by this
39 scoring matrix
40 """
41 pass
42
44 """
45 Simple identity-based scoring matrix.
46
47 @param match: score for a match
48 @type match: float
49 @param mismatch: score for a mismatch
50 @type mismatch: float
51 """
52
53 - def __init__(self, match=1, mismatch=-1):
54
55 self._match = float(match)
56 self._mismatch = float(mismatch)
57
58 @property
60 """
61 Score for a match
62 @rtype: float
63 """
64 return self._match
65
66 @property
68 """
69 Score for a mismatch
70 @rtype: float
71 """
72 return self._mismatch
73
75
76 if x == y:
77 return self._match
78 else:
79 return self._mismatch
80
82 """
83 Similarity scoring matrix.
84
85 @param matrix:
86 """
87
88 BLOSUM62 = {
89 'A': { 'A': 4.0, 'R':-1.0, 'N':-2.0, 'D':-2.0, 'C': 0.0, 'Q':-1.0, 'E':-1.0, 'G': 0.0, 'H':-2.0, 'I':-1.0, 'L':-1.0, 'K':-1.0, 'M':-1.0, 'F':-2.0, 'P':-1.0, 'S': 1.0, 'T': 0.0, 'W':-3.0, 'Y':-2.0, 'V': 0.0, 'B':-2.0, 'Z':-1.0, 'X': 0.0, '*':-4.0 },
90 'R': { 'A':-1.0, 'R': 5.0, 'N': 0.0, 'D':-2.0, 'C':-3.0, 'Q': 1.0, 'E': 0.0, 'G':-2.0, 'H': 0.0, 'I':-3.0, 'L':-2.0, 'K': 2.0, 'M':-1.0, 'F':-3.0, 'P':-2.0, 'S':-1.0, 'T':-1.0, 'W':-3.0, 'Y':-2.0, 'V':-3.0, 'B':-1.0, 'Z': 0.0, 'X':-1.0, '*':-4.0 },
91 'N': { 'A':-2.0, 'R': 0.0, 'N': 6.0, 'D': 1.0, 'C':-3.0, 'Q': 0.0, 'E': 0.0, 'G': 0.0, 'H': 1.0, 'I':-3.0, 'L':-3.0, 'K': 0.0, 'M':-2.0, 'F':-3.0, 'P':-2.0, 'S': 1.0, 'T': 0.0, 'W':-4.0, 'Y':-2.0, 'V':-3.0, 'B': 3.0, 'Z': 0.0, 'X':-1.0, '*':-4.0 },
92 'D': { 'A':-2.0, 'R':-2.0, 'N': 1.0, 'D': 6.0, 'C':-3.0, 'Q': 0.0, 'E': 2.0, 'G':-1.0, 'H':-1.0, 'I':-3.0, 'L':-4.0, 'K':-1.0, 'M':-3.0, 'F':-3.0, 'P':-1.0, 'S': 0.0, 'T':-1.0, 'W':-4.0, 'Y':-3.0, 'V':-3.0, 'B': 4.0, 'Z': 1.0, 'X':-1.0, '*':-4.0 },
93 'C': { 'A': 0.0, 'R':-3.0, 'N':-3.0, 'D':-3.0, 'C': 9.0, 'Q':-3.0, 'E':-4.0, 'G':-3.0, 'H':-3.0, 'I':-1.0, 'L':-1.0, 'K':-3.0, 'M':-1.0, 'F':-2.0, 'P':-3.0, 'S':-1.0, 'T':-1.0, 'W':-2.0, 'Y':-2.0, 'V':-1.0, 'B':-3.0, 'Z':-3.0, 'X':-2.0, '*':-4.0 },
94 'Q': { 'A':-1.0, 'R': 1.0, 'N': 0.0, 'D': 0.0, 'C':-3.0, 'Q': 5.0, 'E': 2.0, 'G':-2.0, 'H': 0.0, 'I':-3.0, 'L':-2.0, 'K': 1.0, 'M': 0.0, 'F':-3.0, 'P':-1.0, 'S': 0.0, 'T':-1.0, 'W':-2.0, 'Y':-1.0, 'V':-2.0, 'B': 0.0, 'Z': 3.0, 'X':-1.0, '*':-4.0 },
95 'E': { 'A':-1.0, 'R': 0.0, 'N': 0.0, 'D': 2.0, 'C':-4.0, 'Q': 2.0, 'E': 5.0, 'G':-2.0, 'H': 0.0, 'I':-3.0, 'L':-3.0, 'K': 1.0, 'M':-2.0, 'F':-3.0, 'P':-1.0, 'S': 0.0, 'T':-1.0, 'W':-3.0, 'Y':-2.0, 'V':-2.0, 'B': 1.0, 'Z': 4.0, 'X':-1.0, '*':-4.0 },
96 'G': { 'A': 0.0, 'R':-2.0, 'N': 0.0, 'D':-1.0, 'C':-3.0, 'Q':-2.0, 'E':-2.0, 'G': 6.0, 'H':-2.0, 'I':-4.0, 'L':-4.0, 'K':-2.0, 'M':-3.0, 'F':-3.0, 'P':-2.0, 'S': 0.0, 'T':-2.0, 'W':-2.0, 'Y':-3.0, 'V':-3.0, 'B':-1.0, 'Z':-2.0, 'X':-1.0, '*':-4.0 },
97 'H': { 'A':-2.0, 'R': 0.0, 'N': 1.0, 'D':-1.0, 'C':-3.0, 'Q': 0.0, 'E': 0.0, 'G':-2.0, 'H': 8.0, 'I':-3.0, 'L':-3.0, 'K':-1.0, 'M':-2.0, 'F':-1.0, 'P':-2.0, 'S':-1.0, 'T':-2.0, 'W':-2.0, 'Y': 2.0, 'V':-3.0, 'B': 0.0, 'Z': 0.0, 'X':-1.0, '*':-4.0 },
98 'I': { 'A':-1.0, 'R':-3.0, 'N':-3.0, 'D':-3.0, 'C':-1.0, 'Q':-3.0, 'E':-3.0, 'G':-4.0, 'H':-3.0, 'I': 4.0, 'L': 2.0, 'K':-3.0, 'M': 1.0, 'F': 0.0, 'P':-3.0, 'S':-2.0, 'T':-1.0, 'W':-3.0, 'Y':-1.0, 'V': 3.0, 'B':-3.0, 'Z':-3.0, 'X':-1.0, '*':-4.0 },
99 'L': { 'A':-1.0, 'R':-2.0, 'N':-3.0, 'D':-4.0, 'C':-1.0, 'Q':-2.0, 'E':-3.0, 'G':-4.0, 'H':-3.0, 'I': 2.0, 'L': 4.0, 'K':-2.0, 'M': 2.0, 'F': 0.0, 'P':-3.0, 'S':-2.0, 'T':-1.0, 'W':-2.0, 'Y':-1.0, 'V': 1.0, 'B':-4.0, 'Z':-3.0, 'X':-1.0, '*':-4.0 },
100 'K': { 'A':-1.0, 'R': 2.0, 'N': 0.0, 'D':-1.0, 'C':-3.0, 'Q': 1.0, 'E': 1.0, 'G':-2.0, 'H':-1.0, 'I':-3.0, 'L':-2.0, 'K': 5.0, 'M':-1.0, 'F':-3.0, 'P':-1.0, 'S': 0.0, 'T':-1.0, 'W':-3.0, 'Y':-2.0, 'V':-2.0, 'B': 0.0, 'Z': 1.0, 'X':-1.0, '*':-4.0 },
101 'M': { 'A':-1.0, 'R':-1.0, 'N':-2.0, 'D':-3.0, 'C':-1.0, 'Q': 0.0, 'E':-2.0, 'G':-3.0, 'H':-2.0, 'I': 1.0, 'L': 2.0, 'K':-1.0, 'M': 5.0, 'F': 0.0, 'P':-2.0, 'S':-1.0, 'T':-1.0, 'W':-1.0, 'Y':-1.0, 'V': 1.0, 'B':-3.0, 'Z':-1.0, 'X':-1.0, '*':-4.0 },
102 'F': { 'A':-2.0, 'R':-3.0, 'N':-3.0, 'D':-3.0, 'C':-2.0, 'Q':-3.0, 'E':-3.0, 'G':-3.0, 'H':-1.0, 'I': 0.0, 'L': 0.0, 'K':-3.0, 'M': 0.0, 'F': 6.0, 'P':-4.0, 'S':-2.0, 'T':-2.0, 'W': 1.0, 'Y': 3.0, 'V':-1.0, 'B':-3.0, 'Z':-3.0, 'X':-1.0, '*':-4.0 },
103 'P': { 'A':-1.0, 'R':-2.0, 'N':-2.0, 'D':-1.0, 'C':-3.0, 'Q':-1.0, 'E':-1.0, 'G':-2.0, 'H':-2.0, 'I':-3.0, 'L':-3.0, 'K':-1.0, 'M':-2.0, 'F':-4.0, 'P': 7.0, 'S':-1.0, 'T':-1.0, 'W':-4.0, 'Y':-3.0, 'V':-2.0, 'B':-2.0, 'Z':-1.0, 'X':-2.0, '*':-4.0 },
104 'S': { 'A': 1.0, 'R':-1.0, 'N': 1.0, 'D': 0.0, 'C':-1.0, 'Q': 0.0, 'E': 0.0, 'G': 0.0, 'H':-1.0, 'I':-2.0, 'L':-2.0, 'K': 0.0, 'M':-1.0, 'F':-2.0, 'P':-1.0, 'S': 4.0, 'T': 1.0, 'W':-3.0, 'Y':-2.0, 'V':-2.0, 'B': 0.0, 'Z': 0.0, 'X': 0.0, '*':-4.0 },
105 'T': { 'A': 0.0, 'R':-1.0, 'N': 0.0, 'D':-1.0, 'C':-1.0, 'Q':-1.0, 'E':-1.0, 'G':-2.0, 'H':-2.0, 'I':-1.0, 'L':-1.0, 'K':-1.0, 'M':-1.0, 'F':-2.0, 'P':-1.0, 'S': 1.0, 'T': 5.0, 'W':-2.0, 'Y':-2.0, 'V': 0.0, 'B':-1.0, 'Z':-1.0, 'X': 0.0, '*':-4.0 },
106 'W': { 'A':-3.0, 'R':-3.0, 'N':-4.0, 'D':-4.0, 'C':-2.0, 'Q':-2.0, 'E':-3.0, 'G':-2.0, 'H':-2.0, 'I':-3.0, 'L':-2.0, 'K':-3.0, 'M':-1.0, 'F': 1.0, 'P':-4.0, 'S':-3.0, 'T':-2.0, 'W': 11.0,'Y': 2.0, 'V':-3.0, 'B':-4.0, 'Z':-3.0, 'X':-2.0, '*':-4.0 },
107 'Y': { 'A':-2.0, 'R':-2.0, 'N':-2.0, 'D':-3.0, 'C':-2.0, 'Q':-1.0, 'E':-2.0, 'G':-3.0, 'H': 2.0, 'I':-1.0, 'L':-1.0, 'K':-2.0, 'M':-1.0, 'F': 3.0, 'P':-3.0, 'S':-2.0, 'T':-2.0, 'W': 2.0, 'Y': 7.0, 'V':-1.0, 'B':-3.0, 'Z':-2.0, 'X':-1.0, '*':-4.0 },
108 'V': { 'A': 0.0, 'R':-3.0, 'N':-3.0, 'D':-3.0, 'C':-1.0, 'Q':-2.0, 'E':-2.0, 'G':-3.0, 'H':-3.0, 'I': 3.0, 'L': 1.0, 'K':-2.0, 'M': 1.0, 'F':-1.0, 'P':-2.0, 'S':-2.0, 'T': 0.0, 'W':-3.0, 'Y':-1.0, 'V': 4.0, 'B':-3.0, 'Z':-2.0, 'X':-1.0, '*':-4.0 },
109 'B': { 'A':-2.0, 'R':-1.0, 'N': 3.0, 'D': 4.0, 'C':-3.0, 'Q': 0.0, 'E': 1.0, 'G':-1.0, 'H': 0.0, 'I':-3.0, 'L':-4.0, 'K': 0.0, 'M':-3.0, 'F':-3.0, 'P':-2.0, 'S': 0.0, 'T':-1.0, 'W':-4.0, 'Y':-3.0, 'V':-3.0, 'B': 4.0, 'Z': 1.0, 'X':-1.0, '*':-4.0 },
110 'Z': { 'A':-1.0, 'R': 0.0, 'N': 0.0, 'D': 1.0, 'C':-3.0, 'Q': 3.0, 'E': 4.0, 'G':-2.0, 'H': 0.0, 'I':-3.0, 'L':-3.0, 'K': 1.0, 'M':-1.0, 'F':-3.0, 'P':-1.0, 'S': 0.0, 'T':-1.0, 'W':-3.0, 'Y':-2.0, 'V':-2.0, 'B': 1.0, 'Z': 4.0, 'X':-1.0, '*':-4.0 },
111 'X': { 'A': 0.0, 'R':-1.0, 'N':-1.0, 'D':-1.0, 'C':-2.0, 'Q':-1.0, 'E':-1.0, 'G':-1.0, 'H':-1.0, 'I':-1.0, 'L':-1.0, 'K':-1.0, 'M':-1.0, 'F':-1.0, 'P':-2.0, 'S': 0.0, 'T': 0.0, 'W':-2.0, 'Y':-1.0, 'V':-1.0, 'B':-1.0, 'Z':-1.0, 'X':-1.0, '*':-4.0 },
112 '*': { 'A':-4.0, 'R':-4.0, 'N':-4.0, 'D':-4.0, 'C':-4.0, 'Q':-4.0, 'E':-4.0, 'G':-4.0, 'H':-4.0, 'I':-4.0, 'L':-4.0, 'K':-4.0, 'M':-4.0, 'F':-4.0, 'P':-4.0, 'S':-4.0, 'T':-4.0, 'W':-4.0, 'Y':-4.0, 'V':-4.0, 'B':-4.0, 'Z':-4.0, 'X':-4.0, '*': 1.0 }
113 }
114
116 self._matrix = matrix
117
123
124 @staticmethod
126 """
127 Parse a standard scoring matrix file, where the first row and
128 column are residue labels.
129
130 @param string: scoring matrix string
131 @type string: str
132
133 @rtype: L{SimilarityMatrix}
134 """
135
136 residues = {}
137 matrix = {}
138
139 for line in string.splitlines():
140 if not line.strip() or line.startswith("#"):
141 continue
142
143 if not residues:
144 residues = line.split()
145
146 else:
147 items = line.split()
148 if len(items) != len(residues) + 1:
149 raise ValueError("{0} scoring columns expected".format(len(residues)))
150
151 try:
152 aa, scores = items[0], map(float, items[1:])
153 matrix[aa] = dict((residues[i], s) for i, s in enumerate(scores))
154 except (KeyError, ValueError):
155 raise ValueError("Corrupt scoring matrix")
156
157 return SimilarityMatrix(matrix)
158
161 """
162 Base class for all sequence alignment algorithms.
163
164 This class was designed with simple sequence alignment algorithms in mind.
165 Implementors have full control over the behavior of the scoring function and
166 the dynamic programming matrix, but implementing things that require
167 additional matrices (such as affine gap penalties) might be trickier.
168
169 @param scoring: scoring matrix
170 @type scoring: L{AbstractScoringMatrix}
171 @param gap: simple gap penalty
172 @type gap: float
173 """
174
175 __metaclass__ = ABCMeta
176
178
179 if not isinstance(scoring, AbstractScoringMatrix):
180 raise TypeError(scoring)
181
182 self._gap = float(gap)
183 self._scoring = scoring
184
185 @property
187 """
188 Simple gap penalty
189 @rtype: float
190 """
191 return self._gap
192
193 @property
195 """
196 Scoring matrix
197 @rtype: L{AbstractScoringMatrix}
198 """
199 return self._scoring
200
201 - def align(self, query, subject):
202 """
203 Align two sequences and return the optimal alignment.
204
205 @param query: first sequence
206 @type query: L{AbstractSequence}
207 @param subject: second sequence
208 @type subject: L{AbstractSequence}
209
210 @rtype: L{AlignmentResult}
211 """
212 if query.length == 0 or subject.length == 0:
213 raise ValueError("Can't align zero length sequence")
214
215
216 qseq = ["*"] + self._sequence(query)
217 sseq = ["*"] + self._sequence(subject)
218
219
220 matrix = []
221 rows, cols = len(query), len(subject)
222
223 for i in range(rows + 1):
224 matrix.append([])
225 for j in range(cols + 1):
226 matrix[i].append(None)
227
228
229 self._initialize(matrix)
230
231
232 for i in range(1, rows + 1):
233 for j in range(1, cols + 1):
234 score = self._score(qseq[i], sseq[j])
235 self._fill(matrix, i, j, score)
236
237
238 return self._traceback(matrix, query, subject)
239
241 """
242 Extract and return the string sequence of {s}.
243
244 @param s: sequence object
245 @type s: L{AbstractSequence}
246
247 @rtype: list of str
248 """
249 return list(s.sequence)
250
251 @abstractmethod
253 """
254 Initialize (typically the first row and column) of the dynamic
255 programming matrix.
256
257 @param matrix: list (2D)
258 @type matrix: list
259 """
260 pass
261
262 - def _score(self, residue1, residue2):
263 """
264 Retrieve the pairwise score of two residues using the current
265 scoring matrix.
266
267 @rtype: float
268 """
269 return self._scoring.score(residue1, residue2)
270
271 - def _fill(self, matrix, i, j, score):
272 """
273 Compute and set the best score that leads to cell i,j in the dynamic
274 programming matrix: right, down or diagonal.
275
276 See also L{AbstractAlignmentAlgorithm._max}.
277
278 @param score: pairwise score at matrix[i][j]
279 @type score: float
280 @return: the best score
281 @rtype: float
282 """
283
284 match = matrix[i-1][j-1] + score
285 insertion = matrix[i][j-1] + self._gap
286 deletion = matrix[i-1][j] + self._gap
287
288 best = self._max(match, insertion, deletion)
289 matrix[i][j] = best
290
291 return best
292
293 @abstractmethod
294 - def _max(self, match, insertion, deletion):
295 """
296 Choose the best score among all given possibilities:
297 scores for match, insertion and deletion. This will determine
298 the direction taken at each step while building the dynamic programming
299 matrix (diagonal, down or right).
300
301 This is an expected notable point of divergence for most sequence
302 alignment algorithms.
303 """
304 pass
305
307 """
308 Trace back and return the optimal alignment.
309 """
310
311 query = []
312 subject = []
313
314
315 qseq = ["*"] + self._sequence(seq1)
316 sseq = ["*"] + self._sequence(seq2)
317
318 i, j = self._terminus(m)
319 qstart, start = i, j
320 qend, end = i, j
321 score = m[i][j]
322
323 while self._expandable(m, i, j):
324
325 if i > 0 and j > 0 and m[i][j] == (m[i-1][j-1] + self._score(qseq[i], sseq[j])):
326 query.append(seq1.residues[i])
327 subject.append(seq2.residues[j])
328 qstart, start = i, j
329 i, j = i - 1, j - 1
330
331 elif i > 0 and m[i][j] == (m[i-1][j] + self._gap):
332 query.append(seq1.residues[i])
333 subject.append(ResidueInfo(-1, seq2.alphabet.GAP))
334 qstart = i
335 i = i - 1
336
337 elif j > 0 and m[i][j] == (m[i][j-1] + self._gap):
338 query.append(ResidueInfo(-1, seq1.alphabet.GAP))
339 subject.append(seq2.residues[j])
340 start = j
341 j = j - 1
342
343 else:
344 assert False
345
346 query.reverse()
347 subject.reverse()
348
349 aligned_query = RichSequence(seq1.id, seq1.header, query, seq1.type)
350 aligned_subject = RichSequence(seq2.id, seq2.header, subject, seq2.type)
351
352 return AlignmentResult(score, aligned_query, aligned_subject, qstart, qend, start, end)
353
354 @abstractmethod
356 """
357 Find the coordinates of the optimal alignment's right endpoint in the
358 dynamic programming matrix. This is the starting point of a traceback.
359
360 @param matrix: the complete dynamic programming matrix
361 @type matrix: 2D list
362
363 @rtype: 2-tuple (i, j)
364 """
365 pass
366
367 @abstractmethod
369 """
370 Return True if the traceback procedure must not terminate at
371 position C{i,j} in the dynamic programming matrix.
372
373 @rtype: bool
374 """
375 pass
376
379 """
380 Needleman-Wunsch global sequence alignment.
381 """
382
385
387
388 for i in range(len(matrix)):
389 matrix[i][0] = self._gap * i
390 for j in range(len(matrix[0])):
391 matrix[0][j] = self._gap * j
392
393 - def _max(self, match, insertion, deletion):
394 return max(match, insertion, deletion)
395
397
398 i = len(matrix) - 1
399 j = len(matrix[0]) - 1
400
401 return (i, j)
402
404 return i > 0 or j > 0
405
407 """
408 Smith-Waterman local sequence alignment.
409 """
410
411 START = 0
412 """
413 Score for initiation of a new local alignment
414 """
415
418
425
426 - def _max(self, match, insertion, deletion):
428
430
431 maxi, maxj = 0, 0
432
433 for i in range(len(matrix)):
434 for j in range(len(matrix[i])):
435 if matrix[i][j] > matrix[maxi][maxj]:
436 maxi, maxj = i, j
437
438 return (maxi, maxj)
439
442
445 """
446 Represents a pairwise sequence alignment result.
447
448 @param score: raw alignment score
449 @type score: float
450 @param query: aligned query sequence (with gaps)
451 @type query: L{AbstractSequence}
452 @param subject: aligned subject sequence (with gaps)
453 @type subject: L{AbstractSequence}
454 @param qstart: query start position
455 @type qstart: int
456 @param qend: query end position
457 @type qend: int
458 @param start: subject start position
459 @type start: int
460 @param end: subject end position
461 @type end: int
462 """
463
464 - def __init__(self, score, query, subject, qstart, qend, start, end):
507
508
510 string = "{0.qstart:5} {0.query.sequence:} {0.qend:<5}\n"
511 string += "{0.start:5} {0.subject.sequence:} {0.end:<5}"
512 return string.format(self)
513
514 @property
516 """
517 Return True if this is an empty alignment (i.e. no matches)
518 @rtype: bool
519 """
520 return self.length == 0 or self.gaps == self.length
521
522 @property
524 """
525 Raw alignment score
526 @rtype: float
527 """
528 return self._score
529
530 @property
532 """
533 Aligned query sequence (with gaps)
534 @rtype: L{AbstractSequence}
535 """
536 return self._query
537
538 @property
540 """
541 Aligned subject sequence (with gaps)
542 @rtype: L{AbstractSequence}
543 """
544 return self._subject
545
546 @property
548 """
549 Query start position
550 @rtype: int
551 """
552 return self._qstart
553
554 @property
556 """
557 Query end position
558 @rtype: int
559 """
560 return self._qend
561
562 @property
564 """
565 Subject start position
566 @rtype: int
567 """
568 return self._start
569
570 @property
572 """
573 Subject end position
574 @rtype: int
575 """
576 return self._end
577
578 @property
580 """
581 Number of identical residues
582 @rtype: int
583 """
584 return self._identicals
585
586 @property
588 """
589 Percentage of identical residues
590 @rtype: int
591 """
592 return float(self._identicals) / self._length
593
594 @property
596 """
597 Total number of gaps (query + subject)
598 @rtype: int
599 """
600 return self._gaps
601
602 @property
604 """
605 Alignment length (query + subject + gaps / 2)
606 @rtype: int
607 """
608 return self._length
609
611 """
612 @return: as a sequence alignment object
613 @rtype: L{SequenceAlignment}
614 """
615 return SequenceAlignment([self.query, self.subject])
616