csb.bio.sequence.alignment

1 """ 2 Collection of sequence alignment algorithms. 3 4 @note: The classes in this module have been optimized for performance. 5 Think twice before switching a field to a generally nicer property 6 access, because it turns out that these things often add significant 7 constants to the running time of a dynamic programming algorithm. 8 """ 9 10 from csb.bio.sequence import AbstractSequence, SequenceAlignment, RichSequence, ResidueInfo 11 from abc import ABCMeta, abstractmethod

12 13 14 -class ResidueNotFoundError(KeyError):

15 pass

16

17 18 -class AbstractScoringMatrix(object):

19 """ 20 Defines a pairwise sequence alignment scoring function. 21 """ 22 23 __metaclass__ = ABCMeta 24 25 @abstractmethod

26 - def score(self, x, y):

27 """ 28 Return the pairwise score of residues C{x} and C{y}. 29 C{x} and C{y} must be comparable (e.g. implement __eq__ and __hash__). 30 31 @param x: first residue 32 @type x: object 33 @param y: second residue 34 @type y: object 35 36 @rtype: float 37 38 @raise ResidueNotFoundError: if C{x} or C{y} cannot be handled by this 39 scoring matrix 40 """ 41 pass

42

43 -class IdentityMatrix(AbstractScoringMatrix):

44 """ 45 Simple identity-based scoring matrix. 46 47 @param match: score for a match 48 @type match: float 49 @param mismatch: score for a mismatch 50 @type mismatch: float 51 """ 52

53 - def __init__(self, match=1, mismatch=-1):

54 55 self._match = float(match) 56 self._mismatch = float(mismatch)

57 58 @property

59 - def match(self):

60 """ 61 Score for a match 62 @rtype: float 63 """ 64 return self._match

65 66 @property

67 - def mismatch(self):

68 """ 69 Score for a mismatch 70 @rtype: float 71 """ 72 return self._mismatch

73

74 - def score(self, x, y):

75 76 if x == y: 77 return self._match 78 else: 79 return self._mismatch

80

81 -class SimilarityMatrix(AbstractScoringMatrix):

82 """ 83 Similarity scoring matrix. 84 85 @param matrix: 86 """ 87 88 BLOSUM62 = { 89 'A': { 'A': 4.0, 'R':-1.0, 'N':-2.0, 'D':-2.0, 'C': 0.0, 'Q':-1.0, 'E':-1.0, 'G': 0.0, 'H':-2.0, 'I':-1.0, 'L':-1.0, 'K':-1.0, 'M':-1.0, 'F':-2.0, 'P':-1.0, 'S': 1.0, 'T': 0.0, 'W':-3.0, 'Y':-2.0, 'V': 0.0, 'B':-2.0, 'Z':-1.0, 'X': 0.0, '*':-4.0 }, 90 'R': { 'A':-1.0, 'R': 5.0, 'N': 0.0, 'D':-2.0, 'C':-3.0, 'Q': 1.0, 'E': 0.0, 'G':-2.0, 'H': 0.0, 'I':-3.0, 'L':-2.0, 'K': 2.0, 'M':-1.0, 'F':-3.0, 'P':-2.0, 'S':-1.0, 'T':-1.0, 'W':-3.0, 'Y':-2.0, 'V':-3.0, 'B':-1.0, 'Z': 0.0, 'X':-1.0, '*':-4.0 }, 91 'N': { 'A':-2.0, 'R': 0.0, 'N': 6.0, 'D': 1.0, 'C':-3.0, 'Q': 0.0, 'E': 0.0, 'G': 0.0, 'H': 1.0, 'I':-3.0, 'L':-3.0, 'K': 0.0, 'M':-2.0, 'F':-3.0, 'P':-2.0, 'S': 1.0, 'T': 0.0, 'W':-4.0, 'Y':-2.0, 'V':-3.0, 'B': 3.0, 'Z': 0.0, 'X':-1.0, '*':-4.0 }, 92 'D': { 'A':-2.0, 'R':-2.0, 'N': 1.0, 'D': 6.0, 'C':-3.0, 'Q': 0.0, 'E': 2.0, 'G':-1.0, 'H':-1.0, 'I':-3.0, 'L':-4.0, 'K':-1.0, 'M':-3.0, 'F':-3.0, 'P':-1.0, 'S': 0.0, 'T':-1.0, 'W':-4.0, 'Y':-3.0, 'V':-3.0, 'B': 4.0, 'Z': 1.0, 'X':-1.0, '*':-4.0 }, 93 'C': { 'A': 0.0, 'R':-3.0, 'N':-3.0, 'D':-3.0, 'C': 9.0, 'Q':-3.0, 'E':-4.0, 'G':-3.0, 'H':-3.0, 'I':-1.0, 'L':-1.0, 'K':-3.0, 'M':-1.0, 'F':-2.0, 'P':-3.0, 'S':-1.0, 'T':-1.0, 'W':-2.0, 'Y':-2.0, 'V':-1.0, 'B':-3.0, 'Z':-3.0, 'X':-2.0, '*':-4.0 }, 94 'Q': { 'A':-1.0, 'R': 1.0, 'N': 0.0, 'D': 0.0, 'C':-3.0, 'Q': 5.0, 'E': 2.0, 'G':-2.0, 'H': 0.0, 'I':-3.0, 'L':-2.0, 'K': 1.0, 'M': 0.0, 'F':-3.0, 'P':-1.0, 'S': 0.0, 'T':-1.0, 'W':-2.0, 'Y':-1.0, 'V':-2.0, 'B': 0.0, 'Z': 3.0, 'X':-1.0, '*':-4.0 }, 95 'E': { 'A':-1.0, 'R': 0.0, 'N': 0.0, 'D': 2.0, 'C':-4.0, 'Q': 2.0, 'E': 5.0, 'G':-2.0, 'H': 0.0, 'I':-3.0, 'L':-3.0, 'K': 1.0, 'M':-2.0, 'F':-3.0, 'P':-1.0, 'S': 0.0, 'T':-1.0, 'W':-3.0, 'Y':-2.0, 'V':-2.0, 'B': 1.0, 'Z': 4.0, 'X':-1.0, '*':-4.0 }, 96 'G': { 'A': 0.0, 'R':-2.0, 'N': 0.0, 'D':-1.0, 'C':-3.0, 'Q':-2.0, 'E':-2.0, 'G': 6.0, 'H':-2.0, 'I':-4.0, 'L':-4.0, 'K':-2.0, 'M':-3.0, 'F':-3.0, 'P':-2.0, 'S': 0.0, 'T':-2.0, 'W':-2.0, 'Y':-3.0, 'V':-3.0, 'B':-1.0, 'Z':-2.0, 'X':-1.0, '*':-4.0 }, 97 'H': { 'A':-2.0, 'R': 0.0, 'N': 1.0, 'D':-1.0, 'C':-3.0, 'Q': 0.0, 'E': 0.0, 'G':-2.0, 'H': 8.0, 'I':-3.0, 'L':-3.0, 'K':-1.0, 'M':-2.0, 'F':-1.0, 'P':-2.0, 'S':-1.0, 'T':-2.0, 'W':-2.0, 'Y': 2.0, 'V':-3.0, 'B': 0.0, 'Z': 0.0, 'X':-1.0, '*':-4.0 }, 98 'I': { 'A':-1.0, 'R':-3.0, 'N':-3.0, 'D':-3.0, 'C':-1.0, 'Q':-3.0, 'E':-3.0, 'G':-4.0, 'H':-3.0, 'I': 4.0, 'L': 2.0, 'K':-3.0, 'M': 1.0, 'F': 0.0, 'P':-3.0, 'S':-2.0, 'T':-1.0, 'W':-3.0, 'Y':-1.0, 'V': 3.0, 'B':-3.0, 'Z':-3.0, 'X':-1.0, '*':-4.0 }, 99 'L': { 'A':-1.0, 'R':-2.0, 'N':-3.0, 'D':-4.0, 'C':-1.0, 'Q':-2.0, 'E':-3.0, 'G':-4.0, 'H':-3.0, 'I': 2.0, 'L': 4.0, 'K':-2.0, 'M': 2.0, 'F': 0.0, 'P':-3.0, 'S':-2.0, 'T':-1.0, 'W':-2.0, 'Y':-1.0, 'V': 1.0, 'B':-4.0, 'Z':-3.0, 'X':-1.0, '*':-4.0 }, 100 'K': { 'A':-1.0, 'R': 2.0, 'N': 0.0, 'D':-1.0, 'C':-3.0, 'Q': 1.0, 'E': 1.0, 'G':-2.0, 'H':-1.0, 'I':-3.0, 'L':-2.0, 'K': 5.0, 'M':-1.0, 'F':-3.0, 'P':-1.0, 'S': 0.0, 'T':-1.0, 'W':-3.0, 'Y':-2.0, 'V':-2.0, 'B': 0.0, 'Z': 1.0, 'X':-1.0, '*':-4.0 }, 101 'M': { 'A':-1.0, 'R':-1.0, 'N':-2.0, 'D':-3.0, 'C':-1.0, 'Q': 0.0, 'E':-2.0, 'G':-3.0, 'H':-2.0, 'I': 1.0, 'L': 2.0, 'K':-1.0, 'M': 5.0, 'F': 0.0, 'P':-2.0, 'S':-1.0, 'T':-1.0, 'W':-1.0, 'Y':-1.0, 'V': 1.0, 'B':-3.0, 'Z':-1.0, 'X':-1.0, '*':-4.0 }, 102 'F': { 'A':-2.0, 'R':-3.0, 'N':-3.0, 'D':-3.0, 'C':-2.0, 'Q':-3.0, 'E':-3.0, 'G':-3.0, 'H':-1.0, 'I': 0.0, 'L': 0.0, 'K':-3.0, 'M': 0.0, 'F': 6.0, 'P':-4.0, 'S':-2.0, 'T':-2.0, 'W': 1.0, 'Y': 3.0, 'V':-1.0, 'B':-3.0, 'Z':-3.0, 'X':-1.0, '*':-4.0 }, 103 'P': { 'A':-1.0, 'R':-2.0, 'N':-2.0, 'D':-1.0, 'C':-3.0, 'Q':-1.0, 'E':-1.0, 'G':-2.0, 'H':-2.0, 'I':-3.0, 'L':-3.0, 'K':-1.0, 'M':-2.0, 'F':-4.0, 'P': 7.0, 'S':-1.0, 'T':-1.0, 'W':-4.0, 'Y':-3.0, 'V':-2.0, 'B':-2.0, 'Z':-1.0, 'X':-2.0, '*':-4.0 }, 104 'S': { 'A': 1.0, 'R':-1.0, 'N': 1.0, 'D': 0.0, 'C':-1.0, 'Q': 0.0, 'E': 0.0, 'G': 0.0, 'H':-1.0, 'I':-2.0, 'L':-2.0, 'K': 0.0, 'M':-1.0, 'F':-2.0, 'P':-1.0, 'S': 4.0, 'T': 1.0, 'W':-3.0, 'Y':-2.0, 'V':-2.0, 'B': 0.0, 'Z': 0.0, 'X': 0.0, '*':-4.0 }, 105 'T': { 'A': 0.0, 'R':-1.0, 'N': 0.0, 'D':-1.0, 'C':-1.0, 'Q':-1.0, 'E':-1.0, 'G':-2.0, 'H':-2.0, 'I':-1.0, 'L':-1.0, 'K':-1.0, 'M':-1.0, 'F':-2.0, 'P':-1.0, 'S': 1.0, 'T': 5.0, 'W':-2.0, 'Y':-2.0, 'V': 0.0, 'B':-1.0, 'Z':-1.0, 'X': 0.0, '*':-4.0 }, 106 'W': { 'A':-3.0, 'R':-3.0, 'N':-4.0, 'D':-4.0, 'C':-2.0, 'Q':-2.0, 'E':-3.0, 'G':-2.0, 'H':-2.0, 'I':-3.0, 'L':-2.0, 'K':-3.0, 'M':-1.0, 'F': 1.0, 'P':-4.0, 'S':-3.0, 'T':-2.0, 'W': 11.0,'Y': 2.0, 'V':-3.0, 'B':-4.0, 'Z':-3.0, 'X':-2.0, '*':-4.0 }, 107 'Y': { 'A':-2.0, 'R':-2.0, 'N':-2.0, 'D':-3.0, 'C':-2.0, 'Q':-1.0, 'E':-2.0, 'G':-3.0, 'H': 2.0, 'I':-1.0, 'L':-1.0, 'K':-2.0, 'M':-1.0, 'F': 3.0, 'P':-3.0, 'S':-2.0, 'T':-2.0, 'W': 2.0, 'Y': 7.0, 'V':-1.0, 'B':-3.0, 'Z':-2.0, 'X':-1.0, '*':-4.0 }, 108 'V': { 'A': 0.0, 'R':-3.0, 'N':-3.0, 'D':-3.0, 'C':-1.0, 'Q':-2.0, 'E':-2.0, 'G':-3.0, 'H':-3.0, 'I': 3.0, 'L': 1.0, 'K':-2.0, 'M': 1.0, 'F':-1.0, 'P':-2.0, 'S':-2.0, 'T': 0.0, 'W':-3.0, 'Y':-1.0, 'V': 4.0, 'B':-3.0, 'Z':-2.0, 'X':-1.0, '*':-4.0 }, 109 'B': { 'A':-2.0, 'R':-1.0, 'N': 3.0, 'D': 4.0, 'C':-3.0, 'Q': 0.0, 'E': 1.0, 'G':-1.0, 'H': 0.0, 'I':-3.0, 'L':-4.0, 'K': 0.0, 'M':-3.0, 'F':-3.0, 'P':-2.0, 'S': 0.0, 'T':-1.0, 'W':-4.0, 'Y':-3.0, 'V':-3.0, 'B': 4.0, 'Z': 1.0, 'X':-1.0, '*':-4.0 }, 110 'Z': { 'A':-1.0, 'R': 0.0, 'N': 0.0, 'D': 1.0, 'C':-3.0, 'Q': 3.0, 'E': 4.0, 'G':-2.0, 'H': 0.0, 'I':-3.0, 'L':-3.0, 'K': 1.0, 'M':-1.0, 'F':-3.0, 'P':-1.0, 'S': 0.0, 'T':-1.0, 'W':-3.0, 'Y':-2.0, 'V':-2.0, 'B': 1.0, 'Z': 4.0, 'X':-1.0, '*':-4.0 }, 111 'X': { 'A': 0.0, 'R':-1.0, 'N':-1.0, 'D':-1.0, 'C':-2.0, 'Q':-1.0, 'E':-1.0, 'G':-1.0, 'H':-1.0, 'I':-1.0, 'L':-1.0, 'K':-1.0, 'M':-1.0, 'F':-1.0, 'P':-2.0, 'S': 0.0, 'T': 0.0, 'W':-2.0, 'Y':-1.0, 'V':-1.0, 'B':-1.0, 'Z':-1.0, 'X':-1.0, '*':-4.0 }, 112 '*': { 'A':-4.0, 'R':-4.0, 'N':-4.0, 'D':-4.0, 'C':-4.0, 'Q':-4.0, 'E':-4.0, 'G':-4.0, 'H':-4.0, 'I':-4.0, 'L':-4.0, 'K':-4.0, 'M':-4.0, 'F':-4.0, 'P':-4.0, 'S':-4.0, 'T':-4.0, 'W':-4.0, 'Y':-4.0, 'V':-4.0, 'B':-4.0, 'Z':-4.0, 'X':-4.0, '*': 1.0 } 113 } 114

115 - def __init__(self, matrix=BLOSUM62):

116 self._matrix = matrix

117

118 - def score(self, x, y):

119 try: 120 return self._matrix[x][y] 121 except KeyError as ke: 122 raise ResidueNotFoundError(ke.message)

123 124 @staticmethod

125 - def parse(string):

126 """ 127 Parse a standard scoring matrix file, where the first row and 128 column are residue labels. 129 130 @param string: scoring matrix string 131 @type string: str 132 133 @rtype: L{SimilarityMatrix} 134 """ 135 136 residues = {} 137 matrix = {} 138 139 for line in string.splitlines(): 140 if not line.strip() or line.startswith("#"): 141 continue 142 143 if not residues: 144 residues = line.split() 145 146 else: 147 items = line.split() 148 if len(items) != len(residues) + 1: 149 raise ValueError("{0} scoring columns expected".format(len(residues))) 150 151 try: 152 aa, scores = items[0], map(float, items[1:]) 153 matrix[aa] = dict((residues[i], s) for i, s in enumerate(scores)) 154 except (KeyError, ValueError): 155 raise ValueError("Corrupt scoring matrix") 156 157 return SimilarityMatrix(matrix)

158

159 160 -class AbstractAlignmentAlgorithm(object):

161 """ 162 Base class for all sequence alignment algorithms. 163 164 This class was designed with simple sequence alignment algorithms in mind. 165 Implementors have full control over the behavior of the scoring function and 166 the dynamic programming matrix, but implementing things that require 167 additional matrices (such as affine gap penalties) might be trickier. 168 169 @param scoring: scoring matrix 170 @type scoring: L{AbstractScoringMatrix} 171 @param gap: simple gap penalty 172 @type gap: float 173 """ 174 175 __metaclass__ = ABCMeta 176

177 - def __init__(self, scoring=IdentityMatrix(), gap=0):

178 179 if not isinstance(scoring, AbstractScoringMatrix): 180 raise TypeError(scoring) 181 182 self._gap = float(gap) 183 self._scoring = scoring

184 185 @property

186 - def gap(self):

187 """ 188 Simple gap penalty 189 @rtype: float 190 """ 191 return self._gap

192 193 @property

194 - def scoring_matrix(self):

195 """ 196 Scoring matrix 197 @rtype: L{AbstractScoringMatrix} 198 """ 199 return self._scoring

200

201 - def align(self, query, subject):

202 """ 203 Align two sequences and return the optimal alignment. 204 205 @param query: first sequence 206 @type query: L{AbstractSequence} 207 @param subject: second sequence 208 @type subject: L{AbstractSequence} 209 210 @rtype: L{AlignmentResult} 211 """ 212 if query.length == 0 or subject.length == 0: 213 raise ValueError("Can't align zero length sequence") 214 215 # working with string sequences results in a massive speed-up 216 qseq = ["*"] + self._sequence(query) 217 sseq = ["*"] + self._sequence(subject) 218 219 # 1. create a dynamic programming matrix 220 matrix = [] 221 rows, cols = len(query), len(subject) 222 223 for i in range(rows + 1): 224 matrix.append([]) 225 for j in range(cols + 1): 226 matrix[i].append(None) 227 228 # 2. initialize the first row and column 229 self._initialize(matrix) 230 231 # fill 232 for i in range(1, rows + 1): 233 for j in range(1, cols + 1): 234 score = self._score(qseq[i], sseq[j]) 235 self._fill(matrix, i, j, score) 236 237 # 3. compute alignment 238 return self._traceback(matrix, query, subject)

239

240 - def _sequence(self, s):

241 """ 242 Extract and return the string sequence of {s}. 243 244 @param s: sequence object 245 @type s: L{AbstractSequence} 246 247 @rtype: list of str 248 """ 249 return list(s.sequence)

250 251 @abstractmethod

252 - def _initialize(self, matrix):

253 """ 254 Initialize (typically the first row and column) of the dynamic 255 programming matrix. 256 257 @param matrix: list (2D) 258 @type matrix: list 259 """ 260 pass

261

262 - def _score(self, residue1, residue2):

263 """ 264 Retrieve the pairwise score of two residues using the current 265 scoring matrix. 266 267 @rtype: float 268 """ 269 return self._scoring.score(residue1, residue2)

270

271 - def _fill(self, matrix, i, j, score):

272 """ 273 Compute and set the best score that leads to cell i,j in the dynamic 274 programming matrix: right, down or diagonal. 275 276 See also L{AbstractAlignmentAlgorithm._max}. 277 278 @param score: pairwise score at matrix[i][j] 279 @type score: float 280 @return: the best score 281 @rtype: float 282 """ 283 284 match = matrix[i-1][j-1] + score 285 insertion = matrix[i][j-1] + self._gap 286 deletion = matrix[i-1][j] + self._gap 287 288 best = self._max(match, insertion, deletion) 289 matrix[i][j] = best 290 291 return best

292 293 @abstractmethod

294 - def _max(self, match, insertion, deletion):

295 """ 296 Choose the best score among all given possibilities: 297 scores for match, insertion and deletion. This will determine 298 the direction taken at each step while building the dynamic programming 299 matrix (diagonal, down or right). 300 301 This is an expected notable point of divergence for most sequence 302 alignment algorithms. 303 """ 304 pass

305

306 - def _traceback(self, m, seq1, seq2):

307 """ 308 Trace back and return the optimal alignment. 309 """ 310 311 query = [] 312 subject = [] 313 314 # working with string sequences results in a massive speed-up 315 qseq = ["*"] + self._sequence(seq1) 316 sseq = ["*"] + self._sequence(seq2) 317 318 i, j = self._terminus(m) 319 qstart, start = i, j 320 qend, end = i, j 321 score = m[i][j] 322 323 while self._expandable(m, i, j): 324 325 if i > 0 and j > 0 and m[i][j] == (m[i-1][j-1] + self._score(qseq[i], sseq[j])): 326 query.append(seq1.residues[i]) 327 subject.append(seq2.residues[j]) 328 qstart, start = i, j 329 i, j = i - 1, j - 1 330 331 elif i > 0 and m[i][j] == (m[i-1][j] + self._gap): 332 query.append(seq1.residues[i]) 333 subject.append(ResidueInfo(-1, seq2.alphabet.GAP)) 334 qstart = i 335 i = i - 1 336 337 elif j > 0 and m[i][j] == (m[i][j-1] + self._gap): 338 query.append(ResidueInfo(-1, seq1.alphabet.GAP)) 339 subject.append(seq2.residues[j]) 340 start = j 341 j = j - 1 342 343 else: 344 assert False 345 346 query.reverse() 347 subject.reverse() 348 349 aligned_query = RichSequence(seq1.id, seq1.header, query, seq1.type) 350 aligned_subject = RichSequence(seq2.id, seq2.header, subject, seq2.type) 351 352 return AlignmentResult(score, aligned_query, aligned_subject, qstart, qend, start, end)

353 354 @abstractmethod

355 - def _terminus(self, matrix):

356 """ 357 Find the coordinates of the optimal alignment's right endpoint in the 358 dynamic programming matrix. This is the starting point of a traceback. 359 360 @param matrix: the complete dynamic programming matrix 361 @type matrix: 2D list 362 363 @rtype: 2-tuple (i, j) 364 """ 365 pass

366 367 @abstractmethod

368 - def _expandable(self, i, j):

369 """ 370 Return True if the traceback procedure must not terminate at 371 position C{i,j} in the dynamic programming matrix. 372 373 @rtype: bool 374 """ 375 pass

376

377 378 -class GlobalAlignmentAlgorithm(AbstractAlignmentAlgorithm):

379 """ 380 Needleman-Wunsch global sequence alignment. 381 """ 382

383 - def __init__(self, scoring=IdentityMatrix(), gap=0):

384 super(GlobalAlignmentAlgorithm, self).__init__(scoring, gap)

385

386 - def _initialize(self, matrix):

387 388 for i in range(len(matrix)): 389 matrix[i][0] = self._gap * i 390 for j in range(len(matrix[0])): 391 matrix[0][j] = self._gap * j

392

393 - def _max(self, match, insertion, deletion):

394 return max(match, insertion, deletion)

395

396 - def _terminus(self, matrix):

397 398 i = len(matrix) - 1 399 j = len(matrix[0]) - 1 400 401 return (i, j)

402

403 - def _expandable(self, matrix, i, j):

404 return i > 0 or j > 0

405

406 -class LocalAlignmentAlgorithm(AbstractAlignmentAlgorithm):

407 """ 408 Smith-Waterman local sequence alignment. 409 """ 410 411 START = 0 412 """ 413 Score for initiation of a new local alignment 414 """ 415

416 - def __init__(self, scoring=IdentityMatrix(), gap=-1):

417 super(LocalAlignmentAlgorithm, self).__init__(scoring, gap)

418

419 - def _initialize(self, matrix):

420 421 for i in range(len(matrix)): 422 matrix[i][0] = LocalAlignmentAlgorithm.START 423 for j in range(len(matrix[0])): 424 matrix[0][j] = LocalAlignmentAlgorithm.START

425

426 - def _max(self, match, insertion, deletion):

427 return max(match, insertion, deletion, LocalAlignmentAlgorithm.START)

428

429 - def _terminus(self, matrix):

430 431 maxi, maxj = 0, 0 432 433 for i in range(len(matrix)): 434 for j in range(len(matrix[i])): 435 if matrix[i][j] > matrix[maxi][maxj]: 436 maxi, maxj = i, j 437 438 return (maxi, maxj)

439

440 - def _expandable(self, matrix, i, j):

441 return matrix[i][j] != LocalAlignmentAlgorithm.START

442

443 444 -class AlignmentResult(object):

445 """ 446 Represents a pairwise sequence alignment result. 447 448 @param score: raw alignment score 449 @type score: float 450 @param query: aligned query sequence (with gaps) 451 @type query: L{AbstractSequence} 452 @param subject: aligned subject sequence (with gaps) 453 @type subject: L{AbstractSequence} 454 @param qstart: query start position 455 @type qstart: int 456 @param qend: query end position 457 @type qend: int 458 @param start: subject start position 459 @type start: int 460 @param end: subject end position 461 @type end: int 462 """ 463

464 - def __init__(self, score, query, subject, qstart, qend, start, end):

465 466 if not isinstance(query, AbstractSequence): 467 raise TypeError(query) 468 if not isinstance(subject, AbstractSequence): 469 raise TypeError(query) 470 471 if not (len(query) == len(subject)): 472 raise ValueError("Corrupt alignment") 473 474 self._score = float(score) 475 self._query = query 476 self._subject = subject 477 self._qstart = int(qstart) 478 self._qend = int(qend) 479 self._start = int(start) 480 self._end = int(end) 481 self._identicals = 0 482 self._gaps = 0 483 self._length = 0 484 485 if query.length > 0 and subject.length > 0: 486 487 if not 1 <= qstart <= qend: 488 raise ValueError("Invalid query start/end positions") 489 if not 1 <= start <= end: 490 raise ValueError("Invalid subject start/end positions") 491 492 qgap = query.alphabet.GAP 493 sgap = subject.alphabet.GAP 494 495 for q, s in zip(query, subject): 496 if q.type == qgap or s.type == sgap: 497 self._gaps += 1 498 elif q.type == s.type: 499 self._identicals += 1 500 501 self._length = (self._gaps + (qend - qstart + 1) + (end - start + 1)) / 2 502 503 else: 504 if (score + qstart + qend + start + end) != 0: 505 raise ValueError("Corrupt alignment") 506 self._length = 0

507 508

509 - def __str__(self):

510 string = "{0.qstart:5} {0.query.sequence:} {0.qend:<5}\n" 511 string += "{0.start:5} {0.subject.sequence:} {0.end:<5}" 512 return string.format(self)

513 514 @property

515 - def is_empty(self):

516 """ 517 Return True if this is an empty alignment (i.e. no matches) 518 @rtype: bool 519 """ 520 return self.length == 0 or self.gaps == self.length

521 522 @property

523 - def score(self):

524 """ 525 Raw alignment score 526 @rtype: float 527 """ 528 return self._score

529 530 @property

531 - def query(self):

532 """ 533 Aligned query sequence (with gaps) 534 @rtype: L{AbstractSequence} 535 """ 536 return self._query

537 538 @property

539 - def subject(self):

540 """ 541 Aligned subject sequence (with gaps) 542 @rtype: L{AbstractSequence} 543 """ 544 return self._subject

545 546 @property

547 - def qstart(self):

548 """ 549 Query start position 550 @rtype: int 551 """ 552 return self._qstart

553 554 @property

555 - def qend(self):

556 """ 557 Query end position 558 @rtype: int 559 """ 560 return self._qend

561 562 @property

563 - def start(self):

564 """ 565 Subject start position 566 @rtype: int 567 """ 568 return self._start

569 570 @property

571 - def end(self):

572 """ 573 Subject end position 574 @rtype: int 575 """ 576 return self._end

577 578 @property

579 - def identicals(self):

580 """ 581 Number of identical residues 582 @rtype: int 583 """ 584 return self._identicals

585 586 @property

587 - def identity(self):

588 """ 589 Percentage of identical residues 590 @rtype: int 591 """ 592 return float(self._identicals) / self._length

593 594 @property

595 - def gaps(self):

596 """ 597 Total number of gaps (query + subject) 598 @rtype: int 599 """ 600 return self._gaps

601 602 @property

603 - def length(self):

604 """ 605 Alignment length (query + subject + gaps / 2) 606 @rtype: int 607 """ 608 return self._length

609

610 - def alignment(self):

611 """ 612 @return: as a sequence alignment object 613 @rtype: L{SequenceAlignment} 614 """ 615 return SequenceAlignment([self.query, self.subject])

616

Source Code for Module csb.bio.sequence.alignment