Package csb :: Package bio :: Package sequence
[frames] | no frames]

Source Code for Package csb.bio.sequence

   1  """ 
   2  Sequence and sequence alignment APIs. 
   3   
   4  This module defines the base interfaces for biological sequences and alignments: 
   5  L{AbstractSequence} and L{AbstractAlignment}. These are the central abstractions 
   6  here. This module provides also a number of useful enumerations, like L{SequenceTypes} 
   7  and L{SequenceAlphabets}. 
   8   
   9  Sequences 
  10  ========= 
  11  L{AbstractSequence} has a number of implementations. These are of course interchangeable, 
  12  but have different intents and may differ significantly in performance. The standard 
  13  L{Sequence} implementation is what you are after if all you need is high performance 
  14  and efficient storage (e.g. when you are parsing big files). L{Sequence} objects store 
  15  their underlying sequences as strings. L{RichSequence}s on the other hand will store 
  16  their residues as L{ResidueInfo} objects, which have the same basic interface as the  
  17  L{csb.bio.structure.Residue} objects. This of course comes at the expense of degraded 
  18  performance. A L{ChainSequence} is a special case of a rich sequence, whose residue 
  19  objects are I{actually} real L{csb.bio.structure.Residue}s. 
  20   
  21  Basic usage: 
  22   
  23      >>> seq = RichSequence('id', 'desc', 'sequence', SequenceTypes.Protein) 
  24      >>> seq.residues[1] 
  25      <ResidueInfo [1]: SER> 
  26      >>> seq.dump(sys.stdout) 
  27      >desc 
  28      SEQUENCE 
  29   
  30  See L{AbstractSequence} for details.     
  31   
  32  Alignments 
  33  ========== 
  34  L{AbstractAlignment} defines a table-like interface to access the data in an 
  35  alignment: 
  36   
  37      >>> ali = SequenceAlignment.parse(">a\\nABC\\n>b\\nA-C") 
  38      >>> ali[0, 0] 
  39      <SequenceAlignment>   # a new alignment, constructed from row #1, column #1 
  40      >>> ali[0, 1:3] 
  41      <SequenceAlignment>   # a new alignment, constructed from row #1, columns #2..#3 
  42   
  43  which is just a shorthand for using the standard 1-based interface: 
  44   
  45      >>> ali.rows[1] 
  46      <AlignedSequenceAdapter: a, 3>                        # row #1 (first sequence) 
  47      >>> ali.columns[1] 
  48      (<ColumnInfo a [1]: ALA>, <ColumnInfo b [1]: ALA>)    # residues at column #1 
  49   
  50  See L{AbstractAlignment} for all details and more examples. 
  51   
  52  There are a number of L{AbstractAlignment} implementations defined here. 
  53  L{SequenceAlignment} is the default one, nothing surprising. L{A3MAlignment} 
  54  is a more special one: the first sequence in the alignment is a master sequence. 
  55  This alignment is usually used in the context of HHpred. More important is the 
  56  L{StructureAlignment}, which is an alignment of L{csb.bio.structure.Chain} objects. 
  57  The residues in every aligned sequence are really the L{csb.bio.structure.Residue} 
  58  objects taken from those chains. 
  59  """ 
  60   
  61  import re 
  62  import csb.core 
  63  import csb.io 
  64   
  65  from abc import ABCMeta, abstractmethod, abstractproperty 
66 67 68 -class AlignmentFormats(csb.core.enum):
69 """ 70 Enumeration of multiple sequence alignment formats 71 """ 72 A3M='a3m'; FASTA='fa'; PIR='pir'
73
74 -class SequenceTypes(csb.core.enum):
75 """ 76 Enumeration of sequence types 77 """ 78 NucleicAcid='NA'; DNA='DNA'; RNA='RNA'; Protein='Protein'; Unknown='Unknown'
79
80 -class AlignmentTypes(csb.core.enum):
81 """ 82 Enumeration of alignment strategies 83 """ 84 Global='global'; Local='local'
85
86 -class NucleicAlphabet(csb.core.enum):
87 """ 88 Nucleic sequence alphabet 89 """ 90 Adenine='A'; Cytosine='C'; Guanine='G'; Thymine='T'; Uracil='U'; Purine='R'; Pyrimidine='Y'; Ketone='K'; 91 Amino='M'; Strong='S'; Weak='W'; NotA='B'; NotC='D'; NotG='H'; NotT='V'; Any='N'; Masked='X'; GAP='-'; INSERTION='.';
92
93 -class ProteinAlphabet(csb.core.enum):
94 """ 95 Protein sequence alphabet 96 """ 97 ALA='A'; ASX='B'; CYS='C'; ASP='D'; GLU='E'; PHE='F'; GLY='G'; HIS='H'; ILE='I'; LYS='K'; LEU='L'; MET='M'; ASN='N'; 98 PYL='O'; PRO='P'; GLN='Q'; ARG='R'; SER='S'; THR='T'; SEC='U'; VAL='V'; TRP='W'; TYR='Y'; GLX='Z'; UNK='X'; GAP='-'; 99 INSERTION='.'; STOP='*'
100
101 -class StdProteinAlphabet(csb.core.enum):
102 """ 103 Standard protein sequence alphabet 104 """ 105 ALA='A'; CYS='C'; ASP='D'; GLU='E'; PHE='F'; GLY='G'; HIS='H'; ILE='I'; LYS='K'; LEU='L'; MET='M'; ASN='N'; 106 PRO='P'; GLN='Q'; ARG='R'; SER='S'; THR='T'; VAL='V'; TRP='W'; TYR='Y'
107
108 -class UnknownAlphabet(csb.core.enum):
109 """ 110 Unknown sequence alphabet 111 """ 112 UNK='X'; GAP='-'; INSERTION='.'
113
114 -class SequenceAlphabets(object):
115 """ 116 Sequence alphabet enumerations. 117 118 @note: This class is kept for backwards compatibility. The individual 119 alphabet classes must be defined in the top level namespace, 120 otherwise the new enum types cannot be pickled properly. 121 """ 122 Nucleic = NucleicAlphabet 123 Protein = ProteinAlphabet 124 StdProtein = StdProteinAlphabet 125 Unknown = UnknownAlphabet 126 127 MAP = { SequenceTypes.Protein: ProteinAlphabet, 128 SequenceTypes.NucleicAcid: NucleicAlphabet, 129 SequenceTypes.DNA: NucleicAlphabet, 130 SequenceTypes.RNA: NucleicAlphabet, 131 SequenceTypes.Unknown: UnknownAlphabet } 132 133 ALL_ALPHABETS = set([ProteinAlphabet, NucleicAlphabet, UnknownAlphabet]) 134 135 assert set(MAP) == csb.core.Enum.members(SequenceTypes) 136 137 @staticmethod
138 - def get(type):
139 """ 140 Get the alphabet corresponding to the specified sequence C{type} 141 @param type: a member of L{SequenceTypes} 142 @type type: L{csb.core.EnumItem} 143 @rtype: L{csb.core.enum} 144 """ 145 return SequenceAlphabets.MAP[type]
146 147 @staticmethod
148 - def contains(alphabet):
149 """ 150 Return True if C{alphabet} is a sequence alphabet 151 @type alphabet: L{csb.core.enum} 152 @rtype: bool 153 """ 154 return alphabet in SequenceAlphabets.ALL_ALPHABETS
155
156 157 -class SequenceError(ValueError):
158 pass
159
160 -class PositionError(IndexError):
161
162 - def __init__(self, index=None, start=1, end=None):
163 164 if end == 0: 165 start = 0 166 167 self.index = index 168 self.start = start 169 self.end = end 170 171 super(PositionError, self).__init__(index, start, end)
172
173 - def __str__(self):
174 175 if self.index is not None: 176 s = 'Position {0.index} is out of range [{0.start}, {0.end}]' 177 else: 178 s = 'Out of range [{0.start}, {0.end}]' 179 180 return s.format(self)
181
182 -class SequencePositionError(PositionError):
183 pass
184
185 -class ColumnPositionError(PositionError):
186 pass
187
188 -class SequenceNotFoundError(KeyError):
189 pass
190
191 -class DuplicateSequenceError(KeyError):
192 pass
193
194 -class ResidueInfo(object):
195
196 - def __init__(self, rank, type):
197 198 self._type = None 199 self._rank = rank 200 201 self.type = type
202 203 @property
204 - def type(self):
205 """ 206 Residue type - a member of any sequence alphabet 207 @rtype: enum item 208 """ 209 return self._type
210 @type.setter
211 - def type(self, type):
212 if not SequenceAlphabets.contains(type.enum): 213 raise TypeError(type) 214 self._type = type
215 216 @property
217 - def rank(self):
218 """ 219 Residue position (1-based) 220 @rtype: int 221 """ 222 return self._rank
223
224 - def __repr__(self):
225 return '<{1} [{0.rank}]: {0.type!r}>'.format(self, self.__class__.__name__)
226
227 228 -class ColumnInfo(object):
229
230 - def __init__(self, column, id, rank, residue):
231 232 self.column = column 233 self.id = id 234 self.rank = rank 235 self.residue = residue
236
237 - def __repr__(self):
238 return '<{0.__class__.__name__} {0.id} [{0.column}]: {0.residue.type!r}>'.format(self)
239
240 -class SequenceIndexer(object):
241
242 - def __init__(self, container):
243 self._container = container
244
245 - def __getitem__(self, rank):
246 247 if not 1 <= rank <= self._container.length: 248 raise SequencePositionError(rank, 1, self._container.length) 249 250 return self._container._get(rank)
251
252 - def __iter__(self):
253 return iter(self._container)
254
255 -class UngappedSequenceIndexer(SequenceIndexer):
256
257 - def __getitem__(self, rank):
258 try: 259 return self._container._get_ungapped(rank) 260 except SequencePositionError: 261 raise SequencePositionError(rank, 1)
262
263 - def __iter__(self):
264 for c in self._container: 265 if c.residue.type not in (self._container.alphabet.GAP, self._container.alphabet.INSERTION): 266 yield c.residue
267
268 -class ColumnIndexer(SequenceIndexer):
269
270 - def __getitem__(self, column):
271 272 if not 1 <= column <= self._container.length: 273 raise ColumnPositionError(column, 1, self._container.length) 274 275 return self._container._get_column(column)
276
277 278 -class SequenceCollection(csb.core.ReadOnlyCollectionContainer):
279 """ 280 Represents a list of L{AbstractSequence}s. 281 """ 282
283 - def __init__(self, sequences):
284 super(SequenceCollection, self).__init__(items=sequences, type=AbstractSequence)
285
286 - def to_fasta(self, output_file):
287 """ 288 Dump the whole collection in mFASTA format. 289 290 @param output_file: write the output to this file or stream 291 @type output_file: str or stream 292 """ 293 from csb.bio.io.fasta import FASTAOutputBuilder 294 295 with csb.io.EntryWriter(output_file, close=False) as out: 296 builder = FASTAOutputBuilder(out.stream, headers=True) 297 298 for s in self: 299 builder.add_sequence(s)
300
301 302 -class AbstractSequence(object):
303 """ 304 Base abstract class for all Sequence objects. 305 306 Provides 1-based access to the residues in the sequence via the 307 sequence.residues property. The sequence object itself also behaves like 308 a collection and provides 0-based access to its elements (residues). 309 310 @param id: FASTA ID of this sequence (e.g. accession number) 311 @type id: str 312 @param header: FASTA sequence header 313 @type header: str 314 @param residues: sequence residues 315 @type residues: str or collection of L{ResidueInfo} 316 @param type: a L{SequenceTypes} member (defaults to protein) 317 @type type: L{EnumItem} 318 """ 319 320 __metaclass__ = ABCMeta 321 322 DELIMITER = '>' 323
324 - def __init__(self, id, header, residues, type=SequenceTypes.Unknown):
325 326 self._id = None 327 self._header = None 328 self._residues = [] 329 self._type = None 330 331 self.id = id 332 self.header = header 333 self.type = type 334 335 for residue in residues: 336 self._add(residue)
337
338 - def __getitem__(self, spec):
339 340 if isinstance(spec, slice): 341 spec = SliceHelper(spec, 0, self.length) 342 return self.subregion(spec.start + 1, spec.stop) 343 else: 344 if not 0 <= spec < self.length: 345 raise IndexError(spec) 346 return self._get(spec + 1)
347
348 - def __iter__(self):
349 for index in range(self.length): 350 yield self[index]
351 352 @abstractmethod
353 - def _add(self, residue):
354 """ 355 Append a C{residue} to the sequence. 356 357 This is a hook method invoked internally for each residue during object 358 construction. By implementing this method, sub-classes define how 359 residues are attached to the sequence object. 360 """ 361 pass
362 363 @abstractmethod
364 - def _get(self, rank):
365 """ 366 Retrieve the sequence residue at the specified position (1-based, positive). 367 368 This is a hook method which defines the actual behavior of the sequence 369 residue indexer. 370 371 @rtype: L{ResidueInfo} 372 @raise SequencePositionError: when the supplied rank is out of range 373 """ 374 pass
375
376 - def _factory(self, *a, **k):
377 """ 378 Return a new sequence of the current L{AbstractSequence} sub-class. 379 """ 380 return self.__class__(*a, **k)
381
382 - def strip(self):
383 """ 384 Remove all gaps and insertions from the sequence. 385 386 @return: a new sequence instance, containing no gaps 387 @rtype: L{AbstractSequence} 388 """ 389 residues = [r for r in self._residues 390 if r.type not in (self.alphabet.GAP, self.alphabet.INSERTION)] 391 392 return self._factory(self.id, self.header, residues, self.type)
393
394 - def subregion(self, start, end):
395 """ 396 Extract a subsequence, defined by [start, end]. The start and end 397 positions are 1-based, inclusive. 398 399 @param start: start position 400 @type start: int 401 @param end: end position 402 @type end: int 403 404 @return: a new sequence 405 @rtype: L{AbstractSequence} 406 407 @raise SequencePositionError: if start/end positions are out of range 408 """ 409 positions = range(start, end + 1) 410 return self.extract(positions)
411
412 - def extract(self, positions):
413 """ 414 Extract a subsequence, defined by a list of 1-based positions. 415 416 @param positions: positions to extract 417 @type positions: tuple of int 418 419 @return: a new sequence 420 @rtype: L{AbstractSequence} 421 422 @raise SequencePositionError: if any position is out of range 423 """ 424 425 end = self.length 426 residues = [] 427 428 for rank in sorted(set(positions)): 429 if 1 <= rank <= end: 430 residues.append(self._get(rank)) 431 else: 432 raise SequencePositionError(rank, 1, end) 433 434 return self._factory(self.id, self.header, residues, self.type)
435
436 - def dump(self, output_file):
437 """ 438 Dump the sequence in FASTA format. 439 440 @param output_file: write the output to this file or stream 441 @type output_file: str or stream 442 """ 443 from csb.bio.io.fasta import FASTAOutputBuilder 444 445 with csb.io.EntryWriter(output_file, close=False) as out: 446 FASTAOutputBuilder(out.stream, headers=True).add_sequence(self)
447 448 @property
449 - def length(self):
450 """ 451 Number of residues 452 @rtype: int 453 """ 454 return len(self._residues)
455 456 @property
457 - def id(self):
458 """ 459 Sequence identifier 460 @rtype: str 461 """ 462 return self._id
463 @id.setter
464 - def id(self, value):
465 if value is not None: 466 value = str(value).strip() 467 self._id = value
468 469 @property
470 - def header(self):
471 """ 472 Sequence description 473 @rtype: str 474 """ 475 return self._header
476 @header.setter
477 - def header(self, value):
478 if not value: 479 value = 'sequence' 480 else: 481 value = value.strip().lstrip(AbstractSequence.DELIMITER) 482 self._header = value
483 484 @property
485 - def type(self):
486 """ 487 Sequence type - a member of L{SequenceTypes} 488 @rtype: enum item 489 """ 490 return self._type
491 @type.setter
492 - def type(self, value):
493 if isinstance(value, csb.core.string): 494 value = csb.core.Enum.parse(SequenceTypes, value) 495 if value.enum is not SequenceTypes: 496 raise TypeError(value) 497 self._type = value
498 499 @property
500 - def sequence(self):
501 """ 502 The actual sequence 503 @rtype: str 504 """ 505 return ''.join([str(r.type) for r in self._residues])
506 507 @property
508 - def alphabet(self):
509 """ 510 The sequence alphabet corresponding to the current sequence type 511 @rtype: L{csb.core.enum} 512 """ 513 return SequenceAlphabets.get(self._type)
514 515 @property
516 - def residues(self):
517 """ 518 Rank-based access to the underlying L{residues<csb.bio.sequence.ResidueInfo>} 519 @rtype: L{SequenceIndexer} 520 """ 521 return SequenceIndexer(self)
522
523 - def __len__(self):
524 return self.length
525
526 - def __repr__(self):
527 return '<{0.__class__.__name__}: {0.id}, {0.length} residues>'.format(self)
528
529 - def __str__(self):
530 return '{0}{1.header}\n{1.sequence}'.format(AbstractSequence.DELIMITER, self)
531
532 -class Sequence(AbstractSequence):
533 """ 534 High-performance sequence object. The actual sequence is stored internally 535 as a string. The indexer acts as a residue factory, which creates a new 536 L{ResidueInfo} instance each time. 537 538 @note: This class was created with parsing large volumes of data in mind. This 539 comes at the expense of degraded performance of the sequence indexer. 540 541 @param id: FASTA ID of this sequence (e.g. accession number) 542 @type id: str 543 @param header: FASTA sequence header 544 @type header: str 545 @param residues: sequence string 546 @type residues: str 547 @param type: a L{SequenceTypes} member (defaults to protein) 548 @type type: L{EnumItem} 549 """ 550
551 - def __init__(self, id, header, residues, type=SequenceTypes.Unknown):
552 553 self._id = None 554 self._header = None 555 self._residues = '' 556 self._type = None 557 558 self.id = id 559 self.header = header 560 self.type = type 561 562 self._append(residues)
563
564 - def _append(self, string):
565 # this seems to be the fastest method for sanitization and storage 566 self._residues += re.sub('([^\w\-\.])+', '', string)
567
568 - def _add(self, char):
569 self._append(char)
570
571 - def _get(self, rank):
572 573 type = csb.core.Enum.parse(self.alphabet, self._residues[rank - 1]) 574 return ResidueInfo(rank, type)
575
576 - def strip(self):
577 residues = self._residues.replace( 578 str(self.alphabet.GAP), '').replace( 579 str(self.alphabet.INSERTION), '') 580 return self._factory(self.id, self.header, residues, self.type)
581
582 - def subregion(self, start, end):
583 584 if not 1 <= start <= end <= self.length: 585 raise SequencePositionError(None, 1, self.length) 586 587 residues = self._residues[start - 1 : end] 588 return self._factory(self.id, self.header, residues, self.type)
589
590 - def extract(self, positions):
591 592 end = self.length 593 residues = [] 594 595 for rank in sorted(set(positions)): 596 if 1 <= rank <= end: 597 residues.append(self._residues[rank - 1]) 598 else: 599 raise SequencePositionError(rank, 1, end) 600 601 return self._factory(self.id, self.header, ''.join(residues), self.type)
602 603 @property
604 - def sequence(self):
605 return self._residues
606
607 -class RichSequence(AbstractSequence):
608 """ 609 Sequence implementation, which converts the sequence into a list of 610 L{ResidueInfo} objects. See L{AbstractSequence} for details. 611 """ 612
613 - def _add(self, residue):
614 615 if hasattr(residue, 'rank') and hasattr(residue, 'type'): 616 self._residues.append(residue) 617 618 else: 619 if residue.isalpha() or residue in (self.alphabet.GAP, self.alphabet.INSERTION): 620 621 type = csb.core.Enum.parse(self.alphabet, residue) 622 rank = len(self._residues) + 1 623 self._residues.append(ResidueInfo(rank, type))
624
625 - def _get(self, rank):
626 return self._residues[rank - 1]
627 628 @staticmethod
629 - def create(sequence):
630 """ 631 Create a new L{RichSequence} from existing L{AbstractSequence}. 632 633 @type sequence: L{AbstractSequence} 634 @rtype: L{RichSequence} 635 """ 636 return RichSequence( 637 sequence.id, sequence.header, sequence.sequence, sequence.type)
638
639 -class ChainSequence(AbstractSequence):
640 """ 641 Sequence view for L{csb.bio.structure.Chain} objects. 642 See L{AbstractSequence} for details. 643 """ 644
645 - def _add(self, residue):
646 647 if not (hasattr(residue, 'rank') and hasattr(residue, 'type')): 648 raise TypeError(residue) 649 else: 650 self._residues.append(residue)
651
652 - def _get(self, rank):
653 return self._residues[rank - 1]
654 655 @staticmethod
656 - def create(chain):
657 """ 658 Create a new L{ChainSequence} from existing L{Chain} instance. 659 660 @type chain: L{csb.bio.structure.Chain} 661 @rtype: L{ChainSequence} 662 """ 663 return ChainSequence( 664 chain.entry_id, chain.header, chain.residues, chain.type)
665
666 667 -class SequenceAdapter(object):
668 """ 669 Base wrapper class for L{AbstractSequence} objects. 670 Needs to be sub-classed (does not do anything special on its own). 671 672 @param sequence: adaptee 673 @type sequence: L{AbstractSequence} 674 """ 675
676 - def __init__(self, sequence):
677 678 if not isinstance(sequence, AbstractSequence): 679 raise TypeError(sequence) 680 681 self._subject = sequence
682
683 - def __getitem__(self, i):
684 return self._subject[i]
685
686 - def __iter__(self):
687 return iter(self._subject)
688
689 - def __repr__(self):
690 return '<{0.__class__.__name__}: {0.id}, {0.length}>'.format(self)
691
692 - def __str__(self):
693 return str(self._subject)
694
695 - def _add(self):
696 raise NotImplementedError()
697
698 - def _get(self, rank):
699 return self._subject._get(rank)
700
701 - def _factory(self, *a, **k):
702 return self.__class__(self._subject._factory(*a, **k))
703
704 - def strip(self):
705 return self._subject.strip()
706
707 - def subregion(self, start, end):
708 return self._subject.subregion(start, end)
709
710 - def extract(self, positions):
711 return self._subject.extract(positions)
712 713 @property
714 - def id(self):
715 return self._subject.id
716 717 @property
718 - def length(self):
719 return self._subject.length
720 721 @property
722 - def type(self):
723 return self._subject.type
724 725 @property
726 - def header(self):
727 return self._subject.header
728 729 @property
730 - def sequence(self):
731 return self._subject.sequence
732 733 @property
734 - def alphabet(self):
735 return self._subject.alphabet
736
737 -class AlignedSequenceAdapter(SequenceAdapter):
738 """ 739 Adapter, which wraps a gapped L{AbstractSequence} object and makes it 740 compatible with the MSA row/entry interface, expected by L{AbstractAlignment}. 741 742 The C{adapter.residues} property operates with an L{UngappedSequenceIndexer}, 743 which provides a gap-free view of the underlying sequence. 744 745 The C{adapter.columns} property operates with a standard L{ColumnIndexer}, 746 the same indexer which is used to provide the column view in multiple 747 alignments. Adapted sequences therefore act as alignment rows and allow for 748 MSA-column-oriented indexing. 749 750 @param sequence: adaptee 751 @type sequence: L{AbstractSequence} 752 """ 753
754 - def __init__(self, sequence):
755 756 super(AlignedSequenceAdapter, self).__init__(sequence) 757 758 self._fmap = {} 759 self._rmap = {} 760 rank = 0 761 762 for column, residue in enumerate(sequence, start=1): 763 764 if residue.type not in (self.alphabet.GAP, self.alphabet.INSERTION): 765 rank += 1 766 self._fmap[column] = rank 767 self._rmap[rank] = column 768 else: 769 self._fmap[column] = None
770
771 - def __getitem__(self, index):
772 if not 0 <= index < self.length: 773 raise IndexError(index) 774 return self._get_column(index + 1)
775
776 - def __iter__(self):
777 for c in sorted(self._fmap): 778 yield self._get_column(c)
779 780 @property
781 - def columns(self):
782 """ 783 Provides 1-based access to the respective columns in the MSA. 784 @rtype: L{ColumnIndexer} 785 """ 786 return ColumnIndexer(self)
787 788 @property
789 - def residues(self):
790 """ 791 Provides 1-based access to the residues of the unaligned (ungapped) 792 sequence. 793 @rtype: L{UngappedSequenceIndexer} 794 """ 795 return UngappedSequenceIndexer(self)
796
797 - def _get_column(self, column):
798 return ColumnInfo( 799 column, self.id, self._fmap[column], self._subject.residues[column])
800
801 - def _get_ungapped(self, rank):
802 return self._subject.residues[self._rmap[rank]]
803
804 - def map_residue(self, rank):
805 """ 806 Return the MSA column number corresponding to the specified ungapped 807 sequence C{rank}. 808 809 @param rank: 1-based residue rank 810 @type rank: int 811 @rtype: int 812 """ 813 return self._rmap[rank]
814
815 - def map_column(self, column):
816 """ 817 Return the ungapped sequence rank corresponding to the specified MSA 818 C{column} number. 819 820 @param column: 1-based alignment column number 821 @type column: int 822 @rtype: int 823 """ 824 return self._fmap[column]
825
826 -class SliceHelper(object):
827
828 - def __init__(self, slice, start=0, stop=0):
829 830 s, e, t = slice.start, slice.stop, slice.step 831 832 if s is None: 833 s = start 834 if e is None: 835 e = stop 836 if t is None: 837 t = 1 838 839 for value in [s, e, t]: 840 if value < 0: 841 raise IndexError(value) 842 843 self.start = s 844 self.stop = e 845 self.step = t
846
847 -class AlignmentRowsTable(csb.core.BaseDictionaryContainer):
848
849 - def __init__(self, container):
850 851 super(AlignmentRowsTable, self).__init__() 852 853 self._container = container 854 self._map = {}
855
856 - def __getitem__(self, item):
857 858 try: 859 if isinstance(item, int): 860 key = self._map[item] 861 else: 862 key = item 863 864 return super(AlignmentRowsTable, self).__getitem__(key) 865 866 except KeyError: 867 raise SequenceNotFoundError(item)
868
869 - def _append(self, sequence):
870 871 n = 0 872 sequence_id = sequence.id 873 874 while sequence_id in self: 875 n += 1 876 sequence_id = '{0}:A{1}'.format(sequence.id, n) 877 878 super(AlignmentRowsTable, self)._append_item(sequence_id, sequence) 879 self._map[self.length] = sequence_id
880
881 - def __iter__(self):
882 for id in super(AlignmentRowsTable, self).__iter__(): 883 yield self[id]
884
885 886 -class AbstractAlignment(object):
887 """ 888 Base class for all alignment objects. 889 890 Provides 1-based access to the alignment.rows and alignment.columns. 891 Alignment rows can also be accessed by sequence ID. In addition, all 892 alignments support 0-based slicing: 893 894 >>> alignment[rows, columns] 895 AbstractAlignment (sub-alignment) 896 897 where 898 - C{rows} can be a slice, tuple of row indexes or tuple of sequence IDs 899 - columns can be a slice or tuple of column indexes 900 901 For example: 902 903 >>> alignment[:, 2:] 904 AbstractAlignment # all rows, columns [3, alignment.length] 905 >>> alignment[(0, 'seqx'), (3, 5)] 906 AbstractAlignment # rows #1 and 'seq3', columns #4 and #5 907 908 @param sequences: alignment entries (must have equal length) 909 @type sequences: list of L{AbstractSequence}s 910 @param strict: if True, raise {DuplicateSequenceError} when a duplicate ID 911 is found (default=True) 912 @type strict: bool 913 914 @note: if C{strict} is False and there are C{sequences} with redundant identifiers, 915 those sequences will be added to the C{rows} collection with :An suffix, 916 where n is a serial number. Therefore, rows['ID'] will return only one sequence, 917 the first sequence with id=ID. All remaining sequences can be retrieved 918 with C{rows['ID:A1']}, {rows['ID:A2']}, etc. However, the sequence objects will 919 remain intact, e.g. {rows['ID:A1'].id} still returns 'ID' and not 'ID:A1'. 920 """ 921 922 __metaclass__ = ABCMeta 923
924 - def __init__(self, sequences, strict=True):
925 926 self._length = None 927 self._msa = AlignmentRowsTable(self) 928 self._colview = ColumnIndexer(self) 929 self._map = {} 930 self._strict = bool(strict) 931 932 self._construct(sequences)
933
934 - def __getitem__(self, spec):
935 936 # The following code can hardly get more readable than that, sorry. 937 # Don't even think of modifying this before there is a 100% unit test coverage 938 939 # 0. expand the input tuple: (rows/, columns/) => (rows, columns) 940 if not isinstance(spec, tuple) or len(spec) not in (1, 2): 941 raise TypeError('Invalid alignment slice expression') 942 943 if len(spec) == 2: 944 rowspec, colspec = spec 945 else: 946 rowspec, colspec = [spec, slice(None)] 947 948 # 1. interpret the row slice: int, iter(int), iter(str) or slice(int) => list(int, 1-based) 949 if isinstance(rowspec, slice): 950 if isinstance(rowspec.start, csb.core.string) or isinstance(rowspec.stop, csb.core.string): 951 raise TypeError("Invalid row slice: only indexes are supported") 952 rowspec = SliceHelper(rowspec, 0, self.size) 953 rows = range(rowspec.start + 1, rowspec.stop + 1) 954 elif isinstance(rowspec, int): 955 rows = [rowspec + 1] 956 elif csb.core.iterable(rowspec): 957 try: 958 rows = [] 959 for r in rowspec: 960 if isinstance(r, int): 961 rows.append(r + 1) 962 else: 963 rows.append(self._map[r]) 964 except KeyError as ke: 965 raise KeyError('No such Sequence ID: {0!s}'.format(ke)) 966 else: 967 raise TypeError('Unsupported row expression') 968 969 # 2. interpret the column slice: int, iter(int) or slice(int) => list(int, 1-based) 970 if isinstance(colspec, slice): 971 colspec = SliceHelper(colspec, 0, self._length or 0) 972 cols = range(colspec.start + 1, colspec.stop + 1) 973 elif isinstance(colspec, int): 974 cols = [colspec + 1] 975 elif csb.core.iterable(colspec): 976 try: 977 cols = [ c + 1 for c in colspec ] 978 except: 979 raise TypeError('Unsupported column expression') 980 else: 981 raise TypeError('Unsupported column expression') 982 983 # 3. some more checks 984 if len(rows) == 0: 985 raise ValueError("The expression returns zero rows") 986 if len(cols) == 0: 987 raise ValueError("The expression returns zero columns") 988 989 # 4. we are done 990 return self._extract(rows, cols)
991
992 - def _range(self, slice, start, end):
993 994 s, e, t = slice.start, slice.end, slice.step 995 996 if s is None: 997 s = start 998 if e is None: 999 e = end 1000 if t is None: 1001 t = 1 1002 1003 return range(s, e, t)
1004
1005 - def __iter__(self):
1006 for cn in range(1, self.length + 1): 1007 yield self._get_column(cn)
1008 1009 @abstractmethod
1010 - def _construct(self, sequences):
1011 """ 1012 Hook method, called internally upon object construction. Subclasses 1013 define how the source alignment sequences are handled during alignment 1014 construction. 1015 1016 @param sequences: alignment entries 1017 @type sequences: list of L{AbstractSequence}s 1018 """ 1019 pass
1020
1021 - def _initialize(self, rep_sequence):
1022 """ 1023 Hook method, which is used to initialize various alignment properties 1024 (such as length) from the first alignned sequence. 1025 """ 1026 if rep_sequence.length == 0: 1027 raise SequenceError("Sequence '{0}' is empty".format(rep_sequence.id)) 1028 1029 assert self._length is None 1030 self._length = rep_sequence.length
1031
1032 - def _factory(self, *a, **k):
1033 """ 1034 Return a new sequence of the current L{AbstractAlignment} sub-class. 1035 """ 1036 return self.__class__(*a, **k)
1037
1038 - def add(self, sequence):
1039 """ 1040 Append a new sequence to the alignment. 1041 1042 @type sequence: L{AbstractSequence} 1043 @raise SequenceError: if the new sequence is too short/long 1044 @raise DuplicateSequenceError: if a sequence with same ID already exists 1045 """ 1046 1047 if self._msa.length == 0: 1048 self._initialize(sequence) 1049 1050 if sequence.length != self._length: 1051 raise SequenceError('{0!r} is not of the expected length'.format(sequence)) 1052 1053 if self._strict and sequence.id in self._msa: 1054 raise DuplicateSequenceError(sequence.id) 1055 1056 self._msa._append(AlignedSequenceAdapter(sequence)) 1057 self._map[sequence.id] = self._msa.length
1058 1059 @property
1060 - def length(self):
1061 """ 1062 Number of columns in the alignment 1063 @rtype: int 1064 """ 1065 return self._length or 0
1066 1067 @property
1068 - def size(self):
1069 """ 1070 Number of rows (sequences) in the alignment 1071 @rtype: int 1072 """ 1073 return self._msa.length
1074 1075 @property
1076 - def rows(self):
1077 """ 1078 1-based access to the alignment entries (sequences) 1079 @rtype: L{AlignmentRowsTable} 1080 """ 1081 return self._msa
1082 1083 @property
1084 - def columns(self):
1085 """ 1086 1-based access to the alignment columns 1087 @rtype: L{ColumnIndexer} 1088 """ 1089 return self._colview
1090
1091 - def gap_at(self, column):
1092 """ 1093 Return True of C{column} contains at least one gap. 1094 @param column: column number, 1-based 1095 @type column: int 1096 1097 @rtype: bool 1098 """ 1099 1100 for row in self._msa: 1101 if row.columns[column].residue.type == row.alphabet.GAP: 1102 return True 1103 1104 return False
1105
1106 - def _get_column(self, column):
1107 return tuple(row._get_column(column) for row in self.rows)
1108
1109 - def _extract(self, rows, cols):
1110 1111 rows = set(rows) 1112 cols = set(cols) 1113 1114 if not 1 <= min(rows) <= max(rows) <= self.size: 1115 raise IndexError('Row specification out of range') 1116 1117 if not 1 <= min(cols) <= max(cols) <= self.length: 1118 raise IndexError('Column specification out of range') 1119 1120 sequences = [] 1121 1122 for rn, row in enumerate(self.rows, start=1): 1123 if rn in rows: 1124 sequences.append(row.extract(cols)) 1125 1126 return self._factory(sequences, strict=self._strict)
1127
1128 - def subregion(self, start, end):
1129 """ 1130 Extract a sub-alignment, ranging from C{start} to C{end} columns. 1131 1132 @param start: starting column, 1-based 1133 @type start: int 1134 @param end: ending column, 1-based 1135 @type end: int 1136 1137 @return: a new alignment of the current type 1138 @rtype: L{AbstractAlignment} 1139 1140 @raise ColumnPositionError: if start/end is out of range 1141 """ 1142 if not 1 <= start <= end <= self.length: 1143 raise ColumnPositionError(None, 1, self.length) 1144 1145 sequences = [] 1146 1147 for row in self.rows: 1148 sequences.append(row.subregion(start, end)) 1149 1150 return self._factory(sequences, strict=self._strict)
1151
1152 - def format(self, format=AlignmentFormats.FASTA, headers=True):
1153 """ 1154 Format the alignment as a string. 1155 1156 @param format: alignment format type, member of L{AlignmentFormats} 1157 @type format: L{EnumItem} 1158 @param headers: if False, omit headers 1159 @type headers: bool 1160 1161 @rtype: str 1162 """ 1163 from csb.bio.io.fasta import OutputBuilder 1164 1165 temp = csb.io.MemoryStream() 1166 1167 try: 1168 builder = OutputBuilder.create(format, temp, headers=headers) 1169 builder.add_alignment(self) 1170 1171 return temp.getvalue() 1172 1173 finally: 1174 temp.close()
1175
1176 -class SequenceAlignment(AbstractAlignment):
1177 """ 1178 Multiple sequence alignment. See L{AbstractAlignment} for details. 1179 """ 1180
1181 - def _construct(self, sequences):
1182 1183 for sequence in sequences: 1184 self.add(sequence)
1185 1186 @staticmethod
1187 - def parse(string, strict=True):
1188 """ 1189 Create a new L{SequenceAlignment} from an mFASTA string. 1190 1191 @param string: MSA-formatted string 1192 @type string: str 1193 @param strict: see L{AbstractAlignment} 1194 @type strict: bool 1195 1196 @rtype: L{SequenceAlignment} 1197 """ 1198 from csb.bio.io.fasta import SequenceAlignmentReader 1199 return SequenceAlignmentReader(strict=strict).read_fasta(string)
1200
1201 -class StructureAlignment(AbstractAlignment):
1202 """ 1203 Multiple structure alignment. Similar to a L{SequenceAlignment}, but 1204 the alignment holds the actual L{csb.bio.structure.ProteinResidue} objects, 1205 taken from the corresponding source L{csb.bio.structure.Chain}s. 1206 1207 See L{AbstractAlignment} for details. 1208 """ 1209
1210 - def _construct(self, sequences):
1211 1212 for sequence in sequences: 1213 self.add(sequence)
1214 1215 @staticmethod
1216 - def parse(string, provider, id_factory=None, strict=True):
1217 """ 1218 Create a new L{StructureAlignment} from an mFASTA string. See 1219 L{csb.bio.io.fasta.StructureAlignmentFactory} for details. 1220 1221 @param string: MSA-formatted string 1222 @type string: str 1223 @param provider: data source for all structures found in the alignment 1224 @type provider: L{csb.bio.io.wwpdb.StructureProvider} 1225 @param strict: see L{AbstractAlignment} 1226 @type strict: bool 1227 @param id_factory: callable factory, which transforms a sequence ID into 1228 a L{csb.bio.io.wwpdb.EntryID} object. By default 1229 this is L{csb.bio.io.wwpdb.EntryID.create}. 1230 @type id_factory: callable 1231 @rtype: L{StructureAlignment} 1232 """ 1233 from csb.bio.io.fasta import StructureAlignmentFactory 1234 1235 factory = StructureAlignmentFactory( 1236 provider, id_factory=id_factory, strict=strict) 1237 return factory.make_alignment(string)
1238
1239 -class A3MAlignment(AbstractAlignment):
1240 """ 1241 A specific type of multiple alignment, which provides some operations 1242 relative to a master sequence (the first entry in the alignment). 1243 """ 1244
1245 - def __init__(self, sequences, strict=True):
1246 1247 self._master = None 1248 self._matches = 0 1249 self._insertions = set() 1250 1251 super(A3MAlignment, self).__init__(sequences, strict=strict)
1252
1253 - def _initialize(self, rep_sequence):
1254 1255 super(A3MAlignment, self)._initialize(rep_sequence) 1256 self._alphabet = rep_sequence.alphabet
1257
1258 - def _construct(self, sequences):
1259 1260 for sequence in sequences: 1261 1262 self.add(sequence) 1263 1264 for rank, residue in enumerate(sequence, start=1): 1265 if residue.type == self._alphabet.INSERTION: 1266 self._insertions.add(rank) 1267 1268 if self.size == 0: 1269 raise SequenceError("At least one sequence is required") 1270 1271 self._master = list(self._msa)[0] 1272 self._matches = self._master.strip().length
1273 1274 @property
1275 - def master(self):
1276 """ 1277 The master sequence 1278 @rtype: L{AbstractSequence} 1279 """ 1280 return self._master
1281
1282 - def insertion_at(self, column):
1283 """ 1284 Return True of C{column} contains at least one insertion. 1285 1286 @param column: column number, 1-based 1287 @type column: int 1288 @rtype: bool 1289 """ 1290 return column in self._insertions
1291
1292 - def hmm_subregion(self, match_start, match_end):
1293 """ 1294 Same as L{AbstractAlignment.subregion}, but start/end positions are 1295 ranks in the ungapped master sequence. 1296 """ 1297 1298 if not 1 <= match_start <= match_end <= self.matches: 1299 raise ColumnPositionError(None, 1, self.matches) 1300 1301 start = self._master.map_residue(match_start) 1302 end = self._master.map_residue(match_end) 1303 1304 return self.subregion(start, end)
1305
1306 - def format(self, format=AlignmentFormats.A3M, headers=True):
1307 return super(A3MAlignment, self).format(format, headers)
1308 1309 @property
1310 - def matches(self):
1311 """ 1312 Number of match states (residues in the ungapped master). 1313 @rtype: int 1314 """ 1315 return self._matches
1316 1317 @staticmethod
1318 - def parse(string, strict=True):
1319 """ 1320 Create a new L{A3MAlignment} from an A3M string. 1321 1322 @param string: MSA-formatted string 1323 @type string: str 1324 @param strict: see L{AbstractAlignment} 1325 @type strict: bool 1326 1327 @rtype: L{A3MAlignment} 1328 """ 1329 from csb.bio.io.fasta import SequenceAlignmentReader 1330 return SequenceAlignmentReader(strict=strict).read_a3m(string)
1331