csb.bio.sequence

1 """ 2 Sequence and sequence alignment APIs. 3 4 This module defines the base interfaces for biological sequences and alignments: 5 L{AbstractSequence} and L{AbstractAlignment}. These are the central abstractions 6 here. This module provides also a number of useful enumerations, like L{SequenceTypes} 7 and L{SequenceAlphabets}. 8 9 Sequences 10 ========= 11 L{AbstractSequence} has a number of implementations. These are of course interchangeable, 12 but have different intents and may differ significantly in performance. The standard 13 L{Sequence} implementation is what you are after if all you need is high performance 14 and efficient storage (e.g. when you are parsing big files). L{Sequence} objects store 15 their underlying sequences as strings. L{RichSequence}s on the other hand will store 16 their residues as L{ResidueInfo} objects, which have the same basic interface as the 17 L{csb.bio.structure.Residue} objects. This of course comes at the expense of degraded 18 performance. A L{ChainSequence} is a special case of a rich sequence, whose residue 19 objects are I{actually} real L{csb.bio.structure.Residue}s. 20 21 Basic usage: 22 23 >>> seq = RichSequence('id', 'desc', 'sequence', SequenceTypes.Protein) 24 >>> seq.residues[1] 25 <ResidueInfo [1]: SER> 26 >>> seq.dump(sys.stdout) 27 >desc 28 SEQUENCE 29 30 See L{AbstractSequence} for details. 31 32 Alignments 33 ========== 34 L{AbstractAlignment} defines a table-like interface to access the data in an 35 alignment: 36 37 >>> ali = SequenceAlignment.parse(">a\\nABC\\n>b\\nA-C") 38 >>> ali[0, 0] 39 <SequenceAlignment> # a new alignment, constructed from row #1, column #1 40 >>> ali[0, 1:3] 41 <SequenceAlignment> # a new alignment, constructed from row #1, columns #2..#3 42 43 which is just a shorthand for using the standard 1-based interface: 44 45 >>> ali.rows[1] 46 <AlignedSequenceAdapter: a, 3> # row #1 (first sequence) 47 >>> ali.columns[1] 48 (<ColumnInfo a [1]: ALA>, <ColumnInfo b [1]: ALA>) # residues at column #1 49 50 See L{AbstractAlignment} for all details and more examples. 51 52 There are a number of L{AbstractAlignment} implementations defined here. 53 L{SequenceAlignment} is the default one, nothing surprising. L{A3MAlignment} 54 is a more special one: the first sequence in the alignment is a master sequence. 55 This alignment is usually used in the context of HHpred. More important is the 56 L{StructureAlignment}, which is an alignment of L{csb.bio.structure.Chain} objects. 57 The residues in every aligned sequence are really the L{csb.bio.structure.Residue} 58 objects taken from those chains. 59 """ 60 61 import re 62 import csb.core 63 import csb.io 64 65 from abc import ABCMeta, abstractmethod, abstractproperty

66 67 68 -class AlignmentFormats(csb.core.enum):

69 """ 70 Enumeration of multiple sequence alignment formats 71 """ 72 A3M='a3m'; FASTA='fa'; PIR='pir'

73

74 -class SequenceTypes(csb.core.enum):

75 """ 76 Enumeration of sequence types 77 """ 78 NucleicAcid='NA'; DNA='DNA'; RNA='RNA'; Protein='Protein'; Unknown='Unknown'

79

80 -class AlignmentTypes(csb.core.enum):

81 """ 82 Enumeration of alignment strategies 83 """ 84 Global='global'; Local='local'

85

86 -class NucleicAlphabet(csb.core.enum):

87 """ 88 Nucleic sequence alphabet 89 """ 90 Adenine='A'; Cytosine='C'; Guanine='G'; Thymine='T'; Uracil='U'; Purine='R'; Pyrimidine='Y'; Ketone='K'; 91 Amino='M'; Strong='S'; Weak='W'; NotA='B'; NotC='D'; NotG='H'; NotT='V'; Any='N'; Masked='X'; GAP='-'; INSERTION='.';

92

93 -class ProteinAlphabet(csb.core.enum):

94 """ 95 Protein sequence alphabet 96 """ 97 ALA='A'; ASX='B'; CYS='C'; ASP='D'; GLU='E'; PHE='F'; GLY='G'; HIS='H'; ILE='I'; LYS='K'; LEU='L'; MET='M'; ASN='N'; 98 PYL='O'; PRO='P'; GLN='Q'; ARG='R'; SER='S'; THR='T'; SEC='U'; VAL='V'; TRP='W'; TYR='Y'; GLX='Z'; UNK='X'; GAP='-'; 99 INSERTION='.'; STOP='*'

100

101 -class StdProteinAlphabet(csb.core.enum):

102 """ 103 Standard protein sequence alphabet 104 """ 105 ALA='A'; CYS='C'; ASP='D'; GLU='E'; PHE='F'; GLY='G'; HIS='H'; ILE='I'; LYS='K'; LEU='L'; MET='M'; ASN='N'; 106 PRO='P'; GLN='Q'; ARG='R'; SER='S'; THR='T'; VAL='V'; TRP='W'; TYR='Y'

107

108 -class UnknownAlphabet(csb.core.enum):

109 """ 110 Unknown sequence alphabet 111 """ 112 UNK='X'; GAP='-'; INSERTION='.'

113

114 -class SequenceAlphabets(object):

115 """ 116 Sequence alphabet enumerations. 117 118 @note: This class is kept for backwards compatibility. The individual 119 alphabet classes must be defined in the top level namespace, 120 otherwise the new enum types cannot be pickled properly. 121 """ 122 Nucleic = NucleicAlphabet 123 Protein = ProteinAlphabet 124 StdProtein = StdProteinAlphabet 125 Unknown = UnknownAlphabet 126 127 MAP = { SequenceTypes.Protein: ProteinAlphabet, 128 SequenceTypes.NucleicAcid: NucleicAlphabet, 129 SequenceTypes.DNA: NucleicAlphabet, 130 SequenceTypes.RNA: NucleicAlphabet, 131 SequenceTypes.Unknown: UnknownAlphabet } 132 133 ALL_ALPHABETS = set([ProteinAlphabet, NucleicAlphabet, UnknownAlphabet]) 134 135 assert set(MAP) == csb.core.Enum.members(SequenceTypes) 136 137 @staticmethod

138 - def get(type):

139 """ 140 Get the alphabet corresponding to the specified sequence C{type} 141 @param type: a member of L{SequenceTypes} 142 @type type: L{csb.core.EnumItem} 143 @rtype: L{csb.core.enum} 144 """ 145 return SequenceAlphabets.MAP[type]

146 147 @staticmethod

148 - def contains(alphabet):

149 """ 150 Return True if C{alphabet} is a sequence alphabet 151 @type alphabet: L{csb.core.enum} 152 @rtype: bool 153 """ 154 return alphabet in SequenceAlphabets.ALL_ALPHABETS

155

156 157 -class SequenceError(ValueError):

158 pass

159

160 -class PositionError(IndexError):

161

162 - def __init__(self, index=None, start=1, end=None):

163 164 if end == 0: 165 start = 0 166 167 self.index = index 168 self.start = start 169 self.end = end 170 171 super(PositionError, self).__init__(index, start, end)

172

173 - def __str__(self):

174 175 if self.index is not None: 176 s = 'Position {0.index} is out of range [{0.start}, {0.end}]' 177 else: 178 s = 'Out of range [{0.start}, {0.end}]' 179 180 return s.format(self)

181

182 -class SequencePositionError(PositionError):

183 pass

184

185 -class ColumnPositionError(PositionError):

186 pass

187

188 -class SequenceNotFoundError(KeyError):

189 pass

190

191 -class DuplicateSequenceError(KeyError):

192 pass

193

194 -class ResidueInfo(object):

195

196 - def __init__(self, rank, type):

197 198 self._type = None 199 self._rank = rank 200 201 self.type = type

202 203 @property

204 - def type(self):

205 """ 206 Residue type - a member of any sequence alphabet 207 @rtype: enum item 208 """ 209 return self._type

210 @type.setter

211 - def type(self, type):

212 if not SequenceAlphabets.contains(type.enum): 213 raise TypeError(type) 214 self._type = type

215 216 @property

217 - def rank(self):

218 """ 219 Residue position (1-based) 220 @rtype: int 221 """ 222 return self._rank

223

224 - def __repr__(self):

225 return '<{1} [{0.rank}]: {0.type!r}>'.format(self, self.__class__.__name__)

226

227 228 -class ColumnInfo(object):

229

230 - def __init__(self, column, id, rank, residue):

231 232 self.column = column 233 self.id = id 234 self.rank = rank 235 self.residue = residue

236

237 - def __repr__(self):

238 return '<{0.__class__.__name__} {0.id} [{0.column}]: {0.residue.type!r}>'.format(self)

239

240 -class SequenceIndexer(object):

241

242 - def __init__(self, container):

243 self._container = container

244

245 - def __getitem__(self, rank):

246 247 if not 1 <= rank <= self._container.length: 248 raise SequencePositionError(rank, 1, self._container.length) 249 250 return self._container._get(rank)

251

252 - def __iter__(self):

253 return iter(self._container)

254

255 -class UngappedSequenceIndexer(SequenceIndexer):

256

257 - def __getitem__(self, rank):

258 try: 259 return self._container._get_ungapped(rank) 260 except SequencePositionError: 261 raise SequencePositionError(rank, 1)

262

263 - def __iter__(self):

264 for c in self._container: 265 if c.residue.type not in (self._container.alphabet.GAP, self._container.alphabet.INSERTION): 266 yield c.residue

267

268 -class ColumnIndexer(SequenceIndexer):

269

270 - def __getitem__(self, column):

271 272 if not 1 <= column <= self._container.length: 273 raise ColumnPositionError(column, 1, self._container.length) 274 275 return self._container._get_column(column)

276

277 278 -class SequenceCollection(csb.core.ReadOnlyCollectionContainer):

279 """ 280 Represents a list of L{AbstractSequence}s. 281 """ 282

283 - def __init__(self, sequences):

284 super(SequenceCollection, self).__init__(items=sequences, type=AbstractSequence)

285

286 - def to_fasta(self, output_file):

287 """ 288 Dump the whole collection in mFASTA format. 289 290 @param output_file: write the output to this file or stream 291 @type output_file: str or stream 292 """ 293 from csb.bio.io.fasta import FASTAOutputBuilder 294 295 with csb.io.EntryWriter(output_file, close=False) as out: 296 builder = FASTAOutputBuilder(out.stream, headers=True) 297 298 for s in self: 299 builder.add_sequence(s)

300

301 302 -class AbstractSequence(object):

303 """ 304 Base abstract class for all Sequence objects. 305 306 Provides 1-based access to the residues in the sequence via the 307 sequence.residues property. The sequence object itself also behaves like 308 a collection and provides 0-based access to its elements (residues). 309 310 @param id: FASTA ID of this sequence (e.g. accession number) 311 @type id: str 312 @param header: FASTA sequence header 313 @type header: str 314 @param residues: sequence residues 315 @type residues: str or collection of L{ResidueInfo} 316 @param type: a L{SequenceTypes} member (defaults to protein) 317 @type type: L{EnumItem} 318 """ 319 320 __metaclass__ = ABCMeta 321 322 DELIMITER = '>' 323

324 - def __init__(self, id, header, residues, type=SequenceTypes.Unknown):

325 326 self._id = None 327 self._header = None 328 self._residues = [] 329 self._type = None 330 331 self.id = id 332 self.header = header 333 self.type = type 334 335 for residue in residues: 336 self._add(residue)

337

338 - def __getitem__(self, spec):

339 340 if isinstance(spec, slice): 341 spec = SliceHelper(spec, 0, self.length) 342 return self.subregion(spec.start + 1, spec.stop) 343 else: 344 if not 0 <= spec < self.length: 345 raise IndexError(spec) 346 return self._get(spec + 1)

347

348 - def __iter__(self):

349 for index in range(self.length): 350 yield self[index]

351 352 @abstractmethod

353 - def _add(self, residue):

354 """ 355 Append a C{residue} to the sequence. 356 357 This is a hook method invoked internally for each residue during object 358 construction. By implementing this method, sub-classes define how 359 residues are attached to the sequence object. 360 """ 361 pass

362 363 @abstractmethod

364 - def _get(self, rank):

365 """ 366 Retrieve the sequence residue at the specified position (1-based, positive). 367 368 This is a hook method which defines the actual behavior of the sequence 369 residue indexer. 370 371 @rtype: L{ResidueInfo} 372 @raise SequencePositionError: when the supplied rank is out of range 373 """ 374 pass

375

376 - def _factory(self, *a, **k):

377 """ 378 Return a new sequence of the current L{AbstractSequence} sub-class. 379 """ 380 return self.__class__(*a, **k)

381

382 - def strip(self):

383 """ 384 Remove all gaps and insertions from the sequence. 385 386 @return: a new sequence instance, containing no gaps 387 @rtype: L{AbstractSequence} 388 """ 389 residues = [r for r in self._residues 390 if r.type not in (self.alphabet.GAP, self.alphabet.INSERTION)] 391 392 return self._factory(self.id, self.header, residues, self.type)

393

394 - def subregion(self, start, end):

395 """ 396 Extract a subsequence, defined by [start, end]. The start and end 397 positions are 1-based, inclusive. 398 399 @param start: start position 400 @type start: int 401 @param end: end position 402 @type end: int 403 404 @return: a new sequence 405 @rtype: L{AbstractSequence} 406 407 @raise SequencePositionError: if start/end positions are out of range 408 """ 409 positions = range(start, end + 1) 410 return self.extract(positions)

411

412 - def extract(self, positions):

413 """ 414 Extract a subsequence, defined by a list of 1-based positions. 415 416 @param positions: positions to extract 417 @type positions: tuple of int 418 419 @return: a new sequence 420 @rtype: L{AbstractSequence} 421 422 @raise SequencePositionError: if any position is out of range 423 """ 424 425 end = self.length 426 residues = [] 427 428 for rank in sorted(set(positions)): 429 if 1 <= rank <= end: 430 residues.append(self._get(rank)) 431 else: 432 raise SequencePositionError(rank, 1, end) 433 434 return self._factory(self.id, self.header, residues, self.type)

435

436 - def dump(self, output_file):

437 """ 438 Dump the sequence in FASTA format. 439 440 @param output_file: write the output to this file or stream 441 @type output_file: str or stream 442 """ 443 from csb.bio.io.fasta import FASTAOutputBuilder 444 445 with csb.io.EntryWriter(output_file, close=False) as out: 446 FASTAOutputBuilder(out.stream, headers=True).add_sequence(self)

447 448 @property

449 - def length(self):

450 """ 451 Number of residues 452 @rtype: int 453 """ 454 return len(self._residues)

455 456 @property

457 - def id(self):

458 """ 459 Sequence identifier 460 @rtype: str 461 """ 462 return self._id

463 @id.setter

464 - def id(self, value):

465 if value is not None: 466 value = str(value).strip() 467 self._id = value

468 469 @property

470 - def header(self):

471 """ 472 Sequence description 473 @rtype: str 474 """ 475 return self._header

476 @header.setter

477 - def header(self, value):

478 if not value: 479 value = 'sequence' 480 else: 481 value = value.strip().lstrip(AbstractSequence.DELIMITER) 482 self._header = value

483 484 @property

485 - def type(self):

486 """ 487 Sequence type - a member of L{SequenceTypes} 488 @rtype: enum item 489 """ 490 return self._type

491 @type.setter

492 - def type(self, value):

493 if isinstance(value, csb.core.string): 494 value = csb.core.Enum.parse(SequenceTypes, value) 495 if value.enum is not SequenceTypes: 496 raise TypeError(value) 497 self._type = value

498 499 @property

500 - def sequence(self):

501 """ 502 The actual sequence 503 @rtype: str 504 """ 505 return ''.join([str(r.type) for r in self._residues])

506 507 @property

508 - def alphabet(self):

509 """ 510 The sequence alphabet corresponding to the current sequence type 511 @rtype: L{csb.core.enum} 512 """ 513 return SequenceAlphabets.get(self._type)

514 515 @property

516 - def residues(self):

517 """ 518 Rank-based access to the underlying L{residues<csb.bio.sequence.ResidueInfo>} 519 @rtype: L{SequenceIndexer} 520 """ 521 return SequenceIndexer(self)

522

523 - def __len__(self):

524 return self.length

525

526 - def __repr__(self):

527 return '<{0.__class__.__name__}: {0.id}, {0.length} residues>'.format(self)

528

529 - def __str__(self):

530 return '{0}{1.header}\n{1.sequence}'.format(AbstractSequence.DELIMITER, self)

531

532 -class Sequence(AbstractSequence):

533 """ 534 High-performance sequence object. The actual sequence is stored internally 535 as a string. The indexer acts as a residue factory, which creates a new 536 L{ResidueInfo} instance each time. 537 538 @note: This class was created with parsing large volumes of data in mind. This 539 comes at the expense of degraded performance of the sequence indexer. 540 541 @param id: FASTA ID of this sequence (e.g. accession number) 542 @type id: str 543 @param header: FASTA sequence header 544 @type header: str 545 @param residues: sequence string 546 @type residues: str 547 @param type: a L{SequenceTypes} member (defaults to protein) 548 @type type: L{EnumItem} 549 """ 550

551 - def __init__(self, id, header, residues, type=SequenceTypes.Unknown):

552 553 self._id = None 554 self._header = None 555 self._residues = '' 556 self._type = None 557 558 self.id = id 559 self.header = header 560 self.type = type 561 562 self._append(residues)

563

564 - def _append(self, string):

565 # this seems to be the fastest method for sanitization and storage 566 self._residues += re.sub('([^\w\-\.])+', '', string)

567

568 - def _add(self, char):

569 self._append(char)

570

571 - def _get(self, rank):

572 573 type = csb.core.Enum.parse(self.alphabet, self._residues[rank - 1]) 574 return ResidueInfo(rank, type)

575

576 - def strip(self):

577 residues = self._residues.replace( 578 str(self.alphabet.GAP), '').replace( 579 str(self.alphabet.INSERTION), '') 580 return self._factory(self.id, self.header, residues, self.type)

581

582 - def subregion(self, start, end):

583 584 if not 1 <= start <= end <= self.length: 585 raise SequencePositionError(None, 1, self.length) 586 587 residues = self._residues[start - 1 : end] 588 return self._factory(self.id, self.header, residues, self.type)

589

590 - def extract(self, positions):

591 592 end = self.length 593 residues = [] 594 595 for rank in sorted(set(positions)): 596 if 1 <= rank <= end: 597 residues.append(self._residues[rank - 1]) 598 else: 599 raise SequencePositionError(rank, 1, end) 600 601 return self._factory(self.id, self.header, ''.join(residues), self.type)

602 603 @property

604 - def sequence(self):

605 return self._residues

606

607 -class RichSequence(AbstractSequence):

608 """ 609 Sequence implementation, which converts the sequence into a list of 610 L{ResidueInfo} objects. See L{AbstractSequence} for details. 611 """ 612

613 - def _add(self, residue):

614 615 if hasattr(residue, 'rank') and hasattr(residue, 'type'): 616 self._residues.append(residue) 617 618 else: 619 if residue.isalpha() or residue in (self.alphabet.GAP, self.alphabet.INSERTION): 620 621 type = csb.core.Enum.parse(self.alphabet, residue) 622 rank = len(self._residues) + 1 623 self._residues.append(ResidueInfo(rank, type))

624

625 - def _get(self, rank):

626 return self._residues[rank - 1]

627 628 @staticmethod

629 - def create(sequence):

630 """ 631 Create a new L{RichSequence} from existing L{AbstractSequence}. 632 633 @type sequence: L{AbstractSequence} 634 @rtype: L{RichSequence} 635 """ 636 return RichSequence( 637 sequence.id, sequence.header, sequence.sequence, sequence.type)

638

639 -class ChainSequence(AbstractSequence):

640 """ 641 Sequence view for L{csb.bio.structure.Chain} objects. 642 See L{AbstractSequence} for details. 643 """ 644

645 - def _add(self, residue):

646 647 if not (hasattr(residue, 'rank') and hasattr(residue, 'type')): 648 raise TypeError(residue) 649 else: 650 self._residues.append(residue)

651

652 - def _get(self, rank):

653 return self._residues[rank - 1]

654 655 @staticmethod

656 - def create(chain):

657 """ 658 Create a new L{ChainSequence} from existing L{Chain} instance. 659 660 @type chain: L{csb.bio.structure.Chain} 661 @rtype: L{ChainSequence} 662 """ 663 return ChainSequence( 664 chain.entry_id, chain.header, chain.residues, chain.type)

665

666 667 -class SequenceAdapter(object):

668 """ 669 Base wrapper class for L{AbstractSequence} objects. 670 Needs to be sub-classed (does not do anything special on its own). 671 672 @param sequence: adaptee 673 @type sequence: L{AbstractSequence} 674 """ 675

676 - def __init__(self, sequence):

677 678 if not isinstance(sequence, AbstractSequence): 679 raise TypeError(sequence) 680 681 self._subject = sequence

682

683 - def __getitem__(self, i):

684 return self._subject[i]

685

686 - def __iter__(self):

687 return iter(self._subject)

688

689 - def __repr__(self):

690 return '<{0.__class__.__name__}: {0.id}, {0.length}>'.format(self)

691

692 - def __str__(self):

693 return str(self._subject)

694

695 - def _add(self):

696 raise NotImplementedError()

697

698 - def _get(self, rank):

699 return self._subject._get(rank)

700

701 - def _factory(self, *a, **k):

702 return self.__class__(self._subject._factory(*a, **k))

703

704 - def strip(self):

705 return self._subject.strip()

706

707 - def subregion(self, start, end):

708 return self._subject.subregion(start, end)

709

710 - def extract(self, positions):

711 return self._subject.extract(positions)

712 713 @property

714 - def id(self):

715 return self._subject.id

716 717 @property

718 - def length(self):

719 return self._subject.length

720 721 @property

722 - def type(self):

723 return self._subject.type

724 725 @property

726 - def header(self):

727 return self._subject.header

728 729 @property

730 - def sequence(self):

731 return self._subject.sequence

732 733 @property

734 - def alphabet(self):

735 return self._subject.alphabet

736

737 -class AlignedSequenceAdapter(SequenceAdapter):

738 """ 739 Adapter, which wraps a gapped L{AbstractSequence} object and makes it 740 compatible with the MSA row/entry interface, expected by L{AbstractAlignment}. 741 742 The C{adapter.residues} property operates with an L{UngappedSequenceIndexer}, 743 which provides a gap-free view of the underlying sequence. 744 745 The C{adapter.columns} property operates with a standard L{ColumnIndexer}, 746 the same indexer which is used to provide the column view in multiple 747 alignments. Adapted sequences therefore act as alignment rows and allow for 748 MSA-column-oriented indexing. 749 750 @param sequence: adaptee 751 @type sequence: L{AbstractSequence} 752 """ 753

754 - def __init__(self, sequence):

755 756 super(AlignedSequenceAdapter, self).__init__(sequence) 757 758 self._fmap = {} 759 self._rmap = {} 760 rank = 0 761 762 for column, residue in enumerate(sequence, start=1): 763 764 if residue.type not in (self.alphabet.GAP, self.alphabet.INSERTION): 765 rank += 1 766 self._fmap[column] = rank 767 self._rmap[rank] = column 768 else: 769 self._fmap[column] = None

770

771 - def __getitem__(self, index):

772 if not 0 <= index < self.length: 773 raise IndexError(index) 774 return self._get_column(index + 1)

775

776 - def __iter__(self):

777 for c in sorted(self._fmap): 778 yield self._get_column(c)

779 780 @property

781 - def columns(self):

782 """ 783 Provides 1-based access to the respective columns in the MSA. 784 @rtype: L{ColumnIndexer} 785 """ 786 return ColumnIndexer(self)

787 788 @property

789 - def residues(self):

790 """ 791 Provides 1-based access to the residues of the unaligned (ungapped) 792 sequence. 793 @rtype: L{UngappedSequenceIndexer} 794 """ 795 return UngappedSequenceIndexer(self)

796

797 - def _get_column(self, column):

798 return ColumnInfo( 799 column, self.id, self._fmap[column], self._subject.residues[column])

800

801 - def _get_ungapped(self, rank):

802 return self._subject.residues[self._rmap[rank]]

803

804 - def map_residue(self, rank):

805 """ 806 Return the MSA column number corresponding to the specified ungapped 807 sequence C{rank}. 808 809 @param rank: 1-based residue rank 810 @type rank: int 811 @rtype: int 812 """ 813 return self._rmap[rank]

814

815 - def map_column(self, column):

816 """ 817 Return the ungapped sequence rank corresponding to the specified MSA 818 C{column} number. 819 820 @param column: 1-based alignment column number 821 @type column: int 822 @rtype: int 823 """ 824 return self._fmap[column]

825

826 -class SliceHelper(object):

827

828 - def __init__(self, slice, start=0, stop=0):

829 830 s, e, t = slice.start, slice.stop, slice.step 831 832 if s is None: 833 s = start 834 if e is None: 835 e = stop 836 if t is None: 837 t = 1 838 839 for value in [s, e, t]: 840 if value < 0: 841 raise IndexError(value) 842 843 self.start = s 844 self.stop = e 845 self.step = t

846

847 -class AlignmentRowsTable(csb.core.BaseDictionaryContainer):

848

849 - def __init__(self, container):

850 851 super(AlignmentRowsTable, self).__init__() 852 853 self._container = container 854 self._map = {}

855

856 - def __getitem__(self, item):

857 858 try: 859 if isinstance(item, int): 860 key = self._map[item] 861 else: 862 key = item 863 864 return super(AlignmentRowsTable, self).__getitem__(key) 865 866 except KeyError: 867 raise SequenceNotFoundError(item)

868

869 - def _append(self, sequence):

870 871 n = 0 872 sequence_id = sequence.id 873 874 while sequence_id in self: 875 n += 1 876 sequence_id = '{0}:A{1}'.format(sequence.id, n) 877 878 super(AlignmentRowsTable, self)._append_item(sequence_id, sequence) 879 self._map[self.length] = sequence_id

880

881 - def __iter__(self):

882 for id in super(AlignmentRowsTable, self).__iter__(): 883 yield self[id]

884

885 886 -class AbstractAlignment(object):

887 """ 888 Base class for all alignment objects. 889 890 Provides 1-based access to the alignment.rows and alignment.columns. 891 Alignment rows can also be accessed by sequence ID. In addition, all 892 alignments support 0-based slicing: 893 894 >>> alignment[rows, columns] 895 AbstractAlignment (sub-alignment) 896 897 where 898 - C{rows} can be a slice, tuple of row indexes or tuple of sequence IDs 899 - columns can be a slice or tuple of column indexes 900 901 For example: 902 903 >>> alignment[:, 2:] 904 AbstractAlignment # all rows, columns [3, alignment.length] 905 >>> alignment[(0, 'seqx'), (3, 5)] 906 AbstractAlignment # rows #1 and 'seq3', columns #4 and #5 907 908 @param sequences: alignment entries (must have equal length) 909 @type sequences: list of L{AbstractSequence}s 910 @param strict: if True, raise {DuplicateSequenceError} when a duplicate ID 911 is found (default=True) 912 @type strict: bool 913 914 @note: if C{strict} is False and there are C{sequences} with redundant identifiers, 915 those sequences will be added to the C{rows} collection with :An suffix, 916 where n is a serial number. Therefore, rows['ID'] will return only one sequence, 917 the first sequence with id=ID. All remaining sequences can be retrieved 918 with C{rows['ID:A1']}, {rows['ID:A2']}, etc. However, the sequence objects will 919 remain intact, e.g. {rows['ID:A1'].id} still returns 'ID' and not 'ID:A1'. 920 """ 921 922 __metaclass__ = ABCMeta 923

924 - def __init__(self, sequences, strict=True):

925 926 self._length = None 927 self._msa = AlignmentRowsTable(self) 928 self._colview = ColumnIndexer(self) 929 self._map = {} 930 self._strict = bool(strict) 931 932 self._construct(sequences)

933

934 - def __getitem__(self, spec):

935 936 # The following code can hardly get more readable than that, sorry. 937 # Don't even think of modifying this before there is a 100% unit test coverage 938 939 # 0. expand the input tuple: (rows/, columns/) => (rows, columns) 940 if not isinstance(spec, tuple) or len(spec) not in (1, 2): 941 raise TypeError('Invalid alignment slice expression') 942 943 if len(spec) == 2: 944 rowspec, colspec = spec 945 else: 946 rowspec, colspec = [spec, slice(None)] 947 948 # 1. interpret the row slice: int, iter(int), iter(str) or slice(int) => list(int, 1-based) 949 if isinstance(rowspec, slice): 950 if isinstance(rowspec.start, csb.core.string) or isinstance(rowspec.stop, csb.core.string): 951 raise TypeError("Invalid row slice: only indexes are supported") 952 rowspec = SliceHelper(rowspec, 0, self.size) 953 rows = range(rowspec.start + 1, rowspec.stop + 1) 954 elif isinstance(rowspec, int): 955 rows = [rowspec + 1] 956 elif csb.core.iterable(rowspec): 957 try: 958 rows = [] 959 for r in rowspec: 960 if isinstance(r, int): 961 rows.append(r + 1) 962 else: 963 rows.append(self._map[r]) 964 except KeyError as ke: 965 raise KeyError('No such Sequence ID: {0!s}'.format(ke)) 966 else: 967 raise TypeError('Unsupported row expression') 968 969 # 2. interpret the column slice: int, iter(int) or slice(int) => list(int, 1-based) 970 if isinstance(colspec, slice): 971 colspec = SliceHelper(colspec, 0, self._length or 0) 972 cols = range(colspec.start + 1, colspec.stop + 1) 973 elif isinstance(colspec, int): 974 cols = [colspec + 1] 975 elif csb.core.iterable(colspec): 976 try: 977 cols = [ c + 1 for c in colspec ] 978 except: 979 raise TypeError('Unsupported column expression') 980 else: 981 raise TypeError('Unsupported column expression') 982 983 # 3. some more checks 984 if len(rows) == 0: 985 raise ValueError("The expression returns zero rows") 986 if len(cols) == 0: 987 raise ValueError("The expression returns zero columns") 988 989 # 4. we are done 990 return self._extract(rows, cols)

991

992 - def _range(self, slice, start, end):

993 994 s, e, t = slice.start, slice.end, slice.step 995 996 if s is None: 997 s = start 998 if e is None: 999 e = end 1000 if t is None: 1001 t = 1 1002 1003 return range(s, e, t)

1004

1005 - def __iter__(self):

1006 for cn in range(1, self.length + 1): 1007 yield self._get_column(cn)

1008 1009 @abstractmethod

1010 - def _construct(self, sequences):

1011 """ 1012 Hook method, called internally upon object construction. Subclasses 1013 define how the source alignment sequences are handled during alignment 1014 construction. 1015 1016 @param sequences: alignment entries 1017 @type sequences: list of L{AbstractSequence}s 1018 """ 1019 pass

1020

1021 - def _initialize(self, rep_sequence):

1022 """ 1023 Hook method, which is used to initialize various alignment properties 1024 (such as length) from the first alignned sequence. 1025 """ 1026 if rep_sequence.length == 0: 1027 raise SequenceError("Sequence '{0}' is empty".format(rep_sequence.id)) 1028 1029 assert self._length is None 1030 self._length = rep_sequence.length

1031

1032 - def _factory(self, *a, **k):

1033 """ 1034 Return a new sequence of the current L{AbstractAlignment} sub-class. 1035 """ 1036 return self.__class__(*a, **k)

1037

1038 - def add(self, sequence):

1039 """ 1040 Append a new sequence to the alignment. 1041 1042 @type sequence: L{AbstractSequence} 1043 @raise SequenceError: if the new sequence is too short/long 1044 @raise DuplicateSequenceError: if a sequence with same ID already exists 1045 """ 1046 1047 if self._msa.length == 0: 1048 self._initialize(sequence) 1049 1050 if sequence.length != self._length: 1051 raise SequenceError('{0!r} is not of the expected length'.format(sequence)) 1052 1053 if self._strict and sequence.id in self._msa: 1054 raise DuplicateSequenceError(sequence.id) 1055 1056 self._msa._append(AlignedSequenceAdapter(sequence)) 1057 self._map[sequence.id] = self._msa.length

1058 1059 @property

1060 - def length(self):

1061 """ 1062 Number of columns in the alignment 1063 @rtype: int 1064 """ 1065 return self._length or 0

1066 1067 @property

1068 - def size(self):

1069 """ 1070 Number of rows (sequences) in the alignment 1071 @rtype: int 1072 """ 1073 return self._msa.length

1074 1075 @property

1076 - def rows(self):

1077 """ 1078 1-based access to the alignment entries (sequences) 1079 @rtype: L{AlignmentRowsTable} 1080 """ 1081 return self._msa

1082 1083 @property

1084 - def columns(self):

1085 """ 1086 1-based access to the alignment columns 1087 @rtype: L{ColumnIndexer} 1088 """ 1089 return self._colview

1090

1091 - def gap_at(self, column):

1092 """ 1093 Return True of C{column} contains at least one gap. 1094 @param column: column number, 1-based 1095 @type column: int 1096 1097 @rtype: bool 1098 """ 1099 1100 for row in self._msa: 1101 if row.columns[column].residue.type == row.alphabet.GAP: 1102 return True 1103 1104 return False

1105

1106 - def _get_column(self, column):

1107 return tuple(row._get_column(column) for row in self.rows)

1108

1109 - def _extract(self, rows, cols):

1110 1111 rows = set(rows) 1112 cols = set(cols) 1113 1114 if not 1 <= min(rows) <= max(rows) <= self.size: 1115 raise IndexError('Row specification out of range') 1116 1117 if not 1 <= min(cols) <= max(cols) <= self.length: 1118 raise IndexError('Column specification out of range') 1119 1120 sequences = [] 1121 1122 for rn, row in enumerate(self.rows, start=1): 1123 if rn in rows: 1124 sequences.append(row.extract(cols)) 1125 1126 return self._factory(sequences, strict=self._strict)

1127

1128 - def subregion(self, start, end):

1129 """ 1130 Extract a sub-alignment, ranging from C{start} to C{end} columns. 1131 1132 @param start: starting column, 1-based 1133 @type start: int 1134 @param end: ending column, 1-based 1135 @type end: int 1136 1137 @return: a new alignment of the current type 1138 @rtype: L{AbstractAlignment} 1139 1140 @raise ColumnPositionError: if start/end is out of range 1141 """ 1142 if not 1 <= start <= end <= self.length: 1143 raise ColumnPositionError(None, 1, self.length) 1144 1145 sequences = [] 1146 1147 for row in self.rows: 1148 sequences.append(row.subregion(start, end)) 1149 1150 return self._factory(sequences, strict=self._strict)

1151

1152 - def format(self, format=AlignmentFormats.FASTA, headers=True):

1153 """ 1154 Format the alignment as a string. 1155 1156 @param format: alignment format type, member of L{AlignmentFormats} 1157 @type format: L{EnumItem} 1158 @param headers: if False, omit headers 1159 @type headers: bool 1160 1161 @rtype: str 1162 """ 1163 from csb.bio.io.fasta import OutputBuilder 1164 1165 temp = csb.io.MemoryStream() 1166 1167 try: 1168 builder = OutputBuilder.create(format, temp, headers=headers) 1169 builder.add_alignment(self) 1170 1171 return temp.getvalue() 1172 1173 finally: 1174 temp.close()

1175

1176 -class SequenceAlignment(AbstractAlignment):

1177 """ 1178 Multiple sequence alignment. See L{AbstractAlignment} for details. 1179 """ 1180

1181 - def _construct(self, sequences):

1182 1183 for sequence in sequences: 1184 self.add(sequence)

1185 1186 @staticmethod

1187 - def parse(string, strict=True):

1188 """ 1189 Create a new L{SequenceAlignment} from an mFASTA string. 1190 1191 @param string: MSA-formatted string 1192 @type string: str 1193 @param strict: see L{AbstractAlignment} 1194 @type strict: bool 1195 1196 @rtype: L{SequenceAlignment} 1197 """ 1198 from csb.bio.io.fasta import SequenceAlignmentReader 1199 return SequenceAlignmentReader(strict=strict).read_fasta(string)

1200

1201 -class StructureAlignment(AbstractAlignment):

1202 """ 1203 Multiple structure alignment. Similar to a L{SequenceAlignment}, but 1204 the alignment holds the actual L{csb.bio.structure.ProteinResidue} objects, 1205 taken from the corresponding source L{csb.bio.structure.Chain}s. 1206 1207 See L{AbstractAlignment} for details. 1208 """ 1209

1210 - def _construct(self, sequences):

1211 1212 for sequence in sequences: 1213 self.add(sequence)

1214 1215 @staticmethod

1216 - def parse(string, provider, id_factory=None, strict=True):

1217 """ 1218 Create a new L{StructureAlignment} from an mFASTA string. See 1219 L{csb.bio.io.fasta.StructureAlignmentFactory} for details. 1220 1221 @param string: MSA-formatted string 1222 @type string: str 1223 @param provider: data source for all structures found in the alignment 1224 @type provider: L{csb.bio.io.wwpdb.StructureProvider} 1225 @param strict: see L{AbstractAlignment} 1226 @type strict: bool 1227 @param id_factory: callable factory, which transforms a sequence ID into 1228 a L{csb.bio.io.wwpdb.EntryID} object. By default 1229 this is L{csb.bio.io.wwpdb.EntryID.create}. 1230 @type id_factory: callable 1231 @rtype: L{StructureAlignment} 1232 """ 1233 from csb.bio.io.fasta import StructureAlignmentFactory 1234 1235 factory = StructureAlignmentFactory( 1236 provider, id_factory=id_factory, strict=strict) 1237 return factory.make_alignment(string)

1238

1239 -class A3MAlignment(AbstractAlignment):

1240 """ 1241 A specific type of multiple alignment, which provides some operations 1242 relative to a master sequence (the first entry in the alignment). 1243 """ 1244

1245 - def __init__(self, sequences, strict=True):

1246 1247 self._master = None 1248 self._matches = 0 1249 self._insertions = set() 1250 1251 super(A3MAlignment, self).__init__(sequences, strict=strict)

1252

1253 - def _initialize(self, rep_sequence):

1254 1255 super(A3MAlignment, self)._initialize(rep_sequence) 1256 self._alphabet = rep_sequence.alphabet

1257

1258 - def _construct(self, sequences):

1259 1260 for sequence in sequences: 1261 1262 self.add(sequence) 1263 1264 for rank, residue in enumerate(sequence, start=1): 1265 if residue.type == self._alphabet.INSERTION: 1266 self._insertions.add(rank) 1267 1268 if self.size == 0: 1269 raise SequenceError("At least one sequence is required") 1270 1271 self._master = list(self._msa)[0] 1272 self._matches = self._master.strip().length

1273 1274 @property

1275 - def master(self):

1276 """ 1277 The master sequence 1278 @rtype: L{AbstractSequence} 1279 """ 1280 return self._master

1281

1282 - def insertion_at(self, column):

1283 """ 1284 Return True of C{column} contains at least one insertion. 1285 1286 @param column: column number, 1-based 1287 @type column: int 1288 @rtype: bool 1289 """ 1290 return column in self._insertions

1291

1292 - def hmm_subregion(self, match_start, match_end):

1293 """ 1294 Same as L{AbstractAlignment.subregion}, but start/end positions are 1295 ranks in the ungapped master sequence. 1296 """ 1297 1298 if not 1 <= match_start <= match_end <= self.matches: 1299 raise ColumnPositionError(None, 1, self.matches) 1300 1301 start = self._master.map_residue(match_start) 1302 end = self._master.map_residue(match_end) 1303 1304 return self.subregion(start, end)

1305

1306 - def format(self, format=AlignmentFormats.A3M, headers=True):

1307 return super(A3MAlignment, self).format(format, headers)

1308 1309 @property

1310 - def matches(self):

1311 """ 1312 Number of match states (residues in the ungapped master). 1313 @rtype: int 1314 """ 1315 return self._matches

1316 1317 @staticmethod

1318 - def parse(string, strict=True):

1319 """ 1320 Create a new L{A3MAlignment} from an A3M string. 1321 1322 @param string: MSA-formatted string 1323 @type string: str 1324 @param strict: see L{AbstractAlignment} 1325 @type strict: bool 1326 1327 @rtype: L{A3MAlignment} 1328 """ 1329 from csb.bio.io.fasta import SequenceAlignmentReader 1330 return SequenceAlignmentReader(strict=strict).read_a3m(string)

1331

Source Code for Package csb.bio.sequence