1 """
2 Sequence and sequence alignment APIs.
3
4 This module defines the base interfaces for biological sequences and alignments:
5 L{AbstractSequence} and L{AbstractAlignment}. These are the central abstractions
6 here. This module provides also a number of useful enumerations, like L{SequenceTypes}
7 and L{SequenceAlphabets}.
8
9 Sequences
10 =========
11 L{AbstractSequence} has a number of implementations. These are of course interchangeable,
12 but have different intents and may differ significantly in performance. The standard
13 L{Sequence} implementation is what you are after if all you need is high performance
14 and efficient storage (e.g. when you are parsing big files). L{Sequence} objects store
15 their underlying sequences as strings. L{RichSequence}s on the other hand will store
16 their residues as L{ResidueInfo} objects, which have the same basic interface as the
17 L{csb.bio.structure.Residue} objects. This of course comes at the expense of degraded
18 performance. A L{ChainSequence} is a special case of a rich sequence, whose residue
19 objects are I{actually} real L{csb.bio.structure.Residue}s.
20
21 Basic usage:
22
23 >>> seq = RichSequence('id', 'desc', 'sequence', SequenceTypes.Protein)
24 >>> seq.residues[1]
25 <ResidueInfo [1]: SER>
26 >>> seq.dump(sys.stdout)
27 >desc
28 SEQUENCE
29
30 See L{AbstractSequence} for details.
31
32 Alignments
33 ==========
34 L{AbstractAlignment} defines a table-like interface to access the data in an
35 alignment:
36
37 >>> ali = SequenceAlignment.parse(">a\\nABC\\n>b\\nA-C")
38 >>> ali[0, 0]
39 <SequenceAlignment> # a new alignment, constructed from row #1, column #1
40 >>> ali[0, 1:3]
41 <SequenceAlignment> # a new alignment, constructed from row #1, columns #2..#3
42
43 which is just a shorthand for using the standard 1-based interface:
44
45 >>> ali.rows[1]
46 <AlignedSequenceAdapter: a, 3> # row #1 (first sequence)
47 >>> ali.columns[1]
48 (<ColumnInfo a [1]: ALA>, <ColumnInfo b [1]: ALA>) # residues at column #1
49
50 See L{AbstractAlignment} for all details and more examples.
51
52 There are a number of L{AbstractAlignment} implementations defined here.
53 L{SequenceAlignment} is the default one, nothing surprising. L{A3MAlignment}
54 is a more special one: the first sequence in the alignment is a master sequence.
55 This alignment is usually used in the context of HHpred. More important is the
56 L{StructureAlignment}, which is an alignment of L{csb.bio.structure.Chain} objects.
57 The residues in every aligned sequence are really the L{csb.bio.structure.Residue}
58 objects taken from those chains.
59 """
60
61 import re
62 import csb.core
63 import csb.io
64
65 from abc import ABCMeta, abstractmethod, abstractproperty
73
79
81 """
82 Enumeration of alignment strategies
83 """
84 Global='global'; Local='local'
85
92
100
107
109 """
110 Unknown sequence alphabet
111 """
112 UNK='X'; GAP='-'; INSERTION='.'
113
155
159
161
162 - def __init__(self, index=None, start=1, end=None):
172
174
175 if self.index is not None:
176 s = 'Position {0.index} is out of range [{0.start}, {0.end}]'
177 else:
178 s = 'Out of range [{0.start}, {0.end}]'
179
180 return s.format(self)
181
184
187
190
193
195
197
198 self._type = None
199 self._rank = rank
200
201 self.type = type
202
203 @property
205 """
206 Residue type - a member of any sequence alphabet
207 @rtype: enum item
208 """
209 return self._type
210 @type.setter
211 - def type(self, type):
215
216 @property
218 """
219 Residue position (1-based)
220 @rtype: int
221 """
222 return self._rank
223
225 return '<{1} [{0.rank}]: {0.type!r}>'.format(self, self.__class__.__name__)
226
229
230 - def __init__(self, column, id, rank, residue):
236
238 return '<{0.__class__.__name__} {0.id} [{0.column}]: {0.residue.type!r}>'.format(self)
239
241
243 self._container = container
244
251
253 return iter(self._container)
254
267
276
279 """
280 Represents a list of L{AbstractSequence}s.
281 """
282
285
300
303 """
304 Base abstract class for all Sequence objects.
305
306 Provides 1-based access to the residues in the sequence via the
307 sequence.residues property. The sequence object itself also behaves like
308 a collection and provides 0-based access to its elements (residues).
309
310 @param id: FASTA ID of this sequence (e.g. accession number)
311 @type id: str
312 @param header: FASTA sequence header
313 @type header: str
314 @param residues: sequence residues
315 @type residues: str or collection of L{ResidueInfo}
316 @param type: a L{SequenceTypes} member (defaults to protein)
317 @type type: L{EnumItem}
318 """
319
320 __metaclass__ = ABCMeta
321
322 DELIMITER = '>'
323
337
339
340 if isinstance(spec, slice):
341 spec = SliceHelper(spec, 0, self.length)
342 return self.subregion(spec.start + 1, spec.stop)
343 else:
344 if not 0 <= spec < self.length:
345 raise IndexError(spec)
346 return self._get(spec + 1)
347
349 for index in range(self.length):
350 yield self[index]
351
352 @abstractmethod
353 - def _add(self, residue):
354 """
355 Append a C{residue} to the sequence.
356
357 This is a hook method invoked internally for each residue during object
358 construction. By implementing this method, sub-classes define how
359 residues are attached to the sequence object.
360 """
361 pass
362
363 @abstractmethod
364 - def _get(self, rank):
365 """
366 Retrieve the sequence residue at the specified position (1-based, positive).
367
368 This is a hook method which defines the actual behavior of the sequence
369 residue indexer.
370
371 @rtype: L{ResidueInfo}
372 @raise SequencePositionError: when the supplied rank is out of range
373 """
374 pass
375
377 """
378 Return a new sequence of the current L{AbstractSequence} sub-class.
379 """
380 return self.__class__(*a, **k)
381
383 """
384 Remove all gaps and insertions from the sequence.
385
386 @return: a new sequence instance, containing no gaps
387 @rtype: L{AbstractSequence}
388 """
389 residues = [r for r in self._residues
390 if r.type not in (self.alphabet.GAP, self.alphabet.INSERTION)]
391
392 return self._factory(self.id, self.header, residues, self.type)
393
395 """
396 Extract a subsequence, defined by [start, end]. The start and end
397 positions are 1-based, inclusive.
398
399 @param start: start position
400 @type start: int
401 @param end: end position
402 @type end: int
403
404 @return: a new sequence
405 @rtype: L{AbstractSequence}
406
407 @raise SequencePositionError: if start/end positions are out of range
408 """
409 positions = range(start, end + 1)
410 return self.extract(positions)
411
413 """
414 Extract a subsequence, defined by a list of 1-based positions.
415
416 @param positions: positions to extract
417 @type positions: tuple of int
418
419 @return: a new sequence
420 @rtype: L{AbstractSequence}
421
422 @raise SequencePositionError: if any position is out of range
423 """
424
425 end = self.length
426 residues = []
427
428 for rank in sorted(set(positions)):
429 if 1 <= rank <= end:
430 residues.append(self._get(rank))
431 else:
432 raise SequencePositionError(rank, 1, end)
433
434 return self._factory(self.id, self.header, residues, self.type)
435
436 - def dump(self, output_file):
447
448 @property
450 """
451 Number of residues
452 @rtype: int
453 """
454 return len(self._residues)
455
456 @property
458 """
459 Sequence identifier
460 @rtype: str
461 """
462 return self._id
463 @id.setter
464 - def id(self, value):
468
469 @property
471 """
472 Sequence description
473 @rtype: str
474 """
475 return self._header
476 @header.setter
483
484 @property
486 """
487 Sequence type - a member of L{SequenceTypes}
488 @rtype: enum item
489 """
490 return self._type
491 @type.setter
492 - def type(self, value):
498
499 @property
501 """
502 The actual sequence
503 @rtype: str
504 """
505 return ''.join([str(r.type) for r in self._residues])
506
507 @property
509 """
510 The sequence alphabet corresponding to the current sequence type
511 @rtype: L{csb.core.enum}
512 """
513 return SequenceAlphabets.get(self._type)
514
515 @property
517 """
518 Rank-based access to the underlying L{residues<csb.bio.sequence.ResidueInfo>}
519 @rtype: L{SequenceIndexer}
520 """
521 return SequenceIndexer(self)
522
525
527 return '<{0.__class__.__name__}: {0.id}, {0.length} residues>'.format(self)
528
531
533 """
534 High-performance sequence object. The actual sequence is stored internally
535 as a string. The indexer acts as a residue factory, which creates a new
536 L{ResidueInfo} instance each time.
537
538 @note: This class was created with parsing large volumes of data in mind. This
539 comes at the expense of degraded performance of the sequence indexer.
540
541 @param id: FASTA ID of this sequence (e.g. accession number)
542 @type id: str
543 @param header: FASTA sequence header
544 @type header: str
545 @param residues: sequence string
546 @type residues: str
547 @param type: a L{SequenceTypes} member (defaults to protein)
548 @type type: L{EnumItem}
549 """
550
552
553 self._id = None
554 self._header = None
555 self._residues = ''
556 self._type = None
557
558 self.id = id
559 self.header = header
560 self.type = type
561
562 self._append(residues)
563
565
566 self._residues += re.sub('([^\w\-\.])+', '', string)
567
568 - def _add(self, char):
570
571 - def _get(self, rank):
572
573 type = csb.core.Enum.parse(self.alphabet, self._residues[rank - 1])
574 return ResidueInfo(rank, type)
575
581
589
602
603 @property
605 return self._residues
606
608 """
609 Sequence implementation, which converts the sequence into a list of
610 L{ResidueInfo} objects. See L{AbstractSequence} for details.
611 """
612
613 - def _add(self, residue):
624
625 - def _get(self, rank):
626 return self._residues[rank - 1]
627
628 @staticmethod
638
640 """
641 Sequence view for L{csb.bio.structure.Chain} objects.
642 See L{AbstractSequence} for details.
643 """
644
645 - def _add(self, residue):
651
652 - def _get(self, rank):
653 return self._residues[rank - 1]
654
655 @staticmethod
665
668 """
669 Base wrapper class for L{AbstractSequence} objects.
670 Needs to be sub-classed (does not do anything special on its own).
671
672 @param sequence: adaptee
673 @type sequence: L{AbstractSequence}
674 """
675
682
684 return self._subject[i]
685
687 return iter(self._subject)
688
690 return '<{0.__class__.__name__}: {0.id}, {0.length}>'.format(self)
691
693 return str(self._subject)
694
696 raise NotImplementedError()
697
698 - def _get(self, rank):
699 return self._subject._get(rank)
700
702 return self.__class__(self._subject._factory(*a, **k))
703
705 return self._subject.strip()
706
709
711 return self._subject.extract(positions)
712
713 @property
715 return self._subject.id
716
717 @property
719 return self._subject.length
720
721 @property
723 return self._subject.type
724
725 @property
727 return self._subject.header
728
729 @property
732
733 @property
736
738 """
739 Adapter, which wraps a gapped L{AbstractSequence} object and makes it
740 compatible with the MSA row/entry interface, expected by L{AbstractAlignment}.
741
742 The C{adapter.residues} property operates with an L{UngappedSequenceIndexer},
743 which provides a gap-free view of the underlying sequence.
744
745 The C{adapter.columns} property operates with a standard L{ColumnIndexer},
746 the same indexer which is used to provide the column view in multiple
747 alignments. Adapted sequences therefore act as alignment rows and allow for
748 MSA-column-oriented indexing.
749
750 @param sequence: adaptee
751 @type sequence: L{AbstractSequence}
752 """
753
770
772 if not 0 <= index < self.length:
773 raise IndexError(index)
774 return self._get_column(index + 1)
775
777 for c in sorted(self._fmap):
778 yield self._get_column(c)
779
780 @property
782 """
783 Provides 1-based access to the respective columns in the MSA.
784 @rtype: L{ColumnIndexer}
785 """
786 return ColumnIndexer(self)
787
788 @property
790 """
791 Provides 1-based access to the residues of the unaligned (ungapped)
792 sequence.
793 @rtype: L{UngappedSequenceIndexer}
794 """
795 return UngappedSequenceIndexer(self)
796
800
803
805 """
806 Return the MSA column number corresponding to the specified ungapped
807 sequence C{rank}.
808
809 @param rank: 1-based residue rank
810 @type rank: int
811 @rtype: int
812 """
813 return self._rmap[rank]
814
816 """
817 Return the ungapped sequence rank corresponding to the specified MSA
818 C{column} number.
819
820 @param column: 1-based alignment column number
821 @type column: int
822 @rtype: int
823 """
824 return self._fmap[column]
825
827
828 - def __init__(self, slice, start=0, stop=0):
829
830 s, e, t = slice.start, slice.stop, slice.step
831
832 if s is None:
833 s = start
834 if e is None:
835 e = stop
836 if t is None:
837 t = 1
838
839 for value in [s, e, t]:
840 if value < 0:
841 raise IndexError(value)
842
843 self.start = s
844 self.stop = e
845 self.step = t
846
884
887 """
888 Base class for all alignment objects.
889
890 Provides 1-based access to the alignment.rows and alignment.columns.
891 Alignment rows can also be accessed by sequence ID. In addition, all
892 alignments support 0-based slicing:
893
894 >>> alignment[rows, columns]
895 AbstractAlignment (sub-alignment)
896
897 where
898 - C{rows} can be a slice, tuple of row indexes or tuple of sequence IDs
899 - columns can be a slice or tuple of column indexes
900
901 For example:
902
903 >>> alignment[:, 2:]
904 AbstractAlignment # all rows, columns [3, alignment.length]
905 >>> alignment[(0, 'seqx'), (3, 5)]
906 AbstractAlignment # rows #1 and 'seq3', columns #4 and #5
907
908 @param sequences: alignment entries (must have equal length)
909 @type sequences: list of L{AbstractSequence}s
910 @param strict: if True, raise {DuplicateSequenceError} when a duplicate ID
911 is found (default=True)
912 @type strict: bool
913
914 @note: if C{strict} is False and there are C{sequences} with redundant identifiers,
915 those sequences will be added to the C{rows} collection with :An suffix,
916 where n is a serial number. Therefore, rows['ID'] will return only one sequence,
917 the first sequence with id=ID. All remaining sequences can be retrieved
918 with C{rows['ID:A1']}, {rows['ID:A2']}, etc. However, the sequence objects will
919 remain intact, e.g. {rows['ID:A1'].id} still returns 'ID' and not 'ID:A1'.
920 """
921
922 __metaclass__ = ABCMeta
923
924 - def __init__(self, sequences, strict=True):
925
926 self._length = None
927 self._msa = AlignmentRowsTable(self)
928 self._colview = ColumnIndexer(self)
929 self._map = {}
930 self._strict = bool(strict)
931
932 self._construct(sequences)
933
935
936
937
938
939
940 if not isinstance(spec, tuple) or len(spec) not in (1, 2):
941 raise TypeError('Invalid alignment slice expression')
942
943 if len(spec) == 2:
944 rowspec, colspec = spec
945 else:
946 rowspec, colspec = [spec, slice(None)]
947
948
949 if isinstance(rowspec, slice):
950 if isinstance(rowspec.start, csb.core.string) or isinstance(rowspec.stop, csb.core.string):
951 raise TypeError("Invalid row slice: only indexes are supported")
952 rowspec = SliceHelper(rowspec, 0, self.size)
953 rows = range(rowspec.start + 1, rowspec.stop + 1)
954 elif isinstance(rowspec, int):
955 rows = [rowspec + 1]
956 elif csb.core.iterable(rowspec):
957 try:
958 rows = []
959 for r in rowspec:
960 if isinstance(r, int):
961 rows.append(r + 1)
962 else:
963 rows.append(self._map[r])
964 except KeyError as ke:
965 raise KeyError('No such Sequence ID: {0!s}'.format(ke))
966 else:
967 raise TypeError('Unsupported row expression')
968
969
970 if isinstance(colspec, slice):
971 colspec = SliceHelper(colspec, 0, self._length or 0)
972 cols = range(colspec.start + 1, colspec.stop + 1)
973 elif isinstance(colspec, int):
974 cols = [colspec + 1]
975 elif csb.core.iterable(colspec):
976 try:
977 cols = [ c + 1 for c in colspec ]
978 except:
979 raise TypeError('Unsupported column expression')
980 else:
981 raise TypeError('Unsupported column expression')
982
983
984 if len(rows) == 0:
985 raise ValueError("The expression returns zero rows")
986 if len(cols) == 0:
987 raise ValueError("The expression returns zero columns")
988
989
990 return self._extract(rows, cols)
991
992 - def _range(self, slice, start, end):
993
994 s, e, t = slice.start, slice.end, slice.step
995
996 if s is None:
997 s = start
998 if e is None:
999 e = end
1000 if t is None:
1001 t = 1
1002
1003 return range(s, e, t)
1004
1006 for cn in range(1, self.length + 1):
1007 yield self._get_column(cn)
1008
1009 @abstractmethod
1011 """
1012 Hook method, called internally upon object construction. Subclasses
1013 define how the source alignment sequences are handled during alignment
1014 construction.
1015
1016 @param sequences: alignment entries
1017 @type sequences: list of L{AbstractSequence}s
1018 """
1019 pass
1020
1022 """
1023 Hook method, which is used to initialize various alignment properties
1024 (such as length) from the first alignned sequence.
1025 """
1026 if rep_sequence.length == 0:
1027 raise SequenceError("Sequence '{0}' is empty".format(rep_sequence.id))
1028
1029 assert self._length is None
1030 self._length = rep_sequence.length
1031
1033 """
1034 Return a new sequence of the current L{AbstractAlignment} sub-class.
1035 """
1036 return self.__class__(*a, **k)
1037
1038 - def add(self, sequence):
1058
1059 @property
1061 """
1062 Number of columns in the alignment
1063 @rtype: int
1064 """
1065 return self._length or 0
1066
1067 @property
1069 """
1070 Number of rows (sequences) in the alignment
1071 @rtype: int
1072 """
1073 return self._msa.length
1074
1075 @property
1077 """
1078 1-based access to the alignment entries (sequences)
1079 @rtype: L{AlignmentRowsTable}
1080 """
1081 return self._msa
1082
1083 @property
1085 """
1086 1-based access to the alignment columns
1087 @rtype: L{ColumnIndexer}
1088 """
1089 return self._colview
1090
1092 """
1093 Return True of C{column} contains at least one gap.
1094 @param column: column number, 1-based
1095 @type column: int
1096
1097 @rtype: bool
1098 """
1099
1100 for row in self._msa:
1101 if row.columns[column].residue.type == row.alphabet.GAP:
1102 return True
1103
1104 return False
1105
1107 return tuple(row._get_column(column) for row in self.rows)
1108
1110
1111 rows = set(rows)
1112 cols = set(cols)
1113
1114 if not 1 <= min(rows) <= max(rows) <= self.size:
1115 raise IndexError('Row specification out of range')
1116
1117 if not 1 <= min(cols) <= max(cols) <= self.length:
1118 raise IndexError('Column specification out of range')
1119
1120 sequences = []
1121
1122 for rn, row in enumerate(self.rows, start=1):
1123 if rn in rows:
1124 sequences.append(row.extract(cols))
1125
1126 return self._factory(sequences, strict=self._strict)
1127
1129 """
1130 Extract a sub-alignment, ranging from C{start} to C{end} columns.
1131
1132 @param start: starting column, 1-based
1133 @type start: int
1134 @param end: ending column, 1-based
1135 @type end: int
1136
1137 @return: a new alignment of the current type
1138 @rtype: L{AbstractAlignment}
1139
1140 @raise ColumnPositionError: if start/end is out of range
1141 """
1142 if not 1 <= start <= end <= self.length:
1143 raise ColumnPositionError(None, 1, self.length)
1144
1145 sequences = []
1146
1147 for row in self.rows:
1148 sequences.append(row.subregion(start, end))
1149
1150 return self._factory(sequences, strict=self._strict)
1151
1175
1177 """
1178 Multiple sequence alignment. See L{AbstractAlignment} for details.
1179 """
1180
1185
1186 @staticmethod
1187 - def parse(string, strict=True):
1200
1202 """
1203 Multiple structure alignment. Similar to a L{SequenceAlignment}, but
1204 the alignment holds the actual L{csb.bio.structure.ProteinResidue} objects,
1205 taken from the corresponding source L{csb.bio.structure.Chain}s.
1206
1207 See L{AbstractAlignment} for details.
1208 """
1209
1214
1215 @staticmethod
1216 - def parse(string, provider, id_factory=None, strict=True):
1217 """
1218 Create a new L{StructureAlignment} from an mFASTA string. See
1219 L{csb.bio.io.fasta.StructureAlignmentFactory} for details.
1220
1221 @param string: MSA-formatted string
1222 @type string: str
1223 @param provider: data source for all structures found in the alignment
1224 @type provider: L{csb.bio.io.wwpdb.StructureProvider}
1225 @param strict: see L{AbstractAlignment}
1226 @type strict: bool
1227 @param id_factory: callable factory, which transforms a sequence ID into
1228 a L{csb.bio.io.wwpdb.EntryID} object. By default
1229 this is L{csb.bio.io.wwpdb.EntryID.create}.
1230 @type id_factory: callable
1231 @rtype: L{StructureAlignment}
1232 """
1233 from csb.bio.io.fasta import StructureAlignmentFactory
1234
1235 factory = StructureAlignmentFactory(
1236 provider, id_factory=id_factory, strict=strict)
1237 return factory.make_alignment(string)
1238
1240 """
1241 A specific type of multiple alignment, which provides some operations
1242 relative to a master sequence (the first entry in the alignment).
1243 """
1244
1245 - def __init__(self, sequences, strict=True):
1252
1254
1255 super(A3MAlignment, self)._initialize(rep_sequence)
1256 self._alphabet = rep_sequence.alphabet
1257
1273
1274 @property
1276 """
1277 The master sequence
1278 @rtype: L{AbstractSequence}
1279 """
1280 return self._master
1281
1283 """
1284 Return True of C{column} contains at least one insertion.
1285
1286 @param column: column number, 1-based
1287 @type column: int
1288 @rtype: bool
1289 """
1290 return column in self._insertions
1291
1305
1308
1309 @property
1311 """
1312 Number of match states (residues in the ungapped master).
1313 @rtype: int
1314 """
1315 return self._matches
1316
1317 @staticmethod
1318 - def parse(string, strict=True):
1331