1 """
2 FASTA format sequence I/O.
3
4 This module provides parsers and writers for sequences and alignments in
5 FASTA format. The most basic usage is:
6
7 >>> parser = SequenceParser()
8 >>> parser.parse_file('sequences.fa')
9 <SequenceCollection> # collection of L{AbstractSequence}s
10
11 This will load all sequences in memory. If you are parsing a huge file,
12 then you could efficiently read the file sequence by sequence:
13
14 >>> for seq in parser.read('sequences.fa'):
15 ... # seq is an L{AbstractSequence}
16
17 L{BaseSequenceParser} is the central class in this module, which defines a
18 common infrastructure for all sequence readers. L{SequenceParser} is a standard
19 implementation, and L{PDBSequenceParser} is specialized to read FASTA sequences
20 with PDB headers.
21
22 For parsing alignments, have a look at L{SequenceAlignmentReader} and
23 L{StructureAlignmentFactory}.
24
25 Finally, this module provides a number of L{OutputBuilder}s, which know how to
26 write L{AbstractSequence} and L{AbstractAlignment} objects to FASTA files:
27
28 >>> with open('file.fa', 'w') as out:
29 builder = OutputBuilder.create(AlignmentFormats.FASTA, out)
30 builder.add_alignment(alignment)
31 builder.add_sequence(sequence)
32 ...
33
34 or you could instantiate any of the L{OutputBuilder}s directly.
35 """
36
37 import csb.io
38 import csb.core
39
40 from abc import ABCMeta, abstractmethod
41
42 from csb.bio.sequence import SequenceTypes, SequenceAlphabets, AlignmentFormats, SequenceError
43 from csb.bio.sequence import SequenceAlignment, StructureAlignment, A3MAlignment
44 from csb.bio.sequence import SequenceCollection, AbstractSequence, Sequence, RichSequence, ChainSequence
49
52 """
53 FASTA parser template. Subclasses must implement the way FASTA strings are
54 handled by overriding C{BaseSequenceParser.read_sequence}.
55
56 @param product: sequence product factory (an L{AbstractSequence} subclass)
57 @type product: type
58 @param product_type: default L{SequenceTypes} member for the products
59 @type product_type: L{EnumItem}
60 """
61 __metaclass__ = ABCMeta
62
75
76 @property
78 """
79 Factory used to build sequence products
80 @rtype: class
81 """
82 return self._product
83
84 @property
86 """
87 Default sequence type of the products - a member of L{SequenceTypes}
88 @rtype: enum item
89 """
90 return self._type
91
92 @abstractmethod
94 """
95 Parse a single FASTA string
96
97 @return: a new sequence, created with L{BaseSequenceParser.product_factory}
98 @rtype: L{AbstractSequence}
99
100 @raise SequenceFormatError: on parse error
101 """
102 pass
103
105 """
106 Read FASTA sequences from an (m)FASTA-formatted string
107
108 @param fasta_string: FASTA string to parse
109 @type fasta_string: str
110
111 @return: a list of L{Sequence}s
112 @rtype: L{SequenceCollection}
113
114 @raise SequenceFormatError: on parse error
115 """
116
117 stream = csb.io.MemoryStream()
118 stream.write(fasta_string)
119
120 return self.parse_file(stream)
121
123 """
124 Read FASTA sequences from a (m)FASTA file
125
126 @param fasta_file: input FASTA file name or opened stream
127 @type fasta_file: str, file
128
129 @return: a list of L{Sequence}s
130 @rtype: L{SequenceCollection}
131
132 @raise SequenceFormatError: on parse error
133 """
134 if isinstance(fasta_file, csb.core.string):
135 stream = open(fasta_file)
136 else:
137 stream = fasta_file
138
139 seqs = []
140 reader = csb.io.EntryReader(stream, AbstractSequence.DELIMITER, None)
141
142 for entry in reader.entries():
143 seqs.append(self.read_sequence(entry))
144
145 return SequenceCollection(seqs)
146
147 - def read(self, fasta_file):
148 """
149 Read FASTA sequences from an (m)FASTA file.
150
151 @param fasta_file: input FASTA file name or opened stream
152 @type fasta_file: str, file
153
154 @return: efficient cursor over all L{Sequence}s (parse on demand)
155 @rtype: iterator
156
157 @raise SequenceFormatError: on parse error
158 """
159 if isinstance(fasta_file, csb.core.string):
160 stream = open(fasta_file)
161 else:
162 stream = fasta_file
163
164 reader = csb.io.EntryReader(stream, AbstractSequence.DELIMITER, None)
165
166 for entry in reader.entries():
167 yield self.read_sequence(entry)
168
170 """
171 Standard FASTA parser. See L{BaseSequenceParser} for details.
172 """
173
188
190 """
191 PDB FASTA parser. Reads the PDB ID and sequence type from the header.
192 See L{BaseSequenceParser} for more details.
193 """
194
210
213
214 - def __init__(self, sequences, insertion):
224
249
252
255
257 """
258 Sequence alignment parser.
259
260 @param product_type: default L{SequenceTypes} member for the sequence products
261 @type product_type: L{EnumItem}
262 @param strict: if True, raise exception on duplicate sequence identifiers.
263 See L{csb.bio.sequence.AbstractAlignment} for details
264 @type strict: bool
265 """
266
274
275 @property
277 """
278 Default sequence type of the alignment entries - a member of L{SequenceTypes}
279 @rtype: enum item
280 """
281 return self._type
282
283 @property
285 """
286 True if strict mode is enabled
287 @rtype: bool
288 """
289 return self._strict
290
305
341
343 """
344 Protein structure alignment parser.
345
346 In order to construct the structural alignment, this factory needs a PDB
347 structure provider: an object, whose C{provider.get} method returns a
348 L{csb.bio.structute.Structure} for a given sequence identifier. Sequence
349 identifiers on the other hand need to be split into 'accession number'
350 and 'chain ID'. By default this is done using a standard PDB Entry ID
351 factory, but clients are free to provide custom factories. An C{id_factory}
352 must be a callable, which accepts a single string identifier and returns
353 an EntryID object.
354
355 @param provider: data source for all structures found in the alignment
356 @type provider: L{csb.bio.io.wwpdb.StructureProvider}
357 @param id_factory: callable factory, which transforms a sequence ID into
358 a L{csb.bio.io.wwpdb.EntryID} object. By default
359 this is L{csb.bio.io.wwpdb.EntryID.create}.
360 @type id_factory: callable
361 @param strict: if True, raise exception on duplicate sequence identifiers.
362 See L{csb.bio.sequence.AbstractAlignment} for details
363 @type strict: bool
364 """
365
366 - def __init__(self, provider, id_factory=None, strict=True):
382
383 @property
385 """
386 Default sequence type of the alignment rows - a member of L{SequenceTypes}
387 @rtype: enum item
388 """
389 return self._type
390
391 @property
393 """
394 Current L{csb.bio.io.wwpdb.StructureProvider} instance in use
395 @rtype: L{StructureProvider}
396 """
397 return self._provider
398
399 @property
401 """
402 Current L{csb.bio.io.wwpdb.EntryID} factory instance in use
403 @rtype: L{EntryID}
404 """
405 return self._id_factory
406
407 @property
409 """
410 True if strict mode is enabled
411 @rtype: bool
412 """
413 return self._strict
414
444
445 - def make_entry(self, row, chain):
446 """
447 Build a protein structure alignment entry, given a sequence alignment
448 entry and its corresponding source PDB chain.
449
450 @param row: sequence alignment entry (sequence with gaps)
451 @type row: L{AbstractSequence}, L{SequenceAdapter}
452 @param chain: source PDB chain
453 @type chain: L{csb.bio.structure.Chain}
454
455 @return: gapped chain sequence, containing cloned residues from the
456 source chain (except for the gaps)
457 @rtype: L{ChainSequence}
458 @raise SequenceError: when C{row} is not a proper subsequence of C{chain}
459 """
460 offset = 1
461 residues = []
462 sequence = row.strip().sequence.upper()
463
464 try:
465 start = chain.sequence.index(sequence) + 1
466 except ValueError:
467 raise SequenceError('{0}: not a subsequence of {1}'.format(row.id, chain.entry_id))
468
469 for rinfo in row.residues:
470
471 if rinfo.type == row.alphabet.GAP:
472 residues.append(rinfo)
473 continue
474 else:
475 rank = start + offset - 1
476 assert chain.residues[rank].type == rinfo.type
477 residues.append(chain.residues[rank].clone())
478 offset += 1
479 continue
480
481 return ChainSequence(row.id, row.header, residues, chain.type)
482
485 """
486 Base sequence/alignment string format builder.
487
488 @param output: destination stream, where the product is written.
489 @type output: file
490 @param headers: if False, omit headers
491 @type headers: bool
492
493 @note: File builders are not guaranteed to check the correctness of the
494 product. It is assumed that the client of the builder knows what
495 it is doing.
496 """
497 __metaclass__ = ABCMeta
498 _registry = {}
499
500 - def __init__(self, output, headers=True):
501
502 if not hasattr(output, 'write'):
503 raise TypeError(output)
504
505 self._out = output
506 self._headers = bool(headers)
507
508 @staticmethod
510 """
511 Create an output builder, which knows how to handle the specified
512 sequence/alignment C{format}. Additional arguments are passed to the
513 builder's constructor.
514
515 @param format: L{AlignmentFormats} member
516 @type format: L{EnumItem}
517 @rtype: L{OutputBuilder}
518 """
519 if format not in OutputBuilder._registry:
520 raise ValueError('Unhandled format: {0}'.format(format))
521
522 klass = OutputBuilder._registry[format]
523 return klass(*a, **k)
524
525 @staticmethod
527 """
528 Register a new output builder.
529
530 @param format: L{AlignmentFormats} member
531 @type format: L{EnumItem}
532 @param klass: builder class (L{OutputBuilder} sub-class)
533 @type klass: type
534 """
535 assert format not in OutputBuilder._registry
536 assert issubclass(klass, OutputBuilder)
537
538 OutputBuilder._registry[format] = klass
539
540 @property
542 """
543 Destination stream
544 @rtype: stream
545 """
546 return self._out
547
548 @property
550 """
551 True if sequence headers will be written to the destination
552 @rtype: bool
553 """
554 return self._headers
555
557 """
558 Write a chunk of C{text} to the output stream.
559 """
560 self._out.write(text)
561
563 """
564 Write a chunk of C{text}, followed by a newline terminator.
565 """
566 self._out.write(text)
567 self._out.write('\n')
568
569 @abstractmethod
571 """
572 Format and append a new sequence to the product.
573 @type sequence: L{AbstractSequence}
574 """
575 pass
576
578 """
579 Format and append a collection of L{AbstractSequence}s to the product.
580 @type sequences: iterable of L{AbstractSequence}s
581 """
582 for s in sequences:
583 self.add_sequence(s)
584
585 @abstractmethod
587 """
588 Format and append an alignment to the product.
589 @type alignment: L{AbstractAlignment}
590 """
591 pass
592
594 """
595 Append a sequence separator to the product.
596 """
597 self.writeline(separator)
598
615
636
638 """
639 Formats sequences as A3M strings. When appending an alignment, this builder
640 will write all insertion-containing columns in lower case. Also, gap symbols
641 are omitted if the respective columns contain insertions.
642
643 See L{OutputBuilder}.
644 """
645 FORMAT = AlignmentFormats.A3M
646
654
661
685
709
711 """
712 Formats sequences as PIR FASTA strings, recognized by Modeller.
713 See L{OutputBuilder} for general alignment documentation.
714 """
715 FORMAT = AlignmentFormats.PIR
716
719
727
728
731
734
735 - def _add(self, sequence, template=True):
761
762
763
764 for klass in OutputBuilder.__subclasses__():
765 OutputBuilder.register(klass.FORMAT, klass)
766