| Home | Trees | Indices | Help | 
 | 
|---|
|  | 
  1  """ 
  2  FASTA format sequence I/O. 
  3   
  4  This module provides parsers and writers for sequences and alignments in 
  5  FASTA format. The most basic usage is: 
  6   
  7      >>> parser = SequenceParser() 
  8      >>> parser.parse_file('sequences.fa') 
  9      <SequenceCollection>   # collection of L{AbstractSequence}s 
 10   
 11  This will load all sequences in memory. If you are parsing a huge file, 
 12  then you could efficiently read the file sequence by sequence: 
 13   
 14      >>> for seq in parser.read('sequences.fa'): 
 15              ...            # seq is an L{AbstractSequence} 
 16               
 17  L{BaseSequenceParser} is the central class in this module, which defines a 
 18  common infrastructure for all sequence readers. L{SequenceParser} is a standard 
 19  implementation, and L{PDBSequenceParser} is specialized to read FASTA sequences 
 20  with PDB headers. 
 21   
 22  For parsing alignments, have a look at L{SequenceAlignmentReader} and 
 23  L{StructureAlignmentFactory}. 
 24   
 25  Finally, this module provides a number of L{OutputBuilder}s, which know how to 
 26  write L{AbstractSequence} and L{AbstractAlignment} objects to FASTA files: 
 27   
 28      >>> with open('file.fa', 'w') as out: 
 29              builder = OutputBuilder.create(AlignmentFormats.FASTA, out) 
 30              builder.add_alignment(alignment) 
 31              builder.add_sequence(sequence) 
 32              ... 
 33   
 34  or you could instantiate any of the L{OutputBuilder}s directly. 
 35  """ 
 36   
 37  import csb.io 
 38  import csb.core 
 39   
 40  from abc import ABCMeta, abstractmethod 
 41   
 42  from csb.bio.sequence import SequenceTypes, SequenceAlphabets, AlignmentFormats, SequenceError 
 43  from csb.bio.sequence import SequenceAlignment, StructureAlignment, A3MAlignment 
 44  from csb.bio.sequence import SequenceCollection, AbstractSequence, Sequence, RichSequence, ChainSequence 
 49   
 52      """ 
 53      FASTA parser template. Subclasses must implement the way FASTA strings are 
 54      handled by overriding C{BaseSequenceParser.read_sequence}. 
 55       
 56      @param product: sequence product factory (an L{AbstractSequence} subclass) 
 57      @type product: type 
 58      @param product_type: default L{SequenceTypes} member for the products 
 59      @type product_type: L{EnumItem} 
 60      """ 
 61      __metaclass__ = ABCMeta 
 62   
 64           
 65          self._product = None 
 66           
 67          if not issubclass(product, AbstractSequence): 
 68              raise TypeError(product) 
 69           
 70          if not product_type.enum is SequenceTypes: 
 71              raise TypeError(product_type) 
 72           
 73          self._product = product 
 74          self._type = product_type 
 75           
 76      @property 
 83       
 84      @property 
 86          """ 
 87          Default sequence type of the products - a member of L{SequenceTypes} 
 88          @rtype: enum item 
 89          """ 
 90          return self._type 
 91           
 92      @abstractmethod 
 94          """ 
 95          Parse a single FASTA string 
 96           
 97          @return: a new sequence, created with L{BaseSequenceParser.product_factory} 
 98          @rtype: L{AbstractSequence} 
 99           
100          @raise SequenceFormatError: on parse error 
101          """ 
102          pass 
103   
105          """ 
106          Read FASTA sequences from an (m)FASTA-formatted string 
107   
108          @param fasta_string: FASTA string to parse 
109          @type fasta_string: str 
110   
111          @return: a list of L{Sequence}s 
112          @rtype: L{SequenceCollection} 
113           
114          @raise SequenceFormatError: on parse error         
115          """ 
116   
117          stream = csb.io.MemoryStream() 
118          stream.write(fasta_string) 
119   
120          return self.parse_file(stream) 
121   
123          """ 
124          Read FASTA sequences from a (m)FASTA file 
125   
126          @param fasta_file: input FASTA file name or opened stream 
127          @type fasta_file: str, file 
128   
129          @return: a list of L{Sequence}s 
130          @rtype: L{SequenceCollection} 
131           
132          @raise SequenceFormatError: on parse error         
133          """ 
134          if isinstance(fasta_file, csb.core.string): 
135              stream = open(fasta_file) 
136          else: 
137              stream = fasta_file 
138   
139          seqs = [] 
140          reader = csb.io.EntryReader(stream, AbstractSequence.DELIMITER, None) 
141   
142          for entry in reader.entries(): 
143              seqs.append(self.read_sequence(entry)) 
144   
145          return SequenceCollection(seqs) 
146   
148          """ 
149          Read FASTA sequences from an (m)FASTA file. 
150   
151          @param fasta_file: input FASTA file name or opened stream 
152          @type fasta_file: str, file 
153   
154          @return: efficient cursor over all L{Sequence}s (parse on demand) 
155          @rtype: iterator 
156           
157          @raise SequenceFormatError: on parse error         
158          """ 
159          if isinstance(fasta_file, csb.core.string): 
160              stream = open(fasta_file) 
161          else: 
162              stream = fasta_file 
163   
164          reader = csb.io.EntryReader(stream, AbstractSequence.DELIMITER, None) 
165   
166          for entry in reader.entries(): 
167              yield self.read_sequence(entry) 
168       
170      """ 
171      Standard FASTA parser. See L{BaseSequenceParser} for details. 
172      """ 
173       
175   
176          lines = string.strip().splitlines() 
177           
178          if not lines[0].startswith(AbstractSequence.DELIMITER): 
179              lines = [''] + lines 
180          if len(lines) < 2: 
181              raise SequenceFormatError('Empty FASTA entry') 
182   
183          header = lines[0] 
184          id = header[1:].split()[0] 
185          sequence = ''.join(lines[1:]) 
186   
187          return self.product_factory(id, header, sequence, self.product_type) 
188       
190      """ 
191      PDB FASTA parser. Reads the PDB ID and sequence type from the header. 
192      See L{BaseSequenceParser} for more details. 
193      """ 
194           
196   
197          seq = super(PDBSequenceParser, self).read_sequence(string) 
198   
199          if not (seq.header and seq.id) or not (len(seq.id) in(5, 6) and seq.header.find('mol:') != -1): 
200              raise SequenceFormatError('Does not look like a PDB header: {0}'.format(seq.header)) 
201   
202          seq.id = seq.id.replace('_', '') 
203          stype = seq.header.partition('mol:')[2].partition(' ')[0] 
204          try: 
205              seq.type = csb.core.Enum.parse(SequenceTypes, stype) 
206          except csb.core.EnumValueError: 
207              seq.type = SequenceTypes.Unknown 
208   
209          return seq 
210       
213       
215           
216          self._temp = [] 
217          self._insertion = insertion 
218           
219          for sequence in sequences: 
220               
221              row = list(sequence.sequence) 
222              row.reverse() 
223              self._temp.append(row)   
224       
226           
227          sequences = self._temp 
228   
229          column = [ ] 
230          has_insertion = False 
231   
232          for sequence in sequences: 
233              if len(sequence) > 0 and sequence[-1].islower(): 
234                  has_insertion = True 
235   
236          for sequence in sequences: 
237              try: 
238                  if has_insertion and not sequence[-1].islower(): 
239                      column.append(self._insertion) 
240                  else: 
241                      column.append(sequence.pop()) 
242              except IndexError: 
243                  column.append('') 
244   
245          if not any(column): 
246              raise StopIteration() 
247   
248          return column   
249       
252       
254          return self.next()     
255       
257      """ 
258      Sequence alignment parser. 
259       
260      @param product_type: default L{SequenceTypes} member for the sequence products 
261      @type product_type: L{EnumItem} 
262      @param strict: if True, raise exception on duplicate sequence identifiers. 
263                     See L{csb.bio.sequence.AbstractAlignment} for details 
264      @type strict: bool     
265      """ 
266       
268           
269          if not product_type.enum is SequenceTypes: 
270              raise TypeError(product_type) 
271           
272          self._type = product_type 
273          self._strict = bool(strict) 
274   
275      @property 
277          """ 
278          Default sequence type of the alignment entries - a member of L{SequenceTypes} 
279          @rtype: enum item 
280          """         
281          return self._type 
282   
283      @property 
290               
292          """ 
293          Parse an alignment in multi-FASTA format. 
294           
295          @param string: alignment string 
296          @type string: str 
297           
298          @rtype: L{SequenceAlignment} 
299          """ 
300           
301          parser = SequenceParser(RichSequence, self.product_type) 
302          sequences = parser.parse_string(string) 
303           
304          return SequenceAlignment(sequences, strict=self.strict)  
305       
307          """ 
308          Parse an alignment in A3M format. 
309           
310          @param string: alignment string 
311          @type string: str 
312           
313          @rtype: L{A3MAlignment} 
314          """ 
315          alphabet = SequenceAlphabets.get(self.product_type) 
316   
317          # parse all "mis-aligned" sequences as case-sensitive strings         
318          parser = SequenceParser(Sequence, self.product_type) 
319          sequences = parser.parse_string(string) 
320           
321          # storage for expanded sequences 
322          s = [] 
323           
324          for dummy in sequences: 
325              s.append([]) 
326                
327          # expand all sequences with insertion characters and make them equal length 
328          for column in A3MSequenceIterator(sequences, str(alphabet.INSERTION)): 
329              for sn, char in enumerate(column): 
330                  s[sn].append(char) 
331           
332          # build normal sequence objects from the equalized sequence strings 
333          aligned_seqs = [] 
334           
335          for sn, seq in enumerate(sequences): 
336           
337              sequence = RichSequence(seq.id, seq.header, s[sn], self.product_type) 
338              aligned_seqs.append(sequence) 
339               
340          return A3MAlignment(aligned_seqs, strict=self.strict) 
341       
343      """ 
344      Protein structure alignment parser. 
345       
346      In order to construct the structural alignment, this factory needs a PDB 
347      structure provider: an object, whose C{provider.get} method returns a  
348      L{csb.bio.structute.Structure} for a given sequence identifier. Sequence 
349      identifiers on the other hand need to be split into 'accession number' 
350      and 'chain ID'. By default this is done using a standard PDB Entry ID 
351      factory, but clients are free to provide custom factories. An C{id_factory} 
352      must be a callable, which accepts a single string identifier and returns 
353      an EntryID object. 
354       
355      @param provider: data source for all structures found in the alignment 
356      @type provider: L{csb.bio.io.wwpdb.StructureProvider} 
357      @param id_factory: callable factory, which transforms a sequence ID into 
358                         a L{csb.bio.io.wwpdb.EntryID} object. By default 
359                         this is L{csb.bio.io.wwpdb.EntryID.create}. 
360      @type id_factory: callable 
361      @param strict: if True, raise exception on duplicate sequence identifiers. 
362                     See L{csb.bio.sequence.AbstractAlignment} for details 
363      @type strict: bool     
364      """ 
365       
367   
368          from csb.bio.io.wwpdb import EntryID 
369           
370          if id_factory is None: 
371              id_factory = EntryID.create 
372          if not hasattr(id_factory, '__call__'): 
373              raise TypeError(id_factory) 
374           
375          if not hasattr(provider, 'get'): 
376              raise TypeError(provider)         
377                   
378          self._type = SequenceTypes.Protein 
379          self._strict = bool(strict) 
380          self._provider = provider 
381          self._id_factory = id_factory 
382   
383      @property 
385          """ 
386          Default sequence type of the alignment rows - a member of L{SequenceTypes} 
387          @rtype: enum item 
388          """         
389          return self._type 
390       
391      @property 
393          """ 
394          Current L{csb.bio.io.wwpdb.StructureProvider} instance in use 
395          @rtype: L{StructureProvider} 
396          """ 
397          return self._provider   
398       
399      @property 
401          """ 
402          Current L{csb.bio.io.wwpdb.EntryID} factory instance in use 
403          @rtype: L{EntryID}         
404          """         
405          return self._id_factory 
406   
407      @property 
414               
416          """ 
417          Build a protein structure alignment given a multi-FASTA string 
418          and the current structure C{provider}. 
419           
420          @param string: alignment string 
421          @type string: str 
422           
423          @rtype: L{SequenceAlignment} 
424           
425          @raise SequenceError: when an aligned sequence is not a proper 
426                                subsequence of its respective source PDB chain 
427          @raise StructureNotFoundError: if C{provider} can't provide a structure 
428                                         for a given sequence ID 
429          @raise InvalidEntryIDError: if a given sequence ID cannot be parsed         
430          """ 
431   
432          entries = [] 
433          parser = SequenceParser(Sequence, self.product_type) 
434          sequences = parser.parse_string(string) 
435           
436          for row in sequences: 
437              id = self.id_factory(row.id) 
438              chain = self.provider.get(id.accession).chains[id.chain] 
439               
440              entry = self.make_entry(row, chain) 
441              entries.append(entry) 
442               
443          return StructureAlignment(entries, strict=self.strict) 
444       
446          """ 
447          Build a protein structure alignment entry, given a sequence alignment 
448          entry and its corresponding source PDB chain. 
449           
450          @param row: sequence alignment entry (sequence with gaps) 
451          @type row: L{AbstractSequence}, L{SequenceAdapter} 
452          @param chain: source PDB chain 
453          @type chain: L{csb.bio.structure.Chain} 
454           
455          @return: gapped chain sequence, containing cloned residues from the 
456                   source chain (except for the gaps) 
457          @rtype: L{ChainSequence} 
458          @raise SequenceError: when C{row} is not a proper subsequence of C{chain}         
459          """ 
460          offset = 1         
461          residues = [] 
462          sequence = row.strip().sequence.upper() 
463           
464          try: 
465              start = chain.sequence.index(sequence) + 1 
466          except ValueError: 
467              raise SequenceError('{0}: not a subsequence of {1}'.format(row.id, chain.entry_id)) 
468           
469          for rinfo in row.residues: 
470               
471              if rinfo.type == row.alphabet.GAP: 
472                  residues.append(rinfo) 
473                  continue 
474              else: 
475                  rank = start + offset - 1 
476                  assert chain.residues[rank].type == rinfo.type 
477                  residues.append(chain.residues[rank].clone()) 
478                  offset += 1 
479                  continue 
480               
481          return ChainSequence(row.id, row.header, residues, chain.type) 
482           
485      """ 
486      Base sequence/alignment string format builder. 
487       
488      @param output: destination stream, where the product is written. 
489      @type output: file 
490      @param headers: if False, omit headers 
491      @type headers: bool 
492       
493      @note: File builders are not guaranteed to check the correctness of the 
494             product. It is assumed that the client of the builder knows what 
495             it is doing.    
496      """ 
497      __metaclass__ = ABCMeta 
498      _registry = {}     
499           
501   
502          if not hasattr(output, 'write'): 
503              raise TypeError(output)      
504   
505          self._out = output 
506          self._headers = bool(headers) 
507           
508      @staticmethod 
510          """ 
511          Create an output builder, which knows how to handle the specified 
512          sequence/alignment C{format}. Additional arguments are passed to the 
513          builder's constructor. 
514           
515          @param format: L{AlignmentFormats} member 
516          @type format: L{EnumItem} 
517          @rtype: L{OutputBuilder} 
518          """ 
519          if format not in OutputBuilder._registry: 
520              raise ValueError('Unhandled format: {0}'.format(format)) 
521           
522          klass = OutputBuilder._registry[format] 
523          return klass(*a, **k) 
524   
525      @staticmethod 
527          """ 
528          Register a new output builder. 
529           
530          @param format: L{AlignmentFormats} member 
531          @type format: L{EnumItem} 
532          @param klass: builder class (L{OutputBuilder} sub-class) 
533          @type klass: type 
534          """     
535          assert format not in OutputBuilder._registry 
536          assert issubclass(klass, OutputBuilder) 
537           
538          OutputBuilder._registry[format] = klass 
539   
540      @property 
547       
548      @property 
550          """ 
551          True if sequence headers will be written to the destination 
552          @rtype: bool 
553          """ 
554          return self._headers 
555   
561   
563          """ 
564          Write a chunk of C{text}, followed by a newline terminator. 
565          """ 
566          self._out.write(text) 
567          self._out.write('\n')             
568       
569      @abstractmethod 
571          """ 
572          Format and append a new sequence to the product. 
573          @type sequence: L{AbstractSequence}     
574          """ 
575          pass 
576       
578          """ 
579          Format and append a collection of L{AbstractSequence}s to the product. 
580          @type sequences: iterable of L{AbstractSequence}s 
581          """ 
582          for s in sequences: 
583              self.add_sequence(s) 
584               
585      @abstractmethod             
587          """ 
588          Format and append an alignment to the product. 
589          @type alignment: L{AbstractAlignment}  
590          """ 
591          pass 
592       
598   
600          """ 
601          Append a comment line to the product. 
602           
603          @param text: comment text 
604          @type text: str 
605          @param comment: comment prefix 
606          @type comment: str 
607          @param length: maximal comment length 
608          @type length: int 
609          """ 
610          for i in range(0, len(text), length): 
611              self.write(comment) 
612              self.write(' ')             
613              self.write(text[i : i + length]) 
614              self.write('\n') 
615               
617      """ 
618      Formats sequences as standard FASTA strings. See L{OutputBuilder}. 
619      """ 
620      FORMAT = AlignmentFormats.FASTA 
621       
623           
624          if self.headers: 
625              self.write(AbstractSequence.DELIMITER) 
626              self.writeline(sequence.header) 
627   
628          insertion = str(sequence.alphabet.INSERTION) 
629          gap = str(sequence.alphabet.GAP) 
630           
631          self.writeline(sequence.sequence.replace(insertion, gap)) 
632           
636           
638      """ 
639      Formats sequences as A3M strings. When appending an alignment, this builder 
640      will write all insertion-containing columns in lower case. Also, gap symbols 
641      are omitted if the respective columns contain insertions.  
642       
643      See L{OutputBuilder}. 
644      """ 
645      FORMAT = AlignmentFormats.A3M 
646       
648           
649          if self.headers: 
650              self.write(AbstractSequence.DELIMITER) 
651              self.writeline(sequence.header) 
652   
653          self.writeline(sequence.sequence) 
654           
656   
657          if isinstance(alignment, A3MAlignment): 
658              self._add_a3m(alignment) 
659          else: 
660              self._add_proper(alignment) 
661               
663           
664          for s in alignment.rows: 
665               
666              if self.headers: 
667                  self.write(AbstractSequence.DELIMITER) 
668                  self.writeline(s.header) 
669                   
670              sequence = [] 
671               
672              for ci in s.columns: 
673                   
674                  if ci.residue.type != s.alphabet.INSERTION: 
675                      char = str(ci.residue.type) 
676                       
677                      if alignment.insertion_at(ci.column): 
678                          sequence.append(char.lower()) 
679                      else: 
680                          sequence.append(char) 
681                  else: 
682                      continue 
683                           
684              self.writeline(''.join(sequence)) 
685               
687           
688          for s in alignment.rows: 
689               
690              if self.headers: 
691                  self.write(AbstractSequence.DELIMITER) 
692                  self.writeline(s.header) 
693                   
694              master = alignment.rows[1] 
695              sequence = [] 
696               
697              for ci in s.columns: 
698                  char = str(ci.residue.type) 
699                   
700                  if master.columns[ci.column].residue.type == master.alphabet.GAP: 
701                      if ci.residue.type == s.alphabet.GAP: 
702                          continue 
703                      else: 
704                          sequence.append(char.lower()) 
705                  else: 
706                      sequence.append(char) 
707                           
708              self.writeline(''.join(sequence))             
709               
711      """ 
712      Formats sequences as PIR FASTA strings, recognized by Modeller. 
713      See L{OutputBuilder} for general alignment documentation. 
714      """ 
715      FORMAT = AlignmentFormats.PIR 
716           
718          self._add(sequence, template=True) 
719           
721           
722          for n, sequence in enumerate(alignment.rows, start=1): 
723              if n == 1: 
724                  self.add_target(sequence) 
725              else: 
726                  self.add_template(sequence) 
727               
728           
730          self._add(sequence, template=False) 
731       
733          self._add(sequence, template=True) 
734       
736                       
737          if self.headers: 
738               
739              if template: 
740                  type = 'structure' 
741              else: 
742                  type = 'sequence' 
743               
744              id = sequence.id 
745              start = end = '.' 
746   
747              if hasattr(sequence, 'start') and hasattr(sequence, 'end'): 
748                  start = sequence.start 
749                  end = sequence.end 
750                               
751              header = 'P1;{0}\n{2}:{0}:{3}:{1}:{4}:{1}::::'.format(id[:-1], id[-1], type, start, end) 
752                               
753              self.write(AbstractSequence.DELIMITER) 
754              self.writeline(header) 
755   
756          insertion = str(sequence.alphabet.INSERTION) 
757          gap = str(sequence.alphabet.GAP) 
758           
759          self.write(sequence.sequence.replace(insertion, gap)) 
760          self.writeline('*') 
761   
762   
763  # register builders 
764  for klass in OutputBuilder.__subclasses__(): 
765      OutputBuilder.register(klass.FORMAT, klass) 
766   
| Home | Trees | Indices | Help | 
 | 
|---|
| Generated by Epydoc 3.0.1 on Tue Jul 4 20:19:12 2017 | http://epydoc.sourceforge.net |