1 """
2 PDB structure parsers, format builders and database providers.
3
4 The most basic usage is:
5
6 >>> parser = StructureParser('structure.pdb')
7 >>> parser.parse_structure()
8 <Structure> # a Structure object (model)
9
10 or if this is an NMR ensemble:
11
12 >>> parser.parse_models()
13 <Ensemble> # an Ensemble object (collection of alternative Structure-s)
14
15 This module introduces a family of PDB file parsers. The common interface of all
16 parsers is defined in L{AbstractStructureParser}. This class has several
17 implementations:
18
19 - L{RegularStructureParser} - handles normal PDB files with SEQRES fields
20 - L{LegacyStructureParser} - reads structures from legacy or malformed PDB files,
21 which are lacking SEQRES records (initializes all residues from the ATOMs instead)
22 - L{PDBHeaderParser} - reads only the headers of the PDB files and produces structures
23 without coordinates. Useful for reading metadata (e.g. accession numbers or just
24 plain SEQRES sequences) with minimum overhead
25
26 Unless you have a special reason, you should use the L{StructureParser} factory,
27 which returns a proper L{AbstractStructureParser} implementation, depending on the
28 input PDB file. If the input file looks like a regular PDB file, the factory
29 returns a L{RegularStructureParser}, otherwise it instantiates L{LegacyStructureParser}.
30 L{StructureParser} is in fact an alias for L{AbstractStructureParser.create_parser}.
31
32 Writing your own, customized PDB parser is easy. Suppose that you are trying to
33 parse a PDB-like file which misuses the charge column to store custom info. This
34 will certainly crash L{RegularStructureParser} (for good), but you can create your
35 own parser as a workaround. All you need to to is to override the virtual
36 C{_read_charge} hook method::
37
38 class CustomParser(RegularStructureParser):
39
40 def _read_charge(self, line):
41 try:
42 return super(CustomParser, self)._read_charge(line)
43 except StructureFormatError:
44 return None
45
46 Another important abstraction in this module is L{StructureProvider}. It has several
47 implementations which can be used to retrieve PDB L{Structure}s from various sources:
48 file system directories, remote URLs, etc. You can easily create your own provider as
49 well. See L{StructureProvider} for details.
50
51 Finally, this module gives you some L{FileBuilder}s, used for text serialization
52 of L{Structure}s and L{Ensemble}s:
53
54 >>> builder = PDBFileBuilder(stream)
55 >>> builder.add_header(structure)
56 >>> builder.add_structure(structure)
57
58 where stream is any Python stream, e.g. an open file or sys.stdout.
59
60 See L{Ensemble} and L{Structure} from L{csb.bio.structure} for details on these
61 objects.
62 """
63
64 import re
65 import os
66 import numpy
67 import datetime
68 import multiprocessing
69
70 import csb.bio.structure
71 import csb.bio.sequence
72 import csb.bio.sequence.alignment as alignment
73 import csb.core
74 import csb.io
75
76 from abc import ABCMeta, abstractmethod
77 from csb.bio.sequence import SequenceTypes, SequenceAlphabets
78 from csb.bio.structure import ChemElements, SecStructures
79
80
81 PDB_AMINOACIDS = {
82 'PAQ': 'TYR', 'AGM': 'ARG', 'ILE': 'ILE', 'PR3': 'CYS', 'GLN': 'GLN',
83 'DVA': 'VAL', 'CCS': 'CYS', 'ACL': 'ARG', 'GLX': 'GLX', 'GLY': 'GLY',
84 'GLZ': 'GLY', 'DTH': 'THR', 'OAS': 'SER', 'C6C': 'CYS', 'NEM': 'HIS',
85 'DLY': 'LYS', 'MIS': 'SER', 'SMC': 'CYS', 'GLU': 'GLU', 'NEP': 'HIS',
86 'BCS': 'CYS', 'ASQ': 'ASP', 'ASP': 'ASP', 'SCY': 'CYS', 'SER': 'SER',
87 'LYS': 'LYS', 'SAC': 'SER', 'PRO': 'PRO', 'ASX': 'ASX', 'DGN': 'GLN',
88 'DGL': 'GLU', 'MHS': 'HIS', 'ASB': 'ASP', 'ASA': 'ASP', 'NLE': 'LEU',
89 'DCY': 'CYS', 'ASK': 'ASP', 'GGL': 'GLU', 'STY': 'TYR', 'SEL': 'SER',
90 'CGU': 'GLU', 'ASN': 'ASN', 'ASL': 'ASP', 'LTR': 'TRP', 'DAR': 'ARG',
91 'VAL': 'VAL', 'CHG': 'ALA', 'TPO': 'THR', 'CLE': 'LEU', 'GMA': 'GLU',
92 'HAC': 'ALA', 'AYA': 'ALA', 'THR': 'THR', 'TIH': 'ALA', 'SVA': 'SER',
93 'MVA': 'VAL', 'SAR': 'GLY', 'LYZ': 'LYS', 'BNN': 'ALA', '5HP': 'GLU',
94 'IIL': 'ILE', 'SHR': 'LYS', 'HAR': 'ARG', 'FME': 'MET', 'ALO': 'THR',
95 'PHI': 'PHE', 'ALM': 'ALA', 'PHL': 'PHE', 'MEN': 'ASN', 'TPQ': 'ALA',
96 'GSC': 'GLY', 'PHE': 'PHE', 'ALA': 'ALA', 'MAA': 'ALA', 'MET': 'MET',
97 'UNK': 'UNK', 'LEU': 'LEU', 'ALY': 'LYS', 'SET': 'SER', 'GL3': 'GLY',
98 'TRG': 'LYS', 'CXM': 'MET', 'TYR': 'TYR', 'SCS': 'CYS', 'DIL': 'ILE',
99 'TYQ': 'TYR', '3AH': 'HIS', 'DPR': 'PRO', 'PRR': 'ALA', 'CME': 'CYS',
100 'IYR': 'TYR', 'CY1': 'CYS', 'TYY': 'TYR', 'HYP': 'PRO', 'DTY': 'TYR',
101 '2AS': 'ASP', 'DTR': 'TRP', 'FLA': 'ALA', 'DPN': 'PHE', 'DIV': 'VAL',
102 'PCA': 'GLU', 'MSE': 'MET', 'MSA': 'GLY', 'AIB': 'ALA', 'CYS': 'CYS',
103 'NLP': 'LEU', 'CYQ': 'CYS', 'HIS': 'HIS', 'DLE': 'LEU', 'CEA': 'CYS',
104 'DAL': 'ALA', 'LLP': 'LYS', 'DAH': 'PHE', 'HMR': 'ARG', 'TRO': 'TRP',
105 'HIC': 'HIS', 'CYG': 'CYS', 'BMT': 'THR', 'DAS': 'ASP', 'TYB': 'TYR',
106 'BUC': 'CYS', 'PEC': 'CYS', 'BUG': 'LEU', 'CYM': 'CYS', 'NLN': 'LEU',
107 'CY3': 'CYS', 'HIP': 'HIS', 'CSO': 'CYS', 'TPL': 'TRP', 'LYM': 'LYS',
108 'DHI': 'HIS', 'MLE': 'LEU', 'CSD': 'ALA', 'HPQ': 'PHE', 'MPQ': 'GLY',
109 'LLY': 'LYS', 'DHA': 'ALA', 'DSN': 'SER', 'SOC': 'CYS', 'CSX': 'CYS',
110 'OMT': 'MET', 'DSP': 'ASP', 'PTR': 'TYR', 'TRP': 'TRP', 'CSW': 'CYS',
111 'EFC': 'CYS', 'CSP': 'CYS', 'CSS': 'CYS', 'SCH': 'CYS', 'OCS': 'CYS',
112 'NMC': 'GLY', 'SEP': 'SER', 'BHD': 'ASP', 'KCX': 'LYS', 'SHC': 'CYS',
113 'C5C': 'CYS', 'HTR': 'TRP', 'ARG': 'ARG', 'TYS': 'TYR', 'ARM': 'ARG',
114 'DNP': 'ALA'
115 }
116 """
117 Dictionary of non-standard amino acids, which could be found in PDB.
118 """
119
120
121 PDB_NUCLEOTIDES = {
122 'DA': 'Adenine', 'DG': 'Guanine', 'DC': 'Cytosine', 'DT': 'Thymine',
123 'A': 'Adenine', 'G': 'Guanine', 'C': 'Cytosine', 'T': 'Thymine',
124 'U': 'Uracil', 'DOC': 'Cytosine', 'R': 'Purine', 'Y': 'Pyrimidine',
125 'K': 'Ketone', ' M': 'Amino', 'S': 'Strong', 'W': 'Weak',
126 'B': 'NotA', 'D' : 'NotC', 'H': 'NotG', 'V': 'NotT',
127 'N': 'Any', 'X' : 'Masked'
128 }
129 """
130 Dictionary of non-standard nucleotides, which could be found in PDB.
131 """
135
138
141
144
147
150
151 -class InvalidEntryIDError(StructureFormatError):
153
156
157
158 -class EntryID(object):
159 """
160 Represents a PDB Chain identifier. Implementing classes must define
161 how the original ID is split into accession number and chain ID.
162
163 @param id: identifier
164 @type id: str
165 """
166 __metaclass__ = ABCMeta
167
168 - def __init__(self, id):
169
170 self._accession = ''
171 self._chain = ''
172
173 id = id.strip()
174 self._accession, self._chain = self.parse(id)
175
176 @staticmethod
178 """
179 Guess the format of C{id} and parse it.
180
181 @return: a new PDB ID of the appropriate type
182 @rtype: L{EntryID}
183 """
184
185 if len(id) in (4, 5):
186 return StandardID(id)
187 elif len(id) == 6 and id[4] == '_':
188 return SeqResID(id)
189 else:
190 return DegenerateID(id)
191
192 @abstractmethod
193 - def parse(self, id):
194 """
195 Split C{id} into accession number and chain ID.
196
197 @param id: PDB identifier
198 @type id: str
199 @return: (accession, chain)
200 @rtype: tuple of str
201
202 @raise InvalidEntryIDError: when C{id} is in an inappropriate format
203 """
204 pass
205
207 """
208 @return: the identifier in its original format
209 @rtype: str
210 """
211 return self.entry_id
212
213 @property
214 - def accession(self):
215 """
216 Accession number part of the Entry ID
217 @rtype: str
218 """
219 return self._accession
220
221 @property
223 """
224 Chain ID part of the Entry ID
225 @rtype: str
226 """
227 return self._chain
228
229 @property
230 - def entry_id(self):
231 """
232 Accession number + Chain ID
233 @rtype: str
234 """
235 return "{0.accession}{0.chain}".format(self)
236
239
241 """
242 Standard PDB ID in the following form: xxxxY, where xxxx is the accession
243 number (lower case) and Y is an optional chain identifier.
244 """
251
253 """
254 Looks like a L{StandardID}, except that the accession number may have
255 arbitrary length.
256 """
263
265 """
266 Same as a L{StandardID}, but contains an additional underscore between
267 te accession number and the chain identifier.
268 """
275
278
281 """
282 A base PDB structure format-aware parser. Subclasses must implement the
283 internal abstract method C{_parse_header} in order to complete the
284 implementation.
285
286 @param structure_file: the input PD file to parse
287 @type structure_file: str
288 @param check_ss: if True, secondary structure errors in the file will cause
289 L{SecStructureFormatError} exceptions
290 @type check_ss: bool
291 @param mapper: residue mapper, used to align ATOM records to SEQRES.
292 If None, use the default (L{CombinedResidueMapper})
293 @type mapper: L{AbstractResidueMapper}
294
295 @raise IOError: when the input file cannot be found
296 """
297
298 __metaclass__ = ABCMeta
299
300 @staticmethod
302 """
303 A StructureParser factory, which instantiates and returns the proper parser
304 object based on the contents of the PDB file.
305
306 If the file contains a SEQRES section, L{RegularStructureParser} is returned,
307 otherwise L{LegacyStructureParser} is instantiated. In the latter case
308 LegacyStructureParser will read the sequence data directly from the ATOMs.
309
310 @param structure_file: the PDB file to parse
311 @type structure_file: str
312 @param check_ss: if True, secondary structure errors in the file will cause
313 L{SecStructureFormatError} exceptions
314 @type check_ss: bool
315 @param mapper: residue mapper, used to align ATOM records to SEQRES.
316 If None, use the default (L{CombinedResidueMapper})
317 @type mapper: L{AbstractResidueMapper}
318
319 @rtype: L{AbstractStructureParser}
320 """
321 has_seqres = False
322
323 for line in open(structure_file):
324 if line.startswith('SEQRES'):
325 has_seqres = True
326 break
327
328 if has_seqres:
329 return RegularStructureParser(structure_file, check_ss, mapper)
330 else:
331 return LegacyStructureParser(structure_file, check_ss, mapper)
332
333 - def __init__(self, structure_file, check_ss=False, mapper=None):
343
345 try:
346 self._stream.close()
347 except:
348 pass
349
350 @property
352 """
353 Current residue mapping strategy
354 @rtype: L{AbstractResidueMapper}
355 """
356 return self._mapper
357 @mapper.setter
362
363 @property
365 """
366 Current input PDB file name
367 @rtype: str
368 """
369 return self._file
370 @filename.setter
372 try:
373 stream = open(name)
374 except IOError:
375 raise IOError('File not found: {0}'.format(name))
376
377 if self._stream:
378 try:
379 self._stream.close()
380 except:
381 pass
382 self._stream = stream
383 self._file = name
384
386 """
387 Find all available model identifiers in the structure.
388
389 @return: a list of model IDs
390 @rtype: list
391 """
392 models = []
393 check = set()
394
395 with open(self._file, 'r') as f:
396 for line in f:
397 if line.startswith('MODEL'):
398 model_id = int(line[10:14])
399 if model_id in check:
400 raise StructureFormatError('Duplicate model identifier: {0}'.format(model_id))
401 models.append(model_id)
402 check.add(model_id)
403
404 if len(models) > 0:
405 if not(min(check) == 1 and max(check) == len(models)):
406 raise StructureFormatError('Non-consecutive model identifier(s) encountered')
407 return models
408 else:
409 return []
410
412 """
413 Try to guess what is the sequence type of a chunk of PDB
414 C{residue_label}s. The list of labels is probed starting from the middle
415 first, because PDB chains often contain modified / unknown residues at
416 the termini. If none of the probed residues can be used to determine
417 chain's type, just give up and return L{SequenceTypes.Unknown}.
418
419 @param residue_labels: an iterable of PDB residue labels
420 @type residue_labels: iterable
421
422 @return: a L{SequenceTypes} enum member
423 @rtype: L{csb.core.EnumItem}
424 """
425
426 labels = list(residue_labels)
427 middle = int(len(labels) / 2)
428
429 reordered = labels[middle:] + list(reversed(labels[:middle]))
430
431 for label in reordered:
432 try:
433 type = self.guess_sequence_type(label)
434 if type != SequenceTypes.Unknown:
435 return type
436
437 except UnknownPDBResidueError:
438 continue
439
440 return SequenceTypes.Unknown
441
443 """
444 Try to guess what is the sequence type of a PDB C{residue_label}.
445
446 @param residue_label: a PDB-conforming name of a residue
447 @type residue_label: str
448
449 @return: a L{SequenceTypes} enum member
450 @rtype: L{csb.core.EnumItem}
451
452 @raise UnknownPDBResidueError: when there is no such PDB residue name
453 in the catalog tables
454 """
455 if residue_label in PDB_AMINOACIDS:
456 return SequenceTypes.Protein
457 elif residue_label in PDB_NUCLEOTIDES:
458 return SequenceTypes.NucleicAcid
459 else:
460 raise UnknownPDBResidueError(residue_label)
461
463 """
464 Try to parse a PDB C{residue_label} and return its closest 'normal'
465 string representation. If a sequence type (C{as_type}) is defined,
466 guess the alphabet based on that information, otherwise try first to
467 parse it as a protein residue.
468
469 @param residue_label: a PDB-conforming name of a residue
470 @type residue_label: str
471 @param as_type: suggest a sequence type (L{SequenceTypes} member)
472 @type L{scb.core.EnumItem}
473
474 @return: a normalized residue name
475 @rtype: str
476
477 @raise UnknownPDBResidueError: when there is no such PDB residue name
478 in the catalog table(s)
479 """
480 if as_type is None:
481 as_type = self.guess_sequence_type(residue_label)
482
483 try:
484 if as_type == SequenceTypes.Protein:
485 return PDB_AMINOACIDS[residue_label]
486 elif as_type == SequenceTypes.NucleicAcid:
487 return PDB_NUCLEOTIDES[residue_label]
488 else:
489 raise UnknownPDBResidueError(residue_label)
490 except KeyError:
491 raise UnknownPDBResidueError(residue_label)
492
515
516 - def parse(self, filename=None, model=None):
521
523 """
524 Parse and return the L{Structure} with the specified model identifier.
525 If no explicit model is specified, parse the first model in the
526 structure.
527
528 @param model: parse exactly the model with this ID
529 @type model: str
530
531 @return: object representation of the selected model
532 @rtype: L{Structure}
533
534 @raise ValueError: When an invalid model ID is specified
535 @raise PDBParseError: When the input PDB file suffers from unrecoverable
536 corruption. More specialized exceptions will be
537 raised depending on the context (see L{PDBParseError}'s
538 subclasses).
539 """
540 if model is not None:
541 model = int(model)
542
543 try:
544 structure = self._parse_header(model)
545 except PDBParseError:
546 raise
547 except ValueError as ex:
548 raise HeaderFormatError("Malformed header: {0}".format(ex))
549
550 self._parse_atoms(structure, model)
551 self._parse_ss(structure)
552
553 return structure
554
584
586 """
587 Parse and return the L{Structure} of the biological unit (quaternary
588 structure) as annotated by the REMARK 350 BIOMOLECULE record.
589
590 @param number: biomolecule number
591 @type number: int
592
593 @param single: if True, assign new single-letter chain
594 identifiers. If False, assign multi-letter chain identifiers whith a
595 number appended to the original identifier, like "A1", "A2", ...
596 @type single: bool
597
598 @return: structure of biological unit
599 @rtype: L{Structure}
600 """
601 remarks = self._parse_remarks()
602 if 350 not in remarks:
603 raise PDBParseError('There is no REMARK 350')
604
605 current = 1
606 biomt = {current: {}}
607 chains = tuple()
608
609 def split(line):
610 return [c.strip() for c in line.split(',') if c.strip() != '']
611
612 for line in remarks[350]:
613 if line.startswith('BIOMOLECULE:'):
614 current = int(line[12:])
615 biomt[current] = {}
616 elif line.startswith('APPLY THE FOLLOWING TO CHAINS:'):
617 chains = tuple(split(line[30:]))
618 elif line.startswith(' AND CHAINS:'):
619 chains += tuple(split(line[30:]))
620 elif line.startswith(' BIOMT'):
621 num = int(line[8:12])
622 vec = line[12:].split()
623 vec = list(map(float, vec))
624 biomt[current].setdefault(chains, dict()).setdefault(num, []).extend(vec)
625
626 if number not in biomt or len(biomt[number]) == 0:
627 raise KeyError('no BIOMOLECULE number {0}'.format(number))
628
629 asu = self.parse_structure()
630 structure = csb.bio.structure.Structure('{0}_{1}'.format(asu.accession, number))
631
632 newchainiditer = iter('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789')
633
634 for chains, matrices in biomt[number].items():
635 for num in matrices:
636 mat = numpy.array(matrices[num][0:12]).reshape((3,4))
637 R, t = mat[:3,:3], mat[:3,3]
638
639 for chain in chains:
640 if chain not in asu:
641 raise PDBParseError('chain {0} missing'.format(chain))
642 copy = asu[chain].clone()
643 copy.transform(R, t)
644 if single:
645 if len(structure.chains) == 62:
646 raise ValueError('too many chains for single=True')
647 copy.id = next(newchainiditer)
648 else:
649 copy.id = '{0}{1}'.format(chain, num)
650 structure.chains.append(copy)
651
652 return structure
653
654 @abstractmethod
656 """
657 An abstract method, which subclasses must use to customize the way the
658 PDB header (or the absence of a such) is handled. The implementation
659 must rely on reading character data from the current internal
660 self._stream and must return a new L{csb.bio.structure.Structure}
661 instance with properly initialized header data: accession, model,
662 molecule identifiers, chains and residues. This structure object is
663 then internally passed to the C{_parse_atoms} hook, responsible for
664 attachment of the atoms to the residues in the structure.
665
666 @param model: model ID to parse
667 @type model: str
668
669 @rtype: L{Structure}
670 """
671 pass
672
688
690 """
691 Parse the ATOMs from the specified C{model} and attach them to the
692 C{structure}.
693
694 @param structure: L{Structure} being constructed
695 @type structure:L{Structure}
696 @param model: model ID to parse
697 @type model: str
698 """
699
700 structure.model_id = None
701
702 chains = set()
703 total_chains = len([c for c in structure.items if c.length > 0])
704
705 residues = dict( (chain, []) for chain in structure.chains )
706 seen_residues = dict( (chain, {}) for chain in structure.chains )
707
708 in_ligands = False
709 in_atom = False
710 read_model = False
711
712 self._stream.seek(0)
713 while True:
714
715 try:
716 line = next(self._stream)
717 except StopIteration:
718 break
719
720 if line.startswith('MODEL'):
721 if read_model:
722 break
723 else:
724 self._parse_model_line(line, structure, model)
725 model = structure.model_id
726 read_model = True
727
728 elif line.startswith('ATOM') or \
729 (line.startswith('HETATM') and not in_ligands):
730 in_atom = True
731
732 info = self._parse_atom_line(line, structure)
733 chains.add(info.chain)
734
735 if info.id not in seen_residues[info.chain]:
736 residues[info.chain].append(info)
737 seen_residues[info.chain][info.id] = info
738 else:
739 atom = info.atoms[0]
740 seen_residues[info.chain][info.id].atoms.append(atom)
741
742 elif in_atom and line.startswith('TER'):
743 in_atom = False
744 if len(chains) == total_chains:
745 in_ligands = True
746
747 elif line.startswith('ENDMDL'):
748 break
749
750 elif line.startswith('END'):
751 break
752
753 if structure.model_id != model:
754 raise ValueError('No such model {0} in the structure.'.format(model))
755
756 self._map_residues(structure, residues)
757
759 """
760 Handle a new MODEL line. The default implementation will read the model
761 ID and attach it to the C{structure}.
762
763 @param line: raw string line to parse
764 @type line: str
765 @param structure: L{Structure} being constructed
766 @type structure:L{Structure}
767
768 @note: this method may have side effects: scrolls the current stream
769
770 @return: read model ID
771 @rtype: int
772 """
773 if model and model != self._read_model(line):
774 self._scroll_model(model, self._stream)
775 structure.model_id = model
776 else:
777 model = self._read_model(line)
778 structure.model_id = model
779
780 return model
781
822
824 """
825 @return: model identifier
826 @rtype: int
827 """
828 try:
829 return int(line[10:14])
830 except ValueError:
831 raise StructureFormatError("Invalid model ID: {0}".format(line))
832
834 """
835 @return: a new atom (serial_number, name, element, vector)
836 @rtype: L{Atom}
837 """
838 try:
839 serial_number = int(line[6:11])
840 name = line[12:16]
841 x, y, z = line[30:38], line[38:46], line[46:54]
842 vector = numpy.array([float(x), float(y), float(z)])
843 except ValueError as ve:
844 raise StructureFormatError("Invalid ATOM line: {0}".format(ve))
845
846 element = self._read_element(line)
847 return csb.bio.structure.Atom(serial_number, name, element, vector)
848
850 """
851 @return: PDB sequence number
852 @rtype: int
853 """
854 try:
855 return int(line[22:26])
856 except ValueError:
857 raise StructureFormatError("Invalid sequence number")
858
860 """
861 @return: PDB insertion code
862 @rtype: str or None
863 """
864 code = line[26].strip()
865
866 if code:
867 return code
868 else:
869 return None
870
872 """
873 @return: chain identifier
874 @rtype: str
875 """
876 return line[21].strip()
877
896
898 """
899 @rtype: str
900 """
901 return line[17:20].strip()
902
921
923 """
924 @return: alt identifier
925 @rtype: str or None
926 """
927 alternate = line[16].strip()
928
929 if not alternate:
930 return None
931 else:
932 return alternate
933
935 """
936 @return: occupancy
937 @rtype: float or None
938 """
939 try:
940 return float(line[54:60].strip() or 0)
941 except ValueError:
942 raise StructureFormatError("Malformed occupancy field")
943
945 """
946 @return: b-factor
947 @rtype: float or None
948 """
949 try:
950 return float(line[60:66].strip() or 0)
951 except ValueError:
952 raise StructureFormatError("Malformed bfactor field")
953
972
974 """
975 Attach each L{Atom} to its corresponding L{Residue}.
976
977 So far we have constructed a sparse (fragmented) chain given the information
978 we have read from the ATOM/HETATM records. That includes PDB sequence
979 identifiers and insertion codes, which cover only residues with XYZ coordinates
980 and often do not correspond to our L{Residue} ranks.
981 Our job is to find the right correspondence by matching this unreal sequence
982 to what we have got from the SEQRES fields (the complete, gap-free protein
983 sequence). Here we delegate this task to the current L{AbstractResidueMapper}
984 strategy, which does all the magic.
985
986 @param structure: L{Structure} being constructed
987 @type structure:L{Structure}
988 @param residues: all L{Atom}s which have been constructed so far.
989 This must be a map of the form:
990 <chainID: [L{ResidueInfo}1, L{ResidueInfo}2...]>
991 @type residues: dict of L{ResidueInfo}
992 """
993
994 if set(structure.chains) != set(residues.keys()):
995 raise PDBParseError("Corrupt PDB file")
996
997 for chain in structure.items:
998 if chain.length == 0 or len(residues[chain.id]) == 0:
999 continue
1000
1001 reference = SparseChainSequence.create(chain)
1002 sparse = SparseChainSequence(
1003 "atoms", "", residues[chain.id], chain.type)
1004
1005 aligned = self.mapper.map(sparse, reference)
1006 assert aligned.length == chain.length
1007
1008 for residue, mapped in zip(chain.residues, aligned.residues):
1009 if mapped.type != sparse.alphabet.GAP:
1010 residue.id = (mapped.sequence_number, mapped.insertion_code)
1011
1012 for atom in mapped.atoms:
1013 residue.atoms.append(atom)
1014
1016 """
1017 Parse and attach secondary structure data.
1018
1019 @bug: Currently the PDB helix types are ignored. Each HELIX line is treated
1020 as a regular SecStructures.Helix. This is due to incompatibility
1021 between DSSP and PDB helix types.
1022 @todo: Implement a proper workaround for the previous bug (e.g. skip all
1023 helices types not included in the DSSP enum)
1024
1025 @warning: In this implementation only the start/end positions of the SS
1026 elements are parsed. Additional data like H-bonding is ignored.
1027
1028 @bug: Currently structure.to_pdb() is not writing any SS data.
1029 """
1030 elements = {}
1031 self._stream.seek(0)
1032
1033 while True:
1034 try:
1035 line = next(self._stream)
1036 except StopIteration:
1037 break
1038
1039 if line.startswith('HELIX'):
1040
1041 chain = structure.chains[line[19].strip()]
1042 if chain.id not in elements:
1043 elements[chain.id] = []
1044 if chain.id != line[31].strip():
1045 if self._check_ss:
1046 raise SecStructureFormatError('Helix {0} spans multiple chains'.format(line[7:10]))
1047 else:
1048 continue
1049 try:
1050 startres = chain.find(line[21:25].strip(), line[25].strip())
1051 endres = chain.find(line[33:37].strip(), line[37].strip())
1052 except csb.core.ItemNotFoundError as ex:
1053 if self._check_ss:
1054 raise SecStructureFormatError(
1055 'Helix {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex)))
1056 else:
1057 continue
1058 if not startres.rank <= endres.rank:
1059 if self._check_ss:
1060 raise SecStructureFormatError('Helix {0} is out of range'.format(line[7:10]))
1061 else:
1062 continue
1063 helix = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Helix)
1064 elements[chain.id].append(helix)
1065
1066 if line.startswith('SHEET'):
1067
1068 chain = structure.chains[line[21].strip()]
1069 if chain.id not in elements:
1070 elements[chain.id] = []
1071 if chain.id != line[32].strip():
1072 if self._check_ss:
1073 raise SecStructureFormatError('Sheet {0} spans multiple chains'.format(line[7:10]))
1074 else:
1075 continue
1076 try:
1077 startres = chain.find(line[22:26].strip(), line[26].strip())
1078 endres = chain.find(line[33:37].strip(), line[37].strip())
1079 except csb.core.ItemNotFoundError as ex:
1080 if self._check_ss:
1081 raise SecStructureFormatError(
1082 'Sheet {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex)))
1083 else:
1084 continue
1085 if not startres.rank <= endres.rank:
1086 if self._check_ss:
1087 raise SecStructureFormatError('Sheet {0} is out of range'.format(line[7:10]))
1088 else:
1089 continue
1090 strand = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Strand)
1091 elements[chain.id].append(strand)
1092
1093 elif line.startswith('MODEL') or line.startswith('ATOM'):
1094 break
1095
1096 for chain_id in elements:
1097 ss = csb.bio.structure.SecondaryStructure()
1098 for e in elements[chain_id]:
1099 ss.append(e)
1100 structure.chains[chain_id].secondary_structure = ss
1101
1103 """
1104 Read REMARK lines from PDB file.
1105
1106 @return: dictionary with remark numbers as keys, and lists of lines as values.
1107 @rtype: dict
1108 """
1109 self._stream.seek(0)
1110
1111 remarks = {}
1112
1113 for line in self._stream:
1114 if line.startswith('REMARK'):
1115 num = int(line[7:10])
1116 lstring = line[11:]
1117 remarks.setdefault(num, []).append(lstring)
1118 elif line.startswith('DBREF') or line.startswith('ATOM'):
1119 break
1120
1121 return remarks
1122
1125 """
1126 This is the de facto PDB parser, which is designed to read SEQRES and ATOM
1127 sections separately, and them map them. Intentionally fails to parse
1128 malformed PDB files, e.g. a PDB file without a HEADER section.
1129 """
1130
1132 """
1133 Parse the HEADER section of a regular PDB file.
1134
1135 @return: a L{csb.bio.structure.Structure} instance with properly
1136 initialized residues from the SEQRES.
1137 @rtype: L{csb.bio.structure.Structure}
1138
1139 @raise PDBParseError: if the stream has no HEADER at byte 0
1140 """
1141
1142 self._stream.seek(0)
1143
1144 header = next(self._stream)
1145 if not header.startswith('HEADER'):
1146 raise PDBParseError('Does not look like a regular PDB file.')
1147
1148 structure = csb.bio.structure.Structure(header.split()[-1])
1149
1150 while True:
1151
1152 try:
1153 line = next(self._stream)
1154 except StopIteration:
1155 break
1156
1157 if line.startswith('COMPND'):
1158 if line[10:].lstrip().startswith('MOL_ID:'):
1159 mol_id = int(line[18:].replace(';', '').strip())
1160 chain_name = ''
1161 chains = ''
1162 while line.startswith('COMPND'):
1163 line = next(self._stream)
1164 if line.split()[2].startswith('MOLECULE:'):
1165 chain_name += line[20:].strip()
1166 while not chain_name.endswith(';'):
1167 line = next(self._stream)
1168 if not line.startswith('COMPND'):
1169 break
1170 chain_name += ' ' + line[11:].strip()
1171 else:
1172 while not line.split()[2].startswith('CHAIN:'):
1173 line = next(self._stream)
1174 if not line.startswith('COMPND'):
1175 raise HeaderFormatError('Missing chain identifier in COMPND section')
1176 chains = line[17:].strip()
1177 while not chains.endswith(';'):
1178 line = next(self._stream)
1179 if not line.startswith('COMPND'):
1180 break
1181 chains += ', ' + line[11:].strip()
1182 break
1183
1184 chain_ids = chains.replace(';', ' ').replace(',', ' ').split() or ['']
1185 self._add_chains(structure, chain_name, mol_id, *chain_ids)
1186
1187 elif line.startswith('REMARK 2 RESOLUTION'):
1188 structure.resolution = self._read_resolution(line)
1189
1190 elif line.startswith('SEQRES'):
1191 chain_id, residues = self._parse_seqres_line(line, structure)
1192 chain = structure.chains[chain_id]
1193
1194 for residue in residues:
1195 chain.residues.append(residue)
1196 if chain.residues.last_index != residue.rank:
1197 raise HeaderFormatError("Malformed SEQRES")
1198
1199 elif line.startswith('MODEL') or line.startswith('ATOM'):
1200 break
1201
1202 return structure
1203
1204 - def _add_chains(self, structure, name, mol_id, *chain_ids):
1216
1218 """
1219 @return: resolution
1220 @rtype: float or None
1221 """
1222 res = re.search("(\d+(?:\.\d+)?)\s+ANGSTROM", line)
1223
1224 if res and res.groups():
1225 return float(res.group(1))
1226 else:
1227 return None
1228
1261
1264 """
1265 Ultra fast PDB HEADER parser. Does not read any structural data.
1266 """
1267
1270
1273
1276
1279 """
1280 This is a customized PDB parser, which is designed to read both sequence and
1281 atom data from the ATOM section. This is especially useful when parsing PDB
1282 files without a header.
1283 """
1284
1286 """
1287 Initialize a structure with residues from the ATOMs section.
1288
1289 @param model: model identifier (e.g. if multiple models exist)
1290 @type model: str
1291
1292 @return: a L{csb.bio.structure.Structure} instance with properly
1293 initialized residues from ATOMs under the specified C{model}.
1294 @rtype: L{csb.bio.structure.Structure}
1295 """
1296
1297 self._stream.seek(0)
1298 in_atom = False
1299 has_atoms = False
1300 has_model = False
1301 chains = csb.core.OrderedDict()
1302
1303 header = next(self._stream)
1304 if header.startswith('HEADER'):
1305 structure = csb.bio.structure.Structure(header.split()[-1])
1306 else:
1307 self._stream.seek(0)
1308 structure = csb.bio.structure.Structure('NONE')
1309
1310 structure.model_id = None
1311
1312 while True:
1313
1314 try:
1315 line = next(self._stream)
1316 except StopIteration:
1317 break
1318
1319 if line.startswith('MODEL'):
1320 if has_model:
1321 break
1322 else:
1323 self._parse_model_line(line, structure, model)
1324 model = structure.model_id
1325 has_model = True
1326
1327 elif line.startswith('ATOM') \
1328 or (in_atom and line.startswith('HETATM')):
1329 in_atom = True
1330 has_atoms = True
1331
1332 seq_number = self._read_sequence_number(line)
1333 ins_code = self._read_insertion_code(line)
1334 residue_id = (seq_number, ins_code)
1335 label = self._read_residue_raw(line)
1336 chain_id = self._read_chain_id(line)
1337
1338 if chain_id not in chains:
1339 chains[chain_id] = csb.core.OrderedDict()
1340 self._add_chain(structure, chain_id)
1341
1342 if residue_id not in chains[chain_id]:
1343 chains[chain_id][residue_id] = label
1344 chain = structure.chains[chain_id]
1345 if chain.type == SequenceTypes.Unknown:
1346 self._fix_chain(chain, label)
1347
1348 elif in_atom and line.startswith('TER'):
1349 in_atom = False
1350 elif line.startswith('ENDMDL'):
1351 break
1352 elif line.startswith('END'):
1353 break
1354
1355 if not has_atoms:
1356 raise HeaderFormatError("Can't parse legacy structure: no ATOMs found")
1357
1358 for chain in structure.items:
1359 self._build_chain(chain, chains[chain.id])
1360
1361 return structure
1362
1370
1381
1388
1400
1401
1402 StructureParser = AbstractStructureParser.create_parser
1403 """
1404 Alias for L{AbstractStructureParser.create_parser}.
1405 """
1409 """
1410 High-performance struct, which functions as a container for unmapped
1411 L{Atom}s.
1412
1413 @note: This object must implement the L{csb.bio.sequence.ResidueInfo}
1414 interface. This is not enforced through inheritance solely
1415 to save some CPU (by exposing public fields and no properties).
1416 However, on an abstract level this object is_a ResidueInfo
1417 and is used to build L{AbstractSequence}s.
1418 """
1419 __slots__ = ['chain', 'rank', 'id' , 'sequence_number', 'insertion_code', 'type', 'label', 'atoms']
1420
1421 - def __init__(self, chain, rank, id, seq_number, ins_code, type, label):
1431
1432 @property
1439
1441 """
1442 Sequence view for reference (SEQRES) or sparse (ATOM) PDB chains.
1443 The residue instances passed to the constructor must be
1444 L{csb.bio.structure.Residue} or L{csb.bio.io.wwpdb.ResidueInfo} objects.
1445
1446 See L{csb.bio.sequence.AbstractSequence} for details.
1447 """
1448
1449 - def _add(self, residue):
1455
1456 - def _get(self, rank):
1457 return self._residues[rank - 1]
1458
1459 @staticmethod
1469
1472 """
1473 Defines the base interface of all residue mappers, used to align PDB ATOM
1474 records to the real (SEQRES) sequence of a chain.
1475 """
1476
1477 __metaclass__ = ABCMeta
1478
1479 @abstractmethod
1480 - def map(self, sparse, reference):
1481 """
1482 Map L{sparse}'s residues to L{reference}. Return all C{sparse} residues,
1483 aligned over C{reference}, with artificial gap residues inserted at
1484 relevant positions. The resulting sequence of sparse residues will
1485 always have the same length as the C{reference} sequence.
1486
1487 @note: C{sparse}'s ranks won't be touched because the C{rank} property
1488 of the underlying L{ResidueInfo} implementation is not necessarily r/w.
1489
1490 @param sparse: sparse sequence (e.g. derived from ATOMS records)
1491 @type sparse: L{SparseChainSequence}
1492 @param reference: reference, complete sequence
1493 (e.g. derived from SEQRES records)
1494 @type reference: L{SparseChainSequence}
1495
1496 @return: all C{sparse} residues, optimally aligned over C{reference}
1497 (with gaps)
1498 @rtype: L{SparseChainSequence}
1499
1500 @raise ResidueMappingError: if the specified sequences are not alignable
1501 """
1502 pass
1503
1505 """
1506 Create and return a new gap residue.
1507
1508 @param alphabet: sequence alphabet; a member of L{SequenceAlphabets}
1509 which has GAP item
1510 @type alphabet: L{enum}
1511
1512 @rtype: L{ResidueInfo}
1513 """
1514 return ResidueInfo(None, -1, None, None, None, alphabet.GAP, "-")
1515
1516 - def _build(self, sparse, aligned):
1520
1522 """
1523 RegExp-based residue mapper. Fails on heavily malformed input (i.e. it cannot
1524 insert gaps in the C{reference}), but it is very fast (linear) and memory
1525 efficient.
1526 """
1527
1528 MAX_FRAGMENTS = 20
1529
1530 MIN_UNICODE_CHAR = 300
1531 FORBIDDEN_CHARS = set('^.*?()-')
1532
1533 CODEC = "utf-8"
1534 DELIMITER = ").*?(".encode(CODEC).decode(CODEC)
1535 PATTERN = "^.*?({0}).*?$".encode(CODEC).decode(CODEC)
1536
1540
1541 - def map(self, sparse, reference):
1580
1614
1616 """
1617 Return a unique single-letter representation of C{r.type}.
1618 """
1619 if not r.is_modified:
1620 return str(r.type)
1621 else:
1622 return self._register_label(r.label)
1623
1626
1628 """
1629 Assign a new unicode character to C{label} and cache it.
1630
1631 @return: cached single-letter representation of label.
1632 @rtype: unicode char
1633 """
1634
1635 if label not in self._cache:
1636 if set(label).intersection(FastResidueMapper.FORBIDDEN_CHARS):
1637 raise ResidueMappingError("Invalid residue label")
1638
1639 self._charcode += 1
1640 code = self._charcode
1641 self._cache[label] = csb.io.unichr(code)
1642
1643 return self._cache[label]
1644
1647 """
1648 Exhaustive residue mapper, which uses Needleman-Wunsch global alignment.
1649 Much slower (quadratic), but fail-proof even with incompatible sequences
1650 (can insert gaps in both the C{sparse} and the C{reference} sequence).
1651
1652 @param match: score for a match
1653 @type match: float
1654 @param mismatch: score for a mismatch (by default mismatches are heavily
1655 penalized, while gaps are allowed)
1656 @type mismatch: float
1657 @param gap: gap penalty
1658 @type gap: float
1659 """
1660
1665
1666
1667 - def __init__(self, match=1, mismatch=-10, gap=0):
1673
1674 - def map(self, sparse, reference):
1692
1694 """
1695 The best of both worlds: attempts to map the residues using
1696 L{FastResidueMapper}, but upon failure secures success by switching to
1697 L{RobustResidueMapper}.
1698 """
1699 FAST = FastResidueMapper()
1700 ROBUST = RobustResidueMapper()
1701
1702 - def map(self, sparse, reference):
1708
1711 """
1712 Base abstract files for all structure file formatters.
1713 Defines a common step-wise interface according to the Builder pattern.
1714
1715 @param output: output stream (this is where the product is constructed)
1716 @type output: stream
1717 """
1718
1719 __metaclass__ = ABCMeta
1720
1722
1723 if not hasattr(output, 'write'):
1724 raise TypeError(output)
1725
1726 def isnull(this, that, null=None):
1727 if this is null:
1728 return that
1729 else:
1730 return this
1731
1732 self._out = output
1733 self._isnull = isnull
1734
1735 @property
1737 """
1738 Destination stream
1739 @rtype: stream
1740 """
1741 return self._out
1742
1743 @property
1745 """
1746 ISNULL(X, Y) function
1747 @rtype: callable
1748 """
1749 return self._isnull
1750
1752 """
1753 Write a chunk of text
1754 """
1755 self._out.write(text)
1756
1758 """
1759 Write a chunk of text and append a new line terminator
1760 """
1761 self._out.write(text)
1762 self._out.write('\n')
1763
1764 @abstractmethod
1767
1768 @abstractmethod
1771
1774
1776 """
1777 PDB file format builder.
1778 """
1779
1782
1784 """
1785 Write the HEADER of the file using C{master}
1786
1787 @type master: L{Structure}
1788 """
1789
1790 isnull = self.isnull
1791
1792 header = 'HEADER {0:40}{1:%d-%b-%y} {2:4}'
1793 self.writeline(header.format('.', datetime.datetime.now(), master.accession.upper()))
1794
1795 molecules = { }
1796
1797 for chain_id in master.chains:
1798 chain = master.chains[chain_id]
1799 if chain.molecule_id not in molecules:
1800 molecules[chain.molecule_id] = [ ]
1801 molecules[chain.molecule_id].append(chain_id)
1802
1803 k = 0
1804 for mol_id in sorted(molecules):
1805
1806 chains = molecules[mol_id]
1807 first_chain = master.chains[ chains[0] ]
1808
1809 self.writeline('COMPND {0:3} MOL_ID: {1};'.format(k + 1, isnull(mol_id, '0')))
1810 self.writeline('COMPND {0:3} MOLECULE: {1};'.format(k + 2, isnull(first_chain.name, '')))
1811 self.writeline('COMPND {0:3} CHAIN: {1};'.format(k + 3, ', '.join(chains)))
1812 k += 3
1813
1814 for chain_id in master.chains:
1815
1816 chain = master.chains[chain_id]
1817 res = [ r.label for r in chain.residues ]
1818
1819 rn = 0
1820 for j in range(0, chain.length, 13):
1821 rn += 1
1822 residues = [ '{0:>3}'.format(r) for r in res[j : j + 13] ]
1823 self.writeline('SEQRES {0:>3} {1} {2:>4} {3}'.format(
1824 rn, chain.id, chain.length, ' '.join(residues) ))
1825
1827 """
1828 Append a new model to the file
1829
1830 @type structure: L{Structure}
1831 """
1832
1833 isnull = self.isnull
1834
1835 for chain_id in structure.chains:
1836
1837 chain = structure.chains[chain_id]
1838 for residue in chain.residues:
1839
1840 atoms = [ ]
1841 for an in residue.atoms:
1842 atom = residue.atoms[an]
1843 if isinstance(atom, csb.bio.structure.DisorderedAtom):
1844 for dis_atom in atom: atoms.append(dis_atom)
1845 else:
1846 atoms.append(atom)
1847 atoms.sort()
1848
1849 for atom in atoms:
1850
1851 alt = atom.alternate
1852 if alt is True:
1853 alt = 'A'
1854 elif alt is False:
1855 alt = ' '
1856
1857 if atom.element:
1858 element = repr(atom.element)
1859 else:
1860 element = ' '
1861 self.writeline('ATOM {0:>5} {1:>4}{2}{3:>3} {4}{5:>4}{6} {7:>8.3f}{8:>8.3f}{9:>8.3f}{10:>6.2f}{11:>6.2f}{12:>12}{13:2}'.format(
1862 atom.serial_number, atom._full_name, isnull(alt, ' '),
1863 residue.label, chain.id,
1864 isnull(residue.sequence_number, residue.rank), isnull(residue.insertion_code, ' '),
1865 atom.vector[0], atom.vector[1], atom.vector[2], isnull(atom.occupancy, 0.0), isnull(atom.bfactor, 0.0),
1866 element, isnull(atom.charge, ' ') ))
1867
1868 self.writeline('TER')
1869
1871 """
1872 Add the END marker
1873 """
1874 self.writeline('END')
1875 self._out.flush()
1876
1878 """
1879 Supports serialization of NMR ensembles.
1880
1881 Functions as a simple decorator, which wraps C{add_structure} with
1882 MODEL/ENDMDL records.
1883 """
1884
1892
1895 """
1896 Base class for all PDB data source providers.
1897
1898 Concrete classes need to implement the C{find} method, which abstracts the
1899 retrieval of a PDB structure file by a structure identifier. This is a hook
1900 method called internally by C{get}, but subclasses can safely override both
1901 C{find} and {get} to in order to achieve completely custom behavior.
1902 """
1903
1904 __metaclass__ = ABCMeta
1905
1908
1909 @abstractmethod
1910 - def find(self, id):
1911 """
1912 Attempt to discover a PDB file, given a specific PDB C{id}.
1913
1914 @param id: structure identifier (e.g. 1x80)
1915 @type id: str
1916 @return: path and file name on success, None otherwise
1917 @rtype: str or None
1918 """
1919 pass
1920
1921 - def get(self, id, model=None):
1922 """
1923 Discover, parse and return the PDB structure, corresponding to the
1924 specified C{id}.
1925
1926 @param id: structure identifier (e.g. 1x80)
1927 @type id: str
1928 @param model: optional model identifier
1929 @type model: str
1930
1931 @rtype: L{csb.bio.Structure}
1932
1933 @raise StructureNotFoundError: when C{id} could not be found
1934 """
1935 pdb = self.find(id)
1936
1937 if pdb is None:
1938 raise StructureNotFoundError(id)
1939 else:
1940 return StructureParser(pdb).parse_structure(model=model)
1941
1943 """
1944 Simple file system based PDB data source. Scans a list of local directories
1945 using pre-defined file name templates.
1946
1947 @param paths: a list of paths
1948 @type paths: iterable or str
1949 """
1950
1962
1963 @property
1965 """
1966 Current search paths
1967 @rtype: tuple
1968 """
1969 return tuple(self._paths)
1970
1971 @property
1973 """
1974 Current file name match templates
1975 @rtype: tuple
1976 """
1977 return tuple(self._templates)
1978
1979 - def add(self, path):
1980 """
1981 Register a new local C{path}.
1982
1983 @param path: directory name
1984 @type path: str
1985
1986 @raise IOError: if C{path} is not a valid directory
1987 """
1988 if os.path.isdir(path):
1989 self._paths[path] = path
1990 else:
1991 raise IOError(path)
1992
1994 """
1995 Register a custom file name name C{template}. The template must contain
1996 an E{lb}idE{rb} macro, e.g. pdbE{lb}idE{rb}.ent
1997
1998 @param template: pattern
1999 @type template: str
2000 """
2001
2002 if '{id}' not in template:
2003 raise ValueError('Template does not contain an "{id}" macro')
2004
2005 if template not in self._templates:
2006 self._templates.append(template)
2007
2009 """
2010 Unregister an existing local C{path}.
2011
2012 @param path: directory name
2013 @type path: str
2014
2015 @raise ValueError: if C{path} had not been registered
2016 """
2017 if path not in self._paths:
2018 raise ValueError('path not found: {0}'.format(path))
2019
2020 del self._paths[path]
2021
2022 - def find(self, id):
2031
2033 """
2034 Retrieves PDB structures from a specified remote URL.
2035 The URL requested from remote server takes the form: <prefix>/<ID><suffix>
2036
2037 @param prefix: URL prefix, including protocol
2038 @type prefix: str
2039 @param suffix: optional URL suffix (.ent by default)
2040 @type suffix: str
2041 """
2042
2043 - def __init__(self, prefix='https://files.rcsb.org/download/', suffix='.pdb'):
2050
2051 @property
2053 """
2054 Current URL prefix
2055 @rtype: str
2056 """
2057 return self._prefix
2058 @prefix.setter
2060 self._prefix = value
2061
2062 @property
2064 """
2065 Current URL suffix
2066 @rtype: str
2067 """
2068 return self._suffix
2069 @suffix.setter
2071 self._suffix = value
2072
2079
2080 - def find(self, id):
2094
2095 - def get(self, id, model=None):
2106
2108 """
2109 A custom PDB data source. Functions as a user-defined map of structure
2110 identifiers and their corresponding local file names.
2111
2112 @param files: initialization dictionary of id:file pairs
2113 @type files: dict-like
2114 """
2115
2117
2118 self._files = {}
2119 for id in files:
2120 self.add(id, files[id])
2121
2122 @property
2124 """
2125 List of currently registered file names
2126 @rtype: tuple
2127 """
2128 return tuple(self._files.values())
2129
2130 @property
2132 """
2133 List of currently registered structure identifiers
2134 @rtype: tuple
2135 """
2136 return tuple(self._files)
2137
2138 - def add(self, id, path):
2139 """
2140 Register a new local C{id}:C{path} pair.
2141
2142 @param id: structure identifier
2143 @type id: str
2144 @param path: path and file name
2145 @type path: str
2146
2147 @raise IOError: if C{path} is not a valid file name
2148 """
2149 if os.path.isfile(path):
2150 self._files[id] = path
2151 else:
2152 raise IOError(path)
2153
2155 """
2156 Unregister an existing structure C{id}.
2157
2158 @param id: structure identifier
2159 @type id: str
2160
2161 @raise ValueError: if C{id} had not been registered
2162 """
2163 if id not in self._files:
2164 raise ValueError(id)
2165 else:
2166 del self._files[id]
2167
2168 - def find(self, id):
2169
2170 if id in self._files:
2171 return self._files[id]
2172 else:
2173 return None
2174
2175 -def get(accession, model=None, prefix='https://files.rcsb.org/download/'):
2176 """
2177 Download and parse a PDB entry.
2178
2179 @param accession: accession number of the entry
2180 @type accession: str
2181 @param model: model identifier
2182 @type model: str
2183 @param prefix: download URL prefix
2184 @type prefix: str
2185
2186 @return: object representation of the selected model
2187 @rtype: L{Structure}
2188 """
2189 return RemoteStructureProvider(prefix).get(accession, model=model)
2190
2191 -def find(id, paths):
2192 """
2193 Try to discover a PDB file for PDB C{id} in C{paths}.
2194
2195 @param id: PDB ID of the entry
2196 @type id: str
2197 @param paths: a list of directories to scan
2198 @type paths: list of str
2199
2200 @return: path and file name on success, None otherwise
2201 @rtype: str
2202 """
2203 return FileSystemStructureProvider(paths).find(id)
2204
2207
2208 - def __init__(self, result, exception):
2209
2210 self.result = result
2211 self.exception = exception
2212
2214 return '<AsyncParseResult: result={0.result}, error={0.exception.__class__.__name__}>'.format(self)
2215
2220
2223 """
2224 Wraps StructureParser in an asynchronous call. Since a new process is
2225 started by Python internally (as opposed to only starting a new thread),
2226 this makes the parser slower, but provides a way to set a parse timeout
2227 limit.
2228
2229 If initialized with more than one worker, supports parallel parsing
2230 through the C{self.parse_async} method.
2231
2232 @param workers: number of worker threads (1 by default)
2233 @type workers: int
2234 """
2235
2237
2238 self._pool = None
2239 self._workers = 1
2240
2241 if int(workers) > 0:
2242 self._workers = int(workers)
2243 else:
2244 raise ValueError(workers)
2245
2246 self._recycle()
2247
2249
2250 if self._pool:
2251 self._pool.terminate()
2252
2253 self._pool = multiprocessing.Pool(processes=self._workers)
2254
2257 """
2258 Call StructureParser.parse_structure() in a separate process and return
2259 the output. Raise TimeoutError if the parser does not respond within
2260 C{timeout} seconds.
2261
2262 @param structure_file: structure file to parse
2263 @type structure_file: str
2264 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds
2265 elapse before the parser completes its job
2266 @type timeout: int
2267 @param parser: any implementing L{AbstractStructureParser} class
2268 @type parser: type
2269
2270 @return: parsed structure
2271 @rtype: L{csb.structure.Structure}
2272 """
2273
2274 r = self.parse_async([structure_file], timeout, model, parser)
2275 if len(r) > 0:
2276 if r[0].exception is not None:
2277 raise r[0].exception
2278 else:
2279 return r[0].result
2280 return None
2281
2284 """
2285 Call C{self.parse_structure} for a list of structure files
2286 simultaneously. The actual degree of parallelism will depend on the
2287 number of workers specified while constructing the parser object.
2288
2289 @note: Don't be tempted to pass a large list of structures to this
2290 method. Every time a C{TimeoutError} is encountered, the
2291 corresponding worker process in the pool will hang until the
2292 process terminates on its own. During that time, this worker is
2293 unusable. If a sufficiently high number of timeouts occur, the
2294 whole pool of workers will be unsable. At the end of the method
2295 however a pool cleanup is performed and any unusable workers
2296 are 'reactivated'. However, that only happens at B{the end} of
2297 C{parse_async}.
2298
2299 @param structure_files: a list of structure files
2300 @type structure_files: tuple of str
2301 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds
2302 elapse before the parser completes its job
2303 @type timeout: int
2304 @param parser: any implementing L{AbstractStructureParser} class
2305 @type parser: type
2306
2307 @return: a list of L{AsyncParseResult} objects
2308 @rtype: list
2309 """
2310
2311 pool = self._pool
2312 workers = []
2313 results = []
2314
2315 for file in list(structure_files):
2316 result = pool.apply_async(_parse_async, [parser, file, model])
2317 workers.append(result)
2318
2319 hanging = False
2320 for w in workers:
2321 result = AsyncParseResult(None, None)
2322 try:
2323 result.result = w.get(timeout=timeout)
2324 except KeyboardInterrupt as ki:
2325 pool.terminate()
2326 raise ki
2327 except Exception as ex:
2328 result.exception = ex
2329 if isinstance(ex, multiprocessing.TimeoutError):
2330 hanging = True
2331 results.append(result)
2332
2333 if hanging:
2334 self._recycle()
2335
2336 return results
2337