Package csb :: Package bio :: Package io :: Module wwpdb
[frames] | no frames]

Source Code for Module csb.bio.io.wwpdb

   1  """ 
   2  PDB structure parsers, format builders and database providers. 
   3   
   4  The most basic usage is: 
   5   
   6      >>> parser = StructureParser('structure.pdb') 
   7      >>> parser.parse_structure() 
   8      <Structure>     # a Structure object (model) 
   9   
  10  or if this is an NMR ensemble: 
  11       
  12      >>> parser.parse_models() 
  13      <Ensemble>      # an Ensemble object (collection of alternative Structure-s) 
  14       
  15  This module introduces a family of PDB file parsers. The common interface of all 
  16  parsers is defined in L{AbstractStructureParser}. This class has several 
  17  implementations: 
  18   
  19      - L{RegularStructureParser} - handles normal PDB files with SEQRES fields 
  20      - L{LegacyStructureParser} - reads structures from legacy or malformed PDB files, 
  21        which are lacking SEQRES records (initializes all residues from the ATOMs instead) 
  22      - L{PDBHeaderParser} - reads only the headers of the PDB files and produces structures 
  23        without coordinates. Useful for reading metadata (e.g. accession numbers or just 
  24        plain SEQRES sequences) with minimum overhead 
  25         
  26  Unless you have a special reason, you should use the L{StructureParser} factory, 
  27  which returns a proper L{AbstractStructureParser} implementation, depending on the 
  28  input PDB file. If the input file looks like a regular PDB file, the factory 
  29  returns a L{RegularStructureParser}, otherwise it instantiates L{LegacyStructureParser}. 
  30  L{StructureParser} is in fact an alias for L{AbstractStructureParser.create_parser}. 
  31   
  32  Writing your own, customized PDB parser is easy. Suppose that you are trying to 
  33  parse a PDB-like file which misuses the charge column to store custom info. This 
  34  will certainly crash L{RegularStructureParser} (for good), but you can create your 
  35  own parser as a workaround. All you need to to is to override the virtual 
  36  C{_read_charge} hook method:: 
  37   
  38      class CustomParser(RegularStructureParser): 
  39       
  40          def _read_charge(self, line): 
  41              try: 
  42                  return super(CustomParser, self)._read_charge(line) 
  43              except StructureFormatError: 
  44                  return None 
  45   
  46  Another important abstraction in this module is L{StructureProvider}. It has several 
  47  implementations which can be used to retrieve PDB L{Structure}s from various sources: 
  48  file system directories, remote URLs, etc. You can easily create your own provider as 
  49  well. See L{StructureProvider} for details. 
  50   
  51  Finally, this module gives you some L{FileBuilder}s, used for text serialization  
  52  of L{Structure}s and L{Ensemble}s: 
  53   
  54      >>> builder = PDBFileBuilder(stream) 
  55      >>> builder.add_header(structure) 
  56      >>> builder.add_structure(structure) 
  57   
  58  where stream is any Python stream, e.g. an open file or sys.stdout. 
  59   
  60  See L{Ensemble} and L{Structure} from L{csb.bio.structure} for details on these 
  61  objects. 
  62  """ 
  63   
  64  import re 
  65  import os 
  66  import numpy 
  67  import datetime 
  68  import multiprocessing 
  69   
  70  import csb.bio.structure 
  71  import csb.bio.sequence 
  72  import csb.bio.sequence.alignment as alignment 
  73  import csb.core 
  74  import csb.io 
  75   
  76  from abc import ABCMeta, abstractmethod 
  77  from csb.bio.sequence import SequenceTypes, SequenceAlphabets 
  78  from csb.bio.structure import ChemElements, SecStructures 
  79   
  80   
  81  PDB_AMINOACIDS = { 
  82      'PAQ': 'TYR', 'AGM': 'ARG', 'ILE': 'ILE', 'PR3': 'CYS', 'GLN': 'GLN', 
  83      'DVA': 'VAL', 'CCS': 'CYS', 'ACL': 'ARG', 'GLX': 'GLX', 'GLY': 'GLY', 
  84      'GLZ': 'GLY', 'DTH': 'THR', 'OAS': 'SER', 'C6C': 'CYS', 'NEM': 'HIS', 
  85      'DLY': 'LYS', 'MIS': 'SER', 'SMC': 'CYS', 'GLU': 'GLU', 'NEP': 'HIS', 
  86      'BCS': 'CYS', 'ASQ': 'ASP', 'ASP': 'ASP', 'SCY': 'CYS', 'SER': 'SER', 
  87      'LYS': 'LYS', 'SAC': 'SER', 'PRO': 'PRO', 'ASX': 'ASX', 'DGN': 'GLN', 
  88      'DGL': 'GLU', 'MHS': 'HIS', 'ASB': 'ASP', 'ASA': 'ASP', 'NLE': 'LEU', 
  89      'DCY': 'CYS', 'ASK': 'ASP', 'GGL': 'GLU', 'STY': 'TYR', 'SEL': 'SER', 
  90      'CGU': 'GLU', 'ASN': 'ASN', 'ASL': 'ASP', 'LTR': 'TRP', 'DAR': 'ARG', 
  91      'VAL': 'VAL', 'CHG': 'ALA', 'TPO': 'THR', 'CLE': 'LEU', 'GMA': 'GLU', 
  92      'HAC': 'ALA', 'AYA': 'ALA', 'THR': 'THR', 'TIH': 'ALA', 'SVA': 'SER', 
  93      'MVA': 'VAL', 'SAR': 'GLY', 'LYZ': 'LYS', 'BNN': 'ALA', '5HP': 'GLU', 
  94      'IIL': 'ILE', 'SHR': 'LYS', 'HAR': 'ARG', 'FME': 'MET', 'ALO': 'THR', 
  95      'PHI': 'PHE', 'ALM': 'ALA', 'PHL': 'PHE', 'MEN': 'ASN', 'TPQ': 'ALA', 
  96      'GSC': 'GLY', 'PHE': 'PHE', 'ALA': 'ALA', 'MAA': 'ALA', 'MET': 'MET', 
  97      'UNK': 'UNK', 'LEU': 'LEU', 'ALY': 'LYS', 'SET': 'SER', 'GL3': 'GLY', 
  98      'TRG': 'LYS', 'CXM': 'MET', 'TYR': 'TYR', 'SCS': 'CYS', 'DIL': 'ILE', 
  99      'TYQ': 'TYR', '3AH': 'HIS', 'DPR': 'PRO', 'PRR': 'ALA', 'CME': 'CYS', 
 100      'IYR': 'TYR', 'CY1': 'CYS', 'TYY': 'TYR', 'HYP': 'PRO', 'DTY': 'TYR', 
 101      '2AS': 'ASP', 'DTR': 'TRP', 'FLA': 'ALA', 'DPN': 'PHE', 'DIV': 'VAL', 
 102      'PCA': 'GLU', 'MSE': 'MET', 'MSA': 'GLY', 'AIB': 'ALA', 'CYS': 'CYS', 
 103      'NLP': 'LEU', 'CYQ': 'CYS', 'HIS': 'HIS', 'DLE': 'LEU', 'CEA': 'CYS', 
 104      'DAL': 'ALA', 'LLP': 'LYS', 'DAH': 'PHE', 'HMR': 'ARG', 'TRO': 'TRP', 
 105      'HIC': 'HIS', 'CYG': 'CYS', 'BMT': 'THR', 'DAS': 'ASP', 'TYB': 'TYR', 
 106      'BUC': 'CYS', 'PEC': 'CYS', 'BUG': 'LEU', 'CYM': 'CYS', 'NLN': 'LEU', 
 107      'CY3': 'CYS', 'HIP': 'HIS', 'CSO': 'CYS', 'TPL': 'TRP', 'LYM': 'LYS', 
 108      'DHI': 'HIS', 'MLE': 'LEU', 'CSD': 'ALA', 'HPQ': 'PHE', 'MPQ': 'GLY', 
 109      'LLY': 'LYS', 'DHA': 'ALA', 'DSN': 'SER', 'SOC': 'CYS', 'CSX': 'CYS', 
 110      'OMT': 'MET', 'DSP': 'ASP', 'PTR': 'TYR', 'TRP': 'TRP', 'CSW': 'CYS', 
 111      'EFC': 'CYS', 'CSP': 'CYS', 'CSS': 'CYS', 'SCH': 'CYS', 'OCS': 'CYS', 
 112      'NMC': 'GLY', 'SEP': 'SER', 'BHD': 'ASP', 'KCX': 'LYS', 'SHC': 'CYS', 
 113      'C5C': 'CYS', 'HTR': 'TRP', 'ARG': 'ARG', 'TYS': 'TYR', 'ARM': 'ARG', 
 114      'DNP': 'ALA' 
 115      } 
 116  """ 
 117  Dictionary of non-standard amino acids, which could be found in PDB. 
 118  """ 
 119   
 120   
 121  PDB_NUCLEOTIDES = { 
 122      'DA': 'Adenine', 'DG': 'Guanine', 'DC': 'Cytosine', 'DT': 'Thymine', 
 123       'A': 'Adenine', 'G': 'Guanine', 'C': 'Cytosine', 'T': 'Thymine', 
 124       'U': 'Uracil', 'DOC': 'Cytosine', 'R': 'Purine', 'Y': 'Pyrimidine', 
 125       'K': 'Ketone', '  M': 'Amino', 'S': 'Strong', 'W': 'Weak', 
 126       'B': 'NotA', 'D'  : 'NotC', 'H': 'NotG', 'V': 'NotT', 
 127       'N': 'Any', 'X'  : 'Masked' 
 128      } 
 129  """ 
 130  Dictionary of non-standard nucleotides, which could be found in PDB. 
 131  """ 
132 133 -class PDBParseError(ValueError):
134 pass
135
136 -class HeaderFormatError(PDBParseError):
137 pass
138
139 -class SecStructureFormatError(PDBParseError):
140 pass
141
142 -class StructureFormatError(PDBParseError):
143 pass
144
145 -class UnknownPDBResidueError(PDBParseError):
146 pass
147
148 -class StructureNotFoundError(KeyError):
149 pass
150
151 -class InvalidEntryIDError(StructureFormatError):
152 pass
153
154 -class ResidueMappingError(StructureFormatError):
155 pass
156
157 158 -class EntryID(object):
159 """ 160 Represents a PDB Chain identifier. Implementing classes must define 161 how the original ID is split into accession number and chain ID. 162 163 @param id: identifier 164 @type id: str 165 """ 166 __metaclass__ = ABCMeta 167
168 - def __init__(self, id):
169 170 self._accession = '' 171 self._chain = '' 172 173 id = id.strip() 174 self._accession, self._chain = self.parse(id)
175 176 @staticmethod
177 - def create(id):
178 """ 179 Guess the format of C{id} and parse it. 180 181 @return: a new PDB ID of the appropriate type 182 @rtype: L{EntryID} 183 """ 184 185 if len(id) in (4, 5): 186 return StandardID(id) 187 elif len(id) == 6 and id[4] == '_': 188 return SeqResID(id) 189 else: 190 return DegenerateID(id)
191 192 @abstractmethod
193 - def parse(self, id):
194 """ 195 Split C{id} into accession number and chain ID. 196 197 @param id: PDB identifier 198 @type id: str 199 @return: (accession, chain) 200 @rtype: tuple of str 201 202 @raise InvalidEntryIDError: when C{id} is in an inappropriate format 203 """ 204 pass
205
206 - def format(self):
207 """ 208 @return: the identifier in its original format 209 @rtype: str 210 """ 211 return self.entry_id
212 213 @property
214 - def accession(self):
215 """ 216 Accession number part of the Entry ID 217 @rtype: str 218 """ 219 return self._accession
220 221 @property
222 - def chain(self):
223 """ 224 Chain ID part of the Entry ID 225 @rtype: str 226 """ 227 return self._chain
228 229 @property
230 - def entry_id(self):
231 """ 232 Accession number + Chain ID 233 @rtype: str 234 """ 235 return "{0.accession}{0.chain}".format(self)
236
237 - def __str__(self):
238 return self.entry_id
239
240 -class StandardID(EntryID):
241 """ 242 Standard PDB ID in the following form: xxxxY, where xxxx is the accession 243 number (lower case) and Y is an optional chain identifier. 244 """
245 - def parse(self, id):
246 247 if len(id) not in (4, 5): 248 raise InvalidEntryIDError(id) 249 250 return (id[:4].lower(), id[4:])
251
252 -class DegenerateID(EntryID):
253 """ 254 Looks like a L{StandardID}, except that the accession number may have 255 arbitrary length. 256 """
257 - def parse(self, id):
258 259 if len(id) < 2: 260 raise InvalidEntryIDError(id) 261 262 return (id[:-1].lower(), id[-1])
263
264 -class SeqResID(EntryID):
265 """ 266 Same as a L{StandardID}, but contains an additional underscore between 267 te accession number and the chain identifier. 268 """
269 - def parse(self, id):
270 271 if not (len(id) == 6 and id[4] == '_'): 272 raise InvalidEntryIDError(id) 273 274 return (id[:4].lower(), id[5:])
275
276 - def format(self):
277 return "{0.accession}_{0.chain}".format(self)
278
279 280 -class AbstractStructureParser(object):
281 """ 282 A base PDB structure format-aware parser. Subclasses must implement the 283 internal abstract method C{_parse_header} in order to complete the 284 implementation. 285 286 @param structure_file: the input PD file to parse 287 @type structure_file: str 288 @param check_ss: if True, secondary structure errors in the file will cause 289 L{SecStructureFormatError} exceptions 290 @type check_ss: bool 291 @param mapper: residue mapper, used to align ATOM records to SEQRES. 292 If None, use the default (L{CombinedResidueMapper}) 293 @type mapper: L{AbstractResidueMapper} 294 295 @raise IOError: when the input file cannot be found 296 """ 297 298 __metaclass__ = ABCMeta 299 300 @staticmethod
301 - def create_parser(structure_file, check_ss=False, mapper=None):
302 """ 303 A StructureParser factory, which instantiates and returns the proper parser 304 object based on the contents of the PDB file. 305 306 If the file contains a SEQRES section, L{RegularStructureParser} is returned, 307 otherwise L{LegacyStructureParser} is instantiated. In the latter case 308 LegacyStructureParser will read the sequence data directly from the ATOMs. 309 310 @param structure_file: the PDB file to parse 311 @type structure_file: str 312 @param check_ss: if True, secondary structure errors in the file will cause 313 L{SecStructureFormatError} exceptions 314 @type check_ss: bool 315 @param mapper: residue mapper, used to align ATOM records to SEQRES. 316 If None, use the default (L{CombinedResidueMapper}) 317 @type mapper: L{AbstractResidueMapper} 318 319 @rtype: L{AbstractStructureParser} 320 """ 321 has_seqres = False 322 323 for line in open(structure_file): 324 if line.startswith('SEQRES'): 325 has_seqres = True 326 break 327 328 if has_seqres: 329 return RegularStructureParser(structure_file, check_ss, mapper) 330 else: 331 return LegacyStructureParser(structure_file, check_ss, mapper)
332
333 - def __init__(self, structure_file, check_ss=False, mapper=None):
334 335 self._file = None 336 self._stream = None 337 self._mapper = CombinedResidueMapper() 338 self._check_ss = bool(check_ss) 339 340 self.filename = structure_file 341 if mapper is not None: 342 self.mapper = mapper
343
344 - def __del__(self):
345 try: 346 self._stream.close() 347 except: 348 pass
349 350 @property
351 - def mapper(self):
352 """ 353 Current residue mapping strategy 354 @rtype: L{AbstractResidueMapper} 355 """ 356 return self._mapper
357 @mapper.setter
358 - def mapper(self, value):
359 if not isinstance(value, AbstractResidueMapper): 360 raise TypeError(value) 361 self._mapper = value
362 363 @property
364 - def filename(self):
365 """ 366 Current input PDB file name 367 @rtype: str 368 """ 369 return self._file
370 @filename.setter
371 - def filename(self, name):
372 try: 373 stream = open(name) 374 except IOError: 375 raise IOError('File not found: {0}'.format(name)) 376 377 if self._stream: 378 try: 379 self._stream.close() 380 except: 381 pass 382 self._stream = stream 383 self._file = name
384
385 - def models(self):
386 """ 387 Find all available model identifiers in the structure. 388 389 @return: a list of model IDs 390 @rtype: list 391 """ 392 models = [] 393 check = set() 394 395 with open(self._file, 'r') as f: 396 for line in f: 397 if line.startswith('MODEL'): 398 model_id = int(line[10:14]) 399 if model_id in check: 400 raise StructureFormatError('Duplicate model identifier: {0}'.format(model_id)) 401 models.append(model_id) 402 check.add(model_id) 403 404 if len(models) > 0: 405 if not(min(check) == 1 and max(check) == len(models)): 406 raise StructureFormatError('Non-consecutive model identifier(s) encountered') 407 return models 408 else: 409 return []
410
411 - def guess_chain_type(self, residue_labels):
412 """ 413 Try to guess what is the sequence type of a chunk of PDB 414 C{residue_label}s. The list of labels is probed starting from the middle 415 first, because PDB chains often contain modified / unknown residues at 416 the termini. If none of the probed residues can be used to determine 417 chain's type, just give up and return L{SequenceTypes.Unknown}. 418 419 @param residue_labels: an iterable of PDB residue labels 420 @type residue_labels: iterable 421 422 @return: a L{SequenceTypes} enum member 423 @rtype: L{csb.core.EnumItem} 424 """ 425 426 labels = list(residue_labels) 427 middle = int(len(labels) / 2) 428 429 reordered = labels[middle:] + list(reversed(labels[:middle])) 430 431 for label in reordered: 432 try: 433 type = self.guess_sequence_type(label) 434 if type != SequenceTypes.Unknown: 435 return type 436 437 except UnknownPDBResidueError: 438 continue 439 440 return SequenceTypes.Unknown
441
442 - def guess_sequence_type(self, residue_label):
443 """ 444 Try to guess what is the sequence type of a PDB C{residue_label}. 445 446 @param residue_label: a PDB-conforming name of a residue 447 @type residue_label: str 448 449 @return: a L{SequenceTypes} enum member 450 @rtype: L{csb.core.EnumItem} 451 452 @raise UnknownPDBResidueError: when there is no such PDB residue name 453 in the catalog tables 454 """ 455 if residue_label in PDB_AMINOACIDS: 456 return SequenceTypes.Protein 457 elif residue_label in PDB_NUCLEOTIDES: 458 return SequenceTypes.NucleicAcid 459 else: 460 raise UnknownPDBResidueError(residue_label)
461
462 - def parse_residue(self, residue_label, as_type=None):
463 """ 464 Try to parse a PDB C{residue_label} and return its closest 'normal' 465 string representation. If a sequence type (C{as_type}) is defined, 466 guess the alphabet based on that information, otherwise try first to 467 parse it as a protein residue. 468 469 @param residue_label: a PDB-conforming name of a residue 470 @type residue_label: str 471 @param as_type: suggest a sequence type (L{SequenceTypes} member) 472 @type L{scb.core.EnumItem} 473 474 @return: a normalized residue name 475 @rtype: str 476 477 @raise UnknownPDBResidueError: when there is no such PDB residue name 478 in the catalog table(s) 479 """ 480 if as_type is None: 481 as_type = self.guess_sequence_type(residue_label) 482 483 try: 484 if as_type == SequenceTypes.Protein: 485 return PDB_AMINOACIDS[residue_label] 486 elif as_type == SequenceTypes.NucleicAcid: 487 return PDB_NUCLEOTIDES[residue_label] 488 else: 489 raise UnknownPDBResidueError(residue_label) 490 except KeyError: 491 raise UnknownPDBResidueError(residue_label)
492
493 - def parse_residue_safe(self, residue_label, as_type):
494 """ 495 Same as C{parse_residue}, but returns UNK/Any instead of raising 496 UnknownPDBResidueError. 497 498 @param residue_label: a PDB-conforming name of a residue 499 @type residue_label: str 500 @param as_type: suggest a sequence type (L{SequenceTypes} member) 501 @type L{scb.core.EnumItem} 502 503 @return: a normalized residue name 504 @rtype: str 505 """ 506 try: 507 return self.parse_residue(residue_label, as_type) 508 except UnknownPDBResidueError: 509 if as_type == SequenceTypes.Protein: 510 return repr(SequenceAlphabets.Protein.UNK) 511 elif as_type == SequenceTypes.NucleicAcid: 512 return repr(SequenceAlphabets.Nucleic.Any) 513 else: 514 return repr(SequenceAlphabets.Unknown.UNK)
515
516 - def parse(self, filename=None, model=None):
517 518 if filename: 519 self.filename = filename 520 return self.parse_structure(model)
521
522 - def parse_structure(self, model=None):
523 """ 524 Parse and return the L{Structure} with the specified model identifier. 525 If no explicit model is specified, parse the first model in the 526 structure. 527 528 @param model: parse exactly the model with this ID 529 @type model: str 530 531 @return: object representation of the selected model 532 @rtype: L{Structure} 533 534 @raise ValueError: When an invalid model ID is specified 535 @raise PDBParseError: When the input PDB file suffers from unrecoverable 536 corruption. More specialized exceptions will be 537 raised depending on the context (see L{PDBParseError}'s 538 subclasses). 539 """ 540 if model is not None: 541 model = int(model) 542 543 try: 544 structure = self._parse_header(model) 545 except PDBParseError: 546 raise 547 except ValueError as ex: 548 raise HeaderFormatError("Malformed header: {0}".format(ex)) 549 550 self._parse_atoms(structure, model) 551 self._parse_ss(structure) 552 553 return structure
554
555 - def parse_models(self, models=()):
556 """ 557 Parse the specified models in the file and build an L{Ensemble}. 558 559 @param models: an iterable object providing model identifiers. 560 If not specified, all models will be parsed. 561 @type models: tuple 562 563 @return: an ensemble with all parsed models 564 @rtype: L{Ensemble} 565 """ 566 567 if not models: 568 models = self.models() 569 else: 570 models = list(map(int, models)) 571 572 ensemble = csb.bio.structure.Ensemble() 573 574 if len(models) > 0: 575 for model_id in models: 576 model = self.parse_structure(model_id) 577 ensemble.models.append(model) 578 else: 579 model = self.parse_structure() 580 model.model_id = 1 581 ensemble.models.append(model) 582 583 return ensemble
584
585 - def parse_biomolecule(self, number=1, single=False):
586 """ 587 Parse and return the L{Structure} of the biological unit (quaternary 588 structure) as annotated by the REMARK 350 BIOMOLECULE record. 589 590 @param number: biomolecule number 591 @type number: int 592 593 @param single: if True, assign new single-letter chain 594 identifiers. If False, assign multi-letter chain identifiers whith a 595 number appended to the original identifier, like "A1", "A2", ... 596 @type single: bool 597 598 @return: structure of biological unit 599 @rtype: L{Structure} 600 """ 601 remarks = self._parse_remarks() 602 if 350 not in remarks: 603 raise PDBParseError('There is no REMARK 350') 604 605 current = 1 606 biomt = {current: {}} 607 chains = tuple() 608 609 def split(line): 610 return [c.strip() for c in line.split(',') if c.strip() != '']
611 612 for line in remarks[350]: 613 if line.startswith('BIOMOLECULE:'): 614 current = int(line[12:]) 615 biomt[current] = {} 616 elif line.startswith('APPLY THE FOLLOWING TO CHAINS:'): 617 chains = tuple(split(line[30:])) 618 elif line.startswith(' AND CHAINS:'): 619 chains += tuple(split(line[30:])) 620 elif line.startswith(' BIOMT'): 621 num = int(line[8:12]) 622 vec = line[12:].split() 623 vec = list(map(float, vec)) 624 biomt[current].setdefault(chains, dict()).setdefault(num, []).extend(vec) 625 626 if number not in biomt or len(biomt[number]) == 0: 627 raise KeyError('no BIOMOLECULE number {0}'.format(number)) 628 629 asu = self.parse_structure() 630 structure = csb.bio.structure.Structure('{0}_{1}'.format(asu.accession, number)) 631 632 newchainiditer = iter('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789') 633 634 for chains, matrices in biomt[number].items(): 635 for num in matrices: 636 mat = numpy.array(matrices[num][0:12]).reshape((3,4)) 637 R, t = mat[:3,:3], mat[:3,3] 638 639 for chain in chains: 640 if chain not in asu: 641 raise PDBParseError('chain {0} missing'.format(chain)) 642 copy = asu[chain].clone() 643 copy.transform(R, t) 644 if single: 645 if len(structure.chains) == 62: 646 raise ValueError('too many chains for single=True') 647 copy.id = next(newchainiditer) 648 else: 649 copy.id = '{0}{1}'.format(chain, num) 650 structure.chains.append(copy) 651 652 return structure
653 654 @abstractmethod
655 - def _parse_header(self, model):
656 """ 657 An abstract method, which subclasses must use to customize the way the 658 PDB header (or the absence of a such) is handled. The implementation 659 must rely on reading character data from the current internal 660 self._stream and must return a new L{csb.bio.structure.Structure} 661 instance with properly initialized header data: accession, model, 662 molecule identifiers, chains and residues. This structure object is 663 then internally passed to the C{_parse_atoms} hook, responsible for 664 attachment of the atoms to the residues in the structure. 665 666 @param model: model ID to parse 667 @type model: str 668 669 @rtype: L{Structure} 670 """ 671 pass
672
673 - def _scroll_model(self, model, stream):
674 """ 675 Scroll the C{stream} to the specified C{model}. 676 """ 677 678 while True: 679 try: 680 line = next(stream) 681 except StopIteration: 682 raise ValueError('No such model {0} in the structure.'.format(model)) 683 684 if line.startswith('MODEL'): 685 model_id = self._read_model(line) 686 if model == model_id: 687 return model_id
688
689 - def _parse_atoms(self, structure, model):
690 """ 691 Parse the ATOMs from the specified C{model} and attach them to the 692 C{structure}. 693 694 @param structure: L{Structure} being constructed 695 @type structure:L{Structure} 696 @param model: model ID to parse 697 @type model: str 698 """ 699 700 structure.model_id = None 701 702 chains = set() 703 total_chains = len([c for c in structure.items if c.length > 0]) 704 705 residues = dict( (chain, []) for chain in structure.chains ) 706 seen_residues = dict( (chain, {}) for chain in structure.chains ) 707 708 in_ligands = False 709 in_atom = False 710 read_model = False 711 712 self._stream.seek(0) 713 while True: 714 715 try: 716 line = next(self._stream) 717 except StopIteration: 718 break 719 720 if line.startswith('MODEL'): 721 if read_model: 722 break 723 else: 724 self._parse_model_line(line, structure, model) 725 model = structure.model_id 726 read_model = True 727 728 elif line.startswith('ATOM') or \ 729 (line.startswith('HETATM') and not in_ligands): 730 in_atom = True 731 732 info = self._parse_atom_line(line, structure) 733 chains.add(info.chain) 734 735 if info.id not in seen_residues[info.chain]: 736 residues[info.chain].append(info) 737 seen_residues[info.chain][info.id] = info 738 else: 739 atom = info.atoms[0] 740 seen_residues[info.chain][info.id].atoms.append(atom) 741 742 elif in_atom and line.startswith('TER'): 743 in_atom = False 744 if len(chains) == total_chains: 745 in_ligands = True 746 747 elif line.startswith('ENDMDL'): 748 break 749 750 elif line.startswith('END'): 751 break 752 753 if structure.model_id != model: 754 raise ValueError('No such model {0} in the structure.'.format(model)) 755 756 self._map_residues(structure, residues)
757
758 - def _parse_model_line(self, line, structure, model):
759 """ 760 Handle a new MODEL line. The default implementation will read the model 761 ID and attach it to the C{structure}. 762 763 @param line: raw string line to parse 764 @type line: str 765 @param structure: L{Structure} being constructed 766 @type structure:L{Structure} 767 768 @note: this method may have side effects: scrolls the current stream 769 770 @return: read model ID 771 @rtype: int 772 """ 773 if model and model != self._read_model(line): 774 self._scroll_model(model, self._stream) 775 structure.model_id = model 776 else: 777 model = self._read_model(line) 778 structure.model_id = model 779 780 return model
781
782 - def _parse_atom_line(self, line, structure):
783 """ 784 Handle a new ATOM or HETATM line. The default implementation will read 785 all data fields and create a new L{Atom}. 786 787 @param line: raw string line to parse 788 @type line: str 789 @param structure: L{Structure} being constructed 790 @type structure:L{Structure} 791 792 @return: newly constructed atom 793 @rtype: L{ResidueInfo} 794 """ 795 796 atom = self._read_atom(line) 797 798 rank = self._read_sequence_number(line) 799 sequence_number = rank 800 insertion_code = self._read_insertion_code(line) 801 802 id = str(sequence_number) 803 if insertion_code: 804 id += insertion_code 805 806 chain = self._read_chain_id(line) 807 if chain not in structure.chains: 808 raise StructureFormatError("Chain {0} is undefined".format(chain)) 809 810 type = self._read_residue(line, structure.chains[chain]) 811 label = self._read_residue_raw(line) 812 813 atom.alternate = self._read_alternate(line) 814 atom.occupancy = self._read_occupancy(line) 815 atom.bfactor = self._read_bfactor(line) 816 atom.charge = self._read_charge(line) 817 818 info = ResidueInfo(chain, rank, id, sequence_number, insertion_code, type, label) 819 info.atoms = [atom] 820 821 return info
822
823 - def _read_model(self, line):
824 """ 825 @return: model identifier 826 @rtype: int 827 """ 828 try: 829 return int(line[10:14]) 830 except ValueError: 831 raise StructureFormatError("Invalid model ID: {0}".format(line))
832
833 - def _read_atom(self, line):
834 """ 835 @return: a new atom (serial_number, name, element, vector) 836 @rtype: L{Atom} 837 """ 838 try: 839 serial_number = int(line[6:11]) 840 name = line[12:16] 841 x, y, z = line[30:38], line[38:46], line[46:54] 842 vector = numpy.array([float(x), float(y), float(z)]) 843 except ValueError as ve: 844 raise StructureFormatError("Invalid ATOM line: {0}".format(ve)) 845 846 element = self._read_element(line) 847 return csb.bio.structure.Atom(serial_number, name, element, vector)
848
849 - def _read_sequence_number(self, line):
850 """ 851 @return: PDB sequence number 852 @rtype: int 853 """ 854 try: 855 return int(line[22:26]) 856 except ValueError: 857 raise StructureFormatError("Invalid sequence number")
858
859 - def _read_insertion_code(self, line):
860 """ 861 @return: PDB insertion code 862 @rtype: str or None 863 """ 864 code = line[26].strip() 865 866 if code: 867 return code 868 else: 869 return None
870
871 - def _read_chain_id(self, line):
872 """ 873 @return: chain identifier 874 @rtype: str 875 """ 876 return line[21].strip()
877
878 - def _read_residue(self, line, chain):
879 """ 880 @param chain: owning L{Chain} object 881 @type chain: L{Chain} 882 883 @return: a member of any alphabet (e.g. L{SequenceAlphabets.Protein}) 884 @rtype: L{EnumItem} 885 """ 886 raw = self._read_residue_raw(line) 887 residue = self.parse_residue_safe(raw, as_type=chain.type) 888 889 try: 890 if chain.type == SequenceTypes.NucleicAcid: 891 return csb.core.Enum.parsename(SequenceAlphabets.Nucleic, residue) 892 else: 893 return csb.core.Enum.parsename(SequenceAlphabets.Protein, residue) 894 except csb.core.EnumMemberError: 895 raise StructureFormatError("{0} is not a valid {1} residue".format(raw, chain.type))
896
897 - def _read_residue_raw(self, line):
898 """ 899 @rtype: str 900 """ 901 return line[17:20].strip()
902
903 - def _read_element(self, line):
904 """ 905 @return: a member of L{ChemElements} 906 @rtype: L{EnumItem} or None 907 """ 908 element = line[76:78].strip() 909 if element: 910 try: 911 element = csb.core.Enum.parsename(ChemElements, element) 912 except csb.core.EnumMemberError: 913 if element in ('D', 'X'): 914 element = ChemElements.x 915 else: 916 raise StructureFormatError('Unknown chemical element: {0}'.format(element)) 917 else: 918 element = None 919 920 return element
921
922 - def _read_alternate(self, line):
923 """ 924 @return: alt identifier 925 @rtype: str or None 926 """ 927 alternate = line[16].strip() 928 929 if not alternate: 930 return None 931 else: 932 return alternate
933
934 - def _read_occupancy(self, line):
935 """ 936 @return: occupancy 937 @rtype: float or None 938 """ 939 try: 940 return float(line[54:60].strip() or 0) 941 except ValueError: 942 raise StructureFormatError("Malformed occupancy field")
943
944 - def _read_bfactor(self, line):
945 """ 946 @return: b-factor 947 @rtype: float or None 948 """ 949 try: 950 return float(line[60:66].strip() or 0) 951 except ValueError: 952 raise StructureFormatError("Malformed bfactor field")
953
954 - def _read_charge(self, line):
955 """ 956 @return: charge 957 @rtype: int or None 958 """ 959 charge = line[78:80].strip() 960 961 if charge: 962 if charge in ('+', '-'): 963 charge += '1' 964 if charge[-1] in ('+', '-'): 965 charge = charge[::-1] 966 try: 967 return int(charge) 968 except ValueError: 969 raise StructureFormatError("Malformed charge field") 970 else: 971 return None
972
973 - def _map_residues(self, structure, residues):
974 """ 975 Attach each L{Atom} to its corresponding L{Residue}. 976 977 So far we have constructed a sparse (fragmented) chain given the information 978 we have read from the ATOM/HETATM records. That includes PDB sequence 979 identifiers and insertion codes, which cover only residues with XYZ coordinates 980 and often do not correspond to our L{Residue} ranks. 981 Our job is to find the right correspondence by matching this unreal sequence 982 to what we have got from the SEQRES fields (the complete, gap-free protein 983 sequence). Here we delegate this task to the current L{AbstractResidueMapper} 984 strategy, which does all the magic. 985 986 @param structure: L{Structure} being constructed 987 @type structure:L{Structure} 988 @param residues: all L{Atom}s which have been constructed so far. 989 This must be a map of the form: 990 <chainID: [L{ResidueInfo}1, L{ResidueInfo}2...]> 991 @type residues: dict of L{ResidueInfo} 992 """ 993 994 if set(structure.chains) != set(residues.keys()): 995 raise PDBParseError("Corrupt PDB file") 996 997 for chain in structure.items: 998 if chain.length == 0 or len(residues[chain.id]) == 0: 999 continue 1000 1001 reference = SparseChainSequence.create(chain) 1002 sparse = SparseChainSequence( 1003 "atoms", "", residues[chain.id], chain.type) 1004 1005 aligned = self.mapper.map(sparse, reference) 1006 assert aligned.length == chain.length 1007 1008 for residue, mapped in zip(chain.residues, aligned.residues): 1009 if mapped.type != sparse.alphabet.GAP: 1010 residue.id = (mapped.sequence_number, mapped.insertion_code) 1011 1012 for atom in mapped.atoms: 1013 residue.atoms.append(atom)
1014
1015 - def _parse_ss(self, structure):
1016 """ 1017 Parse and attach secondary structure data. 1018 1019 @bug: Currently the PDB helix types are ignored. Each HELIX line is treated 1020 as a regular SecStructures.Helix. This is due to incompatibility 1021 between DSSP and PDB helix types. 1022 @todo: Implement a proper workaround for the previous bug (e.g. skip all 1023 helices types not included in the DSSP enum) 1024 1025 @warning: In this implementation only the start/end positions of the SS 1026 elements are parsed. Additional data like H-bonding is ignored. 1027 1028 @bug: Currently structure.to_pdb() is not writing any SS data. 1029 """ 1030 elements = {} 1031 self._stream.seek(0) 1032 1033 while True: 1034 try: 1035 line = next(self._stream) 1036 except StopIteration: 1037 break 1038 1039 if line.startswith('HELIX'): 1040 1041 chain = structure.chains[line[19].strip()] 1042 if chain.id not in elements: 1043 elements[chain.id] = [] 1044 if chain.id != line[31].strip(): 1045 if self._check_ss: 1046 raise SecStructureFormatError('Helix {0} spans multiple chains'.format(line[7:10])) 1047 else: 1048 continue 1049 try: 1050 startres = chain.find(line[21:25].strip(), line[25].strip()) 1051 endres = chain.find(line[33:37].strip(), line[37].strip()) 1052 except csb.core.ItemNotFoundError as ex: 1053 if self._check_ss: 1054 raise SecStructureFormatError( 1055 'Helix {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex))) 1056 else: 1057 continue 1058 if not startres.rank <= endres.rank: 1059 if self._check_ss: 1060 raise SecStructureFormatError('Helix {0} is out of range'.format(line[7:10])) 1061 else: 1062 continue 1063 helix = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Helix) 1064 elements[chain.id].append(helix) 1065 1066 if line.startswith('SHEET'): 1067 1068 chain = structure.chains[line[21].strip()] 1069 if chain.id not in elements: 1070 elements[chain.id] = [] 1071 if chain.id != line[32].strip(): 1072 if self._check_ss: 1073 raise SecStructureFormatError('Sheet {0} spans multiple chains'.format(line[7:10])) 1074 else: 1075 continue 1076 try: 1077 startres = chain.find(line[22:26].strip(), line[26].strip()) 1078 endres = chain.find(line[33:37].strip(), line[37].strip()) 1079 except csb.core.ItemNotFoundError as ex: 1080 if self._check_ss: 1081 raise SecStructureFormatError( 1082 'Sheet {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex))) 1083 else: 1084 continue 1085 if not startres.rank <= endres.rank: 1086 if self._check_ss: 1087 raise SecStructureFormatError('Sheet {0} is out of range'.format(line[7:10])) 1088 else: 1089 continue 1090 strand = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Strand) 1091 elements[chain.id].append(strand) 1092 1093 elif line.startswith('MODEL') or line.startswith('ATOM'): 1094 break 1095 1096 for chain_id in elements: 1097 ss = csb.bio.structure.SecondaryStructure() 1098 for e in elements[chain_id]: 1099 ss.append(e) 1100 structure.chains[chain_id].secondary_structure = ss
1101
1102 - def _parse_remarks(self):
1103 """ 1104 Read REMARK lines from PDB file. 1105 1106 @return: dictionary with remark numbers as keys, and lists of lines as values. 1107 @rtype: dict 1108 """ 1109 self._stream.seek(0) 1110 1111 remarks = {} 1112 1113 for line in self._stream: 1114 if line.startswith('REMARK'): 1115 num = int(line[7:10]) 1116 lstring = line[11:] 1117 remarks.setdefault(num, []).append(lstring) 1118 elif line.startswith('DBREF') or line.startswith('ATOM'): 1119 break 1120 1121 return remarks
1122
1123 1124 -class RegularStructureParser(AbstractStructureParser):
1125 """ 1126 This is the de facto PDB parser, which is designed to read SEQRES and ATOM 1127 sections separately, and them map them. Intentionally fails to parse 1128 malformed PDB files, e.g. a PDB file without a HEADER section. 1129 """ 1130
1131 - def _parse_header(self, model):
1132 """ 1133 Parse the HEADER section of a regular PDB file. 1134 1135 @return: a L{csb.bio.structure.Structure} instance with properly 1136 initialized residues from the SEQRES. 1137 @rtype: L{csb.bio.structure.Structure} 1138 1139 @raise PDBParseError: if the stream has no HEADER at byte 0 1140 """ 1141 1142 self._stream.seek(0) 1143 1144 header = next(self._stream) 1145 if not header.startswith('HEADER'): 1146 raise PDBParseError('Does not look like a regular PDB file.') 1147 1148 structure = csb.bio.structure.Structure(header.split()[-1]) 1149 1150 while True: 1151 1152 try: 1153 line = next(self._stream) 1154 except StopIteration: 1155 break 1156 1157 if line.startswith('COMPND'): 1158 if line[10:].lstrip().startswith('MOL_ID:'): 1159 mol_id = int(line[18:].replace(';', '').strip()) 1160 chain_name = '' 1161 chains = '' 1162 while line.startswith('COMPND'): 1163 line = next(self._stream) 1164 if line.split()[2].startswith('MOLECULE:'): 1165 chain_name += line[20:].strip() 1166 while not chain_name.endswith(';'): 1167 line = next(self._stream) 1168 if not line.startswith('COMPND'): 1169 break 1170 chain_name += ' ' + line[11:].strip() 1171 else: 1172 while not line.split()[2].startswith('CHAIN:'): 1173 line = next(self._stream) 1174 if not line.startswith('COMPND'): 1175 raise HeaderFormatError('Missing chain identifier in COMPND section') 1176 chains = line[17:].strip() 1177 while not chains.endswith(';'): 1178 line = next(self._stream) 1179 if not line.startswith('COMPND'): 1180 break 1181 chains += ', ' + line[11:].strip() 1182 break 1183 1184 chain_ids = chains.replace(';', ' ').replace(',', ' ').split() or [''] # the second part deals with an empty chain id 1185 self._add_chains(structure, chain_name, mol_id, *chain_ids) 1186 1187 elif line.startswith('REMARK 2 RESOLUTION'): 1188 structure.resolution = self._read_resolution(line) 1189 1190 elif line.startswith('SEQRES'): 1191 chain_id, residues = self._parse_seqres_line(line, structure) 1192 chain = structure.chains[chain_id] 1193 1194 for residue in residues: 1195 chain.residues.append(residue) 1196 if chain.residues.last_index != residue.rank: 1197 raise HeaderFormatError("Malformed SEQRES") 1198 1199 elif line.startswith('MODEL') or line.startswith('ATOM'): 1200 break 1201 1202 return structure
1203
1204 - def _add_chains(self, structure, name, mol_id, *chain_ids):
1205 1206 name = name.strip().rstrip(";") 1207 1208 for chain in chain_ids: 1209 new_chain = csb.bio.structure.Chain(chain, type=SequenceTypes.Unknown, 1210 name=name, accession=structure.accession) 1211 new_chain.molecule_id = mol_id 1212 try: 1213 structure.chains.append(new_chain) 1214 except csb.bio.structure.DuplicateChainIDError: 1215 raise HeaderFormatError('Chain {0} is already defined.'.format(new_chain.id))
1216
1217 - def _read_resolution(self, line):
1218 """ 1219 @return: resolution 1220 @rtype: float or None 1221 """ 1222 res = re.search("(\d+(?:\.\d+)?)\s+ANGSTROM", line) 1223 1224 if res and res.groups(): 1225 return float(res.group(1)) 1226 else: 1227 return None
1228
1229 - def _parse_seqres_line(self, line, structure):
1230 """ 1231 Parse a SEQRES line, build and return newly constructed residues. 1232 If the current sequence type of the chain is unknown, try to guess it 1233 before parsing the residues. 1234 1235 @return: parsed chain_id and L{Residue}s 1236 @rtype: 2-tuple: (str, iterable of L{Residue}) 1237 """ 1238 residues = [] 1239 1240 rownum = int(line[7:10]) 1241 chain_id = line[11].strip() 1242 labels = line[18:].split() 1243 1244 if chain_id not in structure.chains: 1245 raise HeaderFormatError('Chain {0} is undefined'.format(chain_id)) 1246 1247 chain = structure.chains[chain_id] 1248 1249 if chain.type == SequenceTypes.Unknown: 1250 chain.type = self.guess_chain_type(labels) 1251 1252 for rn, label in enumerate(labels): 1253 rank = rownum * 13 - (13 - (rn + 1)) 1254 rtype = self.parse_residue_safe(label, as_type=chain.type) 1255 1256 residue = csb.bio.structure.Residue.create(chain.type, rank=rank, type=rtype) 1257 residue.label = label 1258 residues.append(residue) 1259 1260 return chain_id, residues
1261
1262 1263 -class PDBHeaderParser(RegularStructureParser):
1264 """ 1265 Ultra fast PDB HEADER parser. Does not read any structural data. 1266 """ 1267
1268 - def _parse_atoms(self, structure, model):
1269 pass
1270
1271 - def _parse_ss(self, structure):
1272 pass
1273
1274 - def _parse_header(self, model):
1275 return super(PDBHeaderParser, self)._parse_header(model)
1276
1277 1278 -class LegacyStructureParser(AbstractStructureParser):
1279 """ 1280 This is a customized PDB parser, which is designed to read both sequence and 1281 atom data from the ATOM section. This is especially useful when parsing PDB 1282 files without a header. 1283 """ 1284
1285 - def _parse_header(self, model):
1286 """ 1287 Initialize a structure with residues from the ATOMs section. 1288 1289 @param model: model identifier (e.g. if multiple models exist) 1290 @type model: str 1291 1292 @return: a L{csb.bio.structure.Structure} instance with properly 1293 initialized residues from ATOMs under the specified C{model}. 1294 @rtype: L{csb.bio.structure.Structure} 1295 """ 1296 1297 self._stream.seek(0) 1298 in_atom = False 1299 has_atoms = False 1300 has_model = False 1301 chains = csb.core.OrderedDict() 1302 1303 header = next(self._stream) 1304 if header.startswith('HEADER'): 1305 structure = csb.bio.structure.Structure(header.split()[-1]) 1306 else: 1307 self._stream.seek(0) 1308 structure = csb.bio.structure.Structure('NONE') 1309 1310 structure.model_id = None 1311 1312 while True: 1313 1314 try: 1315 line = next(self._stream) 1316 except StopIteration: 1317 break 1318 1319 if line.startswith('MODEL'): 1320 if has_model: 1321 break 1322 else: 1323 self._parse_model_line(line, structure, model) 1324 model = structure.model_id 1325 has_model = True 1326 1327 elif line.startswith('ATOM') \ 1328 or (in_atom and line.startswith('HETATM')): 1329 in_atom = True 1330 has_atoms = True 1331 1332 seq_number = self._read_sequence_number(line) 1333 ins_code = self._read_insertion_code(line) 1334 residue_id = (seq_number, ins_code) 1335 label = self._read_residue_raw(line) 1336 chain_id = self._read_chain_id(line) 1337 1338 if chain_id not in chains: 1339 chains[chain_id] = csb.core.OrderedDict() 1340 self._add_chain(structure, chain_id) 1341 1342 if residue_id not in chains[chain_id]: 1343 chains[chain_id][residue_id] = label 1344 chain = structure.chains[chain_id] 1345 if chain.type == SequenceTypes.Unknown: 1346 self._fix_chain(chain, label) 1347 1348 elif in_atom and line.startswith('TER'): 1349 in_atom = False 1350 elif line.startswith('ENDMDL'): 1351 break 1352 elif line.startswith('END'): 1353 break 1354 1355 if not has_atoms: 1356 raise HeaderFormatError("Can't parse legacy structure: no ATOMs found") 1357 1358 for chain in structure.items: 1359 self._build_chain(chain, chains[chain.id]) 1360 1361 return structure
1362
1363 - def _add_chain(self, structure, chain_id):
1364 1365 new_chain = csb.bio.structure.Chain(chain_id, 1366 type=SequenceTypes.Unknown, 1367 accession=structure.accession) 1368 new_chain.molecule_id = '1' 1369 structure.chains.append(new_chain)
1370
1371 - def _build_chain(self, chain, residues):
1372 1373 for residue_id, label in residues.items(): 1374 rank = (chain.residues.last_index or 0) + 1 1375 1376 rname = self.parse_residue_safe(label, as_type=chain.type) 1377 residue = csb.bio.structure.Residue.create(chain.type, rank=rank, type=rname) 1378 residue.label = label 1379 residue.id = residue_id 1380 chain.residues.append(residue)
1381
1382 - def _fix_chain(self, chain, probe):
1383 1384 try: 1385 chain.type = self.guess_sequence_type(probe) 1386 except UnknownPDBResidueError: 1387 pass
1388
1389 - def _map_residues(self, structure, residues):
1390 1391 for chain in structure.items: 1392 for residue_info in residues[chain.id]: 1393 try: 1394 residue = chain.find(residue_info.sequence_number, residue_info.insertion_code) 1395 for atom in residue_info.atoms: 1396 residue.atoms.append(atom) 1397 1398 except csb.bio.structure.EntityNotFoundError: 1399 pass
1400 1401 1402 StructureParser = AbstractStructureParser.create_parser 1403 """ 1404 Alias for L{AbstractStructureParser.create_parser}. 1405 """
1406 1407 1408 -class ResidueInfo(object):
1409 """ 1410 High-performance struct, which functions as a container for unmapped 1411 L{Atom}s. 1412 1413 @note: This object must implement the L{csb.bio.sequence.ResidueInfo} 1414 interface. This is not enforced through inheritance solely 1415 to save some CPU (by exposing public fields and no properties). 1416 However, on an abstract level this object is_a ResidueInfo 1417 and is used to build L{AbstractSequence}s. 1418 """ 1419 __slots__ = ['chain', 'rank', 'id' , 'sequence_number', 'insertion_code', 'type', 'label', 'atoms'] 1420
1421 - def __init__(self, chain, rank, id, seq_number, ins_code, type, label):
1422 1423 self.chain = chain 1424 self.rank = rank 1425 self.id = id 1426 self.sequence_number = seq_number 1427 self.insertion_code = ins_code 1428 self.type = type 1429 self.label = label 1430 self.atoms = []
1431 1432 @property
1433 - def is_modified(self):
1434 1435 if self.type.enum is SequenceAlphabets.Nucleic: 1436 return self.label != str(self.type) 1437 else: 1438 return self.label != repr(self.type)
1439
1440 -class SparseChainSequence(csb.bio.sequence.ChainSequence):
1441 """ 1442 Sequence view for reference (SEQRES) or sparse (ATOM) PDB chains. 1443 The residue instances passed to the constructor must be 1444 L{csb.bio.structure.Residue} or L{csb.bio.io.wwpdb.ResidueInfo} objects. 1445 1446 See L{csb.bio.sequence.AbstractSequence} for details. 1447 """ 1448
1449 - def _add(self, residue):
1450 1451 if not isinstance(residue, (csb.bio.structure.Residue, ResidueInfo)): 1452 raise TypeError(residue) 1453 else: 1454 self._residues.append(residue)
1455
1456 - def _get(self, rank):
1457 return self._residues[rank - 1]
1458 1459 @staticmethod
1460 - def create(chain):
1461 """ 1462 Create a new L{SparseChainSequence} from existing L{Chain}. 1463 1464 @type chain: L{csb.bio.structure.Chain} 1465 @rtype: L{SparseChainSequence} 1466 """ 1467 return SparseChainSequence( 1468 chain.entry_id, chain.header, chain.residues, chain.type)
1469
1470 1471 -class AbstractResidueMapper(object):
1472 """ 1473 Defines the base interface of all residue mappers, used to align PDB ATOM 1474 records to the real (SEQRES) sequence of a chain. 1475 """ 1476 1477 __metaclass__ = ABCMeta 1478 1479 @abstractmethod
1480 - def map(self, sparse, reference):
1481 """ 1482 Map L{sparse}'s residues to L{reference}. Return all C{sparse} residues, 1483 aligned over C{reference}, with artificial gap residues inserted at 1484 relevant positions. The resulting sequence of sparse residues will 1485 always have the same length as the C{reference} sequence. 1486 1487 @note: C{sparse}'s ranks won't be touched because the C{rank} property 1488 of the underlying L{ResidueInfo} implementation is not necessarily r/w. 1489 1490 @param sparse: sparse sequence (e.g. derived from ATOMS records) 1491 @type sparse: L{SparseChainSequence} 1492 @param reference: reference, complete sequence 1493 (e.g. derived from SEQRES records) 1494 @type reference: L{SparseChainSequence} 1495 1496 @return: all C{sparse} residues, optimally aligned over C{reference} 1497 (with gaps) 1498 @rtype: L{SparseChainSequence} 1499 1500 @raise ResidueMappingError: if the specified sequences are not alignable 1501 """ 1502 pass
1503
1504 - def create_gap(self, alphabet=SequenceAlphabets.Protein):
1505 """ 1506 Create and return a new gap residue. 1507 1508 @param alphabet: sequence alphabet; a member of L{SequenceAlphabets} 1509 which has GAP item 1510 @type alphabet: L{enum} 1511 1512 @rtype: L{ResidueInfo} 1513 """ 1514 return ResidueInfo(None, -1, None, None, None, alphabet.GAP, "-")
1515
1516 - def _build(self, sparse, aligned):
1517 1518 return SparseChainSequence( 1519 sparse.id, sparse.header, aligned, sparse.type)
1520
1521 -class FastResidueMapper(AbstractResidueMapper):
1522 """ 1523 RegExp-based residue mapper. Fails on heavily malformed input (i.e. it cannot 1524 insert gaps in the C{reference}), but it is very fast (linear) and memory 1525 efficient. 1526 """ 1527 1528 MAX_FRAGMENTS = 20 1529 1530 MIN_UNICODE_CHAR = 300 1531 FORBIDDEN_CHARS = set('^.*?()-') 1532 1533 CODEC = "utf-8" 1534 DELIMITER = ").*?(".encode(CODEC).decode(CODEC) 1535 PATTERN = "^.*?({0}).*?$".encode(CODEC).decode(CODEC) 1536
1537 - def __init__(self):
1538 self._charcode = FastResidueMapper.MIN_UNICODE_CHAR 1539 self._cache = {}
1540
1541 - def map(self, sparse, reference):
1542 1543 aligned = [] 1544 mapping = {} 1545 1546 residues = list(sparse.residues) 1547 1548 pattern = self._build_pattern(residues) 1549 seqres = self._encode_sequence(reference) 1550 1551 matches = re.match(pattern, seqres) 1552 1553 if matches: 1554 unmapped_item = -1 1555 1556 for fn, fragment in enumerate(matches.groups(), start=1): 1557 assert fragment != '' 1558 1559 for offset in range(1, len(fragment) + 1): 1560 unmapped_item += 1 1561 rank = matches.start(fn) + offset 1562 1563 mapped_residue = residues[unmapped_item] 1564 real_residue = reference.residues[rank] 1565 1566 assert real_residue.type == mapped_residue.type 1567 mapping[real_residue] = mapped_residue 1568 1569 else: 1570 raise ResidueMappingError("Can't map ATOM records") 1571 1572 for rank, residue in enumerate(reference.residues, start=1): 1573 if residue in mapping: 1574 aligned.append(mapping[residue]) 1575 else: 1576 aligned.append(self.create_gap(sparse.alphabet)) 1577 1578 assert len(aligned) == reference.length 1579 return self._build(sparse, aligned)
1580
1581 - def _build_pattern(self, residues):
1582 """ 1583 Build and return a sparse regular rexpression for C{residues}. 1584 """ 1585 1586 fragments = [] 1587 1588 for rn, r in enumerate(residues): 1589 res_name = self._encode(r) 1590 1591 if rn == 0: 1592 # First residue, start a new fragment: 1593 fragments.append([res_name]) 1594 elif r.insertion_code: # and not residues[rn - 1].insertion_code: 1595 # If residue i has an insertion code, initiate a new fragment: 1596 fragments.append([res_name]) 1597 elif r.sequence_number - residues[rn - 1].sequence_number in (0, 1, -1): 1598 # If the seq numbers of residues [i-1, i] are consecutive, extend the last fragment: 1599 fragments[-1].append(res_name) 1600 else: 1601 # They are not consecutive, so we better start a new fragment: 1602 fragments.append([res_name]) 1603 1604 for i, frag in enumerate(fragments): 1605 fragments[i] = ''.join(frag) 1606 if len(fragments) > FastResidueMapper.MAX_FRAGMENTS: 1607 # Wow, that's a lot of fragments. Better use a different mapper 1608 raise ResidueMappingError("Can't map chain with large number of fragments") 1609 1610 blocks = FastResidueMapper.DELIMITER.join(fragments) 1611 pattern = FastResidueMapper.PATTERN.format(blocks) 1612 1613 return pattern
1614
1615 - def _encode(self, r):
1616 """ 1617 Return a unique single-letter representation of C{r.type}. 1618 """ 1619 if not r.is_modified: 1620 return str(r.type) 1621 else: 1622 return self._register_label(r.label)
1623
1624 - def _encode_sequence(self, s):
1625 return ''.join(map(self._encode, s.residues))
1626
1627 - def _register_label(self, label):
1628 """ 1629 Assign a new unicode character to C{label} and cache it. 1630 1631 @return: cached single-letter representation of label. 1632 @rtype: unicode char 1633 """ 1634 1635 if label not in self._cache: 1636 if set(label).intersection(FastResidueMapper.FORBIDDEN_CHARS): 1637 raise ResidueMappingError("Invalid residue label") 1638 1639 self._charcode += 1 1640 code = self._charcode 1641 self._cache[label] = csb.io.unichr(code) 1642 1643 return self._cache[label]
1644
1645 1646 -class RobustResidueMapper(AbstractResidueMapper):
1647 """ 1648 Exhaustive residue mapper, which uses Needleman-Wunsch global alignment. 1649 Much slower (quadratic), but fail-proof even with incompatible sequences 1650 (can insert gaps in both the C{sparse} and the C{reference} sequence). 1651 1652 @param match: score for a match 1653 @type match: float 1654 @param mismatch: score for a mismatch (by default mismatches are heavily 1655 penalized, while gaps are allowed) 1656 @type mismatch: float 1657 @param gap: gap penalty 1658 @type gap: float 1659 """ 1660
1661 - class GlobalAligner(alignment.GlobalAlignmentAlgorithm):
1662
1663 - def _sequence(self, s):
1664 return [r.label for r in s.residues]
1665 1666
1667 - def __init__(self, match=1, mismatch=-10, gap=0):
1668 1669 scoring = alignment.IdentityMatrix(match=match, mismatch=mismatch) 1670 aligner = RobustResidueMapper.GlobalAligner(scoring=scoring, gap=gap) 1671 1672 self._aligner = aligner
1673
1674 - def map(self, sparse, reference):
1675 1676 aligned = [] 1677 ali = self._aligner.align(sparse, reference) 1678 1679 if ali.is_empty: 1680 raise ResidueMappingError("Global alignment failed") 1681 1682 for mapped, residue in zip(ali.query, ali.subject): 1683 1684 if residue.type == reference.alphabet.GAP: 1685 continue 1686 elif mapped.type == sparse.alphabet.GAP: 1687 aligned.append(self.create_gap(sparse.alphabet)) 1688 else: 1689 aligned.append(mapped) 1690 1691 return self._build(sparse, aligned)
1692
1693 -class CombinedResidueMapper(AbstractResidueMapper):
1694 """ 1695 The best of both worlds: attempts to map the residues using 1696 L{FastResidueMapper}, but upon failure secures success by switching to 1697 L{RobustResidueMapper}. 1698 """ 1699 FAST = FastResidueMapper() 1700 ROBUST = RobustResidueMapper() 1701
1702 - def map(self, sparse, reference):
1703 1704 try: 1705 return CombinedResidueMapper.FAST.map(sparse, reference) 1706 except ResidueMappingError: 1707 return CombinedResidueMapper.ROBUST.map(sparse, reference)
1708
1709 1710 -class FileBuilder(object):
1711 """ 1712 Base abstract files for all structure file formatters. 1713 Defines a common step-wise interface according to the Builder pattern. 1714 1715 @param output: output stream (this is where the product is constructed) 1716 @type output: stream 1717 """ 1718 1719 __metaclass__ = ABCMeta 1720
1721 - def __init__(self, output):
1722 1723 if not hasattr(output, 'write'): 1724 raise TypeError(output) 1725 1726 def isnull(this, that, null=None): 1727 if this is null: 1728 return that 1729 else: 1730 return this
1731 1732 self._out = output 1733 self._isnull = isnull
1734 1735 @property
1736 - def output(self):
1737 """ 1738 Destination stream 1739 @rtype: stream 1740 """ 1741 return self._out
1742 1743 @property
1744 - def isnull(self):
1745 """ 1746 ISNULL(X, Y) function 1747 @rtype: callable 1748 """ 1749 return self._isnull
1750
1751 - def write(self, text):
1752 """ 1753 Write a chunk of text 1754 """ 1755 self._out.write(text)
1756
1757 - def writeline(self, text):
1758 """ 1759 Write a chunk of text and append a new line terminator 1760 """ 1761 self._out.write(text) 1762 self._out.write('\n')
1763 1764 @abstractmethod
1765 - def add_header(self, master_structure):
1766 pass
1767 1768 @abstractmethod
1769 - def add_structure(self, structure):
1770 pass
1771
1772 - def finalize(self):
1773 pass
1774
1775 -class PDBFileBuilder(FileBuilder):
1776 """ 1777 PDB file format builder. 1778 """ 1779
1780 - def writeline(self, text):
1781 self.write('{0:80}\n'.format(text))
1782
1783 - def add_header(self, master):
1784 """ 1785 Write the HEADER of the file using C{master} 1786 1787 @type master: L{Structure} 1788 """ 1789 1790 isnull = self.isnull 1791 1792 header = 'HEADER {0:40}{1:%d-%b-%y} {2:4}' 1793 self.writeline(header.format('.', datetime.datetime.now(), master.accession.upper())) 1794 1795 molecules = { } 1796 1797 for chain_id in master.chains: 1798 chain = master.chains[chain_id] 1799 if chain.molecule_id not in molecules: 1800 molecules[chain.molecule_id] = [ ] 1801 molecules[chain.molecule_id].append(chain_id) 1802 1803 k = 0 1804 for mol_id in sorted(molecules): 1805 1806 chains = molecules[mol_id] 1807 first_chain = master.chains[ chains[0] ] 1808 1809 self.writeline('COMPND {0:3} MOL_ID: {1};'.format(k + 1, isnull(mol_id, '0'))) 1810 self.writeline('COMPND {0:3} MOLECULE: {1};'.format(k + 2, isnull(first_chain.name, ''))) 1811 self.writeline('COMPND {0:3} CHAIN: {1};'.format(k + 3, ', '.join(chains))) 1812 k += 3 1813 1814 for chain_id in master.chains: 1815 1816 chain = master.chains[chain_id] 1817 res = [ r.label for r in chain.residues ] 1818 1819 rn = 0 1820 for j in range(0, chain.length, 13): 1821 rn += 1 1822 residues = [ '{0:>3}'.format(r) for r in res[j : j + 13] ] 1823 self.writeline('SEQRES {0:>3} {1} {2:>4} {3}'.format( 1824 rn, chain.id, chain.length, ' '.join(residues) ))
1825
1826 - def add_structure(self, structure):
1827 """ 1828 Append a new model to the file 1829 1830 @type structure: L{Structure} 1831 """ 1832 1833 isnull = self.isnull 1834 1835 for chain_id in structure.chains: 1836 1837 chain = structure.chains[chain_id] 1838 for residue in chain.residues: 1839 1840 atoms = [ ] 1841 for an in residue.atoms: 1842 atom = residue.atoms[an] 1843 if isinstance(atom, csb.bio.structure.DisorderedAtom): 1844 for dis_atom in atom: atoms.append(dis_atom) 1845 else: 1846 atoms.append(atom) 1847 atoms.sort() 1848 1849 for atom in atoms: 1850 1851 alt = atom.alternate 1852 if alt is True: 1853 alt = 'A' 1854 elif alt is False: 1855 alt = ' ' 1856 1857 if atom.element: 1858 element = repr(atom.element) 1859 else: 1860 element = ' ' 1861 self.writeline('ATOM {0:>5} {1:>4}{2}{3:>3} {4}{5:>4}{6} {7:>8.3f}{8:>8.3f}{9:>8.3f}{10:>6.2f}{11:>6.2f}{12:>12}{13:2}'.format( 1862 atom.serial_number, atom._full_name, isnull(alt, ' '), 1863 residue.label, chain.id, 1864 isnull(residue.sequence_number, residue.rank), isnull(residue.insertion_code, ' '), 1865 atom.vector[0], atom.vector[1], atom.vector[2], isnull(atom.occupancy, 0.0), isnull(atom.bfactor, 0.0), 1866 element, isnull(atom.charge, ' ') )) 1867 1868 self.writeline('TER')
1869
1870 - def finalize(self):
1871 """ 1872 Add the END marker 1873 """ 1874 self.writeline('END') 1875 self._out.flush()
1876
1877 -class PDBEnsembleFileBuilder(PDBFileBuilder):
1878 """ 1879 Supports serialization of NMR ensembles. 1880 1881 Functions as a simple decorator, which wraps C{add_structure} with 1882 MODEL/ENDMDL records. 1883 """ 1884
1885 - def add_structure(self, structure):
1886 1887 model_id = self.isnull(structure.model_id, 1) 1888 1889 self.writeline('MODEL {0:>4}'.format(model_id)) 1890 super(PDBEnsembleFileBuilder, self).add_structure(structure) 1891 self.writeline('ENDMDL')
1892
1893 1894 -class StructureProvider(object):
1895 """ 1896 Base class for all PDB data source providers. 1897 1898 Concrete classes need to implement the C{find} method, which abstracts the 1899 retrieval of a PDB structure file by a structure identifier. This is a hook 1900 method called internally by C{get}, but subclasses can safely override both 1901 C{find} and {get} to in order to achieve completely custom behavior. 1902 """ 1903 1904 __metaclass__ = ABCMeta 1905
1906 - def __getitem__(self, id):
1907 return self.get(id)
1908 1909 @abstractmethod
1910 - def find(self, id):
1911 """ 1912 Attempt to discover a PDB file, given a specific PDB C{id}. 1913 1914 @param id: structure identifier (e.g. 1x80) 1915 @type id: str 1916 @return: path and file name on success, None otherwise 1917 @rtype: str or None 1918 """ 1919 pass
1920
1921 - def get(self, id, model=None):
1922 """ 1923 Discover, parse and return the PDB structure, corresponding to the 1924 specified C{id}. 1925 1926 @param id: structure identifier (e.g. 1x80) 1927 @type id: str 1928 @param model: optional model identifier 1929 @type model: str 1930 1931 @rtype: L{csb.bio.Structure} 1932 1933 @raise StructureNotFoundError: when C{id} could not be found 1934 """ 1935 pdb = self.find(id) 1936 1937 if pdb is None: 1938 raise StructureNotFoundError(id) 1939 else: 1940 return StructureParser(pdb).parse_structure(model=model)
1941
1942 -class FileSystemStructureProvider(StructureProvider):
1943 """ 1944 Simple file system based PDB data source. Scans a list of local directories 1945 using pre-defined file name templates. 1946 1947 @param paths: a list of paths 1948 @type paths: iterable or str 1949 """ 1950
1951 - def __init__(self, paths=None):
1952 1953 self._templates = ['pdb{id}.ent', 'pdb{id}.pdb', '{id}.pdb', '{id}.ent'] 1954 self._paths = csb.core.OrderedDict() 1955 1956 if paths is not None: 1957 if isinstance(paths, csb.core.string): 1958 paths = [paths] 1959 1960 for path in paths: 1961 self.add(path)
1962 1963 @property
1964 - def paths(self):
1965 """ 1966 Current search paths 1967 @rtype: tuple 1968 """ 1969 return tuple(self._paths)
1970 1971 @property
1972 - def templates(self):
1973 """ 1974 Current file name match templates 1975 @rtype: tuple 1976 """ 1977 return tuple(self._templates)
1978
1979 - def add(self, path):
1980 """ 1981 Register a new local C{path}. 1982 1983 @param path: directory name 1984 @type path: str 1985 1986 @raise IOError: if C{path} is not a valid directory 1987 """ 1988 if os.path.isdir(path): 1989 self._paths[path] = path 1990 else: 1991 raise IOError(path)
1992
1993 - def add_template(self, template):
1994 """ 1995 Register a custom file name name C{template}. The template must contain 1996 an E{lb}idE{rb} macro, e.g. pdbE{lb}idE{rb}.ent 1997 1998 @param template: pattern 1999 @type template: str 2000 """ 2001 2002 if '{id}' not in template: 2003 raise ValueError('Template does not contain an "{id}" macro') 2004 2005 if template not in self._templates: 2006 self._templates.append(template)
2007
2008 - def remove(self, path):
2009 """ 2010 Unregister an existing local C{path}. 2011 2012 @param path: directory name 2013 @type path: str 2014 2015 @raise ValueError: if C{path} had not been registered 2016 """ 2017 if path not in self._paths: 2018 raise ValueError('path not found: {0}'.format(path)) 2019 2020 del self._paths[path]
2021
2022 - def find(self, id):
2023 2024 for path in self._paths: 2025 for token in self.templates: 2026 fn = os.path.join(path, token.format(id=id)) 2027 if os.path.exists(fn): 2028 return fn 2029 2030 return None
2031
2032 -class RemoteStructureProvider(StructureProvider):
2033 """ 2034 Retrieves PDB structures from a specified remote URL. 2035 The URL requested from remote server takes the form: <prefix>/<ID><suffix> 2036 2037 @param prefix: URL prefix, including protocol 2038 @type prefix: str 2039 @param suffix: optional URL suffix (.ent by default) 2040 @type suffix: str 2041 """ 2042
2043 - def __init__(self, prefix='https://files.rcsb.org/download/', suffix='.pdb'):
2044 2045 self._prefix = None 2046 self._suffix = None 2047 2048 self.prefix = prefix 2049 self.suffix = suffix
2050 2051 @property
2052 - def prefix(self):
2053 """ 2054 Current URL prefix 2055 @rtype: str 2056 """ 2057 return self._prefix
2058 @prefix.setter
2059 - def prefix(self, value):
2060 self._prefix = value
2061 2062 @property
2063 - def suffix(self):
2064 """ 2065 Current URL suffix 2066 @rtype: str 2067 """ 2068 return self._suffix
2069 @suffix.setter
2070 - def suffix(self, value):
2071 self._suffix = value
2072
2073 - def _find(self, id):
2074 2075 try: 2076 return csb.io.urllib.urlopen(self.prefix + id + self.suffix) 2077 except Exception as e: 2078 raise StructureNotFoundError(id + ": " + str(e))
2079
2080 - def find(self, id):
2081 2082 stream = self._find(id) 2083 2084 try: 2085 tmp = csb.io.TempFile(dispose=False) 2086 tmp.write(stream.read().decode('utf-8')) 2087 tmp.flush() 2088 return tmp.name 2089 2090 except StructureNotFoundError: 2091 return None 2092 finally: 2093 stream.close()
2094
2095 - def get(self, id, model=None):
2096 2097 stream = self._find(id) 2098 2099 try: 2100 with csb.io.TempFile() as tmp: 2101 tmp.write(stream.read().decode('utf-8')) 2102 tmp.flush() 2103 return StructureParser(tmp.name).parse_structure(model=model) 2104 finally: 2105 stream.close()
2106
2107 -class CustomStructureProvider(StructureProvider):
2108 """ 2109 A custom PDB data source. Functions as a user-defined map of structure 2110 identifiers and their corresponding local file names. 2111 2112 @param files: initialization dictionary of id:file pairs 2113 @type files: dict-like 2114 """ 2115
2116 - def __init__(self, files={}):
2117 2118 self._files = {} 2119 for id in files: 2120 self.add(id, files[id])
2121 2122 @property
2123 - def paths(self):
2124 """ 2125 List of currently registered file names 2126 @rtype: tuple 2127 """ 2128 return tuple(self._files.values())
2129 2130 @property
2131 - def identifiers(self):
2132 """ 2133 List of currently registered structure identifiers 2134 @rtype: tuple 2135 """ 2136 return tuple(self._files)
2137
2138 - def add(self, id, path):
2139 """ 2140 Register a new local C{id}:C{path} pair. 2141 2142 @param id: structure identifier 2143 @type id: str 2144 @param path: path and file name 2145 @type path: str 2146 2147 @raise IOError: if C{path} is not a valid file name 2148 """ 2149 if os.path.isfile(path): 2150 self._files[id] = path 2151 else: 2152 raise IOError(path)
2153
2154 - def remove(self, id):
2155 """ 2156 Unregister an existing structure C{id}. 2157 2158 @param id: structure identifier 2159 @type id: str 2160 2161 @raise ValueError: if C{id} had not been registered 2162 """ 2163 if id not in self._files: 2164 raise ValueError(id) 2165 else: 2166 del self._files[id]
2167
2168 - def find(self, id):
2169 2170 if id in self._files: 2171 return self._files[id] 2172 else: 2173 return None
2174
2175 -def get(accession, model=None, prefix='https://files.rcsb.org/download/'):
2176 """ 2177 Download and parse a PDB entry. 2178 2179 @param accession: accession number of the entry 2180 @type accession: str 2181 @param model: model identifier 2182 @type model: str 2183 @param prefix: download URL prefix 2184 @type prefix: str 2185 2186 @return: object representation of the selected model 2187 @rtype: L{Structure} 2188 """ 2189 return RemoteStructureProvider(prefix).get(accession, model=model)
2190
2191 -def find(id, paths):
2192 """ 2193 Try to discover a PDB file for PDB C{id} in C{paths}. 2194 2195 @param id: PDB ID of the entry 2196 @type id: str 2197 @param paths: a list of directories to scan 2198 @type paths: list of str 2199 2200 @return: path and file name on success, None otherwise 2201 @rtype: str 2202 """ 2203 return FileSystemStructureProvider(paths).find(id)
2204
2205 2206 -class AsyncParseResult(object):
2207
2208 - def __init__(self, result, exception):
2209 2210 self.result = result 2211 self.exception = exception
2212
2213 - def __repr__(self):
2214 return '<AsyncParseResult: result={0.result}, error={0.exception.__class__.__name__}>'.format(self)
2215
2216 2217 -def _parse_async(parser, file, model):
2218 p = parser(file) 2219 return p.parse_structure(model)
2220
2221 2222 -class AsyncStructureParser(object):
2223 """ 2224 Wraps StructureParser in an asynchronous call. Since a new process is 2225 started by Python internally (as opposed to only starting a new thread), 2226 this makes the parser slower, but provides a way to set a parse timeout 2227 limit. 2228 2229 If initialized with more than one worker, supports parallel parsing 2230 through the C{self.parse_async} method. 2231 2232 @param workers: number of worker threads (1 by default) 2233 @type workers: int 2234 """ 2235
2236 - def __init__(self, workers=1):
2237 2238 self._pool = None 2239 self._workers = 1 2240 2241 if int(workers) > 0: 2242 self._workers = int(workers) 2243 else: 2244 raise ValueError(workers) 2245 2246 self._recycle()
2247
2248 - def _recycle(self):
2249 2250 if self._pool: 2251 self._pool.terminate() 2252 2253 self._pool = multiprocessing.Pool(processes=self._workers)
2254
2255 - def parse_structure(self, structure_file, timeout, model=None, 2256 parser=RegularStructureParser):
2257 """ 2258 Call StructureParser.parse_structure() in a separate process and return 2259 the output. Raise TimeoutError if the parser does not respond within 2260 C{timeout} seconds. 2261 2262 @param structure_file: structure file to parse 2263 @type structure_file: str 2264 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds 2265 elapse before the parser completes its job 2266 @type timeout: int 2267 @param parser: any implementing L{AbstractStructureParser} class 2268 @type parser: type 2269 2270 @return: parsed structure 2271 @rtype: L{csb.structure.Structure} 2272 """ 2273 2274 r = self.parse_async([structure_file], timeout, model, parser) 2275 if len(r) > 0: 2276 if r[0].exception is not None: 2277 raise r[0].exception 2278 else: 2279 return r[0].result 2280 return None
2281
2282 - def parse_async(self, structure_files, timeout, model=None, 2283 parser=RegularStructureParser):
2284 """ 2285 Call C{self.parse_structure} for a list of structure files 2286 simultaneously. The actual degree of parallelism will depend on the 2287 number of workers specified while constructing the parser object. 2288 2289 @note: Don't be tempted to pass a large list of structures to this 2290 method. Every time a C{TimeoutError} is encountered, the 2291 corresponding worker process in the pool will hang until the 2292 process terminates on its own. During that time, this worker is 2293 unusable. If a sufficiently high number of timeouts occur, the 2294 whole pool of workers will be unsable. At the end of the method 2295 however a pool cleanup is performed and any unusable workers 2296 are 'reactivated'. However, that only happens at B{the end} of 2297 C{parse_async}. 2298 2299 @param structure_files: a list of structure files 2300 @type structure_files: tuple of str 2301 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds 2302 elapse before the parser completes its job 2303 @type timeout: int 2304 @param parser: any implementing L{AbstractStructureParser} class 2305 @type parser: type 2306 2307 @return: a list of L{AsyncParseResult} objects 2308 @rtype: list 2309 """ 2310 2311 pool = self._pool 2312 workers = [] 2313 results = [] 2314 2315 for file in list(structure_files): 2316 result = pool.apply_async(_parse_async, [parser, file, model]) 2317 workers.append(result) 2318 2319 hanging = False 2320 for w in workers: 2321 result = AsyncParseResult(None, None) 2322 try: 2323 result.result = w.get(timeout=timeout) 2324 except KeyboardInterrupt as ki: 2325 pool.terminate() 2326 raise ki 2327 except Exception as ex: 2328 result.exception = ex 2329 if isinstance(ex, multiprocessing.TimeoutError): 2330 hanging = True 2331 results.append(result) 2332 2333 if hanging: 2334 self._recycle() 2335 2336 return results
2337