csb.bio.io.hhpred

1 """ 2 HHpred-related format parsers. 3 4 This module defines two HHpred format parsers: L{HHProfileParser} and 5 L{HHOutputParser}. The first one is used to read HMM (*.hhm) files: 6 7 >>> HHProfileParser("profile.hhm", "profile.pdb").parse() 8 <ProfileHMM> # ProfileHMM object 9 10 while the latter parses HHsearch results files (*.hhr): 11 12 >>> HHOutputParser().parse_file("hits.hhr"): 13 <HHpredHitList> # collection of HHpredHit-s 14 15 See L{ProfileHMM}, L{HHpredHitList} and L{HHpredHit} from L{csb.bio.hmm} 16 for details. For text serialization of HMM profiles, see L{HHMFileBuilder} 17 in this module. 18 """ 19 20 import os 21 import re 22 23 import csb.io 24 import csb.bio.io 25 import csb.bio.structure as structure 26 27 from csb.core import Enum, CollectionIndexError, ItemNotFoundError 28 from csb.bio.sequence import Sequence, ProteinAlphabet, A3MAlignment 29 30 from csb.bio.hmm import State, Transition, ProfileHMM, HMMLayer, ProfileLength 31 from csb.bio.hmm import HHpredHitList, HHpredHit, ScoreUnits, States, EVDParameters 32 from csb.bio.hmm import StateNotFoundError, TransitionNotFoundError

33 34 35 -class HHProfileFormatError(ValueError):

36 pass

37

38 -class StructureFormatError(HHProfileFormatError):

39 pass

40

41 -class HHOutputFormatError(HHProfileFormatError):

42 pass

43

44 45 -class HHProfileParser(object):

46 """ 47 A class that is HHpred HMM format aware. 48 Produces a L{ProfileHMM} object representation of a given HHpred profile 49 HMM. 50 51 @param hhm_file: *.hhm file name to parse 52 @type hhm_file: str 53 @param pdb_file: an optional hhm structure file, containing the structural 54 data, associated with the HMM. 55 @type pdb_file: str 56 @note: This is NOT a regular PDB file! It must be specifically formatted 57 for use with HHpred. 58 59 @raise IOError: if any of the files does not exist 60 """ 61

62 - def __init__(self, hhm_file, pdb_file=None):

63 64 if not os.path.exists(hhm_file): 65 raise IOError("Could not read HHM file {0}".format(hhm_file)) 66 67 if pdb_file: 68 if not os.path.exists(pdb_file): 69 raise IOError("Could not read structure file {0}".format(pdb_file)) 70 71 self._file = hhm_file 72 self._pdb = pdb_file 73 self._properties = None 74 self._sequences = None 75 self._profile = None 76 77 self._chopped = False 78 self._chop()

79

80 - def format_structure(self, input_pdb, chain_id, output_file):

81 """ 82 Format input_pdb as an HHpred pre-parsed structure file for the current 83 HMM. Formatting means: only chain_id left in the PDB file, residue 84 sequence numbers strictly corresponding to the HMM states' ranks. 85 86 @param input_pdb: source PDB file name 87 @type input_pdb: str 88 @param chain_id: source chain ID (which chain to pick from the 89 structure) 90 @type chain_id: str 91 @param output_file: save the output PDB file here 92 @type output_file: str 93 94 @raise StructureFormatError: when the specified PDB chain does not 95 correspond to the HMM. 96 """ 97 98 hmm = self.parse() 99 s = csb.bio.io.StructureParser(input_pdb).parse_structure() 100 chain = s.chains[chain_id] 101 102 if chain.length != hmm.layers.length: 103 raise StructureFormatError( 104 "{0}: Incorrect number of residues".format(chain.entry_id)) 105 106 for layer, residue in zip(hmm.layers, chain.residues): 107 108 if residue.type == ProteinAlphabet.UNK: 109 residue.type = layer.residue.type 110 111 if residue.type != layer.residue.type: 112 msg = "Expected residue of type {0} at position {1}" 113 raise StructureFormatError(msg.format(layer.residue.type, layer.rank)) 114 115 residue.id = str(residue.rank), '$' 116 117 for residue in chain.residues: 118 residue.id = str(residue.rank), None 119 120 new_structure = structure.Structure(hmm.id) 121 new_structure.chains.append(chain.clone()) 122 123 new_structure.to_pdb(output_file)

124

125 - def parse(self, units=ScoreUnits.LogScales):

126 """ 127 Parse the HMM profile. 128 129 @param units: also convert the profile score units to the specified 130 L{csb.bio.hmm.ScoreUnits} kind 131 @type units: L{csb.core.EnumItem} 132 133 @return: a L{ProfileHMM} instance 134 @rtype: L{ProfileHMM} 135 136 @raise HHProfileFormatError: when the hhm file is invalid/corrupt 137 """ 138 assert self._chopped 139 140 hmm = ProfileHMM(units=units, scale=-1000.0, logbase=2) 141 if self._profile: 142 self._parse_profile(hmm, units) 143 else: 144 raise HHProfileFormatError('Missing HMM profile table.') 145 146 if self._properties: 147 self._parse_properties(hmm) 148 else: 149 raise HHProfileFormatError('No profile properties found.') 150 151 if self._sequences: 152 self._parse_sequences(hmm) 153 else: 154 raise HHProfileFormatError('No profile MSA and secondary structure found.') 155 156 if hmm.dssp: 157 hmm._assign_secstructure() 158 159 if self._pdb: 160 self._parse_structure(hmm) 161 162 return hmm

163

164 - def _chop(self):

165 """ 166 Chop the HHM file into pieces - HMM properties, secondary structure + 167 MSA, HMM. 168 """ 169 170 try: 171 with open(self._file, 'r') as f: 172 pr = csb.io.EntryReader(f, 'HHsearch', 'SEQ') 173 self._properties = pr.readall()[0].splitlines() 174 175 with open(self._file, 'r') as f: 176 sr = csb.io.EntryReader(f, '>', '#') 177 self._sequences = sr.readall() 178 179 with open(self._file, 'r') as f: 180 hr = csb.io.EntryReader(f, '#', '//') 181 self._profile = hr.readall()[0].splitlines() 182 183 self._chopped = True 184 185 except IndexError: 186 raise HHProfileFormatError('Corrupt HHM file.')

187

188 - def _parse_properties(self, hmm):

189 """ 190 Parse the profile properties 191 192 @param hmm: the hmm object being constructed 193 @type hmm: L{ProfileHMM} 194 @return: the updated hmm 195 @rtype: hmm 196 197 @raise NotImplementedError: if an unexpected data field is encountered 198 """ 199 assert self._chopped 200 201 for line in self._properties: 202 203 if line.startswith('NAME '): 204 hmm.name = line[6:].strip() 205 206 elif line.startswith('FAM '): 207 hmm.family = line[6:].strip() 208 209 elif line.startswith('LENG '): 210 m = re.search('(\d+)\D+(\d+)', line).groups() 211 m = tuple(map(int, m)) 212 hmm.length = ProfileLength(m[0], m[1]) 213 214 elif line.startswith('FILE '): 215 hmm.id = line[6:].strip() 216 217 elif line.startswith('HHsearch'): 218 hmm.version = float(line[9:]) 219 220 elif line.startswith('NEFF '): 221 hmm.effective_matches = float(line[6:]) 222 223 elif line.startswith('PCT '): 224 if line[6:].strip().lower() == 'true': 225 hmm.pseudocounts = True 226 227 elif line.startswith('EVD '): 228 lamda, mu = map(float, line[6:].split()) 229 hmm.evd = EVDParameters(lamda, mu) 230 231 elif line[:5] in ('COM ', 'DATE ', 'FILT '): 232 pass 233 else: 234 raise NotImplementedError("Unexpected line: {0}.".format(line[0:5])) 235 236 if not hmm.id and hmm.name: 237 hmm.id = hmm.name.split()[0] 238 return hmm

239

240 - def _parse_sequences(self, hmm):

241 """ 242 Parse secondary structure and multiple alignment sections. 243 244 @param hmm: the hmm object being constructed 245 @type hmm: L{ProfileHMM} 246 @return: the updated hmm 247 @rtype: hmm 248 """ 249 assert self._chopped 250 251 psipred = None 252 msa_entries = [] 253 254 for entry in self._sequences: 255 256 header_token = entry[:8] 257 if header_token in ['>ss_dssp', '>sa_dssp', '>ss_pred', '>ss_conf', '>Consens']: 258 259 lines = entry.strip().splitlines() 260 seq = re.sub('\s+', '', ''.join(lines[1:])) 261 262 if header_token == '>ss_dssp': 263 hmm.dssp = structure.SecondaryStructure(seq) 264 elif header_token == '>sa_dssp': 265 hmm.dssp_solvent = seq 266 elif header_token == '>ss_pred': 267 psipred = seq 268 elif header_token == '>ss_conf': 269 conf = seq 270 hmm.psipred = structure.SecondaryStructure(psipred, conf) 271 elif header_token == '>Consens': 272 hmm.consensus = Sequence('Consensus', 'Consensus', seq) 273 else: 274 msa_entries.append(entry) 275 276 if msa_entries: 277 msa = '\n'.join(msa_entries) 278 hmm.alignment = A3MAlignment.parse(msa, strict=False) 279 280 return hmm

281

282 - def _parse_profile(self, hmm, units=ScoreUnits.LogScales):

283 """ 284 Parse the HMM profile. 285 286 @param hmm: the hmm object being constructed 287 @type hmm: L{ProfileHMM} 288 @return: the updated hmm 289 @rtype: L{ProfileHMM} 290 291 @raise NotImplementedError: when an unknown transition string is 292 encountered 293 """ 294 assert self._chopped 295 296 # 0. Prepare start and end states 297 hmm.start = State(States.Start) 298 hmm.end = State(States.End) 299 300 residues = None 301 background = {} 302 tran_types = None 303 tran_lines = [] 304 start_probs = None 305 306 lines = iter(self._profile) 307 pattern = re.compile('^[A-Z\-]\s[0-9]+\s+') 308 309 if units == ScoreUnits.LogScales: 310 311 def parse_probability(v): 312 if v.strip() == '*': 313 return None 314 else: 315 return float(v)

316 else: 317 318 def parse_probability(v): 319 if v.strip() == '*': 320 return None 321 else: 322 return hmm._convert(units, float(v), 323 hmm.scale, hmm.logbase)

324 325 # 1. Create all layers (profile columns), create and attach their match states 326 327 while True: 328 try: 329 line = next(lines) 330 except StopIteration: 331 break 332 333 if line.startswith('NULL'): 334 try: 335 backprobs = tuple(map(parse_probability, line.split()[1:])) 336 337 line = next(lines) 338 residues = line.split()[1:] 339 residues = [Enum.parse(ProteinAlphabet, aa) for aa in residues] 340 341 for pos, aa in enumerate(residues): 342 background[aa] = backprobs[pos] 343 344 line = next(lines) 345 tran_types = line.split() 346 347 line = next(lines) 348 start_probs = list(map(parse_probability, line.split())) 349 except StopIteration: 350 break 351 352 elif re.match(pattern, line): 353 emrow = line 354 try: 355 tran_lines.append(next(lines)) 356 #junkrow = next(lines) 357 except StopIteration: 358 break 359 360 emprobs = emrow.split() 361 if len(emprobs) != 23: 362 raise HHProfileFormatError( 363 "Unexpected number of data fields: {0}".format(len(emprobs))) 364 365 rank = int(emprobs[1]) 366 residue = structure.ProteinResidue( 367 rank=rank, type=emprobs[0], sequence_number=rank, insertion_code=None) 368 if residue.type == ProteinAlphabet.GAP: 369 raise HHProfileFormatError("Layer {0} can't be represented by a gap".format(rank)) 370 371 new_layer = hmm.layers.append(HMMLayer(rank, residue)) 372 if new_layer != rank: 373 raise HHProfileFormatError('Layer {0} defined as {1}'.format(new_layer, rank)) 374 375 match = State(States.Match, emit=Enum.members(ProteinAlphabet)) 376 377 match.rank = rank 378 match.background.set(background) 379 380 for col, aa in enumerate(residues): 381 prob = parse_probability(emprobs[col + 2]) 382 match.emission.append(aa, prob) 383 384 hmm.layers[new_layer].append(match) 385 assert hmm.layers.last_index == match.rank 386 387 # 2. Append starting transitions: S -> M[1] and optionally S -> D[1] and S -> I[0]. 388 # States D[1] and I[0] will be created if needed 389 # Note that [0] is not a real layer, I[0] is simply an insertion at the level of Start 390 if len(hmm.layers) > 0: 391 392 first_match = hmm.layers[hmm.layers.start_index] 393 394 if start_probs[0] is None: 395 raise HHProfileFormatError("Transition Start > Match[1] is undefined") 396 397 start_tran = Transition(hmm.start, first_match[States.Match], start_probs[0]) 398 hmm.start.transitions.append(start_tran) 399 400 if start_probs[1] is not None and start_probs[3] is not None: # Start -> I[0] -> M[1] 401 start_ins = State(States.Insertion, emit=Enum.members(ProteinAlphabet)) 402 start_ins.rank = 0 403 start_ins.background.set(background) 404 start_ins.emission = start_ins.background 405 406 hmm.start_insertion = start_ins 407 # Start -> I[0] 408 hmm.start.transitions.append( 409 Transition(hmm.start, hmm.start_insertion, start_probs[1])) 410 # I[0] -> M[1] 411 hmm.start_insertion.transitions.append( 412 Transition(hmm.start_insertion, first_match[States.Match], start_probs[3])) 413 # I[0] -> I[0] 414 if start_probs[4]: 415 hmm.start_insertion.transitions.append( 416 Transition(hmm.start_insertion, hmm.start_insertion, start_probs[4])) 417 418 419 if start_probs[2] is None and start_probs[6] is not None: 420 # M->D is corrupt (*) at the Start layer, using D->D instead 421 start_probs[2] = start_probs[6] 422 423 if start_probs[2] is not None: # Start -> D[1] 424 start_del = State(States.Deletion) 425 start_del.rank = 1 426 hmm.layers[1].append(start_del) 427 start_tran = Transition(hmm.start, first_match[States.Deletion], start_probs[2]) 428 hmm.start.transitions.append(start_tran) 429 else: 430 start_tran = Transition(hmm.start, hmm.end, start_probs[0]) 431 hmm.start.transitions.append(start_tran) 432 433 434 # 3. Append remaining transitions. I and D states will be created on demand. 435 436 for rank, fields in enumerate(tran_lines, start=hmm.layers.start_index): 437 assert hmm.layers[rank][States.Match].rank == rank 438 439 ofields = fields.split() 440 fields = tuple(map(parse_probability, ofields)) 441 442 # 3a. Parse all Neff values and create I[i] and D[i] states if NeffX[i] is not None 443 for col, neff in enumerate(tran_types[7:10], start=7): 444 445 if fields[col] is not None: 446 neff_value = float(ofields[col]) / abs(hmm.scale) 447 448 if neff == 'Neff': 449 hmm.layers[rank].effective_matches = neff_value 450 451 elif neff == 'Neff_I': 452 hmm.layers[rank].effective_insertions = neff_value 453 454 if States.Insertion not in hmm.layers[rank]: 455 insertion = State(States.Insertion, emit=Enum.members(ProteinAlphabet)) 456 insertion.background.set(background) 457 insertion.emission.set(background) 458 insertion.rank = rank 459 hmm.layers[rank].append(insertion) 460 461 elif neff == 'Neff_D': 462 hmm.layers[rank].effective_deletions = neff_value 463 464 if States.Deletion not in hmm.layers[rank] and neff_value > 0: 465 deletion = State(States.Deletion) 466 deletion.rank = rank 467 hmm.layers[rank].append(deletion) 468 469 # 3b. Starting from the first layer, parse all transitions and build the HMM graph stepwise 470 for col, tran in enumerate(tran_types): 471 probability = fields[col] 472 473 if probability is not None: 474 try: 475 self._add_transition(hmm, rank, tran, probability) 476 except (CollectionIndexError, ItemNotFoundError) as ex: 477 msg = "Can't add transition {0} at {1}: {2.__class__.__name__}, {2!s}" 478 raise HHProfileFormatError(msg.format(tran, rank, ex)) 479 480 return hmm 481

482 - def _add_transition(self, hmm, rank, tran, probability):

483 484 match = hmm.layers[rank][States.Match] 485 nextmatch = None 486 if rank < hmm.layers.last_index: 487 nextmatch = hmm.layers[rank + 1][States.Match] 488 else: 489 nextmatch = hmm.end 490 491 if tran == 'M->M': 492 transition = Transition(match, nextmatch, probability) 493 match.transitions.append(transition) 494 495 elif tran == 'M->I': 496 insertion = hmm.layers[rank][States.Insertion] 497 transition = Transition(match, insertion, probability) 498 match.transitions.append(transition) 499 500 elif tran == 'M->D': 501 deletion = State(States.Deletion) 502 deletion.rank = rank + 1 503 hmm.layers[rank + 1].append(deletion) 504 transition = Transition(match, deletion, probability) 505 match.transitions.append(transition) 506 507 elif tran == 'I->M': 508 insertion = hmm.layers[rank][States.Insertion] 509 transition = Transition(insertion, nextmatch, probability) 510 insertion.transitions.append(transition) 511 512 elif tran == 'I->I': 513 insertion = hmm.layers[rank][States.Insertion] 514 selfloop = Transition(insertion, insertion, probability) 515 insertion.transitions.append(selfloop) 516 517 elif tran == 'D->M': 518 deletion = hmm.layers[rank][States.Deletion] 519 transition = Transition(deletion, nextmatch, probability) 520 deletion.transitions.append(transition) 521 522 elif tran == 'D->D': 523 deletion = hmm.layers[rank][States.Deletion] 524 525 if States.Deletion not in hmm.layers[rank + 1]: 526 nextdeletion = State(States.Deletion) 527 nextdeletion.rank = rank + 1 528 hmm.layers[rank + 1].append(nextdeletion) 529 530 else: 531 nextdeletion = hmm.layers[rank + 1][States.Deletion] 532 assert match.transitions[States.Deletion].successor == nextdeletion 533 534 transition = Transition(deletion, nextdeletion, probability) 535 deletion.transitions.append(transition) 536 537 else: 538 if not tran.startswith('Neff'): 539 raise NotImplementedError('Unknown transition: {0}'.format(tran))

540

541 - def _parse_structure(self, hmm):

542 """ 543 Add structural information to an existing HMM. 544 545 @param hmm: the hmm object being constructed 546 @type hmm: L{ProfileHMM} 547 @return: the updated hmm 548 @rtype: L{ProfileHMM} 549 550 @raise ValueError: if the structure file does not refer to the HMM 551 @raise HHProfileFormatError: when a residue cannot be assigned to an HMM layer 552 @raise NotImplementedError: when an unknown data field is encountered 553 in the PDB file 554 """ 555 556 assert self._pdb 557 558 s = csb.bio.io.StructureParser(self._pdb).parse_structure() 559 560 if s.chains.length != 1: 561 raise StructureFormatError("Must contain exactly one chain") 562 if s.first_chain.length != hmm.layers.length: 563 raise StructureFormatError("Incorrect number of residues") 564 565 chain = s.first_chain 566 chain.compute_torsion() 567 568 for layer, residue in zip(hmm.layers, chain.residues): 569 570 if residue.type != layer.residue.type: 571 msg = "Expected residue of type {0} at position {1}" 572 raise StructureFormatError(msg.format(layer.residue.type, layer.rank)) 573 574 layer.residue.torsion = residue.torsion.copy() 575 576 for atom in residue.items: 577 atom._residue = None # atom.clone() is much, much slower 578 layer.residue.atoms.append(atom) 579 580 return hmm

581 582 HHpredProfileParser = HHProfileParser

583 584 585 -class HHMFileBuilder(object):

586 """ 587 Builder for HHpred's hhm files. 588 589 @param output: destination stream 590 @type output: file 591 """ 592

593 - def __init__(self, output):

594 595 if not hasattr(output, 'write'): 596 raise TypeError(output) 597 598 self._out = output

599 600 @property

601 - def output(self):

602 """ 603 Destination stream 604 @rtype: stream 605 """ 606 return self._out

607

608 - def write(self, data):

609 self._out.write(data)

610

611 - def writeline(self, data):

612 self.write(data) 613 self.write('\n')

614

615 - def add_hmm(self, hmm):

616 """ 617 Append a new HMM to the destination stream. 618 619 @param hmm: profile HMM to serialize 620 @type hmm: L{ProfileHMM} 621 """ 622 623 if hmm.score_units != ScoreUnits.LogScales: 624 raise ValueError('Scores must be converted to LogScales first.') 625 626 self.writeline('''HHsearch {0.version} 627 NAME {0.name} 628 FAM {0.family} 629 LENG {0.length.matches} match states, {0.length.layers} columns in multiple alignment 630 NEFF {0.effective_matches} 631 PCT {0.pseudocounts}'''.format(hmm)) 632 if hmm.evd: 633 self.writeline('EVD {0.lamda} {0.mu}'.format(hmm.evd)) 634 635 self.writeline('SEQ') 636 if hmm.dssp: 637 self.writeline('>ss_dssp') 638 self.writeline(hmm.dssp.to_string()) 639 if hmm.dssp_solvent: 640 self.writeline('>sa_dssp') 641 self.writeline(hmm.dssp_solvent) 642 if hmm.psipred: 643 self.writeline('>ss_pred') 644 self.writeline(hmm.psipred.to_string()) 645 self.writeline('>ss_conf') 646 confidence = [''.join(map(str, m.score)) for m in hmm.psipred] 647 self.writeline(''.join(confidence)) 648 649 if hmm.alignment: 650 if hmm.consensus: 651 self.writeline(str(hmm.consensus)) 652 self.writeline(hmm.alignment.format().rstrip('\r\n')) 653 654 self.writeline('#') 655 656 first_match = hmm.layers[1][States.Match] 657 null = [int(first_match.background[aa]) 658 for aa in sorted(map(str, first_match.background))] 659 self.writeline('NULL {0}'.format('\t'.join(map(str, null)))) 660 self.writeline('HMM {0}'.format( 661 '\t'.join(sorted(map(str, first_match.emission))))) 662 663 tran_types = 'M->M M->I M->D I->M I->I D->M D->D'.split() 664 self.writeline(' {0}'.format( 665 '\t'.join(tran_types + 'Neff Neff_I Neff_D'.split()))) 666 667 self.write(" ") 668 for tran_type in tran_types: 669 source_statekind = Enum.parse(States, tran_type[0]) 670 target_statekind = Enum.parse(States, tran_type[3]) 671 if source_statekind == States.Match: 672 try: 673 self.write("{0:<7}\t".format( 674 int(hmm.start.transitions[target_statekind].probability))) 675 except TransitionNotFoundError: 676 self.write("*\t") 677 else: 678 self.write("*\t") 679 self.writeline('*\t' * 3) 680 681 for layer in hmm.layers: 682 683 self.write("{0} {1:<5}".format(layer.residue.type, layer.rank)) 684 for aa in sorted(layer[States.Match].emission): 685 emission = layer[States.Match].emission[aa] 686 if emission is None: 687 emission = '*' 688 else: 689 emission = int(emission) 690 self.write("{0:<7}\t".format(emission)) 691 self.writeline("{0}".format(layer.rank)) 692 693 self.write(" ") 694 for tran_type in tran_types: 695 source_statekind = Enum.parse(States, tran_type[0]) 696 target_statekind = Enum.parse(States, tran_type[3]) 697 698 if target_statekind == States.Match and layer.rank == hmm.layers.last_index: 699 target_statekind = States.End 700 701 try: 702 state = layer[source_statekind] 703 self.write("{0:<7}\t".format( 704 int(state.transitions[target_statekind].probability))) 705 except StateNotFoundError: 706 self.write("*\t") 707 708 for data in (layer.effective_matches, layer.effective_insertions, 709 layer.effective_deletions): 710 if data is None: 711 data = '*' 712 else: 713 data = int(data * abs(hmm.scale)) 714 self.write("{0:<7}\t".format(data)) 715 716 self.writeline("\n") 717 718 self.writeline('//')

719

720 721 -class HHOutputParser(object):

722 """ 723 Parser for HHsearch result (*.hhr) files. 724 725 @param alignments: if set to False, the parser will skip the 726 alignments section of the file 727 @type alignments: bool 728 """ 729

730 - def __init__(self, alignments=True):

731 732 self._alignments = True 733 self.alignments = alignments

734

735 - def __repr__(self):

736 return "<HHsearch Result Parser>"

737 738 @property

739 - def alignments(self):

740 """ 741 True if hit alignments will be parsed 742 @rtype: bool 743 """ 744 return self._alignments

745 @alignments.setter

746 - def alignments(self, value):

747 self._alignments = bool(value)

748

749 - def parse_file(self, hhr_file, header_only=False):

750 """ 751 Parse all hits from this HHpred result file. 752 753 @param hhr_file: input HHR file name 754 @type hhr_file: str 755 756 @return: parsed hits 757 @rtype: HHpredHitList 758 759 @raise HHOutputFormatError: if the hhr file is corrupt 760 """ 761 762 with open(os.path.expanduser(hhr_file)) as stream: 763 return self._parse(stream, header_only)

764

765 - def parse_string(self, output, header_only=False):

766 """ 767 Get all hits from an C{output} string. 768 769 @param output: HHpred standard output 770 @type output: str 771 772 @return: parsed hits 773 @rtype: HHpredHitList 774 775 @raise HHOutputFormatError: if the output is corrupt 776 """ 777 stream = csb.io.MemoryStream() 778 stream.write(output) 779 stream.seek(0) 780 781 return self._parse(stream, header_only)

782

783 - def _parse(self, stream, header_only):

784 785 qlen = None 786 in_hits = False 787 in_alis = False 788 has_alis = False 789 c_rank = 0 790 header = {} 791 hits = {} 792 alis = {} 793 794 for line in stream: 795 796 if not in_hits and not in_alis: 797 798 if line.replace(' ', '').startswith('NoHitProbE-value'): 799 in_hits = True 800 continue 801 elif line.strip() == '': 802 continue 803 else: # parse header data (stuff above the hits table) 804 columns = line.strip().split(None, 1) 805 if len(columns) == 2: 806 807 identifier, data = columns 808 if identifier in ('Query', 'Command'): 809 data = data.strip() 810 elif identifier == 'Neff': 811 data = float(data) 812 elif identifier in ('Searched_HMMs', 'Match_columns'): 813 data = int(data) 814 815 header[identifier] = data 816 817 if identifier == 'Match_columns': 818 qlen = data 819 820 if in_hits and not header_only: 821 if not line.strip(): # suboptimal way to handle block switch 822 in_hits = False 823 in_alis = True 824 if self.alignments: 825 continue 826 else: 827 break 828 elif line.strip() == 'Done': 829 in_hits = False 830 in_alis = False 831 break 832 833 description = line[:34].split() 834 rank = int(description[0]) 835 id = description[1] 836 837 pos = line[85:94].strip() 838 start, end = map(int, pos.split('-')) 839 840 qpos = line[75:84].strip() 841 qstart, qend = map(int, qpos.split('-')) 842 843 probability = float(line[35:40]) / 100.0 844 845 hit = HHpredHit(rank, id, start, end, qstart, qend, probability, qlen) 846 847 hit.evalue = float(line[41:48]) 848 hit.pvalue = float(line[49:56]) 849 hit.score = float(line[57:63]) 850 hit.ss_score = float(line[64:69]) 851 852 hit.slength = int(line[94:].replace('(', '').replace(')', '')) 853 854 hits[hit.rank] = hit 855 alis[hit.rank] = {'q': [], 's': []} 856 857 elif in_alis and not header_only: 858 if line.startswith('Done'): 859 in_alis = False 860 break 861 862 elif line.startswith('No '): 863 c_rank = int(line[3:]) 864 if c_rank not in hits: 865 raise HHOutputFormatError('Alignment {0}. refers to a non-existing hit'.format(c_rank)) 866 867 elif line.startswith('>'): 868 hits[c_rank].name = line[1:].strip() 869 870 elif line.startswith('Probab='): 871 for pair in line.split(): 872 key, value = pair.split('=') 873 if key == 'Identities': 874 hits[c_rank].identity = float( 875 value.replace('%', '')) 876 elif key == 'Similarity': 877 hits[c_rank].similarity = float(value) 878 elif key == 'Sum_probs': 879 hits[c_rank].prob_sum = float(value) 880 881 elif line.startswith('Q ') and not line[:11].rstrip() in ('Q Consensus', 'Q ss_pred','Q ss_conf', 'Q ss_dssp'): 882 for residue in line[22:]: 883 if residue.isspace() or residue.isdigit(): 884 break 885 else: 886 alis[c_rank]['q'].append(residue) 887 has_alis = True 888 889 elif line.startswith('T ') and not line[:11].rstrip() in ('T Consensus', 'T ss_pred','T ss_conf', 'T ss_dssp'): 890 for residue in line[22:]: 891 if residue.isspace() or residue.isdigit(): 892 break 893 else: 894 alis[c_rank]['s'].append(residue) 895 896 if self.alignments and has_alis: 897 for rank in alis: 898 try: 899 hits[rank].add_alignment(alis[rank]['q'], alis[rank]['s']) 900 901 except (KeyError, ValueError) as er: 902 raise HHOutputFormatError('Corrupt alignment at hit No {0}.\n {1}'.format(rank, er)) 903 904 del alis 905 906 hits = HHpredHitList(hits.values()) 907 908 hits.sort() 909 910 ## add data obtained from the header to the HHpredHitList 911 for identifier, data in header.items(): 912 if identifier == 'Query': 913 hits.query_name = data 914 elif identifier == 'Match_columns': 915 hits.match_columns = data 916 elif identifier == 'No_of_seqs': 917 hits.no_of_seqs = data 918 elif identifier == 'Neff': 919 hits.neff = data 920 elif identifier == 'Searched_HMMs': 921 hits.searched_hmms = data 922 elif identifier == 'Date': 923 hits.date = data 924 elif identifier == 'Command': 925 hits.command = data 926 927 return hits

928 929 HHpredOutputParser = HHOutputParser 930

Source Code for Module csb.bio.io.hhpred