1  """ 
   2  HHpred and Hidden Markov Model APIs. 
   3   
   4  This package defines the abstractions for working with HHpred's HMMs and 
   5  hit lists. L{ProfileHMM} is the most important object of this module. 
   6  It describes a sequence profile hidden Markov model in the way HHpred 
   7  sees this concept: 
   8   
   9      - a profile is composed of a list of L{HMMLayer}s, which contain a 
  10        number of L{State}s 
  11      - these L{States} can be of different types: Match, Insertion Deletion, etc. 
  12      - a profile contains a multiple alignment, from which it is derived 
  13      - this multiple alignment is an A3M (condensed) Alignment, where the  
  14        first sequence is a master sequence 
  15      - the match states in all layers correspond to the residues of the master 
  16        sequence 
  17   
  18  L{ProfileHMM} objects provide list-like access to their layers: 
  19   
  20      >>> hmm.layers[1] 
  21      <HMMLayer>    # first layer: layer at master residue=1 
  22       
  23  Every layer provides dictionary-like access to its states: 
  24   
  25      >>> layer[States.Match] 
  26      <Match State> 
  27       
  28  and every state provides dictionary-like access to its transitions to other 
  29  states: 
  30   
  31      >>> state = hmm.layers[1][States.match] 
  32      >>> state.transitions[States.Insertion] 
  33      <Transition>       # Match > Insertion 
  34      >>> transition.predecessor 
  35      <Match State>      # source state 
  36      >>> transition.successor 
  37      <Insertion State>  # target state 
  38   
  39  Whether this transition points to a state at the same (i) or the next layer 
  40  (i+1) depends on the semantics of the source and the target states. 
  41   
  42  Building HMMs from scratch is supported through a number of C{append} methods 
  43  at various places: 
  44   
  45      >>> layer = HMMLayer(...) 
  46      >>> layer.append(State(...)) 
  47      >>> hmm.layers.append(layer) 
  48   
  49  See L{HMMLayersCollection}, L{HMMLayer}, L{EmissionTable} and L{TransitionTable} 
  50  for details. 
  51  """ 
  52   
  53  import sys 
  54  import math 
  55   
  56  import csb.core 
  57  import csb.io 
  58  import csb.bio.structure as structure 
  59  import csb.bio.sequence as sequence 
  60   
  61  from csb.core import Enum 
  66   
  69   
  72   
  75   
  78   
  81   
  84   
  87   
  88   
  89 -class States(csb.core.enum): 
   94   
  96      """ 
  97      Enumeration of HMM emission and transition score units 
  98      """ 
  99      LogScales='LogScales'; Probability='Probability' 
  100       
 101  BACKGROUND = [ 0.076627178753322270, 0.018866884241976509, 0.053996136712517316, 
 102                 0.059788009880742142, 0.034939432842683173, 0.075415244982547675,  
 103                 0.036829356494115069, 0.050485048600600511, 0.059581159080509941,  
 104                 0.099925728794059046, 0.021959667190729986, 0.040107059298840765,  
 105                 0.045310838527464106, 0.032644867589507229, 0.051296350550656143,  
 106                 0.046617000834108295, 0.071051060827250878, 0.072644631719882335,  
 107                 0.012473412286822654, 0.039418044025976547 ] 
 108  """ 
 109  Background amino acid probabilities 
 110  """ 
 111   
 112  RELATIVE_SA = { 'A': 0.02, 'B': 0.14, 'C': 0.33, 'D': 0.55, 'E': 1.00 } 
 113  """ 
 114  Relative solvent accessibility codes (upper bounds) 
 115  """ 
 119      """ 
 120      Describes a protein profile Hidden Markov Model. 
 121      Optional parameters: 
 122   
 123      @param units: defines the units of the transition and emission scores 
 124      @type units: L{ScoreUnits} 
 125      @param scale: the scaling factor used to convert emission/transition 
 126                    probabilities 
 127      @type scale: float 
 128      @param logbase: the base of the logarithm used for scaling the emission and 
 129                      transition probabilities 
 130      @type logbase: float 
 131      """ 
 132   
 134   
 135          self._name = None 
 136          self._id = None 
 137          self._family = None 
 138          self._length = ProfileLength(0, 0) 
 139          self._alignment = None 
 140          self._consensus = None 
 141          self._dssp = None 
 142          self._dssp_solvent = None 
 143          self._psipred = None 
 144          self._effective_matches = None 
 145          self._evd = EVDParameters(None, None)  
 146          self._version = None 
 147          self._pseudocounts = False 
 148          self._emission_pseudocounts = False 
 149          self._transition_pseudocounts = False 
 150          self._layers = HMMLayersCollection() 
 151          self._start = State(States.Start) 
 152          self._start_insertion = None 
 153          self._end = State(States.End) 
 154          self._scale = scale 
 155          self._logbase = logbase 
 156          if units is None: 
 157              self._score_units = ScoreUnits.LogScales 
 158          else: 
 159              self._score_units = units         
  160   
 161      @property 
 163          """ 
 164          Profile name (NAME) 
 165          @rtype: str 
 166          """ 
 167          return self._name 
  168      @name.setter 
 169 -    def name(self, value): 
  171       
 172      @property 
 174          """ 
 175          Profile entry ID (FILE) 
 176          @rtype: str         
 177          """         
 178          return self._id 
  179      @id.setter 
 180 -    def id(self, value): 
  181          self._id = str(value) 
  182       
 183      @property 
 185          """ 
 186          Alternative entry ID (FAM) 
 187          @rtype: str            
 188          """         
 189          return self._family 
  190      @family.setter 
 192          self._family = str(value) 
  193       
 194      @property 
 196          """ 
 197          Profile length 
 198          @rtype: L{ProfileLength} 
 199          """ 
 200          return self._length 
  201      @length.setter 
 206       
 207      @property 
 209          """ 
 210          Source multiple alignment 
 211          @rtype: L{A3MAlignment} 
 212          """ 
 213          return self._alignment 
  214      @alignment.setter 
 219       
 220      @property 
 222          """ 
 223          Consensus sequence 
 224          @rtype: L{AbstractSequence} 
 225          """ 
 226          return self._consensus 
  227      @consensus.setter 
 232       
 233      @property 
 235          """ 
 236          DSSP (calculated) secondary structure 
 237          @rtype: L{SecondaryStructure} 
 238          """ 
 239          return self._dssp 
  240      @dssp.setter 
 241 -    def dssp(self, value): 
  245       
 246      @property 
 248          """ 
 249          Solvent accessibility values 
 250          @rtype: str         
 251          """ 
 252          return self._dssp_solvent 
  253      @dssp_solvent.setter 
 255          self._dssp_solvent = str(value) 
  256       
 257      @property 
 259          """ 
 260          PSIPRED (predicted) secondary structure 
 261          @rtype: L{SecondaryStructure}         
 262          """ 
 263          return self._psipred 
  264      @psipred.setter 
 269       
 270      @property 
 272          """ 
 273          Number of effective matches (NEFF) 
 274          """ 
 275          return self._effective_matches 
  276      @effective_matches.setter 
 278          self._effective_matches = value 
  279       
 280      @property 
 282          """ 
 283          Extreme-value distribution parameters (EVD) 
 284          @rtype: L{EVDParameters} 
 285          """ 
 286          return self._evd 
  287      @evd.setter 
 288 -    def evd(self, value): 
  292       
 293      @property 
 295          """ 
 296          Format version number (HHsearch) 
 297          @rtype: str         
 298          """ 
 299          return self._version 
  300      @version.setter 
 302          self._version = str(value) 
  303       
 304      @property 
 306          """ 
 307          @rtype: bool 
 308          """ 
 309          return self._pseudocounts 
  310      @pseudocounts.setter 
 312          self._pseudocounts = bool(value) 
  313       
 314      @property 
 316          """ 
 317          @rtype: bool 
 318          """         
 319          return self._emission_pseudocounts 
  320      @emission_pseudocounts.setter 
 322          self._emission_pseudocounts = bool(value) 
  323       
 324      @property 
 326          """ 
 327          @rtype: bool 
 328          """         
 329          return self._transition_pseudocounts 
  330      @transition_pseudocounts.setter 
 332          self._transition_pseudocounts = bool(value) 
  333       
 334      @property 
 336          """ 
 337          List-like access to the HMM's layers 
 338          @rtype: L{HMMLayersCollection} 
 339          """         
 340          return self._layers 
  341       
 342      @property 
 344          """ 
 345          Start state (at the start layer) 
 346          @rtype: L{State} 
 347          """            
 348          return self._start 
  349      @start.setter 
 355       
 356      @property 
 358          """ 
 359          Insertion state at the start layer 
 360          @rtype: L{State} 
 361          """ 
 362          return self._start_insertion 
  363      @start_insertion.setter 
 369       
 370      @property 
 372          """ 
 373          Final state (at the end layer) 
 374          @rtype: L{State} 
 375          """              
 376          return self._end 
  377      @end.setter 
 378 -    def end(self, value): 
  383       
 384      @property 
 386          """ 
 387          Score scaling factor  
 388          @rtype: float         
 389          """ 
 390          return self._scale 
  391       
 392      @property 
 394          """ 
 395          Base of the logarithm used for score scaling  
 396          @rtype: float         
 397          """         
 398          return self._logbase 
  399       
 400      @property 
 402          """ 
 403          Current score units 
 404          @rtype: L{ScoreUnits} member 
 405          """ 
 406          return self._score_units 
  407       
 408      @property 
 417   
 418      @property 
 420          """ 
 421          A list of layers including start and start_insertion 
 422          @rtype: list of L{HMMLayer} 
 423          """ 
 424          complete_layers = [] 
 425          first_layer = HMMLayer(rank=0, residue=None) 
 426          first_layer.append(self.start) 
 427          if self.start_insertion: 
 428              first_layer.append(self.start_insertion) 
 429          complete_layers.append(first_layer) 
 430          for layer in self.layers: 
 431              complete_layers.append(layer) 
 432           
 433          return complete_layers 
  434   
 435      @property 
 437          """ 
 438          True if this profile contains structural data 
 439          @rtype: bool 
 440          """ 
 441          has = False 
 442          for layer in self.layers: 
 443              if layer.residue.has_structure: 
 444                  return True 
 445          return has 
  446   
 448          """ 
 449          Serialize this HMM to a file. 
 450   
 451          @param file_name: target file name 
 452          @type file_name: str 
 453          """ 
 454          rec = sys.getrecursionlimit()         
 455          sys.setrecursionlimit(10000) 
 456          csb.io.Pickle.dump(self, open(file_name, 'wb')) 
 457          sys.setrecursionlimit(rec) 
  458   
 459      @staticmethod 
 461          """ 
 462          De-serialize an HMM from a file. 
 463   
 464          @param file_name: source file name (pickle) 
 465          @type file_name: str 
 466          """ 
 467          rec = sys.getrecursionlimit() 
 468          sys.setrecursionlimit(10000) 
 469          try: 
 470              return csb.io.Pickle.load(open(file_name, 'rb')) 
 471          finally: 
 472              sys.setrecursionlimit(rec) 
  473   
 474 -    def _convert(self, units, score, scale, logbase): 
  485   
 486 -    def to_hmm(self, output_file=None, convert_scores=False): 
  515   
 517          """ 
 518          Extract a sub-segment of the profile. 
 519   
 520          @param start: start layer of the segment (rank) 
 521          @type start: int 
 522          @param end: end layer of the segment (rank) 
 523          @type end: int 
 524   
 525          @return: a deepcopy of the extracted HMM segment 
 526          @rtype: L{ProfileHMMSegment} 
 527          """ 
 528          return ProfileHMMSegment(self, start, end) 
  529       
 533   
 540   
 547                                     
 548 -    def structure(self, chain_id=None, accession=None): 
  549          """ 
 550          Extract the structural information from the HMM. 
 551   
 552          @param accession: defines the accession number of the structure 
 553          @type accession: str 
 554          @param chain_id: defines explicitly the chain identifier 
 555          @type chain_id: str 
 556   
 557          @return: a shallow L{Structure} wrapper around the residues in the HMM. 
 558          @rtype: L{Structure} 
 559          """ 
 560          struct = structure.Structure(accession or self.id) 
 561          chain = self.chain(chain_id) 
 562          struct.chains.append(chain) 
 563   
 564          return struct 
  565   
 566 -    def chain(self, chain_id=None): 
  586   
 588          """ 
 589          Extract the emission scores of all match states in the profile. 
 590          The metric of the emission scores returned depends on the current 
 591          hmm.score_units setting - you may need to call hmm.convert_scores() 
 592          to adjust the hmm to your particular needs. 
 593   
 594          @return: a list of dictionaries; each dict key is a single amino acid 
 595          @rtype: list 
 596          """ 
 597          profile = [] 
 598   
 599          for layer in self.layers: 
 600              emission = {} 
 601   
 602              for aa in layer[States.Match].emission: 
 603                  emission[str(aa)] = layer[States.Match].emission[aa] or 0.0 
 604              profile.append(emission) 
 605   
 606          return profile 
  607   
 609          """ 
 610          Convert emission and transition scores to the specified units. 
 611   
 612          @param units: the target units for the conversion (a member of 
 613                        L{ScoreUnits}). 
 614          @type units: L{csb.core.EnumItem} 
 615          @param method: if defined, implements the exact mathematical 
 616                         transformation that will be applied. It must be a 
 617                         function or lambda expression with the following 
 618                         signature:: 
 619                               def (target_units, score, scale, logbase) 
 620   
 621                         and it has to return the score converted to 
 622                         C{target_units}. If method performs a conversion from 
 623                         probabilities to scaled logs, you should also update 
 624                         C{hmm.scale} and C{hmm.logbase}. 
 625          @type method: function, lambda 
 626          """ 
 627   
 628          if self._score_units == units: 
 629              return 
 630   
 631          if method is not None: 
 632              convert = method 
 633          else: 
 634              convert = self._convert 
 635   
 636          for layer in self.layers: 
 637              for state_kind in layer: 
 638                  state = layer[state_kind] 
 639                  if not state.silent: 
 640                      for residue in state.emission: 
 641                          if state.emission[residue] is not None: 
 642                              state.emission.update(residue, convert( 
 643                              units, state.emission[residue], 
 644                              self.scale, self.logbase)) 
 645                      for residue in state.background: 
 646                          if state.background[residue] is not None: 
 647                              state.background.update(residue, convert( 
 648                              units, state.background[residue], 
 649                              self.scale, self.logbase)) 
 650                  for tran_kind in state.transitions: 
 651                      transition = state.transitions[tran_kind] 
 652                      transition.probability = convert(units, transition.probability, 
 653                                                       self.scale, self.logbase) 
 654               
 655               
 656   
 657          if self.start_insertion: 
 658              for t_it in self.start_insertion.transitions: 
 659                  transition = self.start_insertion.transitions[t_it] 
 660                  transition.probability = convert(units, transition.probability, 
 661                                                   self.scale, self.logbase) 
 662   
 663              for residue in self.start_insertion.emission: 
 664                  state = self.start_insertion 
 665                  if state.emission[residue] is not None: 
 666                      state.emission.update(residue, 
 667                                            convert(units, state.emission[residue], self.scale, self.logbase)) 
 668                      state.background.update(residue, 
 669                                              convert(units, state.background[residue], self.scale, self.logbase)) 
 670   
 671          for tran_kind in self.start.transitions: 
 672              transition = self.start.transitions[tran_kind] 
 673              transition.probability = convert(units, 
 674                                               transition.probability, self.scale, self.logbase) 
 675   
 676   
 677          self._score_units = units 
  678   
 680          """ 
 681          Compute the Log-sum-of-odds score between the emission tables of self 
 682          and other (Soeding 2004). If no observable Match state is found at a 
 683          given layer, the Insertion state is used instead. 
 684   
 685          @note: This is not a full implementation of the formula since only 
 686          emission vectors are involved in the computation and any transition 
 687          probabilities are ignored. 
 688   
 689          @param other: the subject HMM 
 690          @type other: L{ProfileHMM} 
 691   
 692          @return: emission log-sum-of-odds similarity between C{self} and 
 693                   C{other} 
 694          @rtype: float 
 695   
 696          @raise ValueError: when self and other differ in their length, when the 
 697                             score_units are not Probability, or when no 
 698                             observable states are present 
 699          """ 
 700          score = 1 
 701   
 702          if self.layers.length != other.layers.length or self.layers.length < 1: 
 703              raise ValueError('Both HMMs must have the same nonzero number of layers') 
 704          if self.score_units != ScoreUnits.Probability or \ 
 705                 other.score_units != ScoreUnits.Probability: 
 706              raise ValueError('Scores must be converted to probabilities first.') 
 707   
 708          for q_layer, s_layer in zip(self.layers, other.layers): 
 709   
 710              try: 
 711                  if States.Match in q_layer and not q_layer[States.Match].silent: 
 712                      q_state = q_layer[States.Match] 
 713                  else: 
 714                      q_state = q_layer[States.Insertion] 
 715   
 716                  if States.Match in s_layer and not s_layer[States.Match].silent: 
 717                      s_state = s_layer[States.Match] 
 718                  else: 
 719                      s_state = s_layer[States.Insertion] 
 720              except csb.core.ItemNotFoundError: 
 721                  raise ValueError('Query and subject must contain observable states ' 
 722                                   'at each layer') 
 723   
 724              emission_dotproduct = 0 
 725   
 726              for aa in q_state.emission: 
 727   
 728                  q_emission = q_state.emission[aa] or sys.float_info.min 
 729                  s_emission = s_state.emission[aa] or sys.float_info.min 
 730                  emission_dotproduct += (q_emission * s_emission / 
 731                                          q_state.background[aa]) 
 732   
 733              score *= emission_dotproduct 
 734   
 735          return math.log(score) 
  736   
 738          """ 
 739          Attach references from each profile layer to the relevant DSSP secondary  
 740          structure element. 
 741          """         
 742          assert self.dssp is not None 
 743           
 744          for motif in self.dssp: 
 745              for i in range(motif.start, motif.end + 1): 
 746                  self.layers[i].residue.secondary_structure = motif     
   747   
 750      """ 
 751      Represents a segment (fragment) of a ProfileHMM. 
 752       
 753      @param hmm: source HMM 
 754      @type hmm: ProfileHMM 
 755      @param start: start layer of the segment (rank) 
 756      @type start: int 
 757      @param end: end layer of the segment (rank) 
 758      @type end: int    
 759       
 760      @raise ValueError: when start or end positions are out of range 
 761      """ 
 762       
 764   
 765          if start < hmm.layers.start_index or start > hmm.layers.last_index: 
 766              raise IndexError('Start position {0} is out of range'.format(start)) 
 767          if end < hmm.layers.start_index or end > hmm.layers.last_index: 
 768              raise IndexError('End position {0} is out of range'.format(end)) 
 769           
 770           
 771                   
 772          super(ProfileHMMSegment, self).__init__(units=hmm.score_units, 
 773                                                  scale=hmm.scale, logbase=hmm.logbase) 
 774          self.id = hmm.id 
 775          self.family = hmm.family  
 776          self.name = hmm.name 
 777          self.pseudocounts = hmm.pseudocounts 
 778          self.evd = hmm.evd 
 779          self.version = hmm.version        
 780          self.source = hmm.id 
 781          self._source_start = start 
 782          self._source_end = end 
 783           
 784          if hmm.alignment: 
 785              self.alignment = hmm.alignment.hmm_subregion(start, end) 
 786              self.consensus = hmm.consensus.subregion(start, end) 
 787   
 788          layers = csb.core.deepcopy(hmm.layers[start : end + 1]) 
 789          max_score = 1.0 
 790          if hmm.score_units != ScoreUnits.Probability: 
 791              max_score = hmm._convert(hmm.score_units, 
 792                                       max_score, hmm.scale, hmm.logbase) 
 793          self._build_graph(layers, max_score) 
 794                           
 795          if hmm.dssp: 
 796              self.dssp = hmm.dssp.subregion(start, end) 
 797              self._assign_secstructure() 
 798          if hmm.psipred: 
 799              self.psipred = hmm.psipred.subregion(start, end)             
 800               
 801          self.length.layers = self.layers.length 
 802          self.length.matches = self.layers.length 
 803          self.effective_matches = sum([(l.effective_matches or 0.0) for l in self.layers]) / self.layers.length    
  804   
 805      @property 
 807          """ 
 808          Start position of this segment in its source HMM 
 809          @rtype: int 
 810          """ 
 811          return self._source_start 
  812   
 813      @property 
 815          """ 
 816          End position of this segment in its source HMM 
 817          @rtype: int 
 818          """         
 819          return self._source_end 
  820               
  845   
 848      """ 
 849      Represents a segment of the Match state emission probabilities of a L{ProfileHMM}. 
 850      Contains only Match states, connected with equal transition probabilities of 100%.    
 851      """       
 852       
 854           
 855          factory = StateFactory() 
 856           
 857          for rank, source_layer in enumerate(source_layers, start=1): 
 858               
 859              emission = source_layer[States.Match].emission          
 860              background = source_layer[States.Match].background 
 861               
 862              match = factory.create_match(emission, background) 
 863              match.rank = rank 
 864               
 865              layer = HMMLayer(rank, source_layer.residue)             
 866              layer.append(match) 
 867              self.layers.append(layer) 
 868                           
 869              if rank == 1: 
 870                  self.start.transitions.append(Transition(self.start, match, 1.0)) 
 871              elif rank < len(source_layers): 
 872                  prev_match = self.layers[rank - 1][States.Match] 
 873                  prev_match.transitions.append(Transition(prev_match, match, 1.0)) 
 874              elif rank == len(source_layers): 
 875                  match.transitions.append(Transition(match, self.end, 1.0)) 
 876              else: 
 877                  assert False 
   878   
 910   
 911      @property 
 913          """ 
 914          Start position of this segment in its source HMM 
 915          @rtype: int 
 916          """         
 917          return self._source_start 
  918       
 919      @property 
 921          """ 
 922          End position of this segment in its source HMM 
 923          @rtype: int 
 924          """             
 925          return self._source_end 
  926   
 933   
 936       
 938          self.lamda = lamda 
 939          self.mu = mu 
  940       
 943           
 945          return (self.lamda is not None or self.mu is not None) 
   946   
 949      """  
 950      Represents a lookup table of emission probabilities. Provides dictionary-like 
 951      access: 
 952       
 953          >>> state.emission[ProteinAlphabet.ALA] 
 954          emission probability for ALA 
 955       
 956      @param emission: an initialization dictionary of emission probabilities 
 957      @type emission: dict 
 958      @param restrict: a list of residue types allowed for this emission table.  
 959                       Defaults to the members of L{csb.bio.sequence.ProteinAlphabet} 
 960      @type restrict: list 
 961      """ 
 962   
 965               
 966 -    def append(self, residue, probability): 
  967          """ 
 968          Append a new emission probability to the table. 
 969           
 970          @param residue: residue name (type) - a member of 
 971                          L{csb.bio.sequence.ProteinAlphabet} 
 972          @type residue: L{csb.core.EnumItem} 
 973          @param probability: emission score 
 974          @type probability: float 
 975           
 976          @raise EmissionExistsError: if residue is already defined 
 977          """ 
 978          if residue in self: 
 979              raise EmissionExistsError('Residue {0} is already defined.'.format(residue)) 
 980   
 981          super(EmissionTable, self).append(residue, probability) 
  982       
 983 -    def set(self, table): 
  984          """  
 985          Set the emission table using the dictionary provided in the argument. 
 986           
 987          @param table: the new emission table 
 988          @type table: dict 
 989          """           
 990          super(EmissionTable, self)._set(table) 
  991           
 992 -    def update(self, residue, probability): 
  993          """  
 994          Update the emission C{probability} of a given emission C{residue}. 
 995           
 996          @param residue: name (type) of the residue to be updated 
 997          @type residue: L{csb.core.EnumItem} 
 998          @param probability: new emission score 
 999          @type probability: float 
1000          """                 
1001          super(EmissionTable, self)._update({residue: probability}) 
 1002   
1005      """  
1006      Represents a lookup table of transitions that are possible from within a given state.  
1007       
1008      Provides dictionary-like access, where dictionary keys are target states. 
1009      These are members of the L{States} enumeration, e.g.: 
1010       
1011          >>> state.transitions[States.Match] 
1012          transition info regarding transition from the current state to a Match state 
1013          >>> state.transitions[States.Match].predecessor 
1014          state 
1015          >>> state.transitions[States.Match].successor 
1016          the next match state 
1017        
1018      @param transitions: an initialization dictionary of target L{State}:L{Transition} pairs 
1019      @type transitions: dict 
1020      @param restrict: a list of target states allowed for this transition table.  
1021                       Defaults to the L{States} enum members 
1022      @type restrict: list 
1023      """ 
1024       
1026          super(TransitionTable, self).__init__(transitions, restrict) 
1027           
1028      @property 
1031       
1032 -    def append(self, transition): 
 1033          """ 
1034          Append a new C{transition} to the table. 
1035   
1036          @param transition: transition info 
1037          @type transition: L{Transition} 
1038           
1039          @raise TransitionExistsError: when a transition to the same target state 
1040                                        already exists for the current state 
1041          """ 
1042           
1043          if transition.successor.type in self: 
1044              msg = 'Transition to a {0} state is already defined.' 
1045              raise TransitionExistsError(msg.format(transition.successor.type)) 
1046           
1047          super(TransitionTable, self).append(transition.successor.type, transition) 
 1048       
1049 -    def set(self, table): 
 1050          """  
1051          Set the transition table using the dictionary provided in the argument. 
1052           
1053          @param table: the new transition table 
1054          @type table: dict 
1055          """           
1056          super(TransitionTable, self)._set(table) 
 1057           
1058 -    def update(self, target_statekind, transition): 
 1059          """  
1060          Update the information of a transition, which points to a target  
1061          state of the specified L{States} kind. 
1062           
1063          @param target_statekind: the key of the transition to be updated 
1064          @type target_statekind: L{csb.core.EnumItem} 
1065          @param transition: new transition info object 
1066          @type transition: L{Transition} 
1067           
1068          @raise ValueError: if I{transition.successor.type} differs from 
1069                             C{target_statekind} 
1070          """ 
1071          if transition.successor.type != target_statekind: 
1072              raise ValueError("Successor's type differs from the specified target state.") 
1073                   
1074          super(TransitionTable, self)._update({target_statekind: transition})   
 1075   
1078      """ 
1079      Provides consecutive, 1-based access to all of the layers in the profile. 
1080      Each profile layer contains a catalog of available states at that index, e.g.: 
1081   
1082          >>> profile.layers[i] 
1083          the catalog at profile layer i 
1084          >>> profile.layers[i][States.Deletion] 
1085          the deletion state at index i 
1086           
1087      @param layers: initialization list of L{HMMLayer}s 
1088      @type layers: list     
1089      """         
1092           
1093      @property 
 1096   
1097 -class HMMLayer(csb.core.DictionaryContainer): 
 1098      """ 
1099      Provides a dictionary-like catalog of the available states at this layer. 
1100      Lookup keys are members of the L{States} enumeration, e.g.: 
1101       
1102          >>> profile.layers[i][States.Deletion] 
1103          the deletion state at layer number i   
1104       
1105      @param rank: layer's number 
1106      @type rank: int 
1107      @param residue: a representative L{ProteinResidue} that is associated with 
1108                      this layer 
1109      @type residue: L{ProteinResidue}         
1110      @param states: initialization dictionary of L{States}.Item:L{State} pairs 
1111      @type states: dict  
1112      """         
1113 -    def __init__(self, rank, residue, states=None): 
 1124           
1125      @property 
1128   
1129      @property 
1131          """ 
1132          Layer's position 
1133          @rtype: int 
1134          """ 
1135          return self._rank 
 1136      @rank.setter 
1137 -    def rank(self, value): 
 1138          self._rank = int(value) 
 1139   
1140      @property 
1142          """ 
1143          Representative residue 
1144          @rtype: L{Residue} 
1145          """         
1146          return self._residue 
 1147      @residue.setter 
1152           
1153      @property 
1155          """ 
1156          Number of effective matches at this layer 
1157          @rtype: int 
1158          """ 
1159          return self._effective_matches 
 1160      @effective_matches.setter 
1162          self._effective_matches = value 
 1163       
1164      @property 
1166          """ 
1167          Number of effective insertions at this layer 
1168          @rtype: int 
1169          """         
1170          return self._effective_insertions 
 1171      @effective_insertions.setter 
1173          self._effective_insertions = value 
 1174       
1175      @property 
1177          """ 
1178          Number of effective deletions at this layer 
1179          @rtype: int 
1180          """         
1181          return self._effective_deletions 
 1182      @effective_deletions.setter 
1184          self._effective_deletions = value 
 1185       
1187          """ 
1188          Append a new C{state} to the catalog. 
1189           
1190          @param state: the new state 
1191          @type state: L{State} 
1192           
1193          @raise StateExistsError: when a state of the same type is already defined 
1194          """ 
1195          if state.type in self: 
1196              raise StateExistsError( 
1197                      'State {0} is already defined at this position.'.format(state.type)) 
1198   
1199          super(HMMLayer, self).append(state.type, state) 
 1200           
1201 -    def update(self, state_kind, state): 
 1202          """  
1203          Update the sate of the specified kind under the current layer. 
1204           
1205          @param state_kind: state type (key) - a member of L{States} 
1206          @type state_kind: L{csb.core.EnumItem} 
1207          @param state: the new state info 
1208          @type state: L{State} 
1209           
1210          @raise ValueError: if state.type differs from state_kind 
1211          """ 
1212          if state.type != state_kind: 
1213              raise ValueError("State's type differs from the specified state_kind") 
1214                   
1215          super(HMMLayer, self)._update({state_kind: state})         
  1216   
1217      
1218 -class State(object): 
 1219      """  
1220      Describes a Hidden Markov Model state. 
1221       
1222      @param type: one of the L{States} enumeration values, e.g. States.Match 
1223      @type type: L{csb.core.EnumItem} 
1224      @param emit: a collection of emittable state names allowed for the state,  
1225                   e.g. the members of I{SequenceAlphabets.Protein}. If not defined,  
1226                   the state will be created as a silent (unobservable). 
1227      @type emit: list 
1228       
1229      @raise ValueError: if type is not a member of the States enum 
1230      """ 
1231       
1245       
1247          return "<HMM {0.type!r} State>".format(self) 
 1248   
1249      @property 
1251          """ 
1252          State type: one of the L{States} 
1253          @rtype: enum item 
1254          """ 
1255          return self._type 
 1256      @type.setter 
1257 -    def type(self, value): 
 1261       
1262      @property 
1264          return self._rank 
 1265      @rank.setter 
1266 -    def rank(self, value): 
 1267          self._rank = int(value) 
 1268       
1269      @property 
1271          """ 
1272          Lookup table with available transitions to other states 
1273          @rtype: L{TransitionTable} 
1274          """         
1275          return self._transitions 
 1276       
1277      @property 
1279          """ 
1280          Lookup table with available emission probabilities 
1281          @rtype: L{EmissionTable} 
1282          """          
1283          if self._emission is None: 
1284              raise UnobservableStateError('Silent {0!r} state'.format(self.type)) 
1285          return self._emission 
 1286       
1287      @property 
1289          """ 
1290          Lookup table with background probabilities 
1291          @rtype: L{EmissionTable} 
1292          """          
1293          return self._background 
 1294           
1295      @property 
1297          """ 
1298          Whether this state can emit something 
1299          @rtype: bool 
1300          """ 
1301          try: 
1302              return self.emission is None 
1303          except UnobservableStateError: 
1304              return True      
 1305                       
1307      """ 
1308      Simplifies the construction of protein profile HMM states. 
1309      """ 
1310               
1313       
1320   
1327       
1329          return State(States.Deletion) 
 1330                   
1333   
1335          self.source_state = source.type 
1336          self.target_state = target.type      
 1337           
1339          return '{0}->{1}'.format(self.source_state, self.target_state)    
  1340               
1342      """ 
1343      Describes a Hidden Markov Model transition between two states. 
1344       
1345      @param predecessor: source state 
1346      @type predecessor: L{State} 
1347      @param successor: target state 
1348      @type successor: L{State} 
1349      @param probability: transition score 
1350      @type probability: float 
1351      """ 
1352       
1353 -    def __init__(self, predecessor, successor, probability): 
 1364           
1366          return '<HMM Transition: {0.type} {0.probability}>'.format(self) 
 1367       
1368      @property 
1370          """ 
1371          Transition source state 
1372          @rtype: L{State} 
1373          """ 
1374          return self._predecessor 
 1375       
1376      @property 
1378          """ 
1379          Transition target state 
1380          @rtype: L{State} 
1381          """         
1382          return self._successor 
 1383       
1384      @property 
1386          """ 
1387          Transition score 
1388          @rtype: float 
1389          """ 
1390          return self._probability 
 1391      @probability.setter 
1393          if not (value >=0): 
1394              raise ValueError('Transition probability must be a positive number.')         
1395          self._probability = float(value) 
 1396   
1397      @property 
1399          """ 
1400          Struct, containing information about the source and target state types 
1401          @rtype: L{TransitionType} 
1402          """ 
1403          return self._type 
  1404       
1407      """ 
1408      Represents a query-template alignment in an HHpred result. 
1409       
1410      @param hit: relevant hit object 
1411      @type param: L{HHpredHit} 
1412      @param query: the query sequence in the alignment region, with gaps 
1413      @type query: str 
1414      @param subject: the subject sequence in the alignment region, with gaps 
1415      @type subject: str 
1416      """ 
1417   
1418      GAP = sequence.ProteinAlphabet.GAP 
1419       
1420 -    def __init__(self, hit, query, subject): 
 1431   
1432      @property 
1434          """ 
1435          Query sequence (with gaps) 
1436          @rtype: str 
1437          """ 
1438          return self.rows[1].sequence 
 1439   
1440      @property 
1442          """ 
1443          Subject sequence (with gaps) 
1444          @rtype: str 
1445          """         
1446          return self.rows[2].sequence 
 1447       
1448      @property 
1450          """ 
1451          Find all ungapped query-subject segments in the alignment. 
1452          Return a generator over all ungapped alignment segments, represented 
1453          by L{HHpredHit} objects 
1454           
1455          @rtype: generator 
1456          """ 
1457           
1458          def make_segment(sstart, send, qstart, qend): 
1459               
1460              seg = HHpredHit(self._hit.rank, self._hit.id, sstart, send,  
1461                              qstart, qend, self._hit.probability, self._hit.qlength) 
1462   
1463              seg.slength = self._hit.slength 
1464              seg.evalue = self._hit.evalue 
1465              seg.pvalue = self._hit.pvalue 
1466              seg.score = self._hit.score 
1467              seg.ss_score = self._hit.ss_score 
1468              seg.identity = self._hit.identity 
1469              seg.similarity = self._hit.similarity 
1470              seg.prob_sum = self._hit.prob_sum 
1471               
1472              return seg 
 1473           
1474          in_segment = False 
1475          qs = self._hit.qstart - 1 
1476          ss = self._hit.start - 1  
1477          qi, si = qs, ss 
1478          qe, se = qs, ss 
1479           
1480          for q, s in zip(self.query, self.subject): 
1481   
1482              if q != HHpredHitAlignment.GAP: 
1483                  qi += 1 
1484              if s != HHpredHitAlignment.GAP: 
1485                  si += 1 
1486                   
1487              if HHpredHitAlignment.GAP in (q, s): 
1488                  if in_segment: 
1489                      yield make_segment(ss, se, qs, qe) 
1490                      in_segment = False 
1491                      qs, ss = 0, 0 
1492                      qe, se = 0, 0 
1493              else:  
1494                  if not in_segment: 
1495                      in_segment = True 
1496                      qs, ss = qi, si 
1497               
1498              qe, se = qi, si                         
1499           
1500          if in_segment: 
1501              yield make_segment(ss, se, qs, qe) 
 1502                       
1510   
1512      """ 
1513      Represents a single HHsearch hit. 
1514   
1515      @param rank: rank of the hit 
1516      @type rank: int 
1517      @param id: id of the hit 
1518      @type id: str 
1519      @param start: subject start 
1520      @type start: int 
1521      @param end: subject end 
1522      @type end: int 
1523      @param qstart: query start 
1524      @type qstart: int 
1525      @param qend: query end 
1526      @type qend: int 
1527      @param probability: probability of the hit 
1528      @type probability: float 
1529      @param qlength: length of the query 
1530      @type qlength: int 
1531      """ 
1532   
1533 -    def __init__(self, rank, id, start, end, qstart, qend, probability, 
1534                   qlength): 
 1535   
1536          self._rank = None 
1537          self._id = None 
1538          self._start = None 
1539          self._end = None 
1540          self._qstart = None 
1541          self._qend = None 
1542          self._probability = None 
1543          self._qlength = None 
1544          self._alignment = None 
1545   
1546          self._slength = None 
1547          self._evalue = None 
1548          self._pvalue = None 
1549          self._score = None 
1550          self._ss_score = None 
1551          self._identity = None 
1552          self._similarity = None 
1553          self._prob_sum = None         
1554   
1555           
1556          self.rank = rank 
1557          self.id = id 
1558          self.start = start 
1559          self.end = end 
1560          self.qstart = qstart 
1561          self.qend = qend 
1562          self.probability = probability 
1563          self.qlength = qlength 
 1564   
1566          return "{0.id} {0.probability} {0.start}-{0.end}".format(self) 
 1567       
1569          return "<HHpredHit: {0!s}>".format(self) 
 1570       
1573   
1575          """ 
1576          Return True if C{self} is completely identical to C{other} (same id, same start 
1577          and end positions). 
1578           
1579          @param other: right-hand-term 
1580          @type other: HHpredHit 
1581           
1582          @rtype: bool         
1583          """ 
1584          return (self.id == other.id and self.start == other.start and self.end == other.end) 
 1585           
1587          """ 
1588          Return True if C{self} is a superior to C{other} in terms of length  
1589          and probability. These criteria are applied in the following order: 
1590           
1591              1. Length (the longer hit is better) 
1592              2. Probability (if they have the same length, the one with the higher 
1593                 probability is better) 
1594              3. Address (if they have the same length and probability, the one with 
1595                 higher memory ID wins; for purely practical reasons)  
1596           
1597          @param other: right-hand-term 
1598          @type other: HHpredHit 
1599           
1600          @rtype: bool         
1601          """ 
1602          if self.length > other.length: 
1603              return True 
1604          elif self.length == other.length: 
1605              if self.probability > other.probability: 
1606                  return True       
1607              elif self.probability == other.probability: 
1608                  if id(self) > id(other): 
1609                      return True 
1610          return False 
 1611               
1612 -    def includes(self, other, tolerance=1): 
 1613          """ 
1614          Return True if C{other} overlaps with C{self}, that means C{other} 
1615          is fully or partially included in C{self} when aligned over the query. 
1616           
1617          @param other: right-hand-term 
1618          @type other: HHpredHit 
1619          @param tolerance: allow partial overlaps for that number of residues at 
1620                            either end 
1621          @type tolerance: int 
1622           
1623          @rtype: bool  
1624          """ 
1625          if self.id == other.id: 
1626              if other.start >= self.start: 
1627                  if (other.end - self.end) <= tolerance: 
1628                      return True 
1629              elif other.end <= self.end: 
1630                  if (self.start - other.start) <= tolerance: 
1631                      return True 
1632           
1633          return False 
 1634       
1636          """ 
1637          Add query/subject alignment to the hit. 
1638   
1639          @param query: the query sequence within the alignment region, with gaps 
1640          @type query: str 
1641          @param subject: the subject sequence within the alignment region, with 
1642                          gaps 
1643          @type subject: str 
1644          """ 
1645          self._alignment = HHpredHitAlignment(self, query, subject) 
 1646   
1647      @property 
1650      @rank.setter 
1651 -    def rank(self, value): 
 1657   
1658      @property 
1661      @id.setter 
1662 -    def id(self, value): 
 1668   
1669      @property 
1672      @start.setter 
1673 -    def start(self, value): 
 1679   
1680      @property 
1683      @end.setter 
1684 -    def end(self, value): 
 1690   
1691      @property 
1694      @qstart.setter 
1701   
1702      @property 
1705      @qend.setter 
1706 -    def qend(self, value): 
 1712   
1713      @property 
1715          return self._qlength 
 1716      @qlength.setter 
1723   
1724      @property 
1726          return self._probability 
 1727      @probability.setter 
1729          try: 
1730              value = float(value) 
1731          except: 
1732              raise TypeError('probability must be float, not {0}'.format(type(value))) 
1733          self._probability = value 
 1734   
1735      @property 
1737          return self._alignment 
 1738   
1739      @property 
1741          try: 
1742              return self.end - self.start + 1 
1743          except: 
1744              return 0 
 1745           
1746      @property 
1748          return self._slength 
 1749      @slength.setter 
1751          self._slength = value 
 1752   
1753      @property 
1756      @evalue.setter 
1758          self._evalue = value 
 1759   
1760      @property 
1763      @pvalue.setter 
1765          self._pvalue = value 
 1766   
1767      @property 
1770      @score.setter 
1771 -    def score(self, value): 
 1773   
1774      @property 
1776          return self._ss_score 
 1777      @ss_score.setter 
1779          self._ss_score = value 
 1780   
1781      @property 
1783          return self._identity 
 1784      @identity.setter 
1786          self._identity = value 
 1787   
1788      @property 
1790          return self._similarity 
 1791      @similarity.setter 
1793          self._similarity = value 
 1794   
1795      @property 
1797          return self._prob_sum 
 1798      @prob_sum.setter 
1800          self._prob_sum = value         
  1801   
1804      """ 
1805      Represents a collection of L{HHpredHit}s. 
1806      """ 
1807   
1808 -    def __init__(self, hits, query_name='', match_columns=-1, no_of_seqs='', 
1809                   neff=-1., searched_hmms=-1, date='', command=''): 
 1827           
1828      @property 
1830          return self._query_name 
 1831      @query_name.setter 
1833          self._query_name = value 
 1834   
1835      @property 
1837          return self._match_columns 
 1838      @match_columns.setter 
1840          self._match_columns = value 
 1841   
1842      @property 
1844          return self._no_of_seqs 
 1845      @no_of_seqs.setter 
1847          self._no_of_seqs = value 
 1848   
1849      @property 
1852      @neff.setter 
1853 -    def neff(self, value): 
 1855   
1856      @property 
1858          return self._searched_hmms 
 1859      @searched_hmms.setter 
1861          self._searched_hmms = value 
 1862   
1863      @property 
1866      @date.setter 
1867 -    def date(self, value): 
 1869   
1870      @property 
1872          return self._command 
 1873      @command.setter 
1875          self._command = value         
 1876   
1878          return "HHpredHitList\n\tquery={0.query_name}\n\tmatch_columns={0.match_columns}\n\tno_of_seqs={0.no_of_seqs}\n\tneff={0.neff}\n\tsearched_hmms={0.searched_hmms}\n\tdate={0.date}\n\tcommand={0.command}".format(self) 
 1879   
1881          return "<HHpredHitList: {0} hits>".format(len(self)) 
 1882   
1884          return self._hits[index] 
 1885   
1887          return iter(self._hits) 
 1888   
1890          return len(self._hits) 
 1891   
1893          self._hits.sort(key=lambda i: i.rank) 
  1894