Package csb :: Package bio :: Package hmm :: Module pseudocounts
[frames] | no frames]

Source Code for Module csb.bio.hmm.pseudocounts

  1  import sys 
  2  import csb.core 
  3  import csb.bio.sequence as sequence 
  4   
  5  from csb.bio.hmm import States, ScoreUnits, Transition, State 
  6   
  7   
  8  GONNET = [10227, 3430, 2875, 3869, 1625, 2393, 4590, 6500, 2352, 3225, 5819, 4172, 1435, 
  9            1579, 3728, 4610, 6264, 418, 1824, 5709, 3430, 7780, 2209, 2589, 584, 2369, 
 10            3368, 3080, 2173, 1493, 3093, 5701, 763, 859, 1893, 2287, 3487, 444, 1338, 2356, 
 11            2875, 2209, 3868, 3601, 501, 1541, 2956, 3325, 1951, 1065, 2012, 2879, 532, 688, 
 12            1480, 2304, 3204, 219, 1148, 1759, 3869, 2589, 3601, 8618, 488, 2172, 6021, 4176, 
 13            2184, 1139, 2151, 3616, 595, 670, 2086, 2828, 3843, 204, 1119, 2015, 1625, 584, 
 14            501, 488, 5034, 355, 566, 900, 516, 741, 1336, 591, 337, 549, 419, 901, 
 15            1197, 187, 664, 1373, 2393, 2369, 1541, 2172, 355, 1987, 2891, 1959, 1587, 1066, 
 16            2260, 2751, 570, 628, 1415, 1595, 2323, 219, 871, 1682, 4590, 3368, 2956, 6021, 
 17            566, 2891, 8201, 3758, 2418, 1624, 3140, 4704, 830, 852, 2418, 2923, 4159, 278, 1268, 2809, 
 18            6500, 3080, 3325, 4176, 900, 1959, 3758, 26066, 2016, 1354, 2741, 3496, 741, 797, 2369, 
 19            3863, 4169, 375, 1186, 2569, 2352, 2173, 1951, 2184, 516, 1587, 2418, 2016, 5409, 
 20            1123, 2380, 2524, 600, 1259, 1298, 1642, 2446, 383, 876, 1691, 3225, 1493, 1065, 
 21            1139, 741, 1066, 1624, 1354, 1123, 6417, 9630, 1858, 1975, 2225, 1260, 1558, 3131, 
 22            417, 1697, 7504, 5819, 3093, 2012, 2151, 1336, 2260, 3140, 2741, 2380, 9630, 25113, 
 23            3677, 4187, 5540, 2670, 2876, 5272, 1063, 3945, 11005, 4172, 5701, 2879, 3616, 591, 
 24            2751, 4704, 3496, 2524, 1858, 3677, 7430, 949, 975, 2355, 2847, 4340, 333, 1451, 2932, 
 25            1435, 763, 532, 595, 337, 570, 830, 741, 600, 1975, 4187, 949, 1300, 1111, 573, 
 26            743, 1361, 218, 828, 2310, 1579, 859, 688, 670, 549, 628, 852, 797, 1259, 2225, 
 27            5540, 975, 1111, 6126, 661, 856, 1498, 1000, 4464, 2602, 3728, 1893, 1480, 2086, 419, 
 28            1415, 2418, 2369, 1298, 1260, 2670, 2355, 573, 661, 11834, 2320, 3300, 179, 876, 2179, 
 29            4610, 2287, 2304, 2828, 901, 1595, 2923, 3863, 1642, 1558, 2876, 2847, 743, 856, 2320, 
 30            3611, 4686, 272, 1188, 2695, 6264, 3487, 3204, 3843, 1197, 2323, 4159, 4169, 2446, 3131, 
 31            5272, 4340, 1361, 1498, 3300, 4686, 8995, 397, 1812, 5172, 418, 444, 219, 204, 187, 
 32            219, 278, 375, 383, 417, 1063, 333, 218, 1000, 179, 272, 397, 4101, 1266, 499, 
 33            1824, 1338, 1148, 1119, 664, 871, 1268, 1186, 876, 1697, 3945, 1451, 828, 4464, 876, 
 34            1188, 1812, 1266, 9380, 2227, 5709, 2356, 1759, 2015, 1373, 1682, 2809, 2569, 1691, 7504, 
 35            11005, 2932, 2310, 2602, 2179, 2695, 5172, 499, 2227.0, 11569.0] 
 36  """ 
 37  Gonnet matrix frequencies taken from HHpred 
 38  """ 
39 40 41 -class PseudocountBuilder(object):
42 """ 43 Constructs profile HMMs with pseudocounts. 44 """ 45
46 - def __init__(self, hmm):
47 self._hmm = hmm
48 49 @property
50 - def hmm(self):
51 return self._hmm
52
53 - def add_emission_pseudocounts(self, tau=0.1, pca=2.5, pcb=0.5, pcc=1.0):
54 """ 55 Port from HHpred, it uses the conditional background probabilities, 56 inferred from the Gonnet matrix. 57 58 @param tau: admission weight, i.e how much of the final score is 59 determined by the background probabilities. 60 0.0=no pseudocounts. 61 @type tau: float 62 """ 63 from numpy import array, dot, transpose, clip 64 65 if self.hmm.pseudocounts or self.hmm.emission_pseudocounts: 66 return 67 if abs(tau) < 1e-6: 68 return 69 70 # Assume probabilities 71 if not self.hmm.score_units == ScoreUnits.Probability: 72 self.hmm.convert_scores(units=ScoreUnits.Probability) 73 74 alphabet = csb.core.Enum.values(sequence.StdProteinAlphabet) 75 76 ## S = SubstitutionMatrix(substitution_matrix) 77 s_mat = array(GONNET) 78 #Normalize 79 s_mat /= s_mat.sum() 80 s_mat = s_mat.reshape((len(alphabet), len(alphabet))) 81 # Marginalize matrix 82 s_marginal = s_mat.sum(-1) 83 s_conditional = s_mat / s_marginal 84 # Get data and info from hmm 85 em = array([ [layer[States.Match].emission[aa] or 0.0 for aa in alphabet] 86 for layer in self.hmm.layers]) 87 88 em = clip(em, sys.float_info.min, 1.) 89 90 neff_m = array([l.effective_matches for l in self.hmm.layers]) 91 92 g = dot(em, transpose(s_conditional)) 93 94 if neff_m is not None: 95 tau = clip(pca / (1. + (neff_m / pcb) ** pcc), 0.0, pcc) 96 e = transpose((1. - tau) * transpose(em) + tau * transpose(g)) 97 else: 98 e = (1. - tau) * em + tau * g 99 100 # Renormalize e 101 e = transpose(transpose(e) / e.sum(-1)) 102 103 for i, layer in enumerate(self.hmm.layers): 104 layer[States.Match].emission.set(dict(zip(alphabet, e[i]))) 105 106 self.hmm.emission_pseudocounts = True 107 return
108 109 110
111 - def add_transition_pseudocounts(self, gapb=1., gapd=0.15, gape=1.0, gapf=0.6, gapg=0.6, gapi=0.6):
112 """ 113 Add pseudocounts to the transitions. A port from hhsearch 114 -gapb 1.0 -gapd 0.15 -gape 1.0 -gapf 0.6 -gapg 0.6 -gapi 0.6 115 """ 116 117 from numpy import array 118 119 if not self.hmm._score_units == ScoreUnits.Probability: 120 self.hmm.convert_scores(units=ScoreUnits.Probability) 121 122 if self.hmm.pseudocounts or self.hmm.transition_pseudocounts: 123 return 124 125 # We need a fully populated HMM so first add all missing states 126 states = [States.Match, States.Insertion, States.Deletion] 127 background = self.hmm.layers[1][States.Match].background 128 for layer in self.hmm.layers: 129 rank = layer.rank 130 for state in states: 131 if state not in layer: 132 133 if state is States.Deletion: 134 # Add a new Deletion state 135 deletion = State(States.Deletion) 136 deletion.rank = rank 137 layer.append(deletion) 138 139 elif state is States.Insertion: 140 # Add a new Deletion state 141 insertion = State(States.Insertion, 142 emit=csb.core.Enum.members( 143 sequence.SequenceAlphabets.Protein)) 144 insertion.background.set(background) 145 insertion.emission.set(background) 146 insertion.rank = rank 147 layer.append(insertion) 148 149 if not self.hmm.start_insertion: 150 insertion = State(States.Insertion, 151 emit=csb.core.Enum.members( 152 sequence.SequenceAlphabets.Protein)) 153 insertion.background.set(background) 154 insertion.emission.set(background) 155 insertion.rank = 0 156 self.hmm.start_insertion = insertion 157 158 # make hmm completly connected 159 for i in range(1, self.hmm.layers.length): 160 layer = self.hmm.layers[i] 161 #Start with match state 162 state = layer[States.Match] 163 if not States.Insertion in state.transitions: 164 state.transitions.append(Transition(state, 165 self.hmm.layers[i][States.Insertion], 166 0.0)) 167 if not States.Deletion in state.transitions: 168 state.transitions.append(Transition(state, 169 self.hmm.layers[i + 1][States.Deletion], 170 0.0)) 171 state = layer[States.Insertion] 172 if not States.Insertion in state.transitions: 173 state.transitions.append(Transition(state, 174 self.hmm.layers[i][States.Insertion], 175 0.0)) 176 if not States.Match in state.transitions: 177 state.transitions.append(Transition(state, 178 self.hmm.layers[i + 1][States.Match], 179 0.0)) 180 state = layer[States.Deletion] 181 if not States.Deletion in state.transitions: 182 state.transitions.append(Transition(state, 183 self.hmm.layers[i + 1][States.Deletion], 184 0.0)) 185 if not States.Match in state.transitions: 186 state.transitions.append(Transition(state, 187 self.hmm.layers[i + 1][States.Match], 188 0.0)) 189 # start layer 190 state = self.hmm.start 191 if not States.Insertion in self.hmm.start.transitions: 192 state.transitions.append(Transition(self.hmm.start, 193 self.hmm.start_insertion, 194 0.0)) 195 if not States.Deletion in self.hmm.start.transitions: 196 state.transitions.append(Transition(self.hmm.start, 197 self.hmm.layers[1][States.Deletion], 198 0.0)) 199 200 state = self.hmm.start_insertion 201 if not States.Insertion in self.hmm.start_insertion.transitions: 202 state.transitions.append(Transition(self.hmm.start_insertion, 203 self.hmm.start_insertion, 204 0.0)) 205 if not States.Match in self.hmm.start_insertion.transitions: 206 state.transitions.append(Transition(self.hmm.start_insertion, 207 self.hmm.layers[1][States.Match], 208 0.0)) 209 210 # last layer 211 state = self.hmm.layers[-1][States.Match] 212 if not States.Insertion in state.transitions: 213 state.transitions.append(Transition(state, 214 self.hmm.layers[-1][States.Insertion], 215 0.0)) 216 state = self.hmm.layers[-1][States.Insertion] 217 if not States.Insertion in state.transitions: 218 state.transitions.append(Transition(state, 219 self.hmm.layers[-1][States.Insertion], 220 0.0)) 221 222 if not States.End in state.transitions: 223 state.transitions.append(Transition(state, 224 self.hmm.end, 225 0.0)) 226 state = self.hmm.layers[-1][States.Deletion] 227 if not States.End in state.transitions: 228 state.transitions.append(Transition(state, 229 self.hmm.end, 230 0.0)) 231 232 233 234 # Now we have created a fully connected HMM 235 # Lates add pseuod counts 236 # Calculate pseudo counts 237 238 # to be honest I really do not know how they came up with this 239 pc_MD = pc_MI = 0.0286 * gapd 240 pc_MM = 1. - 2 * pc_MD 241 pc_DD = pc_II = gape / (gape - 1 + 1 / 0.75) 242 pc_DM = pc_IM = 1. - pc_II 243 244 245 # Get current transtion probabilities 246 t_mm = self.hmm.start.transitions[States.Match].probability 247 t_mi = self.hmm.start.transitions[States.Insertion].probability 248 t_md = self.hmm.start.transitions[States.Deletion].probability 249 250 # Transitions from Match state 251 n_eff = self.hmm.effective_matches 252 253 t = array([(n_eff - 1) * t_mm + gapb * pc_MM, 254 (n_eff - 1) * t_mi + gapb * pc_MI, 255 (n_eff - 1) * t_md + gapb * pc_MD]) 256 # normalize to one 257 t /= t.sum() 258 # Set 259 self.hmm.start.transitions[States.Match].probability = t[0] 260 self.hmm.start.transitions[States.Insertion].probability = t[1] 261 self.hmm.start.transitions[States.Deletion].probability = t[2] 262 263 # Rinse and repeat 264 t_im = self.hmm.start_insertion.transitions[States.Match].probability 265 t_ii = self.hmm.start_insertion.transitions[States.Insertion].probability 266 267 t = array([t_im + gapb * pc_IM, t_ii + gapb * pc_II]) 268 t /= t.sum() 269 270 self.hmm.start_insertion.transitions[States.Match].probability = t[0] 271 t_ii = self.hmm.start_insertion.transitions[States.Insertion].probability = t[1] 272 273 # And now for all layers 274 for layer in self.hmm.layers[:-1]: 275 # Get current transtion probabilities 276 t_mm = layer[States.Match].transitions[States.Match].probability 277 t_mi = layer[States.Match].transitions[States.Insertion].probability 278 t_md = layer[States.Match].transitions[States.Deletion].probability 279 n_eff = layer.effective_matches 280 t = array([(n_eff - 1) * t_mm + gapb * pc_MM, 281 (n_eff - 1) * t_mi + gapb * pc_MI, 282 (n_eff - 1) * t_md + gapb * pc_MD]) 283 # normalize to one 284 t /= t.sum() 285 layer[States.Match].transitions[States.Match].probability = t[0] 286 layer[States.Match].transitions[States.Insertion].probability = t[1] 287 layer[States.Match].transitions[States.Deletion].probability = t[2] 288 289 # Transitions from insert state 290 t_im = layer[States.Insertion].transitions[States.Match].probability 291 t_ii = layer[States.Insertion].transitions[States.Insertion].probability 292 n_eff = layer.effective_insertions 293 t = array([t_im * n_eff + gapb * pc_IM, 294 t_im * n_eff + gapb * pc_II]) 295 # normalize to one 296 t /= t.sum() 297 layer[States.Insertion].transitions[States.Match].probability = t[0] 298 layer[States.Insertion].transitions[States.Insertion].probability = t[1] 299 300 # Transitions form deletion state 301 t_dm = layer[States.Deletion].transitions[States.Match].probability 302 t_dd = layer[States.Deletion].transitions[States.Deletion].probability 303 n_eff = layer.effective_deletions 304 t = array([t_dm * n_eff + gapb * pc_DM, 305 t_dd * n_eff + gapb * pc_DD]) 306 # normalize to one 307 t /= t.sum() 308 layer[States.Deletion].transitions[States.Match].probability = t[0] 309 layer[States.Deletion].transitions[States.Deletion].probability = t[1] 310 311 #Last layer 312 313 layer = self.hmm.layers[-1] 314 t_mm = layer[States.Match].transitions[States.End].probability 315 t_mi = layer[States.Match].transitions[States.Insertion].probability 316 n_eff = layer.effective_matches 317 # No deletion 318 t = array([(n_eff - 1) * t_mm + gapb * pc_MM, 319 (n_eff - 1) * t_mi + gapb * pc_MI]) 320 # normalize to one 321 t /= t.sum() 322 layer[States.Match].transitions[States.End].probability = t[0] 323 layer[States.Match].transitions[States.Insertion].probability = t[1] 324 325 # Transitions from insert state 326 t_im = layer[States.Insertion].transitions[States.End].probability 327 t_ii = layer[States.Insertion].transitions[States.Insertion].probability 328 n_eff = layer.effective_insertions 329 t = array([t_im * n_eff + gapb * pc_IM, 330 t_im * n_eff + gapb * pc_II]) 331 # normalize to one 332 t /= t.sum() 333 layer[States.Insertion].transitions[States.End].probability = t[0] 334 layer[States.Insertion].transitions[States.Insertion].probability = t[1] 335 336 layer[States.Deletion].transitions[States.End].probability = 1. 337 338 self.hmm.transition_pseudocounts = True 339 return
340