1 import sys
2 import csb.core
3 import csb.bio.sequence as sequence
4
5 from csb.bio.hmm import States, ScoreUnits, Transition, State
6
7
8 GONNET = [10227, 3430, 2875, 3869, 1625, 2393, 4590, 6500, 2352, 3225, 5819, 4172, 1435,
9 1579, 3728, 4610, 6264, 418, 1824, 5709, 3430, 7780, 2209, 2589, 584, 2369,
10 3368, 3080, 2173, 1493, 3093, 5701, 763, 859, 1893, 2287, 3487, 444, 1338, 2356,
11 2875, 2209, 3868, 3601, 501, 1541, 2956, 3325, 1951, 1065, 2012, 2879, 532, 688,
12 1480, 2304, 3204, 219, 1148, 1759, 3869, 2589, 3601, 8618, 488, 2172, 6021, 4176,
13 2184, 1139, 2151, 3616, 595, 670, 2086, 2828, 3843, 204, 1119, 2015, 1625, 584,
14 501, 488, 5034, 355, 566, 900, 516, 741, 1336, 591, 337, 549, 419, 901,
15 1197, 187, 664, 1373, 2393, 2369, 1541, 2172, 355, 1987, 2891, 1959, 1587, 1066,
16 2260, 2751, 570, 628, 1415, 1595, 2323, 219, 871, 1682, 4590, 3368, 2956, 6021,
17 566, 2891, 8201, 3758, 2418, 1624, 3140, 4704, 830, 852, 2418, 2923, 4159, 278, 1268, 2809,
18 6500, 3080, 3325, 4176, 900, 1959, 3758, 26066, 2016, 1354, 2741, 3496, 741, 797, 2369,
19 3863, 4169, 375, 1186, 2569, 2352, 2173, 1951, 2184, 516, 1587, 2418, 2016, 5409,
20 1123, 2380, 2524, 600, 1259, 1298, 1642, 2446, 383, 876, 1691, 3225, 1493, 1065,
21 1139, 741, 1066, 1624, 1354, 1123, 6417, 9630, 1858, 1975, 2225, 1260, 1558, 3131,
22 417, 1697, 7504, 5819, 3093, 2012, 2151, 1336, 2260, 3140, 2741, 2380, 9630, 25113,
23 3677, 4187, 5540, 2670, 2876, 5272, 1063, 3945, 11005, 4172, 5701, 2879, 3616, 591,
24 2751, 4704, 3496, 2524, 1858, 3677, 7430, 949, 975, 2355, 2847, 4340, 333, 1451, 2932,
25 1435, 763, 532, 595, 337, 570, 830, 741, 600, 1975, 4187, 949, 1300, 1111, 573,
26 743, 1361, 218, 828, 2310, 1579, 859, 688, 670, 549, 628, 852, 797, 1259, 2225,
27 5540, 975, 1111, 6126, 661, 856, 1498, 1000, 4464, 2602, 3728, 1893, 1480, 2086, 419,
28 1415, 2418, 2369, 1298, 1260, 2670, 2355, 573, 661, 11834, 2320, 3300, 179, 876, 2179,
29 4610, 2287, 2304, 2828, 901, 1595, 2923, 3863, 1642, 1558, 2876, 2847, 743, 856, 2320,
30 3611, 4686, 272, 1188, 2695, 6264, 3487, 3204, 3843, 1197, 2323, 4159, 4169, 2446, 3131,
31 5272, 4340, 1361, 1498, 3300, 4686, 8995, 397, 1812, 5172, 418, 444, 219, 204, 187,
32 219, 278, 375, 383, 417, 1063, 333, 218, 1000, 179, 272, 397, 4101, 1266, 499,
33 1824, 1338, 1148, 1119, 664, 871, 1268, 1186, 876, 1697, 3945, 1451, 828, 4464, 876,
34 1188, 1812, 1266, 9380, 2227, 5709, 2356, 1759, 2015, 1373, 1682, 2809, 2569, 1691, 7504,
35 11005, 2932, 2310, 2602, 2179, 2695, 5172, 499, 2227.0, 11569.0]
36 """
37 Gonnet matrix frequencies taken from HHpred
38 """
42 """
43 Constructs profile HMMs with pseudocounts.
44 """
45
48
49 @property
52
54 """
55 Port from HHpred, it uses the conditional background probabilities,
56 inferred from the Gonnet matrix.
57
58 @param tau: admission weight, i.e how much of the final score is
59 determined by the background probabilities.
60 0.0=no pseudocounts.
61 @type tau: float
62 """
63 from numpy import array, dot, transpose, clip
64
65 if self.hmm.pseudocounts or self.hmm.emission_pseudocounts:
66 return
67 if abs(tau) < 1e-6:
68 return
69
70
71 if not self.hmm.score_units == ScoreUnits.Probability:
72 self.hmm.convert_scores(units=ScoreUnits.Probability)
73
74 alphabet = csb.core.Enum.values(sequence.StdProteinAlphabet)
75
76
77 s_mat = array(GONNET)
78
79 s_mat /= s_mat.sum()
80 s_mat = s_mat.reshape((len(alphabet), len(alphabet)))
81
82 s_marginal = s_mat.sum(-1)
83 s_conditional = s_mat / s_marginal
84
85 em = array([ [layer[States.Match].emission[aa] or 0.0 for aa in alphabet]
86 for layer in self.hmm.layers])
87
88 em = clip(em, sys.float_info.min, 1.)
89
90 neff_m = array([l.effective_matches for l in self.hmm.layers])
91
92 g = dot(em, transpose(s_conditional))
93
94 if neff_m is not None:
95 tau = clip(pca / (1. + (neff_m / pcb) ** pcc), 0.0, pcc)
96 e = transpose((1. - tau) * transpose(em) + tau * transpose(g))
97 else:
98 e = (1. - tau) * em + tau * g
99
100
101 e = transpose(transpose(e) / e.sum(-1))
102
103 for i, layer in enumerate(self.hmm.layers):
104 layer[States.Match].emission.set(dict(zip(alphabet, e[i])))
105
106 self.hmm.emission_pseudocounts = True
107 return
108
109
110
112 """
113 Add pseudocounts to the transitions. A port from hhsearch
114 -gapb 1.0 -gapd 0.15 -gape 1.0 -gapf 0.6 -gapg 0.6 -gapi 0.6
115 """
116
117 from numpy import array
118
119 if not self.hmm._score_units == ScoreUnits.Probability:
120 self.hmm.convert_scores(units=ScoreUnits.Probability)
121
122 if self.hmm.pseudocounts or self.hmm.transition_pseudocounts:
123 return
124
125
126 states = [States.Match, States.Insertion, States.Deletion]
127 background = self.hmm.layers[1][States.Match].background
128 for layer in self.hmm.layers:
129 rank = layer.rank
130 for state in states:
131 if state not in layer:
132
133 if state is States.Deletion:
134
135 deletion = State(States.Deletion)
136 deletion.rank = rank
137 layer.append(deletion)
138
139 elif state is States.Insertion:
140
141 insertion = State(States.Insertion,
142 emit=csb.core.Enum.members(
143 sequence.SequenceAlphabets.Protein))
144 insertion.background.set(background)
145 insertion.emission.set(background)
146 insertion.rank = rank
147 layer.append(insertion)
148
149 if not self.hmm.start_insertion:
150 insertion = State(States.Insertion,
151 emit=csb.core.Enum.members(
152 sequence.SequenceAlphabets.Protein))
153 insertion.background.set(background)
154 insertion.emission.set(background)
155 insertion.rank = 0
156 self.hmm.start_insertion = insertion
157
158
159 for i in range(1, self.hmm.layers.length):
160 layer = self.hmm.layers[i]
161
162 state = layer[States.Match]
163 if not States.Insertion in state.transitions:
164 state.transitions.append(Transition(state,
165 self.hmm.layers[i][States.Insertion],
166 0.0))
167 if not States.Deletion in state.transitions:
168 state.transitions.append(Transition(state,
169 self.hmm.layers[i + 1][States.Deletion],
170 0.0))
171 state = layer[States.Insertion]
172 if not States.Insertion in state.transitions:
173 state.transitions.append(Transition(state,
174 self.hmm.layers[i][States.Insertion],
175 0.0))
176 if not States.Match in state.transitions:
177 state.transitions.append(Transition(state,
178 self.hmm.layers[i + 1][States.Match],
179 0.0))
180 state = layer[States.Deletion]
181 if not States.Deletion in state.transitions:
182 state.transitions.append(Transition(state,
183 self.hmm.layers[i + 1][States.Deletion],
184 0.0))
185 if not States.Match in state.transitions:
186 state.transitions.append(Transition(state,
187 self.hmm.layers[i + 1][States.Match],
188 0.0))
189
190 state = self.hmm.start
191 if not States.Insertion in self.hmm.start.transitions:
192 state.transitions.append(Transition(self.hmm.start,
193 self.hmm.start_insertion,
194 0.0))
195 if not States.Deletion in self.hmm.start.transitions:
196 state.transitions.append(Transition(self.hmm.start,
197 self.hmm.layers[1][States.Deletion],
198 0.0))
199
200 state = self.hmm.start_insertion
201 if not States.Insertion in self.hmm.start_insertion.transitions:
202 state.transitions.append(Transition(self.hmm.start_insertion,
203 self.hmm.start_insertion,
204 0.0))
205 if not States.Match in self.hmm.start_insertion.transitions:
206 state.transitions.append(Transition(self.hmm.start_insertion,
207 self.hmm.layers[1][States.Match],
208 0.0))
209
210
211 state = self.hmm.layers[-1][States.Match]
212 if not States.Insertion in state.transitions:
213 state.transitions.append(Transition(state,
214 self.hmm.layers[-1][States.Insertion],
215 0.0))
216 state = self.hmm.layers[-1][States.Insertion]
217 if not States.Insertion in state.transitions:
218 state.transitions.append(Transition(state,
219 self.hmm.layers[-1][States.Insertion],
220 0.0))
221
222 if not States.End in state.transitions:
223 state.transitions.append(Transition(state,
224 self.hmm.end,
225 0.0))
226 state = self.hmm.layers[-1][States.Deletion]
227 if not States.End in state.transitions:
228 state.transitions.append(Transition(state,
229 self.hmm.end,
230 0.0))
231
232
233
234
235
236
237
238
239 pc_MD = pc_MI = 0.0286 * gapd
240 pc_MM = 1. - 2 * pc_MD
241 pc_DD = pc_II = gape / (gape - 1 + 1 / 0.75)
242 pc_DM = pc_IM = 1. - pc_II
243
244
245
246 t_mm = self.hmm.start.transitions[States.Match].probability
247 t_mi = self.hmm.start.transitions[States.Insertion].probability
248 t_md = self.hmm.start.transitions[States.Deletion].probability
249
250
251 n_eff = self.hmm.effective_matches
252
253 t = array([(n_eff - 1) * t_mm + gapb * pc_MM,
254 (n_eff - 1) * t_mi + gapb * pc_MI,
255 (n_eff - 1) * t_md + gapb * pc_MD])
256
257 t /= t.sum()
258
259 self.hmm.start.transitions[States.Match].probability = t[0]
260 self.hmm.start.transitions[States.Insertion].probability = t[1]
261 self.hmm.start.transitions[States.Deletion].probability = t[2]
262
263
264 t_im = self.hmm.start_insertion.transitions[States.Match].probability
265 t_ii = self.hmm.start_insertion.transitions[States.Insertion].probability
266
267 t = array([t_im + gapb * pc_IM, t_ii + gapb * pc_II])
268 t /= t.sum()
269
270 self.hmm.start_insertion.transitions[States.Match].probability = t[0]
271 t_ii = self.hmm.start_insertion.transitions[States.Insertion].probability = t[1]
272
273
274 for layer in self.hmm.layers[:-1]:
275
276 t_mm = layer[States.Match].transitions[States.Match].probability
277 t_mi = layer[States.Match].transitions[States.Insertion].probability
278 t_md = layer[States.Match].transitions[States.Deletion].probability
279 n_eff = layer.effective_matches
280 t = array([(n_eff - 1) * t_mm + gapb * pc_MM,
281 (n_eff - 1) * t_mi + gapb * pc_MI,
282 (n_eff - 1) * t_md + gapb * pc_MD])
283
284 t /= t.sum()
285 layer[States.Match].transitions[States.Match].probability = t[0]
286 layer[States.Match].transitions[States.Insertion].probability = t[1]
287 layer[States.Match].transitions[States.Deletion].probability = t[2]
288
289
290 t_im = layer[States.Insertion].transitions[States.Match].probability
291 t_ii = layer[States.Insertion].transitions[States.Insertion].probability
292 n_eff = layer.effective_insertions
293 t = array([t_im * n_eff + gapb * pc_IM,
294 t_im * n_eff + gapb * pc_II])
295
296 t /= t.sum()
297 layer[States.Insertion].transitions[States.Match].probability = t[0]
298 layer[States.Insertion].transitions[States.Insertion].probability = t[1]
299
300
301 t_dm = layer[States.Deletion].transitions[States.Match].probability
302 t_dd = layer[States.Deletion].transitions[States.Deletion].probability
303 n_eff = layer.effective_deletions
304 t = array([t_dm * n_eff + gapb * pc_DM,
305 t_dd * n_eff + gapb * pc_DD])
306
307 t /= t.sum()
308 layer[States.Deletion].transitions[States.Match].probability = t[0]
309 layer[States.Deletion].transitions[States.Deletion].probability = t[1]
310
311
312
313 layer = self.hmm.layers[-1]
314 t_mm = layer[States.Match].transitions[States.End].probability
315 t_mi = layer[States.Match].transitions[States.Insertion].probability
316 n_eff = layer.effective_matches
317
318 t = array([(n_eff - 1) * t_mm + gapb * pc_MM,
319 (n_eff - 1) * t_mi + gapb * pc_MI])
320
321 t /= t.sum()
322 layer[States.Match].transitions[States.End].probability = t[0]
323 layer[States.Match].transitions[States.Insertion].probability = t[1]
324
325
326 t_im = layer[States.Insertion].transitions[States.End].probability
327 t_ii = layer[States.Insertion].transitions[States.Insertion].probability
328 n_eff = layer.effective_insertions
329 t = array([t_im * n_eff + gapb * pc_IM,
330 t_im * n_eff + gapb * pc_II])
331
332 t /= t.sum()
333 layer[States.Insertion].transitions[States.End].probability = t[0]
334 layer[States.Insertion].transitions[States.Insertion].probability = t[1]
335
336 layer[States.Deletion].transitions[States.End].probability = 1.
337
338 self.hmm.transition_pseudocounts = True
339 return
340