1 """
2 HHfrag: build a dynamic variable-length fragment library for protein structure
3 prediction with Rosetta AbInitio.
4 """
5
6 import os
7 import multiprocessing
8
9 import csb.apps
10 import csb.apps.hhsearch as hhsearch
11
12 import csb.bio.io.hhpred
13 import csb.bio.fragments
14 import csb.bio.fragments.rosetta as rosetta
15 import csb.bio.structure
16
17 import csb.io.tsv
18 import csb.core
27
30
31 @property
34
36
37 cmd = csb.apps.ArgHandler(self.program, __doc__)
38 cpu = multiprocessing.cpu_count()
39
40 cmd.add_scalar_option('hhsearch', 'H', str, 'path to the HHsearch executable', default='hhsearch')
41 cmd.add_scalar_option('database', 'd', str, 'database directory (containing PDBS25.hhm)', required=True)
42
43 cmd.add_scalar_option('min', 'm', int, 'minimum query segment length', default=6)
44 cmd.add_scalar_option('max', 'M', int, 'maximum query segment length', default=21)
45 cmd.add_scalar_option('step', 's', int, 'query segmentation step', default=3)
46 cmd.add_scalar_option('cpu', 'c', int, 'maximum degree of parallelism', default=cpu)
47
48 cmd.add_scalar_option('gap-filling', 'g', str, 'path to a fragment file (e.g. CSfrag or Rosetta NNmake), which will be used '
49 'to complement low-confidence regions (when specified, a hybrid fragment library be produced)')
50 cmd.add_scalar_option('filtered-filling', 'F', str, 'path to a filtered fragment file (e.g. filtered CSfrag-ments), which will '
51 'be mixed with the HHfrag-set and then filtered, resulting in a double-filtered library')
52 cmd.add_boolean_option('filtered-map', 'f', 'make an additional filtered fragment map of centroids and predict torsion angles', default=False)
53 cmd.add_boolean_option('c-alpha', None, 'include also C-alpha vectors in the output', default=False)
54 cmd.add_scalar_option('confidence-threshold', 't', float, 'confidence threshold for gap filling', default=0.7)
55
56 cmd.add_scalar_option('verbosity', 'v', int, 'verbosity level', default=2)
57 cmd.add_scalar_option('output', 'o', str, 'output directory', default='.')
58
59 cmd.add_positional_argument('QUERY', str, 'query profile HMM (e.g. created with csb.apps.buildhmm)')
60
61 return cmd
62
65
67 if not os.path.isdir(self.args.output):
68 HHfragApp.exit('Output directory does not exist', code=ExitCodes.INVALID_DATA, usage=True)
69
70 if self.args.c_alpha:
71 builder = rosetta.ExtendedOutputBuilder
72 else:
73 builder = rosetta.OutputBuilder
74
75 try:
76 hhf = HHfrag(self.args.QUERY, self.args.hhsearch, self.args.database, logger=self)
77 output = os.path.join(self.args.output, hhf.query.id)
78
79 hhf.slice_query(self.args.min, self.args.max, self.args.step, self.args.cpu)
80 frags = hhf.extract_fragments()
81
82 if len(frags) == 0:
83 HHfragApp.exit('No fragments found!', code=ExitCodes.NO_OUTPUT)
84
85 fragmap = hhf.build_fragment_map()
86 fragmap.dump(output + '.hhfrags.09', builder)
87
88 if self.args.filtered_map:
89 fragmap, events = hhf.build_filtered_map()
90 fragmap.dump(output + '.filtered.09', builder)
91 tsv = PredictionBuilder.create(events).product
92 tsv.dump(output + '.centroids.tsv')
93
94 if self.args.filtered_filling:
95 fragmap, events = hhf.build_hybrid_filtered_map(self.args.filtered_filling)
96 fragmap.dump(output + '.hybrid.filtered.09', builder)
97 tsv = PredictionBuilder.create(events).product
98 tsv.dump(output + '.hybrid.centroids.tsv')
99
100 if self.args.gap_filling:
101 fragmap = hhf.build_combined_map(self.args.gap_filling, self.args.confidence_threshold)
102 fragmap.dump(output + '.complemented.09', builder)
103
104
105 self.log('\nDONE.')
106
107 except ArgumentIOError as ae:
108 HHfragApp.exit(str(ae), code=ExitCodes.IO_ERROR)
109
110 except ArgumentError as ae:
111 HHfragApp.exit(str(ae), code=ExitCodes.INVALID_DATA)
112
113 except csb.io.InvalidCommandError as ose:
114 msg = '{0!s}: {0.program}'.format(ose)
115 HHfragApp.exit(msg, ExitCodes.IO_ERROR)
116
117 except csb.bio.io.hhpred.HHProfileFormatError as hfe:
118 msg = 'Corrupt HMM: {0!s}'.format(hfe)
119 HHfragApp.exit(msg, code=ExitCodes.INVALID_DATA)
120
121 except csb.io.ProcessError as pe:
122 message = 'Bad exit code from HHsearch: #{0.code}.\nSTDERR: {0.stderr}\nSTDOUT: {0.stdout}'.format(pe.context)
123 HHfragApp.exit(message, ExitCodes.HHSEARCH_FAILURE)
124
125
126
127 - def log(self, message, ending='\n', level=1):
131
134
137
140
143 """
144 The HHfrag dynamic fragment detection protocol.
145
146 @param query: query HMM path and file name
147 @type query: str
148 @param binary: the HHsearch binary
149 @type binary: str
150 @param database: path to the PDBS25 directory
151 @type database: str
152 @param logger: logging client (needs to have a C{log} method)
153 @type logger: L{Application}
154 """
155
156 PDBS = 'pdbs25.hhm'
157
158 - def __init__(self, query, binary, database, logger=None):
177
178 @property
181
182 @property
185
186 @property
188 return self._database
189 @database.setter
197
198 @property
201 @aligner.setter
203 if hasattr(value, 'run') and hasattr(value, 'runmany'):
204 self._aligner = value
205 else:
206 raise TypeError(value)
207
208 - def log(self, *a, **ka):
209
210 if self._app:
211 self._app.log(*a, **ka)
212
213 - def slice_query(self, min=6, max=21, step=3, cpu=None):
214 """
215 Run the query slicer and collect the optimal query segments.
216
217 @param min: min segment length
218 @type min: int
219 @param max: max segment length
220 @type max: int
221 @param step: slicing step
222 @type step: int
223 @param cpu: degree of parallelism
224 @type cpu: int
225
226 @rtype: tuple of L{SliceContext}
227 """
228
229 if not 0 < min <= max:
230 raise ArgumentError('min and max must be positive numbers, with max >= min')
231 if not 0 < step:
232 raise ArgumentError('step must be positive number')
233
234 self.log('\n# Processing profile HMM "{0}"...'.format(self.query.id))
235 self.log('', level=2)
236 qp = self.query
237 hsqs = []
238
239 if not cpu:
240 cpu = max - min + 1
241
242 for start in range(1, qp.layers.length - min + 1 + 1, step):
243
244 self.log('{0:3}. '.format(start), ending='', level=1)
245 probes = []
246
247 for end in range(start + min - 1, start + max):
248 if end > qp.layers.length:
249 break
250 context = SliceContext(qp.segment(start, end), start, end)
251 probes.append(context)
252
253 probes = self.aligner.runmany(probes, workers=cpu)
254 probes.sort()
255
256 if len(probes) > 0:
257 rep = probes[-1]
258 hsqs.append(rep)
259 self.log('{0.start:3} {0.end:3} ({0.length:2} aa) {0.recurrence:3} hits'.format(rep), level=1)
260 else:
261 self.log(' no hits', level=1)
262
263 self._hsqs = hsqs
264 return tuple(hsqs)
265
267 """
268 Extract all matching fragment instances, given the list of optimal
269 query slices, generated during the first stage.
270
271 @rtype: tuple of L{Assignment}s
272 """
273
274 if self._hsqs is None:
275 self.slice_query()
276
277 self.log('\n# Extracting fragments...')
278 fragments = []
279
280 for si in self._hsqs:
281 self.log('\nSEGMENT: {0.start:3} {0.end:3} ({0.recurrence})'.format(si), level=2)
282
283 for hit in si.hits:
284 cn = 0
285 for chunk in hit.alignment.segments:
286 chunk.qstart = chunk.qstart + si.start - 1
287 chunk.qend = chunk.qend + si.start - 1
288 cn += 1
289 self.log(' {0.id:5} L{0.qstart:3} {0.qend:3} {0.length:2}aa P:{0.probability:5.3f}'.format(chunk), ending='', level=2)
290
291 sourcefile = os.path.join(self.database, hit.id + '.pdb')
292 if not os.path.isfile(sourcefile):
293 self.log(' missing', level=2)
294 continue
295 source = csb.bio.io.StructureParser(sourcefile).parse().first_chain
296 assert hit.id[-1] == source.id
297
298 source.compute_torsion()
299 try:
300 fragment = csb.bio.fragments.Assignment(source, chunk.start, chunk.end,
301 chunk.qstart, chunk.qend,
302 probability=chunk.probability)
303 fragments.append(fragment)
304 if cn > 1:
305 self.log(' (chunk #{0})'.format(cn), level=2)
306 else:
307 self.log('', level=2)
308
309 except csb.bio.structure.Broken3DStructureError:
310 self.log(' corrupt', level=2)
311 continue
312
313 self._matches = fragments
314 return tuple(fragments)
315
317
318 self.log('\n {0} ungapped assignments'.format(len(self._matches)))
319 self.log('', level=2)
320
321 histogram = {}
322 for f in self._matches:
323 histogram[f.length] = histogram.get(f.length, 0) + 1
324
325 for length in sorted(histogram):
326
327 percent = histogram[length] * 100.0 / len(self._matches)
328 bar = '{0:3} |{1} {2:5.1f}%'.format(length, 'o' * int(percent), percent)
329 self.log(bar, level=2)
330
348
350
351 if ri.gap is True or ri.confident is False:
352 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3}'.format(ri), level=2)
353
354 else:
355 phi = PredictionBuilder.format_angle(ri.torsion.phi)
356 psi = PredictionBuilder.format_angle(ri.torsion.psi)
357 omega = PredictionBuilder.format_angle(ri.torsion.omega)
358
359 pred = "{0.source_id:5} {0.start:3} {0.end:3} {1} {2} {3}".format(ri.rep, phi, psi, omega)
360 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3} {1}'.format(ri, pred), level=2)
361
363 """
364 Build a filtered fragset of centroids.
365 @return: filtered fragset and a list of residue-wise predictions
366 (centroid and torsion angles)
367 @rtype: L{RosettaFragmentMap}, list of L{ResidueEventInfo}
368 """
369
370 if self._matches is None:
371 self.extract_fragments()
372
373 self.log('\n# Building filtered map...')
374 self.log('\n Confidence Recurrence Representative Phi Psi Omega', level=2)
375
376 events = []
377 def logger(ri):
378 events.append(ri)
379 self._filter_event_handler(ri)
380
381 target = csb.bio.fragments.Target.from_profile(self.query)
382 target.assignall(self._matches)
383
384 factory = csb.bio.fragments.RosettaFragsetFactory()
385 fragset = factory.make_filtered(target, extend=True, callback=logger)
386
387 return fragset, events
388
390
391 marked = ""
392
393 if ri.gap is True or ri.confident is False:
394 marked = "*"
395
396 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3} {1:>3}'.format(ri, marked), level=2)
397
399 """
400 Build a hybrid map, where low-confidence regions are complemented
401 with the specified filling.
402
403 @param threshold: confidence threshold
404 @type threshold: float
405 @param fragfile: filling fragset (Rosetta fragment file)
406 @type fragfile: str
407
408 @return: filtered fragset and a list of residue-wise predictions
409 (centroid and torsion angles)
410 @rtype: L{RosettaFragmentMap}, list of L{ResidueEventInfo}
411 """
412
413 if self._matches is None:
414 self.extract_fragments()
415
416 self.log('\n# Building complemented map...')
417
418 try:
419 filling = rosetta.RosettaFragmentMap.read(fragfile, top=top)
420 except IOError as io:
421 raise ArgumentIOError(str(io))
422
423 self.log('\n {0} supplementary fragments loaded'.format(filling.size))
424 self.log(' Confidence Recurrence Fill?', level=2)
425
426 target = csb.bio.fragments.Target.from_profile(self.query)
427 target.assignall(self._matches)
428
429 factory = csb.bio.fragments.RosettaFragsetFactory()
430 return factory.make_combined(target, filling, threshold=threshold,
431 callback=self._merge_event_handler)
432
433
435 """
436 Mix the fragset with the specified (filtered)filling and then filter
437 the mixture. If the filling is a filtered CSfrag library, this will
438 produce a double-filtered map.
439
440 @param fragfile: filtered filling (filtered CSfrag fragment file)
441 @type fragfile: str
442
443 @rtype: L{RosettaFragmentMap}
444 """
445
446 if self._matches is None:
447 self.extract_fragments()
448
449 self.log('\n# Building hybrid filtered map...')
450
451 filling = []
452 events = []
453
454 def logger(ri):
455 events.append(ri)
456 self._filter_event_handler(ri)
457
458 try:
459 db = csb.bio.io.wwpdb.FileSystemStructureProvider(self.database)
460
461 for f in rosetta.RosettaFragmentMap.read(fragfile):
462 filling.append(csb.bio.fragments.Assignment.from_fragment(f, db))
463
464 except IOError as io:
465 raise ArgumentIOError(str(io))
466 except csb.bio.io.wwpdb.StructureNotFoundError as sne:
467 msg = "{0} is not a PDBS25-derived fragset (template {1} not found)"
468 raise ArgumentIOError(msg.format(fragfile, str(sne)))
469
470 self.log('\n {0} supplementary fragments loaded'.format(len(filling)))
471 self.log('\n Confidence Recurrence Representative Phi Psi Omega', level=2)
472
473 if len(filling) > self.query.layers.length:
474 msg = "{0} does not look like a filtered fragset (too many centroids)"
475 raise ArgumentError(msg.format(fragfile))
476
477 target = csb.bio.fragments.Target.from_profile(self.query)
478 target.assignall(self._matches)
479 target.assignall(filling)
480
481 factory = csb.bio.fragments.RosettaFragsetFactory()
482 fragset = factory.make_filtered(target, extend=False, callback=logger)
483
484 return fragset, events
485
486 -class SliceContext(hhsearch.Context):
487
488 - def __init__(self, segment, start, end):
489
490 self.start = start
491 self.end = end
492
493 if not isinstance(segment, csb.core.string):
494 segment = segment.to_hmm(convert_scores=True)
495
496 super(SliceContext, self).__init__(segment)
497
498 @property
500 return self.end - self.start + 1
501
502 @property
505
506 @property
507 - def recurrence(self):
508 return len(self.result)
509
510 - def __lt__(self, other):
511
512 if self.recurrence == other.recurrence:
513 return self.length < other.length
514 else:
515 return self.recurrence < other.recurrence
516
519
520 HEADER = "rank:int residue:str confidence:float centroid:str phi:float psi:float omega:float"
521
522 @staticmethod
533
534 @staticmethod
543
546
547 @property
549 """
550 @rtype: L{Table}
551 """
552 return self._tsv
553
572
574 """
575 @param ri: all predictions
576 @type ri: list of L{ResidueEventInfo}
577 """
578
579 ri = list(ri)
580 ri.sort(key=lambda i: i.rank)
581
582 for i in ri:
583 self.add(i)
584
588
589
590 if __name__ == '__main__':
591 main()
592