1 """
2 CSfrag: build a dynamic library of analogous fragments, given a list
3 of assigned chemical shifts.
4 """
5
6 import os
7 import numpy
8 import multiprocessing
9
10 import csb.io
11 import csb.apps
12
13 from csb.bio.io.wwpdb import FileSystemStructureProvider, StructureNotFoundError, PDBParseError
14 from csb.bio.nmr import RandomCoil, ChemShiftScoringModel
15 from csb.bio.structure import Chain, Broken3DStructureError
16 from csb.bio.fragments import ChemShiftTarget, ChemShiftAssignment, RosettaFragsetFactory
17 from csb.bio.io.cs import ChemShiftReader, ChemShiftFormatError
18 from csb.bio.io.fasta import SequenceParser, SequenceFormatError
26
28
29 @property
32
34
35 cmd = csb.apps.ArgHandler(self.program, __doc__)
36 cpu = multiprocessing.cpu_count()
37
38 cmd.add_scalar_option('database', 'd', str, 'PDBS25 database directory (containing PDBS25cs.scs)', required=True)
39 cmd.add_scalar_option('shifts', 's', str, 'assigned chemical shifts table (NMR STAR file fragment)', required=True)
40
41 cmd.add_scalar_option('window', 'w', int, 'sliding window size', default=8)
42 cmd.add_scalar_option('top', 't', int, 'maximum number per starting position', default=25)
43 cmd.add_scalar_option('cpu', 'c', int, 'maximum degree of parallelism', default=cpu)
44
45 cmd.add_scalar_option('verbosity', 'v', int, 'verbosity level', default=1)
46 cmd.add_scalar_option('output', 'o', str, 'output directory', default='.')
47 cmd.add_boolean_option('filtered-map', 'f', 'make an additional filtered fragment map of centroids', default=False)
48
49 cmd.add_positional_argument('QUERY', str, 'query sequence (FASTA file)')
50
51 return cmd
52
54
56 if not os.path.isdir(self.args.output):
57 CSfragApp.exit('Output directory does not exist', code=ExitCodes.INVALID_DATA, usage=True)
58
59 try:
60 csf = CSfrag(self.args.QUERY, self.args.shifts, self.args.database, self.args.window, logger=self)
61 output = os.path.join(self.args.output, csf.query.accession)
62
63 frags = csf.extract_fragments(self.args.top, self.args.cpu)
64
65 if len(frags) == 0:
66 CSfragApp.exit('No fragments found!', code=ExitCodes.NO_OUTPUT)
67
68 fragmap = csf.build_fragment_map()
69 fragmap.dump(output + '.csfrags.08')
70
71 if self.args.filtered_map:
72 fragmap = csf.build_filtered_map()
73 fragmap.dump(output + '.filtered.08')
74
75 self.log('\nDONE.')
76
77 except ArgumentIOError as ae:
78 CSfragApp.exit(str(ae), code=ExitCodes.IO_ERROR)
79
80 except ArgumentError as ae:
81 CSfragApp.exit(str(ae), code=ExitCodes.INVALID_DATA, usage=True)
82
83 except ChemShiftFormatError as ce:
84 msg = "Can't parse input chemical shifts: " + str(ce)
85 CSfragApp.exit(msg, code=ExitCodes.INVALID_DATA)
86
87
88 - def log(self, message, ending='\n', level=1):
92
95 """
96 Helper, which reads assigned shifts from NMR STAR files and calculates
97 corrected secondary shifts.
98 """
99
125
127 """
128 Reads secondary shifts from files in CSfrag format.
129 """
130
131 DB = 'pdbs25cs.scs'
132
134 """
135 Read secondary shifts.
136 @param string: complete secondary shift block
137 @type string: str
138
139 @return: dictionary of the form: [rank: [nucleus: sec shift]]
140 @rtype: dict
141 """
142
143 shifts = {}
144
145 for l in string.splitlines():
146
147 if l.startswith('#') or not l.strip():
148 continue
149
150 l = l.split('\t')
151 rank = int(l[0])
152
153 for n, cs in zip(ChemShiftScoringModel.NUCLEI, l[1:]):
154 if cs != '':
155 shifts.setdefault(rank, {})[n] = float(cs)
156
157 return shifts
158
160 """
161 Read the entire PDBS25CS database.
162
163 @return: dictionary of the form: [entry ID: [rank: [nucleus: sec shift]]]
164 @rtype: dict
165 """
166
167 db = {}
168 file = os.path.join(path, file)
169
170 with open(file) as stream:
171 er = csb.io.EntryReader(stream, '#', None)
172
173 for e in er.entries():
174 entry = e[10:15]
175 db[entry] = self.read_shifts(e)
176
177 return db
178
180
185
186 @property
189
190 - def score(self, qcs, scs, qstart, qend, sstart, send):
228
229 - def offsets(self, start, end, window=6):
240
244
247
250
253 """
254 @param query: query FASTA sequence path and file name
255 @type query: str
256 @param cstable: file, containing the table of assigned experimental chemical shifts
257 @type cstable: str
258 @param database: path to the PDBS25 directory
259 @type database: str
260 @param logger: logging client (needs to have a C{log} method)
261 @type logger: L{Application}
262 """
263
264 - def __init__(self, query, cstable, database, window=8, logger=None):
298
299 @property
302
303 @property
305 return self._database
306 @database.setter
314
315 @property
318 @window.setter
324
325 - def log(self, *a, **ka):
326 if self._app:
327 self._app.log(*a, **ka)
328
330 """
331 Extract fragments with matching chemical shifts using a sliding window.
332
333 @param top: L{MatchTable} capacity per starting position
334 @type top: int
335 @param cpu: degree of parallelism
336 @type cpu: int
337
338 @rtype: tuple of L{ChemShiftAssignment}s
339 """
340 self.log("# Reading chemical shifts...", level=1)
341 db = SecondaryShiftReader().load_database(self.database)
342 matches = MatchTable(self.query.length, capacity=top)
343
344 slices = []
345 fragments = []
346
347 for qs in range(1, self.query.length + 1):
348 qe = qs + self.window - 1
349 if qe > self.query.length:
350 break
351
352 slices.append((qs, qe))
353
354 self.log("\n# Processing target {0}...".format(self.query.accession), level=1)
355 pool = multiprocessing.Pool(cpu)
356
357 try:
358 for subject in db:
359 tasks = []
360
361 for qs, qe in slices:
362 task = pool.apply_async(_task, [self._helper, subject, qs, qe, self._qcs, db[subject]])
363 tasks.append(task)
364
365 for task in tasks:
366 for result in task.get():
367 if result.score > ChemShiftAssignment.BIT_SCORE_THRESHOLD * self.window:
368 matches.add(result)
369
370 except KeyboardInterrupt:
371 pass
372 finally:
373 pool.terminate()
374
375 for rank in matches:
376 msg = '{0:3} {1:3} ({2:2} aa) {3:3} fragments'
377 self.log(msg.format(rank, rank + self.window - 1, self.window, len(matches[rank])),
378 level=1)
379
380
381 self.log("\n# Extracting fragments...")
382
383 for group in matches.by_source:
384 try:
385 source_id = group[0].entry_id
386 source = self._pdb.get(source_id).first_chain
387 source.compute_torsion()
388
389 for match in group:
390 try:
391 row = ' {0.entry_id:5} L{0.qs:3} {0.qe:3} {1}aa S:{0.score:5.1f}'
392 self.log(row.format(match, self.window), ending='', level=2)
393
394 fragment = ChemShiftAssignment(source=source, start=match.ss, end=match.se,
395 qstart=match.qs, qend=match.qe,
396 window=self.window, rmsd=None, score=match.score)
397 fragments.append(fragment)
398 self.log('', level=2)
399
400 except Broken3DStructureError:
401 self.log(' corrupt', level=2)
402 continue
403 except PDBParseError:
404 continue
405 except StructureNotFoundError:
406 self.log(" Warning: Template {0} is missing!".format(source_id))
407
408 self._matches = fragments
409 return tuple(fragments)
410
427
444
446
447 - def __init__(self, entry_id, qs, qe, ss, se, score):
455
457 return '{0.qs:4} {0.qe:4} {0.ss:4} {0.se:4} {0.score:10.3f}'.format(self)
458
461
463
464 - def __init__(self, length, capacity=25):
465
466 if capacity < 1:
467 capacity = 1
468
469 self._capacity = capacity
470 self._length = length
471 self._t = {}
472
473 for i in range(1, length + 1):
474 self._t[i] = []
475
490
492 return tuple(self._t[rank])
493
496
497 @property
511
512 -def _task(helper, subject, qs, qe, qcs, scs):
513
514 try:
515 results = []
516 slength = max(scs or [0])
517
518 for ss in range(1, slength + 1, 3):
519 se = ss + helper.window - 1
520
521 if se > slength:
522 break
523
524 score = helper.score(qcs, scs, qs, qe, ss, se)
525
526 if score is not None:
527 info = MatchInfo(subject, qs, qe, ss, se, score)
528 results.append(info)
529
530 return results
531
532 except KeyboardInterrupt:
533 return []
534
538
539
540 if __name__ == '__main__':
541 main()
542