Package csb :: Package apps :: Module precision
[frames] | no frames]

Source Code for Module csb.apps.precision

  1  """ 
  2  Measure the precision and coverage of a fragment library stored in Rosetta 
  3  NNmake format. 
  4  """ 
  5   
  6  import os 
  7  import multiprocessing 
  8  import matplotlib 
  9   
 10  import csb.apps 
 11   
 12  import csb.bio.io.wwpdb  as wwpdb 
 13  import csb.bio.structure as structure 
 14  import csb.bio.fragments 
 15  import csb.bio.fragments.rosetta as rosetta 
 16  import csb.core 
 17  import csb.io.plots 
18 19 20 -class ExitCodes(csb.apps.ExitCodes):
21 22 IO_ERROR = 2 23 INVALID_DATA = 3
24
25 -class AppRunner(csb.apps.AppRunner):
26 27 @property
28 - def target(self):
29 return PrecisionApp
30
31 - def command_line(self):
32 33 cpu = multiprocessing.cpu_count() 34 cmd = csb.apps.ArgHandler(self.program, __doc__) 35 36 cmd.add_scalar_option('pdb', 'p', str, 'the PDB database (a directory containing all PDB files)', required=True) 37 cmd.add_scalar_option('native', 'n', str, 'native structure of the target (PDB file)', required=True) 38 cmd.add_scalar_option('chain', 'c', str, 'chain identifier (if not specified, the first chain)', default=None) 39 40 cmd.add_scalar_option('top', 't', int, 'read top N fragments per position', default=25) 41 cmd.add_scalar_option('cpu', 'C', int, 'maximum degree of parallelism', default=cpu) 42 cmd.add_scalar_option('rmsd', 'r', float, 'RMSD cutoff for precision and coverage', default=1.5) 43 cmd.add_scalar_option('output', 'o', str, 'output directory', default='.') 44 45 cmd.add_boolean_option('save-structures', 's', 'create a PDB file for each fragment, superimposed over the native', default=False) 46 47 cmd.add_positional_argument('library', str, 'Fragment library file in Rosetta NNmake format') 48 49 return cmd
50
51 -class PrecisionApp(csb.apps.Application):
52
53 - def main(self):
54 55 if not os.path.isdir(self.args.output): 56 PrecisionApp.exit('Output directory does not exist', code=ExitCodes.INVALID_DATA, usage=True) 57 58 for file in [self.args.native, self.args.library]: 59 if not os.path.isfile(file): 60 PrecisionApp.exit('File not found: {0}'.format(file), code=ExitCodes.INVALID_DATA, usage=True) 61 62 self.log('\nLoading {0.library} (top {0.top} per position)... '.format(self.args), ending='') 63 64 try: 65 library = rosetta.RosettaFragmentMap.read(self.args.library, top=self.args.top) 66 self.log('{0} fragments'.format(len(library))) 67 68 try: 69 native = wwpdb.RegularStructureParser(self.args.native).parse_structure() 70 native.accession = native.accession[:4] 71 if self.args.chain: 72 native = native.chains[self.args.chain] 73 else: 74 native = native.first_chain 75 except (structure.Broken3DStructureError, wwpdb.PDBParseError) as se: 76 raise ArgumentError("Can't parse native structure: " + str(se)) 77 78 except (csb.core.ItemNotFoundError) as ce: 79 raise ArgumentError("Chain {0!s} not found in native structure".format(ce)) 80 81 self.log('Superimposing fragments...\n') 82 si = LibrarySuperimposer(native, library, self.args.pdb, self.args.output, 83 save=self.args.save_structures, cutoff=self.args.rmsd) 84 matches = si.run(self.args.cpu) 85 86 info = si.plot(matches) 87 self.log(' RMSD Cutoff {0:>6.2f} A'.format(self.args.rmsd)) 88 self.log(' AVG Precision {0.precision:>6.2f} %'.format(info)) 89 self.log(' Coverage {0.coverage:>6.2f} %'.format(info)) 90 self.log(' RW Precision {0.figure}'.format(si)) 91 92 93 94 except ArgumentIOError as ioe: 95 PrecisionApp.exit(str(ioe), code=ExitCodes.INVALID_DATA) 96 97 except ArgumentError as ae: 98 PrecisionApp.exit(str(ae), code=ExitCodes.INVALID_DATA) 99 100 self.log('\nDone.')
101
102 -class ArgumentError(ValueError):
103 pass
104
105 -class ArgumentIOError(ArgumentError):
106 pass
107
108 -class GlobalInfo(object):
109
110 - def __init__(self, precision, coverage):
111 self.precision = precision 112 self.coverage = coverage
113
114 - def __str__(self):
115 return '{0.precision:6.2f} {0.coverage:6.2f}'.format(self)
116
117 - def __sub__(self, rhs):
118 return GlobalInfo(self.precision - rhs.precision, self.coverage - rhs.coverage)
119
120 -class LibrarySuperimposer(object):
121
122 - def __init__(self, native, library, pdb, output, save=False, cutoff=1.5):
123 124 if not isinstance(native, structure.Chain): 125 raise TypeError(native) 126 elif native.length < 1: 127 raise ArgumentError('The native chain has no residues') 128 129 if not isinstance(library, rosetta.RosettaFragmentMap): 130 raise TypeError(library) 131 132 for i in [pdb, output]: 133 if not os.path.isdir(i): 134 raise ArgumentIOError("{0} is not a valid directory".format(i)) 135 136 self._pdb = pdb 137 self._native = native 138 self._library = library 139 self._output = os.path.abspath(output) 140 self._tab = os.path.join(self._output, native.entry_id + '.fragments.tab') 141 self._figure = os.path.join(self._output, native.entry_id + '.precision.pdf') 142 self._out = open(self._tab, 'w') 143 self._save = bool(save) 144 self._cutoff = float(cutoff)
145
146 - def __del__(self):
147 try: 148 if self._out and not self._out.closed: 149 self._out.close() 150 except: 151 pass
152 153 @property
154 - def table(self):
155 return self._tab
156 157 @property
158 - def figure(self):
159 return self._figure
160
161 - def run(self, cpu=1):
162 163 tasks = [] 164 matches = [] 165 save = None 166 if self._save: 167 save = self._output 168 169 pool = multiprocessing.Pool(cpu) 170 171 for source in self._library.sources: 172 fragments = self._library.fromsource(source) 173 task = pool.apply_async(rmsd, [self._native, source, fragments, self._pdb, save]) 174 tasks.append(task) 175 176 for task in tasks: 177 for match in task.get(): 178 if isinstance(match, csb.core.string): # error 179 self._out.write(' [E] ') 180 self._out.write(match.rstrip()) 181 self._out.write('\n') 182 else: 183 line = '{0.id}\t{0.qstart}\t{0.qend}\t{0.length}\t{0.rmsd}\n'.format(match) 184 self._out.write(line) 185 matches.append(match) 186 187 pool.terminate() 188 return matches
189
190 - def plot(self, matches):
191 192 residues = range(1, self._native.length + 1) 193 precision = [] 194 precision2 = [] 195 background = [] 196 covered = set() 197 198 all = {} 199 positive = {} 200 201 for rank in residues: 202 all[rank] = 0 203 positive[rank] = 0 204 205 for match in matches: 206 for rank in range(match.qstart, match.qend + 1): 207 all[rank] += 1 208 209 if match.rmsd <= self._cutoff: 210 positive[rank] += 1 211 covered.add(rank) 212 213 assert sorted(all.keys()) == sorted(positive.keys()) == residues 214 215 for rank in residues: 216 if all[rank] == 0: 217 precision2.append(0) 218 background.append(0) 219 else: 220 p = positive[rank] * 100.0 / all[rank] 221 precision.append(p) 222 precision2.append(p) 223 background.append(100) 224 225 coverage = len(covered) * 100.0 / len(residues) 226 if len(precision) > 0: 227 avg_precision = sum(precision) / len(precision) 228 else: 229 avg_precision = 0 230 231 with csb.io.plots.Chart() as chart: 232 233 chart.plot.bar(residues, background, color='#f5f5f5', linewidth=None, edgecolor='#f5f5f5') 234 chart.plot.bar(residues, precision2, color='#5ba9da', linewidth=None, edgecolor='#5ba9da') 235 236 chart.plot.set_title(self._native.entry_id) 237 chart.plot.set_xlabel('Residue') 238 chart.plot.set_xlim(1, len(residues)) 239 chart.plot.set_ylabel('Precision, %') 240 chart.plot.set_ylim(0, 100) 241 242 xaxis = chart.plot.axes.xaxis 243 yaxis = chart.plot.axes.yaxis 244 #xaxis.set_minor_locator(matplotlib.ticker.IndexLocator(1, 0)) 245 xaxis.set_major_locator(matplotlib.ticker.IndexLocator(10, 0)) 246 xaxis.tick_bottom() 247 yaxis.tick_left() 248 for t in xaxis.get_major_ticks(): 249 t.tick1On = False 250 t.tick2On = False 251 for t in xaxis.get_ticklabels(): 252 t.set_fontsize(16) 253 for t in yaxis.get_ticklabels(): 254 t.set_fontsize(16) 255 256 chart.plot.spines["right"].set_visible(False) 257 chart.plot.spines["top"].set_visible(False) 258 259 try: 260 chart.width = 15 261 chart.height = 5.5 262 263 chart.save(self._figure, chart.formats.PDF) 264 265 except IOError as io: 266 raise ArgumentIOError("Can't save figure: " + str(io)) 267 268 return GlobalInfo(avg_precision, coverage)
269
270 -def rmsd(target, source, fragments, pdb, save=None):
271 272 matches = [] 273 274 try: 275 src_file = wwpdb.find(source, [pdb]) 276 if src_file is None: 277 raise IOError("Can't find structure {0} in {1}".format(source, pdb)) 278 279 src_structure = wwpdb.RegularStructureParser(src_file).parse_structure() 280 281 except (IOError, structure.Broken3DStructureError, wwpdb.PDBParseError) as ex: 282 matches.append('Error parsing {0:5}: {1!s}'.format(source, ex)) 283 return matches 284 285 for fn, fragment in enumerate(fragments): 286 try: 287 if fragment.chain not in ('_', '', None): 288 src_chain = src_structure.chains[fragment.chain] 289 else: 290 src_chain = src_structure.first_chain 291 292 try: 293 query = target.subregion(fragment.qstart, fragment.qend) 294 subject = src_chain.subregion(fragment.start, fragment.end, clone=True) 295 296 except IndexError: 297 matches.append('Fragment {1.source_id:>5} {0.start:>4}-{0.end:>4} is out of range'.format(fragment)) 298 continue 299 300 si = query.align(subject) 301 match = csb.bio.fragments.FragmentMatch(fragment.id, qstart=fragment.qstart, qend=fragment.qend, 302 probability=None, rmsd=si.rmsd, tm_score=None, qlength=target.length) 303 matches.append(match) 304 305 if save: 306 dummy = structure.Structure(subject.entry_id) 307 dummy.chains.append(subject) 308 filename = '{0.qstart:}-{0.qend}-{0.source_id}.{1.entry_id}{2}.frag'.format(fragment, query, fn or '') 309 dummy.to_pdb(os.path.join(save, filename)) 310 311 except (structure.Broken3DStructureError, IOError) as ex: 312 matches.append("Can't superimpose fragment {0}: {1!s}".format(fragment.id, ex)) 313 continue 314 315 return matches
316
317 318 -def main():
319 AppRunner().run()
320 321 322 if __name__ == '__main__': 323 main() 324