Package csb :: Package apps :: Module buildhmm
[frames] | no frames]

Source Code for Module csb.apps.buildhmm

  1  """ 
  2  Build an HMM from a FASTA sequence. This program is a proxy to hhblits/addss.pl 
  3  and hhmake from the HHpred package. 
  4   
  5  @note: assuming you have the full HHpred package installed and configured. 
  6  """ 
  7   
  8   
  9  import os 
 10  import abc 
 11   
 12  import csb.apps 
 13  import csb.core 
 14  import csb.io 
 15   
 16  from csb.bio.io.wwpdb import StructureParser 
 17  from csb.bio.io.hhpred import HHProfileParser 
 18  from csb.bio.io.fasta import FASTAOutputBuilder 
 19  from csb.bio.sequence import ChainSequence 
20 21 22 23 -class ExitCodes(csb.apps.ExitCodes):
24 25 IO_ERROR = 2 26 INVALID_DATA = 3 27 EXT_TOOL_FAILURE = 4
28
29 -class AppRunner(csb.apps.AppRunner):
30 31 @property
32 - def target(self):
33 return BuildProfileApp
34
35 - def command_line(self):
36 37 cmd = csb.apps.ArgHandler(self.program, __doc__) 38 39 cmd.add_scalar_option('query-id', 'q', str, 'ID of the query, in PDB-like format (accessionCHAIN).' 40 'Used for naming the output files. Also, if the input is a PDB file with ' 41 'multiple chains, CHAIN is used to pull the required chain from the file.', 42 required=True) 43 cmd.add_scalar_option('tk-root', 't', str, 'path to the ToolkitRoot folder in your HHsuite setup', default='/ebio/abt1_toolkit/share/wye') 44 cmd.add_scalar_option('database', 'd', str, 'custom HHblits database; if not defined, toolkit\'s unirpto20 will be used', default=None) 45 cmd.add_scalar_option('tk-config', 'c', str, 'path to a folder containing custom HHsuite configs (e.g. HHPaths.pm)', default='.') 46 cmd.add_scalar_option('cpu', None, int, 'maximum degree of parallelism', default=1) 47 48 cmd.add_boolean_option('no-ss', None, 'do not include secondary structure', default=False) 49 cmd.add_boolean_option('no-pseudo', None, 'do not add emission and transition pseudocounts', default=False) 50 cmd.add_boolean_option('no-calibration', None, 'do not calibrate the profile', default=False) 51 52 cmd.add_positional_argument('query', str, 'input sequence (FASTA or PDB file)') 53 54 return cmd
55
56 57 -class BuildProfileApp(csb.apps.Application):
58
59 - def main(self):
60 61 if os.path.isfile(self.args.query_id + '.hhm'): 62 BuildProfileApp.exit('# Profile "{0}" already exists, skipping'.format(self.args.query_id), 63 ExitCodes.CLEAN) 64 65 try: 66 self.log('# Building profile HMM for {0}...'.format(self.args.query)) 67 pb = ProfileBuilder.create(self.args.query, self.args.query_id, self.args.database, self.args.tk_root, self.args.tk_config, 68 pseudo=not self.args.no_pseudo, ss=not self.args.no_ss, cpu=self.args.cpu) 69 70 pb.build_alignment() 71 pb.make_hmm() 72 73 if not self.args.no_calibration: 74 pb.calibrate_hmm() 75 76 except BuildArgError as ae: 77 BuildProfileApp.exit(str(ae), ExitCodes.INVALID_DATA) 78 79 except BuildIOError as ioe: 80 BuildProfileApp.exit(str(ioe), ExitCodes.IO_ERROR) 81 82 except csb.io.InvalidCommandError as ose: 83 msg = '{0!s}: {0.cmd}'.format(ose) 84 BuildProfileApp.exit(msg, ExitCodes.IO_ERROR) 85 86 except NoOutputError as noe: 87 msg = 'Expected file {0} not produced by: {1.cmd}.\nSTDERR: {1.stderr}\nSTDOUT: {1.stdout}'.format(noe.expected, noe.context) 88 BuildProfileApp.exit(msg, ExitCodes.EXT_TOOL_FAILURE) 89 90 except csb.io.ProcessError as pe: 91 msg = 'Bad exit code #{0.code} from: {0.cmd}.\nSTDERR: {0.stderr}\nSTDOUT: {0.stdout}'.format(pe.context) 92 BuildProfileApp.exit(msg, ExitCodes.EXT_TOOL_FAILURE) 93 94 self.log(' successfully created profile "{0}"'.format(self.args.query_id))
95
96 97 -class BuildError(Exception):
98 pass
99
100 -class BuildIOError(BuildError):
101 pass
102
103 -class BuildArgError(BuildError):
104 pass
105
106 -class NoOutputError(BuildError):
107
108 - def __init__(self, expected, context, *args):
109 110 self.expected = expected 111 self.context = context 112 super(NoOutputError, self).__init__(*args)
113
114 115 -class ProfileBuilder(object):
116 117 __metaclass__ = abc.ABCMeta 118 119 EMISSION_PSEUDO = '-pcm 4 -pca 2.5 -pcb 0.5 -pcc 1.0' 120 TRANSITION_PSEUDO = '-gapb 1.0 -gapd 0.15 -gape 1.0 -gapf 0.6 -gapg 0.6 -gapi 0.6' 121 122 @staticmethod
123 - def create(query, target_id, database, tk_root, tk_config, pseudo=True, ss=True, cpu=1):
124 125 if database is None: 126 database = os.path.join(tk_root, "databases", "hhblits", "uniprot20") 127 128 if not os.path.isfile(query): 129 raise BuildIOError('File not found: ' + query) 130 131 for line in open(query): 132 133 if not line.strip(): 134 continue 135 136 if line.startswith('>'): 137 return FASTAProfileBuilder(query, target_id, database, tk_root, tk_config, pseudo, ss, cpu) 138 elif line.startswith('HEADER') or line.startswith('ATOM'): 139 return PDBProfileBuilder(query, target_id, database, tk_root, tk_config, pseudo, ss, cpu) 140 else: 141 raise BuildArgError('Unknown input file format')
142
143 - def __init__(self, query, target_id, database, tk_root, tk_config, pseudo=True, ss=True, cpu=1):
144 145 self.tk_root = tk_root 146 self.tk_config = tk_config 147 self.hhlib = os.path.join(tk_root, "bioprogs", "hhsuite") 148 149 if 'TK_ROOT' not in os.environ or not os.environ['TK_ROOT']: 150 os.putenv('TK_ROOT', self.tk_root) 151 if 'HHLIB' not in os.environ or not os.environ['HHLIB']: 152 os.putenv('HHLIB', self.hhlib) 153 os.environ["PATH"] += os.pathsep + os.path.join(self.hhlib, "bin") 154 155 self.query = query 156 self.accession = target_id[:-1] 157 self.chain = target_id[-1] 158 self.database = database 159 self.pseudo = bool(pseudo) 160 self.ss = bool(ss) 161 self.cpu = cpu 162 163 self._input = None 164 self._a3m = None 165 self._hhm = None 166 167 self.configure_input()
168
169 - def run(self):
170 171 self.build_alignment() 172 self.make_hmm() 173 self.calibrate_hmm()
174 175 @property
176 - def target_id(self):
177 return self.accession + self.chain
178 179 @abc.abstractmethod
180 - def configure_input(self):
181 pass
182
183 - def build_alignment(self):
184 assert self._input is not None 185 186 program = os.path.join(self.tk_root, 'bioprogs', 'hhsuite', 'bin', 'hhblits') 187 188 ali = self.target_id + '.a3m' 189 cmd = '{0} -cpu {1} -i {2} -d {3} -nodiff -oa3m {4}'.format( 190 program, self.cpu, self._input, self.database, ali) 191 bali = csb.io.Shell.run(cmd) 192 193 if bali.code != 0: 194 raise csb.io.ProcessError(bali) 195 if not os.path.isfile(ali): 196 raise NoOutputError(ali, bali) 197 198 if self.ss: 199 program2 = os.path.join(self.tk_root, 'bioprogs', 'hhsuite', 'scripts', 'addss.pl') 200 201 with csb.io.TempFile() as patch: 202 for l in open(program2): 203 if l.lstrip().startswith("use HHPaths"): 204 patch.write('use lib "{0}";\n'.format(self.tk_config)) 205 patch.write(l); 206 patch.flush() 207 208 cmd2 = "perl {0} {1}".format(patch.name, ali) 209 addss = csb.io.Shell.run(cmd2) 210 if addss.code != 0: 211 raise csb.io.ProcessError(addss) 212 213 self._ali = ali 214 return ali
215
216 - def make_hmm(self):
217 assert self._ali is not None 218 219 program = os.path.join(self.tk_root, 'bioprogs', 'hhpred', 'hhmake') 220 hhm = self.target_id + '.hhm' 221 cmd = '{0} -i {1} -o {2}'.format(program, self._ali, hhm) 222 223 if self.pseudo: 224 cmd = '{0} {1} {2}'.format(cmd, ProfileBuilder.EMISSION_PSEUDO, ProfileBuilder.TRANSITION_PSEUDO) 225 226 nnmake = csb.io.Shell.run(cmd) 227 if nnmake.code != 0: 228 raise csb.io.ProcessError(nnmake) 229 if not os.path.isfile(hhm): 230 raise NoOutputError(hhm, nnmake) 231 232 self._hhm = hhm 233 return hhm
234
235 - def calibrate_hmm(self):
236 assert self._hhm is not None 237 238 program = os.path.join(self.tk_root, 'bioprogs', 'hhpred', 'hhsearch') 239 caldb = os.path.join(self.tk_root, 'databases', 'hhpred', 'cal.hhm') 240 241 cmd = '{0} -i {1}.hhm -d {2} -cal -cpu {3}'.format(program, self.target_id, caldb, self.cpu) 242 csb.io.Shell.runstrict(cmd)
243
244 245 -class FASTAProfileBuilder(ProfileBuilder):
246
247 - def configure_input(self):
248 249 if not os.path.isfile(self.query): 250 raise BuildIOError('File not found: ' + self.query) 251 252 fasta = self.target_id + '.fa' 253 254 with csb.io.EntryWriter(fasta) as f: 255 with open(self.query) as q: 256 f.write(q.read()) 257 258 self._input = fasta 259 return fasta
260
261 262 -class PDBProfileBuilder(ProfileBuilder):
263
264 - def configure_input(self):
265 266 try: 267 s = StructureParser(self.query).parse() 268 chain = s.chains[self.chain] 269 except csb.core.ItemNotFoundError: 270 raise BuildArgError('Chain {0.chain} not found in {0.query}'.format(self)) 271 except IOError as ioe: 272 raise BuildIOError(str(ioe)) 273 274 fasta = self.target_id + '.fa' 275 276 with open(fasta, 'w') as f: 277 sequence = ChainSequence.create(chain) 278 FASTAOutputBuilder(f).add_sequence(sequence) 279 280 self._input = fasta 281 return fasta
282
283 - def make_hmm(self):
284 285 super(PDBProfileBuilder, self).make_hmm() 286 self.format_structure()
287
288 - def format_structure(self):
289 assert self._hhm is not None 290 291 pdb = self.target_id + '.pdb' 292 293 parser = HHProfileParser(self._hhm) 294 parser.format_structure(self.query, self.chain, pdb) 295 296 self._pdb = pdb 297 return pdb
298
299 300 -def main():
301 AppRunner().run()
302 303 304 if __name__ == '__main__': 305 main() 306