1  """ 
  2  Build an HMM from a FASTA sequence. This program is a proxy to hhblits/addss.pl 
  3  and hhmake from the HHpred package. 
  4   
  5  @note: assuming you have the full HHpred package installed and configured. 
  6  """ 
  7   
  8   
  9  import os 
 10  import abc 
 11   
 12  import csb.apps 
 13  import csb.core 
 14  import csb.io 
 15   
 16  from csb.bio.io.wwpdb import StructureParser 
 17  from csb.bio.io.hhpred import HHProfileParser 
 18  from csb.bio.io.fasta import FASTAOutputBuilder 
 19  from csb.bio.sequence import ChainSequence 
 28   
 30       
 31      @property 
 34       
 36           
 37          cmd = csb.apps.ArgHandler(self.program, __doc__) 
 38           
 39          cmd.add_scalar_option('query-id', 'q', str, 'ID of the query, in PDB-like format (accessionCHAIN).' 
 40                                'Used for naming the output files. Also, if the input is a PDB file with ' 
 41                                'multiple chains, CHAIN is used to pull the required chain from the file.', 
 42                                required=True) 
 43          cmd.add_scalar_option('tk-root', 't', str, 'path to the ToolkitRoot folder in your HHsuite setup', default='/ebio/abt1_toolkit/share/wye') 
 44          cmd.add_scalar_option('database', 'd', str, 'custom HHblits database; if not defined, toolkit\'s unirpto20 will be used', default=None)         
 45          cmd.add_scalar_option('tk-config', 'c', str, 'path to a folder containing custom HHsuite configs (e.g. HHPaths.pm)', default='.') 
 46          cmd.add_scalar_option('cpu', None, int, 'maximum degree of parallelism', default=1)         
 47   
 48          cmd.add_boolean_option('no-ss', None, 'do not include secondary structure', default=False)         
 49          cmd.add_boolean_option('no-pseudo', None, 'do not add emission and transition pseudocounts', default=False) 
 50          cmd.add_boolean_option('no-calibration', None, 'do not calibrate the profile', default=False) 
 51   
 52          cmd.add_positional_argument('query', str, 'input sequence (FASTA or PDB file)')         
 53                   
 54          return cmd 
   55       
 58       
 60           
 61          if os.path.isfile(self.args.query_id + '.hhm'): 
 62              BuildProfileApp.exit('# Profile "{0}" already exists, skipping'.format(self.args.query_id), 
 63                                   ExitCodes.CLEAN)   
 64           
 65          try: 
 66              self.log('# Building profile HMM for {0}...'.format(self.args.query)) 
 67              pb = ProfileBuilder.create(self.args.query, self.args.query_id, self.args.database, self.args.tk_root, self.args.tk_config, 
 68                                         pseudo=not self.args.no_pseudo, ss=not self.args.no_ss, cpu=self.args.cpu) 
 69               
 70              pb.build_alignment() 
 71              pb.make_hmm() 
 72               
 73              if not self.args.no_calibration:                 
 74                  pb.calibrate_hmm() 
 75   
 76          except BuildArgError as ae: 
 77              BuildProfileApp.exit(str(ae), ExitCodes.INVALID_DATA) 
 78                           
 79          except BuildIOError as ioe: 
 80              BuildProfileApp.exit(str(ioe), ExitCodes.IO_ERROR) 
 81               
 82          except csb.io.InvalidCommandError as ose: 
 83              msg = '{0!s}: {0.cmd}'.format(ose) 
 84              BuildProfileApp.exit(msg, ExitCodes.IO_ERROR)             
 85   
 86          except NoOutputError as noe: 
 87              msg = 'Expected file {0} not produced by: {1.cmd}.\nSTDERR: {1.stderr}\nSTDOUT: {1.stdout}'.format(noe.expected, noe.context) 
 88              BuildProfileApp.exit(msg, ExitCodes.EXT_TOOL_FAILURE)   
 89               
 90          except csb.io.ProcessError as pe: 
 91              msg = 'Bad exit code #{0.code} from: {0.cmd}.\nSTDERR: {0.stderr}\nSTDOUT: {0.stdout}'.format(pe.context) 
 92              BuildProfileApp.exit(msg, ExitCodes.EXT_TOOL_FAILURE)               
 93   
 94          self.log('  successfully created profile "{0}"'.format(self.args.query_id))             
  95   
 99   
102   
105   
107   
108 -    def __init__(self, expected, context, *args): 
  113   
116       
117      __metaclass__ = abc.ABCMeta 
118       
119      EMISSION_PSEUDO = '-pcm 4 -pca 2.5 -pcb 0.5 -pcc 1.0' 
120      TRANSITION_PSEUDO = '-gapb 1.0 -gapd 0.15 -gape 1.0 -gapf 0.6 -gapg 0.6 -gapi 0.6' 
121       
122      @staticmethod 
123 -    def create(query, target_id, database, tk_root, tk_config, pseudo=True, ss=True, cpu=1): 
 124           
125          if database is None: 
126              database = os.path.join(tk_root, "databases", "hhblits", "uniprot20")         
127   
128          if not os.path.isfile(query): 
129              raise BuildIOError('File not found: ' + query) 
130                   
131          for line in open(query): 
132               
133              if not line.strip(): 
134                  continue 
135               
136              if line.startswith('>'): 
137                  return FASTAProfileBuilder(query, target_id, database, tk_root, tk_config, pseudo, ss, cpu) 
138              elif line.startswith('HEADER') or line.startswith('ATOM'):  
139                  return PDBProfileBuilder(query, target_id, database, tk_root, tk_config, pseudo, ss, cpu) 
140              else: 
141                  raise BuildArgError('Unknown input file format') 
 142                   
143 -    def __init__(self, query, target_id, database, tk_root, tk_config, pseudo=True, ss=True, cpu=1): 
 144           
145          self.tk_root = tk_root 
146          self.tk_config = tk_config 
147          self.hhlib = os.path.join(tk_root, "bioprogs", "hhsuite") 
148           
149          if 'TK_ROOT' not in os.environ or not os.environ['TK_ROOT']: 
150              os.putenv('TK_ROOT', self.tk_root) 
151          if 'HHLIB' not in os.environ or not os.environ['HHLIB']: 
152              os.putenv('HHLIB', self.hhlib) 
153          os.environ["PATH"] += os.pathsep + os.path.join(self.hhlib, "bin") 
154                                   
155          self.query = query 
156          self.accession = target_id[:-1] 
157          self.chain = target_id[-1] 
158          self.database = database 
159          self.pseudo = bool(pseudo) 
160          self.ss = bool(ss) 
161          self.cpu = cpu 
162           
163          self._input = None 
164          self._a3m = None 
165          self._hhm = None 
166           
167          self.configure_input() 
 168               
174       
175      @property 
178       
179      @abc.abstractmethod 
182       
184          assert self._input is not None 
185           
186          program = os.path.join(self.tk_root, 'bioprogs', 'hhsuite', 'bin', 'hhblits') 
187               
188          ali = self.target_id + '.a3m'                     
189          cmd = '{0} -cpu {1} -i {2} -d {3} -nodiff -oa3m {4}'.format( 
190                                  program, self.cpu, self._input, self.database, ali) 
191          bali = csb.io.Shell.run(cmd) 
192           
193          if bali.code != 0: 
194              raise csb.io.ProcessError(bali) 
195          if not os.path.isfile(ali): 
196              raise NoOutputError(ali, bali) 
197           
198          if self.ss: 
199              program2 = os.path.join(self.tk_root, 'bioprogs', 'hhsuite', 'scripts', 'addss.pl') 
200               
201              with csb.io.TempFile() as patch: 
202                  for l in open(program2): 
203                      if l.lstrip().startswith("use HHPaths"): 
204                          patch.write('use lib "{0}";\n'.format(self.tk_config)) 
205                      patch.write(l); 
206                  patch.flush() 
207                   
208                  cmd2 = "perl {0} {1}".format(patch.name, ali) 
209                  addss = csb.io.Shell.run(cmd2)             
210                  if addss.code != 0: 
211                      raise csb.io.ProcessError(addss)        
212           
213          self._ali = ali 
214          return ali 
 215           
234       
 243   
260   
298   
302       
303       
304  if __name__ == '__main__': 
305      main() 
306