1 """
2 Build an HMM from a FASTA sequence. This program is a proxy to hhblits/addss.pl
3 and hhmake from the HHpred package.
4
5 @note: assuming you have the full HHpred package installed and configured.
6 """
7
8
9 import os
10 import abc
11
12 import csb.apps
13 import csb.core
14 import csb.io
15
16 from csb.bio.io.wwpdb import StructureParser
17 from csb.bio.io.hhpred import HHProfileParser
18 from csb.bio.io.fasta import FASTAOutputBuilder
19 from csb.bio.sequence import ChainSequence
28
30
31 @property
34
36
37 cmd = csb.apps.ArgHandler(self.program, __doc__)
38
39 cmd.add_scalar_option('query-id', 'q', str, 'ID of the query, in PDB-like format (accessionCHAIN).'
40 'Used for naming the output files. Also, if the input is a PDB file with '
41 'multiple chains, CHAIN is used to pull the required chain from the file.',
42 required=True)
43 cmd.add_scalar_option('tk-root', 't', str, 'path to the ToolkitRoot folder in your HHsuite setup', default='/ebio/abt1_toolkit/share/wye')
44 cmd.add_scalar_option('database', 'd', str, 'custom HHblits database; if not defined, toolkit\'s unirpto20 will be used', default=None)
45 cmd.add_scalar_option('tk-config', 'c', str, 'path to a folder containing custom HHsuite configs (e.g. HHPaths.pm)', default='.')
46 cmd.add_scalar_option('cpu', None, int, 'maximum degree of parallelism', default=1)
47
48 cmd.add_boolean_option('no-ss', None, 'do not include secondary structure', default=False)
49 cmd.add_boolean_option('no-pseudo', None, 'do not add emission and transition pseudocounts', default=False)
50 cmd.add_boolean_option('no-calibration', None, 'do not calibrate the profile', default=False)
51
52 cmd.add_positional_argument('query', str, 'input sequence (FASTA or PDB file)')
53
54 return cmd
55
58
60
61 if os.path.isfile(self.args.query_id + '.hhm'):
62 BuildProfileApp.exit('# Profile "{0}" already exists, skipping'.format(self.args.query_id),
63 ExitCodes.CLEAN)
64
65 try:
66 self.log('# Building profile HMM for {0}...'.format(self.args.query))
67 pb = ProfileBuilder.create(self.args.query, self.args.query_id, self.args.database, self.args.tk_root, self.args.tk_config,
68 pseudo=not self.args.no_pseudo, ss=not self.args.no_ss, cpu=self.args.cpu)
69
70 pb.build_alignment()
71 pb.make_hmm()
72
73 if not self.args.no_calibration:
74 pb.calibrate_hmm()
75
76 except BuildArgError as ae:
77 BuildProfileApp.exit(str(ae), ExitCodes.INVALID_DATA)
78
79 except BuildIOError as ioe:
80 BuildProfileApp.exit(str(ioe), ExitCodes.IO_ERROR)
81
82 except csb.io.InvalidCommandError as ose:
83 msg = '{0!s}: {0.cmd}'.format(ose)
84 BuildProfileApp.exit(msg, ExitCodes.IO_ERROR)
85
86 except NoOutputError as noe:
87 msg = 'Expected file {0} not produced by: {1.cmd}.\nSTDERR: {1.stderr}\nSTDOUT: {1.stdout}'.format(noe.expected, noe.context)
88 BuildProfileApp.exit(msg, ExitCodes.EXT_TOOL_FAILURE)
89
90 except csb.io.ProcessError as pe:
91 msg = 'Bad exit code #{0.code} from: {0.cmd}.\nSTDERR: {0.stderr}\nSTDOUT: {0.stdout}'.format(pe.context)
92 BuildProfileApp.exit(msg, ExitCodes.EXT_TOOL_FAILURE)
93
94 self.log(' successfully created profile "{0}"'.format(self.args.query_id))
95
99
102
105
107
108 - def __init__(self, expected, context, *args):
113
116
117 __metaclass__ = abc.ABCMeta
118
119 EMISSION_PSEUDO = '-pcm 4 -pca 2.5 -pcb 0.5 -pcc 1.0'
120 TRANSITION_PSEUDO = '-gapb 1.0 -gapd 0.15 -gape 1.0 -gapf 0.6 -gapg 0.6 -gapi 0.6'
121
122 @staticmethod
123 - def create(query, target_id, database, tk_root, tk_config, pseudo=True, ss=True, cpu=1):
124
125 if database is None:
126 database = os.path.join(tk_root, "databases", "hhblits", "uniprot20")
127
128 if not os.path.isfile(query):
129 raise BuildIOError('File not found: ' + query)
130
131 for line in open(query):
132
133 if not line.strip():
134 continue
135
136 if line.startswith('>'):
137 return FASTAProfileBuilder(query, target_id, database, tk_root, tk_config, pseudo, ss, cpu)
138 elif line.startswith('HEADER') or line.startswith('ATOM'):
139 return PDBProfileBuilder(query, target_id, database, tk_root, tk_config, pseudo, ss, cpu)
140 else:
141 raise BuildArgError('Unknown input file format')
142
143 - def __init__(self, query, target_id, database, tk_root, tk_config, pseudo=True, ss=True, cpu=1):
144
145 self.tk_root = tk_root
146 self.tk_config = tk_config
147 self.hhlib = os.path.join(tk_root, "bioprogs", "hhsuite")
148
149 if 'TK_ROOT' not in os.environ or not os.environ['TK_ROOT']:
150 os.putenv('TK_ROOT', self.tk_root)
151 if 'HHLIB' not in os.environ or not os.environ['HHLIB']:
152 os.putenv('HHLIB', self.hhlib)
153 os.environ["PATH"] += os.pathsep + os.path.join(self.hhlib, "bin")
154
155 self.query = query
156 self.accession = target_id[:-1]
157 self.chain = target_id[-1]
158 self.database = database
159 self.pseudo = bool(pseudo)
160 self.ss = bool(ss)
161 self.cpu = cpu
162
163 self._input = None
164 self._a3m = None
165 self._hhm = None
166
167 self.configure_input()
168
174
175 @property
178
179 @abc.abstractmethod
182
184 assert self._input is not None
185
186 program = os.path.join(self.tk_root, 'bioprogs', 'hhsuite', 'bin', 'hhblits')
187
188 ali = self.target_id + '.a3m'
189 cmd = '{0} -cpu {1} -i {2} -d {3} -nodiff -oa3m {4}'.format(
190 program, self.cpu, self._input, self.database, ali)
191 bali = csb.io.Shell.run(cmd)
192
193 if bali.code != 0:
194 raise csb.io.ProcessError(bali)
195 if not os.path.isfile(ali):
196 raise NoOutputError(ali, bali)
197
198 if self.ss:
199 program2 = os.path.join(self.tk_root, 'bioprogs', 'hhsuite', 'scripts', 'addss.pl')
200
201 with csb.io.TempFile() as patch:
202 for l in open(program2):
203 if l.lstrip().startswith("use HHPaths"):
204 patch.write('use lib "{0}";\n'.format(self.tk_config))
205 patch.write(l);
206 patch.flush()
207
208 cmd2 = "perl {0} {1}".format(patch.name, ali)
209 addss = csb.io.Shell.run(cmd2)
210 if addss.code != 0:
211 raise csb.io.ProcessError(addss)
212
213 self._ali = ali
214 return ali
215
234
243
260
298
302
303
304 if __name__ == '__main__':
305 main()
306