''' Author: Badri Adhikari, University of Missouri-St. Louis, 12-30-2019 File: Generate sequence features for a fasta input Outputs: A feature dictionary file (.pkl) with the following features: 1) Psipred - https://www.ncbi.nlm.nih.gov/pubmed/10869041 2) Psisolv - https://www.ncbi.nlm.nih.gov/pubmed/10869041 3) Shannon entropy sum - features from the .colstat file generated by the alnstat program in MetaPSICOV - https://www.ncbi.nlm.nih.gov/pubmed/25431331 4) CCMpred - https://github.com/soedinglab/CCMpred 5) FreeContact - https://rostlab.org/owiki/index.php/FreeContact 6) contact potential - features from the .pairstat file generated by the alnstat program in MetaPSICOV - https://www.ncbi.nlm.nih.gov/pubmed/25431331 7) Sequence profiles - position specific scoring matrix (PSSM) ''' from datetime import datetime import os import sys import numpy as np import subprocess import pickle import getopt scripts = os.path.dirname(os.path.abspath(sys.argv[0])) if sys.version_info < (3,0,0): print('Python 3 required!!!') sys.exit(1) def usage(): print('Usage:') print(sys.argv[0] + ' <-f fasta> <-j jobdir> <-p pklfile> [-a alnfile] [-m a3mfile]') try: opts, args = getopt.getopt(sys.argv[1:], "f:j:p:a:m:h") except getopt.GetoptError as err: print(err) usage() sys.exit(2) fasta = '' jobdir = '' pklfile = '' alnfile = '' a3mfile = '' for o, a in opts: if o in ("-h", "--help"): usage() sys.exit() elif o in ("-f"): fasta = os.path.abspath(a) elif o in ("-j"): jobdir = os.path.abspath(a) elif o in ("-p"): pklfile = os.path.abspath(a) elif o in ("-a"): alnfile = os.path.abspath(a) elif o in ("-m"): a3mfile = os.path.abspath(a) else: assert False, "Error!! unhandled option!!" if len(fasta) < 2: print('fasta undefined!') usage() sys.exit() if len(jobdir) < 2: print('jobdir undefined!') usage() sys.exit() if len(pklfile) < 2: print('pklfile undefined!') usage() sys.exit() start = datetime.now() if not os.path.exists(jobdir): os.system('mkdir -p ' + jobdir) if os.path.exists(pklfile): exit(pklfile + ' already there!.. Nothing to do..') id = os.path.splitext(os.path.basename(fasta))[0] os.chdir(jobdir) # Lewis server settings PSIPRED = '/group/prayog/tools-proteindistnet/psipred/BLAST+/runpsipredplus' # DB = uniclust30_2018_08 METAPSI = '/group/prayog/tools-proteindistnet/metapsicov/metapsicov-master/' CCMPRED = '/group/prayog/TOOLS/CCMpred-GPU/CCMpred/bin/ccmpred' FREECON = 'export LD_LIBRARY_PATH=/storage/hpc/group/prayog/TOOLS/freecontact-1.0.21/freecontact/lib; /storage/hpc/group/prayog/TOOLS/freecontact-1.0.21/freecontact/bin/freecontact' HHBLITS = '/group/prayog/tools-proteindistnet/hh-suite/build/bin/hhblits' HHBDB = '/group/prayog/tools-proteindistnet/uniclust30_2018_08/uniclust30_2018_08' ALN2PSSM = scripts + '/aln2pssm.py' # Prayog server settings myhost = os.uname()[1] if myhost == 'prayog02': PSIPRED = '/ssdA/common-tools/psipred/BLAST+/runpsipredplus' # DB = uniclust30_2018_08 METAPSI = '/ssdA/common-tools/metapsicov/metapsicov-master/' CCMPRED = '/ssdA/common-tools/ccmpred/CCMpred/bin/ccmpred' FREECON = '/usr/bin/freecontact' HHBLITS = '/ssdA/common-tools/hh-suite/build/bin/hhblits' HHBDB = '/ssdA/common-tools/uniclust30_2018_08_hhsuite/uniclust30_2018_08' if not os.path.exists(PSIPRED): sys.exit(PSIPRED + ' not found') if not os.path.exists(METAPSI): sys.exit(METAPSI + ' not found') if not os.path.exists(CCMPRED): sys.exit(CCMPRED + ' not found') if not os.path.exists(ALN2PSSM): sys.exit(ALN2PSSM + ' not found') if not os.path.exists(HHBLITS): sys.exit(HHBLITS + ' not found') print('Started ' + sys.argv[0] + ' ' + str(start)) print('Fasta: ' + fasta) print('ID: ' + id) print('Outdir: ' + jobdir) if len(alnfile) > 1: print('Alnfile: ' + alnfile) if len(a3mfile) > 1: print('A3mfile: ' + a3mfile) f = open(fasta, 'r') f.readline() seq = "".join([line.strip() for line in f.readlines()]) print('L: ' + str(len(seq))) print('Seq: ' + seq) sys.stdout.flush() print ('') print ('Predict secondary structures..') sys.stdout.flush() if os.path.exists(id + '.ss2'): print('SS already done!') else: print(PSIPRED + ' ' + fasta) result = os.system(PSIPRED + ' ' + fasta) if (result != 0): print(results) sys.exit() print ('') print ('Predict solvent accessibility..') sys.stdout.flush() if os.path.exists(id + '.solv'): print('SA already done!') else: if not os.path.exists(id + '.mtx'): sys.exit('Expected mtx is absent, please update the psipred run and keep the temp mtx file!') outfile = open(id + '.solv', 'w') response = subprocess.run([METAPSI + 'bin/solvpred', id + '.mtx', METAPSI + 'data/weights_solv.dat'], stdout = outfile) if (response.returncode != 0): sys.exit(str(response.returncode) + ' ' + str(response.stderr) + ' ' + str(response.args)) if len(alnfile) > 1: print('') print('Using existing alignment file..') if not os.path.exists(alnfile): print('ERROR! file not found!') print(alnfile) sys.exit() os.system('cp ' + alnfile + ' ' + './' + id + '.aln') fin = open(alnfile, 'r') lines = fin.readlines() fin.close() if seq != lines[0].strip(): print('ERROR! Fasta seq and aln seq dont match') print(seq) print(lines[0]) sys.exit() fout = open(id + '.a3m', 'w') for i in range(len(lines)): fout.write('>seq-' + str(i) + '\n') fout.write(lines[0].strip() + '\n') fout.close() elif len(a3mfile) > 1: print('') print('Using existing a3m alignment file..') os.system('cp ' + a3mfile + ' ' + './' + id + '.a3m') print ('') print ('Clean A3M..') fa3m = open(id + '.a3m', 'r') lines_a3m = fa3m.readlines() fa3m.close() fa3m = open(id + '.a3m', 'w') for line in lines_a3m: if line.startswith('>'): fa3m.write(line) else: fa3m.write(line.replace('O', '-')) fa3m.close() print ('') print ('Genrate aln from a3m..') sys.stdout.flush() if os.path.exists(id + '.aln'): print('ALN already done!') else: os.system("egrep -v \"^>\" " + id + ".a3m | sed 's/[a-z]//g' > " + id + ".aln") else: print ('') print ('Generate alignments..') sys.stdout.flush() if os.path.exists(id + '.a3m'): print('A3M already done!') else: outfile = open('hhblits.log', 'w') response = subprocess.run([HHBLITS, '-i', fasta, '-d', HHBDB, '-oa3m', id + '.a3m', '-maxmem', '32', '-cpu', '4', '-n', '3', '-maxfilt', '50000', '-diff', 'inf', '-e', '0.001', '-id', '99', '-cov', '40'], stdout = outfile, stderr = outfile) if (response.returncode != 0): sys.exit(str(response.returncode) + ' ' + str(response.stderr) + ' ' + str(response.args)) print ('') print ('Clean A3M..') fa3m = open(id + '.a3m', 'r') lines_a3m = fa3m.readlines() fa3m.close() fa3m = open(id + '.a3m', 'w') for line in lines_a3m: if line.startswith('>'): fa3m.write(line) else: fa3m.write(line.replace('O', '-')) fa3m.close() print ('') print ('Genrate aln from a3m..') sys.stdout.flush() if os.path.exists(id + '.aln'): print('ALN already done!') else: os.system("egrep -v \"^>\" " + id + ".a3m | sed 's/[a-z]//g' > " + id + ".aln") print('') os.system('wc -l ' + id + '.a3m') os.system('wc -l ' + id + '.aln') print ('') print ('Run alnstats..') sys.stdout.flush() if os.path.exists(id + '.colstats') and os.path.exists(id + '.pairstats'): print('alnstats already done!') else: response = subprocess.run([METAPSI + 'bin/alnstats', id + '.aln', id + '.colstats', id + '.pairstats']) if (response.returncode != 0): sys.exit(str(response.returncode) + ' ' + str(response.stderr) + ' ' + str(response.args)) print ('') print ('Run CCMpred..') sys.stdout.flush() if os.path.exists(id + '.ccmpred'): print('ALN already done!') else: outfile = open('ccmpred.log', 'w') response = subprocess.run([CCMPRED, id + '.aln', id + '.ccmpred'], stdout = outfile) if (response.returncode != 0): sys.exit(str(response.returncode) + ' ' + str(response.stderr) + ' ' + str(response.args)) print ('') print ('Run FreeContact..') sys.stdout.flush() if os.path.exists(id + '.freecontact.rr'): print('ALN already done!') else: if os.system(FREECON + ' < ' + id + '.aln' + ' > ' + id + '.freecontact.rr') != 0: faln = open(id + '.a3m', 'r') lines_aln = faln.readlines() faln.close() if len(lines_aln) > 5: sys.exit(FREECON + 'Failed!') else: print('WARNING! Aln is too small, freecontact failed!') os.system('touch ' + id + '.freecontact.rr') print ('') print ('Generate PSSM from alignment (write pssm)..') sys.stdout.flush() if os.path.exists(id + '.pssm.npy'): print('PSSM already done!') else: os.system('CUDA_VISIBLE_DEVICES="" python3 ' + ALN2PSSM + ' ' + id + '.aln' + ' ' + id + '.pssm.npy') # Check the presence of all files assert os.path.exists(id + '.ss2') assert os.path.exists(id + '.solv') assert os.path.exists(id + '.a3m') assert os.path.exists(id + '.aln') assert os.path.exists(id + '.colstats') assert os.path.exists(id + '.pairstats') assert os.path.exists(id + '.ccmpred') assert os.path.exists(id + '.freecontact.rr') assert os.path.exists(id + '.pssm.npy') print('') print('Write feature file..') # Add sequence features = {'seq': seq } # Add secondary structure ss = np.zeros((3, len(seq))) i = 0 f = open(id + '.ss2', 'r') f.readline() f.readline() for l in f.readlines(): c = l.strip().split() assert c[1] == seq[i] ss[0, i] = float(c[3]) ss[1, i] = float(c[4]) ss[2, i] = float(c[5]) i += 1 features['ss'] = ss.astype(np.float16) # Add solvent accessibility sa = np.zeros(len(seq)) i = 0 f = open(id + '.solv', 'r') for l in f.readlines(): c = l.strip().split() assert c[1] == seq[i] sa[i] = float(c[2]) i += 1 features['sa'] = sa.astype(np.float16) # Add CCMpred f = open(id + '.ccmpred', 'r') ccmlist = [[float(num) for num in line.strip().split()] for line in f ] ccmpred = np.array(ccmlist, dtype = np.float16) assert ccmpred.shape == (( len(seq), len(seq) )) features['ccmpred'] = ccmpred # Add FreeContact freecontact = np.zeros((len(seq), len(seq))) f = open(id + '.freecontact.rr', 'r') for l in f.readlines(): c = l.strip().split() #assert c[1] == seq[int(c[0]) - 1] #assert c[3] == seq[int(c[2]) - 1] freecontact[int(c[0]) - 1, int(c[2]) - 1] = float(c[5]) freecontact[int(c[2]) - 1, int(c[0]) - 1] = float(c[5]) features['freecon'] = freecontact.astype(np.float16) # Add Shanon entropy entropy = [] f = open(id + '.colstats', 'r') f.readline() f.readline() f.readline() f.readline() for l in f.readlines(): c = l.strip().split() entropy.append(c[21]) assert len(entropy) == len(seq) features['entropy'] = np.array(entropy, dtype = np.float16) # Add mean contact potential potential = np.zeros((len(seq), len(seq))) f = open(id + '.pairstats', 'r') for l in f.readlines(): c = l.strip().split() potential[int(c[0]) - 1, int(c[1]) - 1] = float(c[2]) potential[int(c[1]) - 1, int(c[0]) - 1] = float(c[2]) features['potential'] = potential.astype(np.float16) # Add PSSM mypssm = np.load(id + '.pssm.npy') assert mypssm.shape == (len(seq), 22) features['pssm'] = mypssm f = open(pklfile, 'wb') pickle.dump(features,f) f.close() print('done ' + sys.argv[0] + ' ' + str(datetime.now()))