'''
Author: Bikash Shrestha
Description: This script takes list of ids as an argument to generate MSAs
'''

import subprocess
import sys
import os


# location of HHBLITS tool in prayog10 server
HHBLITS  = '/ssdA/common-tools/hh-suite/build/bin/hhblits'
# location of uniclust protein database in prayog10 server
HHBDB    = '/ssdA/common-tools/uniclust30_2018_08_hhsuite/uniclust30_2018_08'


combination = [
	{'e': '0.0001', 'cov': '70', 'n': '3'},
	{'e': '0.001', 'cov': '40', 'n': '3'},
	{'e': '0.1', 'cov': '50', 'n': '3'},
	{'e': '10', 'cov': '30', 'n': '1'}
]

full_list = []

if len(sys.argv) > 1:
    full_list = sys.argv[1].split(',')

def generate_align(id, fasta, path, e='0.01', cov='40', n='3', maxfilt='50000'):
    outfile = open(path + '/' + 'hhblits.log', 'w')
    response = subprocess.run([HHBLITS, '-i', fasta, '-d', HHBDB, '-oa3m', path + '/' + id + '.a3m', '-maxmem', '32', '-cpu', '4', '-n', n, '-maxfilt', maxfilt, '-diff', 'inf', '-e', e, '-id', '99', '-cov', cov], stdout = outfile, stderr = outfile)
    if (response.returncode != 0):
        outfile.close()
        sys.exit(str(response.returncode) + ' ' + str(response.stderr) + ' ' + str(response.args))

    outfile.close()

    print ('')
    print ('Clean A3M..')
    fa3m = open(path + '/' + id + '.a3m', 'r')
    lines_a3m = fa3m.readlines()
    fa3m.close()

    fa3m = open(path + '/' + id + '.a3m', 'w')
    for line in lines_a3m:
        if line.startswith('>'):
            fa3m.write(line)
        else:
            fa3m.write(line.replace('O', '-'))
    fa3m.close()

    print ('')
    print ('Genrate aln from a3m..')
    sys.stdout.flush()
    if os.path.exists(path + '/' + id + '.aln'):
        print('ALN already done!')
    else:
        os.system("egrep -v \"^>\" " + path + '/' + id + ".a3m | sed 's/[a-z]//g' > " + path + '/' + id + ".aln")

def main_func(e, cov, n, maxfilt='50000'):
    if not os.path.exists('dataset/'):
        os.system('mkdir dataset')

    dirname = 'dataset/e_' + e +'_cov_' + cov + '_n_' + n
    if not os.path.exists(dirname):
        os.system('mkdir ' + dirname)

    fasta_dir = '1353_fasta/'
    for item in full_list:
        files_dir = dirname + '/' + item
        target_file = files_dir + '/' + item + '.aln'
        if not os.path.exists(files_dir):
            os.system('mkdir ' + files_dir)
            fasta = fasta_dir + item + '.fasta'
            if not os.path.exists(target_file):
                generate_align(item, fasta, files_dir, e, cov, n, maxfilt)


for item in combination:
    main_func(item['e'], item['cov'], item['n'])
