''' Author: Badri Adhikari, University of Missouri-St. Louis, 1-26-2020 File: Contains the metrics to evaluate predicted real-valued distances, binned-distances and contact maps ''' import numpy as np import tensorflow as tf epsilon = tf.keras.backend.epsilon() from dataio import * from plots import * from generator import * def calculate_mae(PRED, YTRUE, pdb_list, length_dict): mae_lr_d8_list = np.zeros(len(PRED[:, 0, 0, 0])) mae_mlr_d8_list = np.zeros(len(PRED[:, 0, 0, 0])) mae_lr_d12_list = np.zeros(len(PRED[:, 0, 0, 0])) mae_mlr_d12_list = np.zeros(len(PRED[:, 0, 0, 0])) for i in range(0, len(PRED[:, 0, 0, 0])): L = length_dict[pdb_list[i]] PAVG = np.full((L, L), 100.0) # Average the predictions from both triangles for j in range(0, L): for k in range(j, L): PAVG[j, k] = (PRED[i, k, j, 0] + PRED[i, j, k, 0]) / 2.0 # at distance 8 and separation 24 Y = np.copy(YTRUE[i, 0:L, 0:L, 0]) P = np.copy(PAVG) for p in range(len(Y)): for q in range(len(Y)): if q - p < 24: Y[p, q] = np.nan P[p, q] = np.nan continue if Y[p, q] > 8: Y[p, q] = np.nan P[p, q] = np.nan mae_lr_d8 = np.nan if not np.isnan(np.abs(Y - P)).all(): mae_lr_d8 = np.nanmean(np.abs(Y - P)) #mae_lr_d8 = np.sqrt(np.nanmean(np.abs(Y - P) ** 2)) # at distance 8 and separation 12 Y = np.copy(YTRUE[i, 0:L, 0:L, 0]) P = np.copy(PAVG) for p in range(len(Y)): for q in range(len(Y)): if q - p < 12: Y[p, q] = np.nan P[p, q] = np.nan continue if Y[p, q] > 8: Y[p, q] = np.nan P[p, q] = np.nan mae_mlr_d8 = np.nan if not np.isnan(np.abs(Y - P)).all(): mae_mlr_d8 = np.nanmean(np.abs(Y - P)) # at distance 12 and separation 24 Y = np.copy(YTRUE[i, 0:L, 0:L, 0]) P = np.copy(PAVG) for p in range(len(Y)): for q in range(len(Y)): if q - p < 24: Y[p, q] = np.nan P[p, q] = np.nan continue if Y[p, q] > 12: Y[p, q] = np.nan P[p, q] = np.nan mae_lr_d12 = np.nan if not np.isnan(np.abs(Y - P)).all(): mae_lr_d12 = np.nanmean(np.abs(Y - P)) # at distance 12 and separation 12 Y = np.copy(YTRUE[i, 0:L, 0:L, 0]) P = np.copy(PAVG) for p in range(len(Y)): for q in range(len(Y)): if q - p < 12: Y[p, q] = np.nan P[p, q] = np.nan continue if Y[p, q] > 12: Y[p, q] = np.nan P[p, q] = np.nan mae_mlr_d12 = np.nan if not np.isnan(np.abs(Y - P)).all(): mae_mlr_d12 = np.nanmean(np.abs(Y - P)) # add to list mae_lr_d8_list[i] = mae_lr_d8 mae_mlr_d8_list[i] = mae_mlr_d8 mae_lr_d12_list[i] = mae_lr_d12 mae_mlr_d12_list[i] = mae_mlr_d12 print('MAE for ' + str(i) + ' ' + str(pdb_list[i]) + ' lr_d8 = %.2f mlr_d8 = %.2f lr_d12 = %.2f mlr_d12 = %.2f' % (mae_lr_d8, mae_mlr_d8, mae_lr_d12, mae_mlr_d12)) print('Average MAE : lr<8A = %.4f mlr<8A = %.4f lr<12A = %.4f mlr<12A = %.4f' % (np.nanmean(mae_lr_d8_list), np.nanmean(mae_mlr_d8_list), np.nanmean(mae_lr_d12_list), np.nanmean(mae_mlr_d12_list))) # Convert distances to contact probabilities def distance_to_contacts(distance_matrix): P = 4.0 / distance_matrix P[P > 1.0] = 1.0 return P def calculate_contact_precision_in_distances(PRED, YTRUE, pdb_list, length_dict): Y = np.copy(YTRUE) Y[ Y < 8.0] = True Y[ Y >= 8.0] = False calculate_contact_precision(distance_to_contacts(PRED), Y, pdb_list, length_dict) def calculate_contact_precision(PRED, YTRUE, pdb_list, length_dict): avg_p_lr_topL5 = 0.0 avg_p_lr_topL = 0.0 avg_p_lr_topNc = 0.0 avg_p_mlr_topL5 = 0.0 avg_p_mlr_topL = 0.0 avg_p_mlr_topNc = 0.0 count_pdbs_with_lr = 0 count_pdbs_with_mlr = 0 for i in range(0, len(PRED[:, 0, 0, 0])): # Ideally: L = length_dict[pdb_list[i]], but L should be # of valid residues in structure L = length_dict[pdb_list[i]] LPDB = len(np.where(~np.isnan(np.diagonal(YTRUE[i, :L, :L, 0])))[0]) P = np.full((L, L), 0.0) # Average the predictions from both triangles for j in range(0, L): for k in range(j, L): P[j, k] = (PRED[i, k, j, 0] + PRED[i, j, k, 0]) / 2.0 p_dict = {} for j in range(0, L): for k in range(j, L): if np.isnan(YTRUE[i, j, k, 0]): continue p_dict[(j,k)] = P[j, k] p_lr_topNc = float('nan') p_lr_topL = float('nan') p_lr_topL5 = float('nan') p_mlr_topNc = float('nan') p_mlr_topL = float('nan') p_mlr_topL5 = float('nan') # Check if there are 0 medium-long-range contacts total_true_mlr = 0 for j in range(0, L): for k in range(j, L): if (abs(j-k) < 12): continue if np.isnan(YTRUE[i, k, j]): continue if YTRUE[i, k, j]: total_true_mlr += 1 # Check if there are 0 long-range contacts total_true_lr = 0 for j in range(0, L): for k in range(j, L): if (abs(j-k) < 24): continue if np.isnan(YTRUE[i, k, j]): continue if YTRUE[i, k, j]: total_true_lr += 1 if total_true_mlr > 0: # Top Nc (all) medium- and long-range distances p_mlr_topNc = 0 x = total_true_mlr total_predictions = 0 for pair in reversed(sorted(p_dict.items(), key=lambda x: x[1])): (k, v) = pair (p, q) = k if (abs(p-q) < 12): continue if YTRUE[i, p, q]: p_mlr_topNc += 1 total_predictions += 1 x -= 1 if x == 0: break p_mlr_topNc /= total_predictions # Top L medium- and long-range distances p_mlr_topL = 0 x = LPDB total_predictions = 0 for pair in reversed(sorted(p_dict.items(), key=lambda x: x[1])): (k, v) = pair (p, q) = k if (abs(p-q) < 12): continue if YTRUE[i, p, q]: p_mlr_topL += 1 total_predictions += 1 x -= 1 if x == 0: break p_mlr_topL /= total_predictions # Top L/5 medium- and long-range distances p_mlr_topL5 = 0 x = int(round(LPDB/5)) total_predictions = 0 for pair in reversed(sorted(p_dict.items(), key=lambda x: x[1])): (k, v) = pair (p, q) = k if (abs(p-q) < 12): continue if YTRUE[i, p, q]: p_mlr_topL5 += 1 total_predictions += 1 x -= 1 if x == 0: break p_mlr_topL5 /= total_predictions if total_true_lr > 0: # Top Nc (all) long-range distances p_lr_topNc = 0 x = total_true_lr total_predictions = 0 for pair in reversed(sorted(p_dict.items(), key=lambda x: x[1])): (k, v) = pair (p, q) = k if (abs(p-q) < 24): continue if YTRUE[i, p, q]: p_lr_topNc += 1 total_predictions += 1 x -= 1 if x == 0: break p_lr_topNc /= total_predictions # Top L long-range distances p_lr_topL = 0 x = LPDB total_predictions = 0 for pair in reversed(sorted(p_dict.items(), key=lambda x: x[1])): (p, q) = pair[0] if abs(p-q) < 24: continue if YTRUE[i, p, q] == 1.0: p_lr_topL += 1 total_predictions += 1 x -= 1 if x == 0: break p_lr_topL /= total_predictions # Top L/5 long-range distances p_lr_topL5 = 0 x = int(round(LPDB/5)) total_predictions = 0 for pair in reversed(sorted(p_dict.items(), key=lambda x: x[1])): (k, v) = pair (p, q) = k if (abs(p-q) < 24): continue if YTRUE[i, p, q]: p_lr_topL5 += 1 total_predictions += 1 x -= 1 if x == 0: break p_lr_topL5 /= total_predictions # Average for all pdbs avg_p_lr_topL5 += p_lr_topL5 avg_p_lr_topL += p_lr_topL avg_p_lr_topNc += p_lr_topNc avg_p_mlr_topL5 += p_mlr_topL5 avg_p_mlr_topL += p_mlr_topL avg_p_mlr_topNc += p_mlr_topNc count_pdbs_with_lr += 1 count_pdbs_with_mlr += 1 print('Precision for ' + str(i) + ' - ' + str(pdb_list[i]) + ' top_L5_lr = %.4f top_L_lr = %.4f top_Nc_lr = %.4f top_L5_mlr = %.4f top_L_mlr = %.4f top_Nc_mlr = %.4f (total_true_lr = %d total_true_mlr = %d)' % (p_lr_topL5, p_lr_topL, p_lr_topNc, p_mlr_topL5, p_mlr_topL, p_mlr_topNc, total_true_lr, total_true_mlr) ) print('Average Precision: top_L5_lr = %.2f top_L_lr = %.2f top_Nc_lr = %.2f top_L5_mlr = %.2f top_L_mlr = %.2f top_Nc_mlr = %.2f' % (100.0 * avg_p_lr_topL5 / count_pdbs_with_lr, 100.0 * avg_p_lr_topL / count_pdbs_with_lr, 100.0 * avg_p_lr_topNc / count_pdbs_with_lr, 100.0 * avg_p_mlr_topL5 / count_pdbs_with_mlr, 100.0 * avg_p_mlr_topL / count_pdbs_with_mlr, 100.0 * avg_p_mlr_topNc / count_pdbs_with_mlr)) def eval_distance_predictions(my_model, my_list, my_length_dict, my_dir_features, my_dir_distance, pad_size, flag_plots, flag_save, LMAX, expected_n_channels): # Padded but full inputs/outputs my_generator = DistGenerator(my_list, my_dir_features, my_dir_distance, LMAX, pad_size, 1, expected_n_channels) P = my_model.predict_generator(my_generator, max_queue_size=10, verbose=1) P[ P < 0.01 ] = 0.01 P = 100.0 / (P + epsilon) # Remove padding, i.e. shift up and left by int(pad_size/2) P[:, :LMAX-pad_size, :LMAX-pad_size, :] = P[:, int(pad_size/2) : LMAX-int(pad_size/2), int(pad_size/2) : LMAX-int(pad_size/2), :] Y = get_bulk_output_dist_maps(my_list, my_dir_distance, LMAX) if flag_plots: plot_four_pair_maps(Y, P, my_list, my_length_dict) print('') calculate_mae(P, Y, my_list, my_length_dict) print('') calculate_contact_precision_in_distances(P, Y, my_list, my_length_dict) if flag_save: os.system('mkdir -p ./predictions') print('') print('Save predictions..') for i in range(len(my_list)): L = my_length_dict[my_list[i]] pred = P[i, :L, :L] save_contacts_rr(my_list[i], my_dir_features, distance_to_contacts(pred), './predictions/' + my_list[i] + '.4bydist.rr') def eval_contact_predictions(my_model, my_list, my_length_dict, my_dir_features, my_dir_distance, pad_size, flag_plots, flag_save, LMAX, expected_n_channels): # Padded but full inputs/outputs my_generator = ContactGenerator(my_list, my_dir_features, my_dir_distance, LMAX, pad_size, 1, expected_n_channels) P = my_model.predict_generator(my_generator, max_queue_size=10, verbose=1) # Remove padding, i.e. shift up and left by int(pad_size/2) P[:, :LMAX-pad_size, :LMAX-pad_size, :] = P[:, int(pad_size/2) : LMAX-int(pad_size/2), int(pad_size/2) : LMAX-int(pad_size/2), :] Y = get_bulk_output_contact_maps(my_list, my_dir_distance, LMAX) if flag_plots: plot_four_pair_maps(Y, P, my_list, my_length_dict) print('') calculate_contact_precision(P, Y, my_list, my_length_dict) if flag_save: os.system('mkdir -p ./predictions') print('') print('Save predictions..') for i in range(len(my_list)): L = my_length_dict[my_list[i]] pred = P[i, :L, :L] save_contacts_rr(my_list[i], my_dir_features, pred, './predictions/' + my_list[i] + '.contacts.rr') def eval_binned_predictions(my_model, my_list, my_length_dict, my_dir_features, my_dir_distance, pad_size, flag_plots, flag_save, LMAX, bins, expected_n_channels): # Padded but full inputs/outputs my_generator = BinnedDistGenerator(my_list, my_dir_features, my_dir_distance, bins, LMAX, pad_size, 1, expected_n_channels) P = my_model.predict_generator(my_generator, max_queue_size=10, verbose=1) # Remove padding, i.e. shift up and left by int(pad_size/2) P[:, :LMAX-pad_size, :LMAX-pad_size, :] = P[:, int(pad_size/2) : LMAX-int(pad_size/2), int(pad_size/2) : LMAX-int(pad_size/2), :] Y = get_bulk_output_contact_maps(my_list, my_dir_distance, LMAX) # Predicted distance is mean distance of the most confident bin D = np.zeros((len(P), LMAX, LMAX, 1)) for p in range(len(P)): for i in range(LMAX): for j in range(LMAX): index = np.argmax(P[p, i, j, :]) min_max = [float(x) for x in bins[index].split()] D[p, i, j, 0] = ( min_max[0] + min_max[1] ) / 2.0 # The last bin's range has a very large value, so trim it bin_max = float(bins[len(bins) - 1].split()[0]) D[D > bin_max] = bin_max Y = get_bulk_output_dist_maps(my_list, my_dir_distance, LMAX) print('') calculate_mae(D, Y, my_list, my_length_dict) # Identify the bins that fall under the 8.0A distance contact_bins = -1 for k, v in bins.items(): if bins[k].split()[0] == '8.0': contact_bins = k # Sum the probabilities of the bins that fall under 8.0A distance C = np.zeros((len(P), LMAX, LMAX, 1)) for p in range(len(P)): for i in range(LMAX): for j in range(LMAX): C[p, i, j, 0] = np.sum(P[p, i, j, :contact_bins]) Y = get_bulk_output_contact_maps(my_list, my_dir_distance, LMAX) print('') calculate_contact_precision(C, Y, my_list, my_length_dict) if flag_save: os.system('mkdir -p ./predictions') print('') print('Save predictions..') for i in range(len(my_list)): L = my_length_dict[my_list[i]] predictions = {} for b in range(len(bins)): predictions[bins[b]] = P[i, :L, :L, b].astype(np.float16) f = open('./predictions/' + my_list[i] + '.bins.pkl', 'wb') pickle.dump(predictions, f) f.close()