Commit 060e6561 authored by Ngan Thi Dong's avatar Ngan Thi Dong
Browse files

add topK evaluation

parent fe45a213
......@@ -5,6 +5,7 @@ from dbmda.autoencoder import *
from multiprocess import Process, Queue
import argparse
import pandas as pd
from sklearn import preprocessing
np.random.seed(1337)
sysdevice = t.device('cuda' if t.cuda.is_available() else 'cpu')
......@@ -80,11 +81,17 @@ def eval_fold(seq_sim, foldIdx, train_pair_list, train_pair_lbl, test_pair_list,
score_path += '_useAutoencoder'
if args.use_seq_sim:
score_path += '_useSeqsim'
if args.imbalanced:
score_path += '_imbalanced'
score_path += '.csv'
else:
score_path = None
auc_score, auprc_score = clf_eval_fold(train_data, np.array(train_pair_lbl), test_data, np.array(test_pair_lbl), score_save_path=score_path, classifier='rotation_forest')
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(train_data)
X_test_minmax = min_max_scaler.transform(test_data)
auc_score, auprc_score = clf_eval_fold(X_train_minmax, np.array(train_pair_lbl), X_test_minmax, np.array(test_pair_lbl), score_save_path=score_path, classifier='rotation_forest')
print('foldIdx: ', foldIdx, 'auc_score: ', auc_score, 'auprc_score: ', auprc_score)
auc_queue.put(auc_score)
auprc_queue.put(auprc_score)
......@@ -108,6 +115,15 @@ def eval(args):
for train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, miRNA_sim, disease_sim in folds_data:
# for the use of imbalanced dataset
if args.imbalanced:
n_disease = disease_sim.shape[0]
n_miRNA = miRNA_sim.shape[0]
adj, zero_index, one_index = assoc_list_to_adj(n_miRNA, n_disease, train_pair_list, train_pair_lbl, imbalanced=True)
new_list = one_index + zero_index
new_lbl_list = [1.0] * len(one_index) + [0.0] * len(zero_index)
train_pair_list = new_list
train_pair_lbl = new_lbl_list
foldIdx += 1
if args.faulty:
miRNA_sim = fault_miRNA_sim
......@@ -140,6 +156,8 @@ def eval(args):
save_path += '_useAutoencoder'
if args.use_seq_sim == True or args.use_seq_sim == 'True':
save_path += '_useSeqsim'
if args.imbalanced:
save_path += '_imbalanced'
save_path += '.csv'
print('save_path:', save_path, args.use_seq_sim, args.use_autoencoder)
......@@ -154,9 +172,9 @@ def main():
parser = argparse.ArgumentParser(description='Neural based matrix completion for virus-host PPI')
parser.add_argument('--epochs', type=int, default=1000, metavar='N',
help='number of epochs to train')
parser.add_argument('--data_dir', default='/home/dong/simplifying_mirna_disease/hmdd2/', help='dataset directory')
parser.add_argument('--fold_dir', default='/home/dong/simplifying_mirna_disease/hmdd2/folds/', help='dataset directory')
parser.add_argument('--result_dir', default='/home/dong/simplifying_mirna_disease/results_hmdd2/', help='saved result directory')
parser.add_argument('--data_dir', default='data/hmdd2/', help='dataset directory')
parser.add_argument('--fold_dir', default='data/hmdd2/folds/', help='the data set pre-splitted fold directory')
parser.add_argument('--result_dir', default='data/hmdd2/results/', help='saved result directory')
parser.add_argument('--save_score', default=False, help='whether to save the predicted score or not')
parser.add_argument('--sim_type', default='functional2', help='the miRNA and disease sim, pass in "functional2" for miRNA functional + disease semantic(with phenotype info added),'
'"functional1" for miRNA functional and disease semantic only,'
......@@ -169,6 +187,7 @@ def main():
parser.add_argument('--numFold', default=5, help='value of K for K-foldCV, default is 5')
parser.add_argument('--neg_rate', default=1.0, help='the negative sampling rate')
parser.add_argument('--faulty', default=False, help='Faulty calculation or not')
parser.add_argument('--imbalanced', default=False, help='Faulty calculation or not')
args = parser.parse_args()
args.save_score = True if str(args.save_score) == 'True' else False
......@@ -177,7 +196,16 @@ def main():
args.use_seq_sim = True if str(args.use_seq_sim) == 'True' else False
args.faulty = True if str(args.faulty) == 'True' else False
eval(args)
bool_pool = [True, False]
randseeds = [123, 456, 789, 101, 112]
randseeds=[123]
for randseed in randseeds:
args.randseed = randseed
for use_auto in bool_pool:
for use_seq in bool_pool:
args.use_autoencoder = use_auto
args.use_seq_sim = use_seq
eval(args)
if __name__ == "__main__":
main()
......
......@@ -741,7 +741,7 @@ methods = ['MLP', 'linear']
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='EPMDA original evaluation method')
parser.add_argument('--data_dir', default='/home/dong/Desktop/miRNA-disease/mirna-disease/epmda/data/hmdd2_numpy/', help='dataset directory')
parser.add_argument('--data_dir', default='/epmda/data/hmdd2_numpy/', help='dataset directory')
parser.add_argument('--fold_dir', default='data/hmdd2/folds/', help='dataset directory')
parser.add_argument('--result_dir', default='data/hmdd2/results/', help='saved result directory')
parser.add_argument('--save_score', default=False, help='whether to save the predicted score or not')
......@@ -755,12 +755,12 @@ if __name__ == '__main__':
args = parser.parse_args()
args.save_score = True if str(args.save_score) == 'True' else False
args.faulty = True if str(args.faulty) == 'True' else False
# args.data_dir = standardize_dir(args.data_dir)
# args.result_dir = standardize_dir(args.result_dir)
args.data_dir = standardize_dir(args.data_dir)
args.result_dir = standardize_dir(args.result_dir)
args.randseed = int(args.randseed)
data_dirs = ['../simplifying_mirna_disease/epmda/data/hmdd2_numpy/']#,'../simplifying_mirna_disease/epmda/data/hmdd3_numpy/', '/home/dong/Desktop/miRNA-disease/simplifying_mirna_disease/epmda/data/faultyHmdd2_numpy/']
fold_dirs = ['folds_hmdd2/']#, 'folds/', 'folds_hmdd2/']
data_dirs = ['epmda/data/hmdd2_numpy/']#,'../simplifying_mirna_disease/epmda/data/hmdd3_numpy/', '/home/dong/Desktop/miRNA-disease/simplifying_mirna_disease/epmda/data/faultyHmdd2_numpy/']
fold_dirs = ['data/hmdd2/folds/']#, 'folds/', 'folds_hmdd2/']
result_dirs = ['data/hmdd2/results_epmda/']#, 'data/hmdd3/results/', 'data/hmdd2/results_epmda/']
args.save_score = True
randseeds = [123, 456, 789, 101, 112]
......@@ -794,6 +794,7 @@ if __name__ == '__main__':
sim_dir = args.fold_dir
interactions = dataSource.interactions
# imbalanced training data
write_path = write_path_pre + args.sim_type + '_imbalance_results.txt'
writer = open(write_path, 'w')
writer.write('Fold,AUC,AUPR\n')
......@@ -827,6 +828,7 @@ if __name__ == '__main__':
writer.close()
# Balanced training data
write_path = write_path_pre + args.sim_type + '_balance_results.txt'
writer = open(write_path, 'w')
writer.write('Fold,AUC,AUPR\n')
......
......@@ -7,7 +7,7 @@ import numpy as np
from utility.utils import *
from data.preparation.GIP import *
from data.preparation.miRNA_sim import *
from utility.one_class_clf_negative_sampling import *
# from utility.one_class_clf_negative_sampling import *
import argparse
......@@ -105,8 +105,8 @@ def gen_fold(data_dir, save_dir, numFold=5, negative_rate=1.0, randomseed=123):
# get the negative sampling set of training set
train_neg_list = list(neg_assoc)
train_neg_list.extend(test_pair_list)
train_pair_list, train_pair_lbl = negative_sampling(pos_train_pair_list, train_neg_list, train_miRNA_gip, train_disease_gip)
# train_pair_list, train_pair_lbl = negative_sampling(pos_train_pair_list, n_miRNA, n_disease, negative_rate)
# train_pair_list, train_pair_lbl = negative_sampling(pos_train_pair_list, train_neg_list, train_miRNA_gip, train_disease_gip)
train_pair_list, train_pair_lbl = negative_sampling(pos_train_pair_list, n_miRNA, n_disease, negative_rate)
save2File(train_pair_list, saving_prefix + 'train.csv')
save2File(train_pair_lbl, saving_prefix + 'train_lbl.csv')
save2File(test_pair_list, saving_prefix + 'test.csv')
......@@ -119,41 +119,41 @@ def save2File(input, out_path):
df.to_csv(out_path, header=False, index=False)
# def negative_sampling(pos_samples, n_miRNA, n_disease, negative_rate):
# size_of_batch = len(pos_samples)
# print(size_of_batch)
# if negative_rate > 0:
# num_to_generate = size_of_batch * (negative_rate)
# values1 = np.random.randint(n_miRNA, size=int(num_to_generate * 1.2))
# values2 = np.random.randint(n_disease, size=int(num_to_generate * 1.2))
# labels = [1] * len(pos_samples)
# neg_samples = list()
# #
# # if train_pair_list != None:
# # print(train_pair_list)
# if negative_rate > 0:
# for i in range(len(values1)):
# pair = [values1[i], values2[i]]
# if pair in neg_samples or pair in pos_samples:
# continue
# neg_samples.append(pair)
# labels.append(0)
#
# if len(labels) == size_of_batch + num_to_generate:
# break
# else:
# for i in range(n_miRNA):
# for j in range(n_disease):
# pair = [i,j]
# if pair not in pos_samples:
# neg_samples.append(pair)
# labels.append(0)
#
# new_pair_list = pos_samples
# new_pair_list.extend(neg_samples)
# print('len new_pair_list: ', len(new_pair_list))
# # print('len(labels): ', len(labels))
# return np.asarray(new_pair_list), np.asarray(labels)
def negative_sampling(pos_samples, n_miRNA, n_disease, negative_rate):
size_of_batch = len(pos_samples)
print(size_of_batch)
if negative_rate > 0:
num_to_generate = size_of_batch * (negative_rate)
values1 = np.random.randint(n_miRNA, size=int(num_to_generate * 1.2))
values2 = np.random.randint(n_disease, size=int(num_to_generate * 1.2))
labels = [1] * len(pos_samples)
neg_samples = list()
#
# if train_pair_list != None:
# print(train_pair_list)
if negative_rate > 0:
for i in range(len(values1)):
pair = [values1[i], values2[i]]
if pair in neg_samples or pair in pos_samples:
continue
neg_samples.append(pair)
labels.append(0)
if len(labels) == size_of_batch + num_to_generate:
break
else:
for i in range(n_miRNA):
for j in range(n_disease):
pair = [i,j]
if pair not in pos_samples:
neg_samples.append(pair)
labels.append(0)
new_pair_list = pos_samples
new_pair_list.extend(neg_samples)
print('len new_pair_list: ', len(new_pair_list))
# print('len(labels): ', len(labels))
return np.asarray(new_pair_list), np.asarray(labels)
# random_seeds=[123,456,789,101,112]
# random_seeds = [123]
......@@ -167,7 +167,7 @@ def save2File(input, out_path):
parser = argparse.ArgumentParser(description='Neural based matrix completion for virus-host PPI')
parser.add_argument('--data_dir', default='data/hmdd2/', help='dataset directory')
parser.add_argument('--save_dir', default='data/hmdd2/folds_oneclf/', help='dataset directory')
parser.add_argument('--save_dir', default='data/hmdd2/folds/', help='dataset directory')
parser.add_argument('--randseed', default=123, help='the random seed')
parser.add_argument('--numFold', default=5, help='value of K for K-foldCV, default is 5')
parser.add_argument('--neg_rate', default=1.0, help='the negative sampling rate')
......
# import torch as t
# from torch import nn, optim
# from utility.utils import *
# from miRNA_sim import *
import numpy as np
# from multiprocess import Process, Queue
import pandas as pd
import os
# from os import path
import random
np.random.seed(1337)
random_seeds = [123, 456, 789, 101, 112]
def get_top(N, csv_path, selected_index):
df = pd.read_csv(csv_path, header=None)
# print(csv_path, len(df.columns))
df.columns = ['y_pred', 'y_true']
# print(selected_index, len(df))
df = df.iloc[selected_index, :]
unique_y_pred = df['y_pred'].unique().tolist()
tmp_df = df[df['y_true'] == 1.0]
all_pos = len(tmp_df)
selected_list = list()
for item in sorted(unique_y_pred, reverse=True):
pairs = df[df['y_pred'] == item].values.tolist()
indexes = list(range(len(pairs)))
random.shuffle(indexes)
for iptmp in indexes:
if len(selected_list) == N:
break
selected_list.append(pairs[iptmp])
# selected_list.extend(pairs)
all_pos = all_pos if all_pos < len(selected_list) else len(selected_list)
return selected_list, all_pos
def eval_top(res_dir, fold_dir, disease_selected_index, method ='nimgcn300_variance3_gip', N = 100, epmda=False):
# print('Evaluate top ', N, 'results for method', method)
all_selected = list()
all_pos = 0
runs = 0
for randseed in random_seeds:
cur_folder = res_dir + str(randseed) + '/'
if not os.path.exists(cur_folder):
continue
# print(cur_folder)
for file in os.listdir(cur_folder):
if file.find(method) < 0 or file[0] not in ['1', '2', '3', '4', '5']:
continue
# print(file)
foldIdx = str(int(file[0])-1)
runs += 1
if not epmda:
test_csv_path = fold_dir + str(randseed) + '_' + foldIdx + '_test.csv'
else:
test_csv_path = cur_folder + file.replace('.csv', '_pair.csv')
test_df = pd.read_csv(test_csv_path, header=None)
test_df.columns = ['miRNA', 'disease']
selected_index = test_df[test_df['disease'] == disease_selected_index].index.tolist()
# print(type(selected_df), selected_df)
cur_selected, cur_pos = get_top(N, cur_folder + file, selected_index)
# print('cur_selected: ', len(cur_selected), 'cur_pos: ', cur_pos)
all_pos += cur_pos
all_selected.extend(cur_selected)
arr = np.array(all_selected)
if arr.shape[0] != 0:
# print(arr.shape)
lbls = arr[:,1].tolist()
freq = lbls.count(1.0)
# print('disease: ', disease_selected_index, 'Total positive: ', freq, 'over', len(lbls), 'selected top prediction', all_pos)
else:
return 0, all_pos, 0, 0
return freq, all_pos, len(lbls), runs
def getTopRes(fold_dir, res_dir, methods, disease_indexes, out_path, avg_out_path, epmda=False):
writer = open(out_path, 'w')
writer.write('method,k')
avg_writer = open(avg_out_path, 'w')
avg_writer.write('method,k,avg,ab_percent,rel_percent\n')
for disease in disease_indexes:
writer.write(',' + str(disease))
writer.write('\n')
k_pool = [10,20,30,40,50]
for method in methods:
sum_tp, sum_all, sum_true = 0,0,0
for k in k_pool:
writer.write(method + ',' + str(k))
for idx in disease_indexes:
tp, all_pos, all, runs = eval_top(res_dir, fold_dir, idx, method=method, N=k, epmda=epmda)
avg_top = 0 if runs == 0 else round(tp * 1.0 / runs, 1)
writer.write(',' + str(avg_top))
sum_tp += tp
sum_all += all
sum_true += all_pos
percent = round(sum_tp * 100.0 / sum_all if sum_all != 0 else 0, 2)
rel_percent = round(sum_tp * 100.0 / sum_true if sum_true != 0 else 0, 2)
avg_writer.write(method + ',' + str(k) + ',' + str(sum_tp / runs/len(hmdd2_disease_index) if runs != 0 else 0) + ',' + str(percent) + ',' + str(rel_percent) + '\n')
writer.write('\n')
writer.close()
avg_writer.close()
methods = ['nimgcn_functional2.csv','nimgcn1_functional2.csv','nimgcn2_functional2.csv','nimgcn3_functional2.csv']
hmdd2_disease_index = [91,205,327,236,304,202]
hmdd2_res_dir = 'data/hmdd2/results/'
hmdd2_fold_dir = 'data/hmdd2/folds/'
out_path = hmdd2_res_dir + 'topK.csv'
avg_out_path = hmdd2_res_dir + 'avg_topK.csv'
methods = ['linear_epmda_gip_balance.csv']#, 'linear_epmda_gip_imbalance.csv','MLP_epmda_gip_balance.csv','MLP_epmda_gip_imbalance.csv'
hmdd2_res_dir = 'data/hmdd2/results_epmda/'
out_path = hmdd2_res_dir + 'topK.csv'
avg_out_path = hmdd2_res_dir + 'avg_topK.csv'
getTopRes(hmdd2_fold_dir,hmdd2_res_dir,methods,hmdd2_disease_index,out_path,avg_out_path, epmda=True)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment