Commit fd451819 authored by Ngan Thi Dong's avatar Ngan Thi Dong

update eval nimgcn models

parent 141cd474
......@@ -31,12 +31,14 @@ def assocList2adjMat(pos_assoc, n_miRNA, n_disease):
def gen_fold(data_dir, save_dir, numFold=5, negative_rate=1.0, randomseed=123):
# assoc_list, neglist
adj_path = data_dir + 'm-d.csv'
disease_sim_path = data_dir + 'disease_sim.csv'
disease_sim_path2 = data_dir + 'disease_sim2.csv'
onto_disease_sim_path = data_dir + 'disease_sim.csv'
onto_disease_sim_path2 = data_dir + 'disease_sim2.csv'
disease_not_found_path = data_dir + 'disease_not_found_list.txt'
mirna_func_path = 'mirna_func.csv'
mirna_func_path2 = 'mirna_func2.csv'
disease_sim_path = 'disease_sim.csv'
disease_sim_path2 = 'disease_sim2.csv'
mirna_gip_path = 'mirna_gip.csv'
disease_gip_path = 'disease_gip.csv'
......@@ -79,10 +81,12 @@ def gen_fold(data_dir, save_dir, numFold=5, negative_rate=1.0, randomseed=123):
train_adj = assocList2adjMat(pos_train_pair_list, n_miRNA, n_disease)
# miRNA functional with disease semantic only
train_miRNA_func_sim = cal_miRNA_func_sim(train_adj, disease_sim_path, disease_not_found_path, disease_sim_path)
train_disease_semantic_sim, train_miRNA_func_sim= cal_miRNA_func_sim(train_adj, onto_disease_sim_path, disease_not_found_path, onto_disease_sim_path)
save2File(train_disease_semantic_sim, saving_prefix + disease_sim_path)
save2File(train_miRNA_func_sim, saving_prefix + mirna_func_path)
train_miRNA_func_sim2 = cal_miRNA_func_sim(train_adj, disease_sim_path2, disease_not_found_path, disease_sim_path)
train_disease_semantic_sim2, train_miRNA_func_sim2 = cal_miRNA_func_sim(train_adj, onto_disease_sim_path2, disease_not_found_path, onto_disease_sim_path)
save2File(train_miRNA_func_sim2, saving_prefix + mirna_func_path2)
save2File(train_disease_semantic_sim2, saving_prefix + disease_sim_path2)
train_miRNA_gip, train_disease_gip = calculate_gip(train_adj)
save2File(train_miRNA_gip, saving_prefix + mirna_gip_path)
......@@ -142,10 +146,12 @@ def negative_sampling(pos_samples, n_miRNA, n_disease, negative_rate):
# print('len(labels): ', len(labels))
return np.asarray(new_pair_list), np.asarray(labels)
random_seeds=[123,456,789,101,112]
# random_seeds=[123,456,789,101,112]
random_seeds = [123]
hmdd2_dir = '../hmdd2/'
hmdd3_dir = '../hmdd3/'
hmdd2_savedir = hmdd2_dir + 'folds/'
hmdd3_savedir = hmdd3_dir + 'folds/'
gen_fold(hmdd2_dir, hmdd2_savedir)
gen_fold(hmdd3_dir, hmdd3_savedir)
for randseed in random_seeds:
gen_fold(hmdd2_dir, hmdd2_savedir, randomseed=randseed)
# gen_fold(hmdd3_dir, hmdd3_savedir, randomseed=randseed)
import torch as t
from torch import nn, optim
from utility.utils import *
from data.hmdd2.miRNA_sim import *
import numpy as np
from nimcgcn.code.model import *
from multiprocess import Process, Queue
import argparse
import os
import os.path as path
np.random.seed(1337)
sysdevice = t.device('cuda' if t.cuda.is_available() else 'cpu')
......@@ -40,14 +37,8 @@ class Myloss(nn.Module):
loss_sum = loss(input, target)
return (1-self.alpha)*loss_sum[one_index].sum()+self.alpha*loss_sum[zero_index].sum()
def eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, sizes, epochs, disease_semantic_sim_path,
auc_queue,
auprc_queue, method):
print('Starting fold: ', foldIdx)
adj, zero_index, one_index = assoc_list_to_adj(n_disease, n_miRNA, train_pair_list, train_pair_lbl)
disease_sim, miRNA_sim = cal_sim(adj.T, disease_semantic_sim_path=disease_semantic_sim_path)# our current adj is miRNA: row, but the function take the row
# as
# disease
def eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, miRNA_sim, disease_sim,
sizes, auc_queue, aupr_queue, args):
dataset = dict()
dd_edge_index = get_edge_index(disease_sim)
......@@ -62,13 +53,15 @@ def eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test
miRNA_sim_tensor = miRNA_sim_tensor.cuda()
dataset['mm'] = {'data': miRNA_sim_tensor, 'edge_index': mm_edge_index}
if method == 'nimgcn1':
adj, zero_index, one_index = assoc_list_to_adj(n_disease, n_miRNA, train_pair_list, train_pair_lbl)
if args.method == 'nimgcn1':
model = SimpleModel1(sizes)
elif method == 'nimgcn2':
elif args.method == 'nimgcn2':
model = SimpleModel2(sizes)
elif method == 'nimgcn3':
elif args.method == 'nimgcn3':
model = SimpleModel3(sizes)
elif method == 'nimgcn':
elif args.method == 'nimgcn':
model = Model(sizes)
else:
print('Invalid method name, please input either nimgcn, nimgcn1, nimgcn2, nimgcn3')
......@@ -81,7 +74,7 @@ def eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test
model.train()
for epoch in range(0,epochs+1):
for epoch in range(0, args.epochs+1):
model.zero_grad()
score = model([dataset['dd'], dataset['mm']], device=sysdevice)
loss = regression_crit(one_index, zero_index, t.FloatTensor(adj), score)
......@@ -96,40 +89,41 @@ def eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test
auc_score, auprc_score = get_score(test_pair_lbl, test_pred_lbl)
print('foldIdx: ', foldIdx, 'auc_score: ', auc_score, 'auprc_score: ', auprc_score)
auc_queue.put(auc_score)
auprc_queue.put(auprc_score)
aupr_queue.put(auprc_score)
#save scores
if args.save_score:
score_save_dir = args.result_dir + str(args.randseed)
score_save_dir = standardize_dir(score_save_dir)
score_path = score_save_dir + str(foldIdx) + '_' + args.method + '_' + args.sim_type + '.csv'
save_scores(test_pred_lbl, test_pair_lbl, score_path)
# evaluate nimcgcn model
def eval(args, adj_path='./m-d.csv', disease_semantic_sim_path = './disease_sim2.csv', negrate=1.0, epochs=300, method='nimgcn'):
def eval(args):
data_dir = standardize_dir(args.data_dir)
fold_dir = standardize_dir(args.fold_dir)
result_dir = standardize_dir(args.result_dir)
adj_path = data_dir + 'm-d.csv'# the adjacency matrix
disease_sim_path = data_dir + 'disease_sim.csv' # pre-calculated disease_semantic
disease_sim_path2 = data_dir + 'disease_sim2.csv'# pre-calculated disease semantic+phenotype
disease_missing_list = data_dir + 'disease_not_found_list.txt'
miRNA_seq_sim_path = data_dir + ''
# load
n_miRNA, n_disease, one_edge_list, zero_edge_list = load_data(adj_path)
sizes = Sizes(n_miRNA, n_disease)
numFold = 5
folds_data = randomSplitTrainTestArray(one_edge_list, zero_edge_list, n_disease, n_miRNA, numFold, negrate, randomseed=args.randseed)
numFold = args.numFold # default is 5 for 5FoldCV
folds_data = load_fold_data(data_dir, fold_dir, args.randseed, args.sim_type)
foldIdx = 0
auc_queue = Queue(numFold)# for multiprocessing
aupr_queue = Queue(numFold)
processList = list()
for train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl in folds_data:
for train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, miRNA_sim, disease_sim in folds_data:
foldIdx += 1
# calculate the similarity if needed
n_miRNA = miRNA_sim.shape[0]
n_disease = disease_sim.shape[0]
sizes = Sizes(n_miRNA, n_disease)
if t.cuda.is_available():
eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl,
sizes, epochs, disease_semantic_sim_path, auc_queue, aupr_queue, method, ori_semantic_path, disease_semantic_missing_list, save_result_path)
eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, miRNA_sim, disease_sim,
sizes, auc_queue, aupr_queue, args)
else:
process = Process(target=eval_fold, args=(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl,
sizes, epochs, disease_semantic_sim_path, auc_queue, aupr_queue, method, ori_semantic_path, disease_semantic_missing_list, save_result_path))
process = Process(target=eval_fold, args=(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, miRNA_sim, disease_sim,
sizes, auc_queue, aupr_queue, args))
processList.append(process)
process.start()
if not t.cuda.is_available():
......@@ -141,8 +135,8 @@ def eval(args, adj_path='./m-d.csv', disease_semantic_sim_path = './disease_sim2
avg_auc = sum(auc_list)/len(auc_list)
avg_auprc = sum(auprc_list)/len(auprc_list)
print('NIMCGCN average performance: auc:', avg_auc, 'auprc: ', avg_auprc)
with open(result_dir + str(args.randseed) + '_' + method + '_' + args.sim_type + '.csv', 'w') as f:
print(args.method, ' average performance: auc:', avg_auc, 'auprc: ', avg_auprc)
with open(result_dir + str(args.randseed) + '_' + args.method + '_' + args.sim_type + '.csv', 'w') as f:
f.write('Fold,auc,auprc\n')
for i in range(5):
f.write(',' + str(auc_list[i]) + ',' + str(auprc_list[i]) + '\n')
......@@ -153,8 +147,9 @@ def main():
parser = argparse.ArgumentParser(description='Neural based matrix completion for virus-host PPI')
parser.add_argument('--epochs', type=int, default=300, metavar='N',
help='number of epochs to train')
parser.add_argument('--data_dir', default='/home/dong/simplifying_mirna_disease/folds/', help='dataset directory')
parser.add_argument('--result_dir', default='/home/dong/simplifying_mirna_disease/results_hmdd3/', help='saved result directory')
parser.add_argument('--data_dir', default='/home/dong/simplifying_mirna_disease/hmdd2/', help='dataset directory')
parser.add_argument('--fold_dir', default='/home/dong/simplifying_mirna_disease/hmdd2/folds/', help='dataset directory')
parser.add_argument('--result_dir', default='/home/dong/simplifying_mirna_disease/results_hmdd2/', help='saved result directory')
parser.add_argument('--method', default='nimgcn', help='method should be one of nimgcn, nimgcn1, nimgcn2, nimgcn3')
parser.add_argument('--save_score', default=False, help='whether to save the predicted score or not')
parser.add_argument('--sim_type', default='functional2', help='the miRNA and disease sim, pass in "functional2" for miRNA functional + disease semantic(with phenotype info added),'
......@@ -162,14 +157,13 @@ def main():
'"gip" for miRNA and disease GIP kernel similarity,'
'"seq" for miRNA sequence and disease semantic')
parser.add_argument('--randseed', default=123, help='the random seed')
parser.add_argument('--numFold', default=5, help='value of K for K-foldCV, default is 5')
parser.add_argument('--neg_rate', default=1.0, help='the negative sampling rate')
args = parser.parse_args()
eval(args)
# methods = ['nimgcn_variance3'] # 'nimgcn', 'nimgcn_variance1', 'nimgcn_variance2',
# for method in methods:
# eval_nimcgcn(method=method)
# eval_nimcgcn(disease_semantic_sim_path='../../../../data/hmdd2/disease_sim.csv', method=method)
# correct way of calculation: AUC: 0.8688, AUPRC: 0.1667
if __name__ == "__main__":
main()
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment