Commit fd451819 authored by Ngan Thi Dong's avatar Ngan Thi Dong
Browse files

update eval nimgcn models

parent 141cd474
......@@ -31,12 +31,14 @@ def assocList2adjMat(pos_assoc, n_miRNA, n_disease):
def gen_fold(data_dir, save_dir, numFold=5, negative_rate=1.0, randomseed=123):
# assoc_list, neglist
adj_path = data_dir + 'm-d.csv'
disease_sim_path = data_dir + 'disease_sim.csv'
disease_sim_path2 = data_dir + 'disease_sim2.csv'
onto_disease_sim_path = data_dir + 'disease_sim.csv'
onto_disease_sim_path2 = data_dir + 'disease_sim2.csv'
disease_not_found_path = data_dir + 'disease_not_found_list.txt'
mirna_func_path = 'mirna_func.csv'
mirna_func_path2 = 'mirna_func2.csv'
disease_sim_path = 'disease_sim.csv'
disease_sim_path2 = 'disease_sim2.csv'
mirna_gip_path = 'mirna_gip.csv'
disease_gip_path = 'disease_gip.csv'
......@@ -79,10 +81,12 @@ def gen_fold(data_dir, save_dir, numFold=5, negative_rate=1.0, randomseed=123):
train_adj = assocList2adjMat(pos_train_pair_list, n_miRNA, n_disease)
# miRNA functional with disease semantic only
train_miRNA_func_sim = cal_miRNA_func_sim(train_adj, disease_sim_path, disease_not_found_path, disease_sim_path)
train_disease_semantic_sim, train_miRNA_func_sim= cal_miRNA_func_sim(train_adj, onto_disease_sim_path, disease_not_found_path, onto_disease_sim_path)
save2File(train_disease_semantic_sim, saving_prefix + disease_sim_path)
save2File(train_miRNA_func_sim, saving_prefix + mirna_func_path)
train_miRNA_func_sim2 = cal_miRNA_func_sim(train_adj, disease_sim_path2, disease_not_found_path, disease_sim_path)
train_disease_semantic_sim2, train_miRNA_func_sim2 = cal_miRNA_func_sim(train_adj, onto_disease_sim_path2, disease_not_found_path, onto_disease_sim_path)
save2File(train_miRNA_func_sim2, saving_prefix + mirna_func_path2)
save2File(train_disease_semantic_sim2, saving_prefix + disease_sim_path2)
train_miRNA_gip, train_disease_gip = calculate_gip(train_adj)
save2File(train_miRNA_gip, saving_prefix + mirna_gip_path)
......@@ -142,10 +146,12 @@ def negative_sampling(pos_samples, n_miRNA, n_disease, negative_rate):
# print('len(labels): ', len(labels))
return np.asarray(new_pair_list), np.asarray(labels)
random_seeds=[123,456,789,101,112]
# random_seeds=[123,456,789,101,112]
random_seeds = [123]
hmdd2_dir = '../hmdd2/'
hmdd3_dir = '../hmdd3/'
hmdd2_savedir = hmdd2_dir + 'folds/'
hmdd3_savedir = hmdd3_dir + 'folds/'
gen_fold(hmdd2_dir, hmdd2_savedir)
gen_fold(hmdd3_dir, hmdd3_savedir)
for randseed in random_seeds:
gen_fold(hmdd2_dir, hmdd2_savedir, randomseed=randseed)
# gen_fold(hmdd3_dir, hmdd3_savedir, randomseed=randseed)
import torch as t
from torch import nn, optim
from utility.utils import *
from data.hmdd2.miRNA_sim import *
import numpy as np
from nimcgcn.code.model import *
from multiprocess import Process, Queue
import argparse
import os
import os.path as path
np.random.seed(1337)
sysdevice = t.device('cuda' if t.cuda.is_available() else 'cpu')
......@@ -40,14 +37,8 @@ class Myloss(nn.Module):
loss_sum = loss(input, target)
return (1-self.alpha)*loss_sum[one_index].sum()+self.alpha*loss_sum[zero_index].sum()
def eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, sizes, epochs, disease_semantic_sim_path,
auc_queue,
auprc_queue, method):
print('Starting fold: ', foldIdx)
adj, zero_index, one_index = assoc_list_to_adj(n_disease, n_miRNA, train_pair_list, train_pair_lbl)
disease_sim, miRNA_sim = cal_sim(adj.T, disease_semantic_sim_path=disease_semantic_sim_path)# our current adj is miRNA: row, but the function take the row
# as
# disease
def eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, miRNA_sim, disease_sim,
sizes, auc_queue, aupr_queue, args):
dataset = dict()
dd_edge_index = get_edge_index(disease_sim)
......@@ -62,13 +53,15 @@ def eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test
miRNA_sim_tensor = miRNA_sim_tensor.cuda()
dataset['mm'] = {'data': miRNA_sim_tensor, 'edge_index': mm_edge_index}
if method == 'nimgcn1':
adj, zero_index, one_index = assoc_list_to_adj(n_disease, n_miRNA, train_pair_list, train_pair_lbl)
if args.method == 'nimgcn1':
model = SimpleModel1(sizes)
elif method == 'nimgcn2':
elif args.method == 'nimgcn2':
model = SimpleModel2(sizes)
elif method == 'nimgcn3':
elif args.method == 'nimgcn3':
model = SimpleModel3(sizes)
elif method == 'nimgcn':
elif args.method == 'nimgcn':
model = Model(sizes)
else:
print('Invalid method name, please input either nimgcn, nimgcn1, nimgcn2, nimgcn3')
......@@ -81,7 +74,7 @@ def eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test
model.train()
for epoch in range(0,epochs+1):
for epoch in range(0, args.epochs+1):
model.zero_grad()
score = model([dataset['dd'], dataset['mm']], device=sysdevice)
loss = regression_crit(one_index, zero_index, t.FloatTensor(adj), score)
......@@ -96,40 +89,41 @@ def eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test
auc_score, auprc_score = get_score(test_pair_lbl, test_pred_lbl)
print('foldIdx: ', foldIdx, 'auc_score: ', auc_score, 'auprc_score: ', auprc_score)
auc_queue.put(auc_score)
auprc_queue.put(auprc_score)
aupr_queue.put(auprc_score)
#save scores
if args.save_score:
score_save_dir = args.result_dir + str(args.randseed)
score_save_dir = standardize_dir(score_save_dir)
score_path = score_save_dir + str(foldIdx) + '_' + args.method + '_' + args.sim_type + '.csv'
save_scores(test_pred_lbl, test_pair_lbl, score_path)
# evaluate nimcgcn model
def eval(args, adj_path='./m-d.csv', disease_semantic_sim_path = './disease_sim2.csv', negrate=1.0, epochs=300, method='nimgcn'):
def eval(args):
data_dir = standardize_dir(args.data_dir)
fold_dir = standardize_dir(args.fold_dir)
result_dir = standardize_dir(args.result_dir)
adj_path = data_dir + 'm-d.csv'# the adjacency matrix
disease_sim_path = data_dir + 'disease_sim.csv' # pre-calculated disease_semantic
disease_sim_path2 = data_dir + 'disease_sim2.csv'# pre-calculated disease semantic+phenotype
disease_missing_list = data_dir + 'disease_not_found_list.txt'
miRNA_seq_sim_path = data_dir + ''
# load
n_miRNA, n_disease, one_edge_list, zero_edge_list = load_data(adj_path)
sizes = Sizes(n_miRNA, n_disease)
numFold = 5
folds_data = randomSplitTrainTestArray(one_edge_list, zero_edge_list, n_disease, n_miRNA, numFold, negrate, randomseed=args.randseed)
numFold = args.numFold # default is 5 for 5FoldCV
folds_data = load_fold_data(data_dir, fold_dir, args.randseed, args.sim_type)
foldIdx = 0
auc_queue = Queue(numFold)# for multiprocessing
aupr_queue = Queue(numFold)
processList = list()
for train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl in folds_data:
for train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, miRNA_sim, disease_sim in folds_data:
foldIdx += 1
# calculate the similarity if needed
n_miRNA = miRNA_sim.shape[0]
n_disease = disease_sim.shape[0]
sizes = Sizes(n_miRNA, n_disease)
if t.cuda.is_available():
eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl,
sizes, epochs, disease_semantic_sim_path, auc_queue, aupr_queue, method, ori_semantic_path, disease_semantic_missing_list, save_result_path)
eval_fold(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, miRNA_sim, disease_sim,
sizes, auc_queue, aupr_queue, args)
else:
process = Process(target=eval_fold, args=(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl,
sizes, epochs, disease_semantic_sim_path, auc_queue, aupr_queue, method, ori_semantic_path, disease_semantic_missing_list, save_result_path))
process = Process(target=eval_fold, args=(foldIdx, n_miRNA, n_disease, train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, miRNA_sim, disease_sim,
sizes, auc_queue, aupr_queue, args))
processList.append(process)
process.start()
if not t.cuda.is_available():
......@@ -141,8 +135,8 @@ def eval(args, adj_path='./m-d.csv', disease_semantic_sim_path = './disease_sim2
avg_auc = sum(auc_list)/len(auc_list)
avg_auprc = sum(auprc_list)/len(auprc_list)
print('NIMCGCN average performance: auc:', avg_auc, 'auprc: ', avg_auprc)
with open(result_dir + str(args.randseed) + '_' + method + '_' + args.sim_type + '.csv', 'w') as f:
print(args.method, ' average performance: auc:', avg_auc, 'auprc: ', avg_auprc)
with open(result_dir + str(args.randseed) + '_' + args.method + '_' + args.sim_type + '.csv', 'w') as f:
f.write('Fold,auc,auprc\n')
for i in range(5):
f.write(',' + str(auc_list[i]) + ',' + str(auprc_list[i]) + '\n')
......@@ -153,8 +147,9 @@ def main():
parser = argparse.ArgumentParser(description='Neural based matrix completion for virus-host PPI')
parser.add_argument('--epochs', type=int, default=300, metavar='N',
help='number of epochs to train')
parser.add_argument('--data_dir', default='/home/dong/simplifying_mirna_disease/folds/', help='dataset directory')
parser.add_argument('--result_dir', default='/home/dong/simplifying_mirna_disease/results_hmdd3/', help='saved result directory')
parser.add_argument('--data_dir', default='/home/dong/simplifying_mirna_disease/hmdd2/', help='dataset directory')
parser.add_argument('--fold_dir', default='/home/dong/simplifying_mirna_disease/hmdd2/folds/', help='dataset directory')
parser.add_argument('--result_dir', default='/home/dong/simplifying_mirna_disease/results_hmdd2/', help='saved result directory')
parser.add_argument('--method', default='nimgcn', help='method should be one of nimgcn, nimgcn1, nimgcn2, nimgcn3')
parser.add_argument('--save_score', default=False, help='whether to save the predicted score or not')
parser.add_argument('--sim_type', default='functional2', help='the miRNA and disease sim, pass in "functional2" for miRNA functional + disease semantic(with phenotype info added),'
......@@ -162,14 +157,13 @@ def main():
'"gip" for miRNA and disease GIP kernel similarity,'
'"seq" for miRNA sequence and disease semantic')
parser.add_argument('--randseed', default=123, help='the random seed')
parser.add_argument('--numFold', default=5, help='value of K for K-foldCV, default is 5')
parser.add_argument('--neg_rate', default=1.0, help='the negative sampling rate')
args = parser.parse_args()
eval(args)
# methods = ['nimgcn_variance3'] # 'nimgcn', 'nimgcn_variance1', 'nimgcn_variance2',
# for method in methods:
# eval_nimcgcn(method=method)
# eval_nimcgcn(disease_semantic_sim_path='../../../../data/hmdd2/disease_sim.csv', method=method)
# correct way of calculation: AUC: 0.8688, AUPRC: 0.1667
if __name__ == "__main__":
main()
......@@ -7,174 +7,47 @@ import numpy as np
import random
from rotation_forest import RotationForestClassifier
from sklearn import metrics
np.random.seed(1337)
import os
NEGATIVE_SAMPLING_RATE = 1
DATA_DIR = '/home/ngandong/Desktop/miRNA-disease/mirna-disease/data/mirna_disease/'
def load_name_dict(path):
with open(path, 'r') as f:
lines = f.readlines()
loaded_dict = dict()
for i, line in enumerate(lines):
line = line.strip()
if line == '':
continue
loaded_dict[line] = i
return loaded_dict
def load_data(adj_path):
edgelist = list()
neglist = list()
adj = pd.read_csv(adj_path, header=None).values
n_miRNA, n_disease = adj.shape
arr = adj.tolist()
for i, row in enumerate(arr):
for j, val in enumerate(row):
if int(val) == 1:
edgelist.append([i,j])
else:
neglist.append([i,j])
return n_miRNA, n_disease, edgelist, neglist
np.random.seed(1337)
import os
def saveFoldData(train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, savepath):
train_df = pd.DataFrame(np.array(train_pair_list), header=False, index=False)
train_df.to_csv(savepath.replace('.csv', '_train.csv'))
test_df = pd.DataFrame(np.array(test_pair_list))
test_df.to_csv(savepath.replace('.csv', '_test.csv'), header=False, index=False)
with open(savepath.replace('.csv', '_train_lbl.csv'), 'w') as f:
for lbl in train_pair_lbl:
f.write(str(lbl) + '\n')
with open(savepath.replace('.csv', '_test_lbl.csv'), 'w') as f:
for lbl in test_pair_lbl:
f.write(str(lbl) + '\n')
def randomSplitTrainTestArray(assoc_list, neglist, n_disease, n_miRNA, numFold, negative_rate, randomseed=1):
N = len(assoc_list)
randomIndex = [i for i in range(N)]
print('Number of known association: ', N)
random.seed(randomseed)
random.shuffle(randomIndex)
neglist_lbl = [0] * len(neglist)
folds = list()
save_dir = '../folds/'
if not os.path.exists(save_dir):
os.mkdir(save_dir)
for i in range(numFold):
test_lbl_list = list()
pos_train_pair_list = list()
pos_test_pair_list = list()
# Get the list of sample in the test set
if i < numFold - 1:
index = randomIndex[int(N / numFold * i): int(N / numFold * (i+1))]
else:
index = randomIndex[int(N / numFold * i):]
for idx, edge in enumerate(assoc_list):
if idx in index: # if idx in the test index, put it in the test set
pos_test_pair_list.append(edge)
test_lbl_list.append(1)
else:
pos_train_pair_list.append(edge)
# get the negative sampling set of training set
train_pair_list, train_pair_lbl = negative_sampling(pos_train_pair_list, n_miRNA, n_disease, negative_rate)
test_pair_list = pos_test_pair_list
test_pair_list.extend(neglist)
test_pair_lbl = test_lbl_list
test_pair_lbl.extend(neglist_lbl)
# print(len(test_pair_list), len(test_pair_lbl))
folds.append([train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl])
save_path = save_dir + str(randomseed) + '_Fold' + str(i) + '.csv'
saveFoldData(train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, save_path)
return folds
def load_fold_data(dir, ranseed, disease_semantic_sim_path='./disease_sim2.csv'):
def load_fold_data(data_dir, fold_dir, ranseed, sim_type, numFold=5):
folds_data = list()
for i in range(5):
train_pair_path = dir + str(ranseed) + '_Fold' + str(i) + '_train.csv'
train_lbl_path = train_pair_path.replace('_train.csv', '_train_lbl.csv')
test_lbl_path = train_pair_path.replace('_train.csv', '_test_lbl.csv')
test_pair_path = train_pair_path.replace('_train.csv', '_test.csv')
train_pair_list = pd.read_csv(train_pair_path, header=None).values.tolist()
test_pair_list = pd.read_csv(test_pair_path, header=None).values.tolist()
data_dir = standardize_dir(data_dir)
data_prefix = fold_dir + str(ranseed) + '_'
for iFold in range(numFold):
fold_prefix = data_prefix + str(iFold) + '_'
train_assoc_path = fold_prefix + 'train.csv'
test_assoc_path = fold_prefix + 'test.csv'
test_lbl_path = fold_prefix + 'test_lbl.csv'
train_lbl_path = fold_prefix + 'train_lbl.csv'
train_assoc_list = pd.read_csv(train_assoc_path, header=None).values.tolist()
test_assoc_list = pd.read_csv(test_assoc_path, header=None).values.tolist()
test_pair_lbl = pd.read_csv(test_lbl_path, header=None).values.tolist()
train_pair_lbl = pd.read_csv(train_lbl_path, header=None).values.tolist()
test_pair_lbl = [lbl[0] for lbl in test_pair_lbl]
train_pair_lbl = [lbl[0] for lbl in train_pair_lbl]
disease_sim_path = dir + str(ranseed) + '_Fold' + str(i) + '_hmdd3_' + disease_semantic_sim_path[disease_semantic_sim_path.rfind('/') + 1:]
miRNA_sim_path = disease_sim_path.replace('disease_sim', 'miRNA_sim')
disease_sim = pd.read_csv(disease_sim_path, header=None).values
miRNA_sim = pd.read_csv(miRNA_sim_path, header=None).values
folds_data.append([train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, disease_sim, miRNA_sim])
return folds_data
def load_fold_data2(dir, ranseed, disease_semantic_sim_path='./disease_sim2.csv'):
folds_data = list()
for i in range(5):
train_pair_path = dir + str(ranseed) + '_Fold' + str(i) + '_train.csv'
train_lbl_path = train_pair_path.replace('_train.csv', '_train_lbl.csv')
test_lbl_path = train_pair_path.replace('_train.csv', '_test_lbl.csv')
test_pair_path = train_pair_path.replace('_train.csv', '_test.csv')
train_pair_list = pd.read_csv(train_pair_path, header=None).values.tolist()
test_pair_list = pd.read_csv(test_pair_path, header=None).values.tolist()
test_pair_lbl = pd.read_csv(test_lbl_path, header=None).values.tolist()
train_pair_lbl = pd.read_csv(train_lbl_path, header=None).values.tolist()
test_pair_lbl = [lbl[0] for lbl in test_pair_lbl]
train_pair_lbl = [lbl[0] for lbl in train_pair_lbl]
disease_sim_path = dir + str(ranseed) + '_Fold' + str(i) + '_' + disease_semantic_sim_path[disease_semantic_sim_path.rfind('/') + 1:]
miRNA_sim_path = disease_sim_path.replace('disease_sim', 'miRNA_sim')
if sim_type == 'gip':
miRNA_sim_path = fold_prefix + 'mirna_gip.csv'
disease_sim_path = fold_prefix + 'disease_gip.csv'
elif sim_type == 'seq':
miRNA_sim_path = data_dir + 'mirna_seq.csv'
disease_sim_path = fold_prefix + 'disease_sim2.csv'
elif sim_type == 'functional2':
miRNA_sim_path = fold_prefix + 'mirna_func2.csv'
disease_sim_path = fold_prefix + 'disease_sim2.csv'
elif sim_type == 'functional1':
miRNA_sim_path = fold_prefix + 'mirna_func.csv'
disease_sim_path = fold_prefix + 'disease_sim.csv'
disease_sim = pd.read_csv(disease_sim_path, header=None).values
miRNA_sim = pd.read_csv(miRNA_sim_path, header=None).values
folds_data.append([train_pair_list, train_pair_lbl, test_pair_list, test_pair_lbl, disease_sim, miRNA_sim])
folds_data.append([train_assoc_list, train_pair_lbl, test_assoc_list, test_pair_lbl, miRNA_sim, disease_sim])
return folds_data
def negative_sampling(pos_samples, n_miRNA, n_disease, negative_rate):
size_of_batch = len(pos_samples)
print(size_of_batch)
if negative_rate > 0:
num_to_generate = size_of_batch * (negative_rate)
values1 = np.random.randint(n_miRNA, size=int(num_to_generate * 1.2))
values2 = np.random.randint(n_disease, size=int(num_to_generate * 1.2))
labels = [1] * len(pos_samples)
neg_samples = list()
#
# if train_pair_list != None:
# print(train_pair_list)
if negative_rate > 0:
for i in range(len(values1)):
pair = [values1[i], values2[i]]
if pair in neg_samples or pair in pos_samples:
continue
neg_samples.append(pair)
labels.append(0)
if len(labels) == size_of_batch + num_to_generate:
break
else:
for i in range(n_miRNA):
for j in range(n_disease):
pair = [i,j]
if pair not in pos_samples:
neg_samples.append(pair)
labels.append(0)
new_pair_list = pos_samples
new_pair_list.extend(neg_samples)
print('len new_pair_list: ', len(new_pair_list))
# print('len(labels): ', len(labels))
return np.asarray(new_pair_list), np.asarray(labels)
def assoc_list_to_adj(n_disease, n_miRNA, assoc_list, assoc_lbl):
......@@ -194,13 +67,17 @@ def assoc_list_to_adj(n_disease, n_miRNA, assoc_list, assoc_lbl):
def get_score(y_test, preds):
fpr, tpr, auc_thresholds = metrics.roc_curve(y_test, preds)
# print('Fold: ', fold, 'fpr, tpr: ', fpr, tpr, auc_thresholds)
auc_score = metrics.auc(fpr, tpr)
precision1, recall, pr_threshods = metrics.precision_recall_curve(y_test, preds)
aupr_score = metrics.auc(recall, precision1)
return auc_score, aupr_score
def save_scores(y_preds, y_true, path):
join_list = [[pred,target] for pred, target in zip(y_preds, y_true)]
df = pd.DataFrame(np.array(join_list))
df.to_csv(path, header=False, index=False)
def clf_eval_fold(X_train, y_train, X_test, y_test, score_save_path=None, classifier='linear'):
if classifier == 'rotation_forest':
clf = RotationForestClassifier(random_state=1337)
......@@ -232,19 +109,11 @@ def clf_eval_fold(X_train, y_train, X_test, y_test, score_save_path=None, classi
return auc_score, aupr_score
import pandas as pd
def adj_to_list(adj_path = './m-d.csv'):
df = pd.read_csv(adj_path, header=None)
vals = df.values.tolist()
writer1 = open('./edge_list1.txt', 'w')
writer2 = open('./edge_list2.txt', 'w')
n_miRNA = len(vals)
for i, val in enumerate(vals):
for j, item in enumerate(val):
if item == 1:
writer1.write(str(i) + ' ' + str(j) + '\n')
writer2.write(str(i) + ' ' + str(j+n_miRNA) + '\n')
writer1.close()
writer2.close()
# adj_to_list()
\ No newline at end of file
def standardize_dir(dir):
res_dir = dir
if not res_dir.endswith('/') and not res_dir.endswith('\\'):
res_dir += '/'
if not os.path.exists(res_dir):
os.mkdir(res_dir)
return res_dir
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment