Commit cb8fd0da authored by Ngan Thi Dong's avatar Ngan Thi Dong
Browse files

update epmda evaluation

parent b6b5dd9b
......@@ -321,6 +321,8 @@ class Interactions:
randomIndex = list(range(self.numInteractions))
random.seed(ranseed)
random.shuffle(randomIndex)
pairs = [[key, value] for key, value in self.integerToNodes.items()]
edgesList = [[] for i in range(numFold)]
for i in range(numFold):
......@@ -347,7 +349,7 @@ class Interactions:
for edge in edgesList[j]:
arrays[i][0][edge[0] - 1][edge[1] - 1] = edge[2]
return arrays
return arrays, pairs
def clear(self):
"""
......@@ -519,6 +521,7 @@ class InteractionDataSource:
"""
content = self.content
for curLine in range(self.curLine, len(content)):
# print('curLine: ', curLine)
if len(content[curLine]) == 0:
continue
if -1 != content[curLine].find(u',') or -1 != content[curLine].find(u','):
......@@ -529,6 +532,7 @@ class InteractionDataSource:
while '' in conList:
conList.remove(u'')
# print('conList: ', conList)
for i in range(len(conList)):
conList[i] = conList[i].strip()
......@@ -536,6 +540,7 @@ class InteractionDataSource:
self.interactions.updateNodesIntegerMap(conList[0:2])
interaction = Interaction(self.interactions)
nodes = [self.interactions.nodesToInteger[conList[0]], self.interactions.nodesToInteger[conList[1]]]
# print('nodes: ', nodes, 'conList: ', conList, self.interactions.integerToNodes[nodes[0]], self.interactions.integerToNodes[nodes[1]])
interaction.setNodes(nodes)
if len(conList) == 3:
self.interactions.setWeighted(True)
......@@ -598,7 +603,7 @@ class InteractionDataSource:
class SimpleModel(object):
def __init__(self, trainArray, probeArray, directory, numFeature, classifier=None):
def __init__(self, trainArray, probeArray, directory, numFeature, classifier=None, imbalanced = True, seed = 123):
"""
Construction function. Init variables:
"""
......@@ -610,6 +615,8 @@ class SimpleModel(object):
self.directory = directory
self.classifier = classifier
self.EP = None
self.imbalanced = imbalanced
self.randseed = seed
def readFeatures(self, num):
......@@ -640,19 +647,45 @@ class SimpleModel(object):
testData = []
trainLabel = []
testLabel =[]
test_pairs = list()
test_indexes = list()
count_pos = 0
if not self.imbalanced:
negative_pool = list()
for rowIndex in range(self.numRow):
for colIndex in range(self.numCol):
if self.trainArray[rowIndex][colIndex] == 0:
negative_pool.append([rowIndex, colIndex])
else:
count_pos += 1
negative_indexes = list(range(len(negative_pool)))
random.seed(self.randseed)
random.shuffle(negative_indexes)
negative_training_data = [self.getFeatures(negative_pool[pind][0], negative_pool[pind][1]) for pind in negative_indexes[:count_pos]]
for rowIndex in range(self.numRow):
for colIndex in range(self.numCol):
features = self.getFeatures(rowIndex, colIndex)
trainData.append(features[:])
trainLabel.append(self.trainArray[rowIndex][colIndex])
if self.imbalanced:
trainData.append(features[:])
trainLabel.append(self.trainArray[rowIndex][colIndex])
elif self.trainArray[rowIndex][colIndex] != 0:
trainData.append(features[:])
trainLabel.append(self.trainArray[rowIndex][colIndex])
if self.trainArray[rowIndex][colIndex] == 0:
testData.append(features[:])
testLabel.append(self.probeArray[rowIndex][colIndex])
test_pairs.append([rowIndex, colIndex])
if not self.imbalanced:
trainData = trainData + negative_training_data
trainLabel = trainLabel + [0] * count_pos
return np.array(trainData), np.array(testData), np.array(trainLabel), np.array(testLabel)
return np.array(trainData), np.array(testData), np.array(trainLabel), np.array(testLabel), np.array(test_pairs)
def calAUC(trainArray, probeArray, fold, method, numFeatures, feature_dir, score_save_path = None):
def calAUC(trainArray, probeArray, fold, method, numFeatures, feature_dir, randseed=123, imbalance = True, score_save_path = None):
"""
:param trainArray:
......@@ -666,10 +699,10 @@ def calAUC(trainArray, probeArray, fold, method, numFeatures, feature_dir, score
numFeature = numFeatures
numHidden = 20
simpleModel = SimpleModel(trainArray, probeArray, feature_dir + "/", numFeature)
simpleModel = SimpleModel(trainArray, probeArray, feature_dir + "/", numFeature, imbalanced = imbalance, seed = randseed)
simpleModel.readFeatures(fold)
X_train, X_test, y_train, y_test = simpleModel.toNumArray()
X_train, X_test, y_train, y_test, test_pairs = simpleModel.toNumArray()
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
min_max_scaler = preprocessing.MinMaxScaler()
......@@ -694,9 +727,12 @@ def calAUC(trainArray, probeArray, fold, method, numFeatures, feature_dir, score
aupr_score = metrics.average_precision_score(y_test, values, average='micro')
print ("auc", auc_score, "aupr", aupr_score)
join_list = [(pred, target) for pred,target in zip(values, values)]
join_list = [(pred, target) for pred,target in zip(values, y_test)]
df = pd.DataFrame(np.array(join_list))
df.to_csv(score_save_path, header=False, index=False)
pair_save_path = score_save_path.replace('.csv', '_pair.csv')
pdf = pd.DataFrame(test_pairs)
pdf.to_csv(pair_save_path, index=False, header=False)
return auc_score, aupr_score
......@@ -719,49 +755,104 @@ if __name__ == '__main__':
args = parser.parse_args()
args.save_score = True if str(args.save_score) == 'True' else False
args.faulty = True if str(args.faulty) == 'True' else False
args.data_dir = standardize_dir(args.data_dir)
args.result_dir = standardize_dir(args.result_dir)
if args.save_score:
score_dir = standardize_dir(args.result_dir + str(args.randseed))
for method in methods:
numFold = args.numFold
numFeatures = args.numFeatures
write_path = args.result_dir + str(args.randseed) + '_' + method + '_epmda_'
if args.faulty:
write_path += 'faulty_'
write_path += args.sim_type + '_original_results.txt'
writer = open(write_path, 'w')
writer.write('Fold,AUC,AUPR\n')
auc_list = list()
aupr_list = list()
featuresPath = os.path.join(args.data_dir, str(args.randseed) + "features5FoldCV")
dataSource = InteractionDataSource(os.path.join(args.data_dir, "miRNA-disease.txt"), args.randseed)
sim_dir = args.fold_dir
interactions = dataSource.interactions
arrays = interactions.randomSplitTrainProbeArrary(numFold, ranseed=args.randseed)
print (len(os.listdir(featuresPath)))
# args.data_dir = standardize_dir(args.data_dir)
# args.result_dir = standardize_dir(args.result_dir)
args.randseed = int(args.randseed)
data_dirs = ['../simplifying_mirna_disease/epmda/data/hmdd2_numpy/']#,'../simplifying_mirna_disease/epmda/data/hmdd3_numpy/', '/home/dong/Desktop/miRNA-disease/simplifying_mirna_disease/epmda/data/faultyHmdd2_numpy/']
fold_dirs = ['folds_hmdd2/']#, 'folds/', 'folds_hmdd2/']
result_dirs = ['data/hmdd2/results_epmda/']#, 'data/hmdd3/results/', 'data/hmdd2/results_epmda/']
args.save_score = True
randseeds = [123, 456, 789, 101, 112]
for idir, dir in enumerate(data_dirs):
cur_seeds = randseeds
if dir.find('hmdd3') > 0:
cur_seeds = [456]
for seed in cur_seeds:
if dir.find('faulty') > 0:
args.faulty = True
args.data_dir = dir
args.fold_dir = fold_dirs[idir]
args.result_dir = result_dirs[idir]
args.randseed = seed
if idir == 1 and seed != 456:
continue
for i in range(numFold):
score_save_path = None
if args.save_score:
score_save_path = score_dir + str(i) + '_' + method + '_epmda_'
score_dir = standardize_dir(args.result_dir + str(args.randseed))
for method in methods:
numFold = args.numFold
numFeatures = args.numFeatures
write_path_pre = args.result_dir + str(args.randseed) + '_' + method + '_epmda_'
if args.faulty:
score_save_path += 'faulty_'
score_save_path += args.sim_type + '_original.csv'
auc, aupr = calAUC(arrays[i][0], arrays[i][1], i + 1, method, numFeatures, featuresPath, score_save_path)
auc_list.append(auc)
aupr_list.append(aupr)
writer.write(',' + str(round(auc, 4)) + ',' + str(round(aupr, 4)) + '\n')
auc_arr = np.array(auc_list)
aucmean = np.mean(auc_arr, axis=0).tolist()
aucstd = np.std(auc_arr, axis=0).tolist()
aupr_arr = np.array(aupr_list)
auprmean = np.mean(aupr_arr, axis=0).tolist()
auprstd = np.std(aupr_arr, axis=0).tolist()
writer.write('Average,' + str(round(aucmean,4)) + ' $\pm$ ' + str(round(aucstd, 4)) + ',' + str(round(auprmean,4)) + ' $\pm$ ' + str(round(auprstd, 4)) + '\n')
writer.close()
write_path_pre += 'faulty_'
featuresPath = os.path.join(args.data_dir, str(args.randseed) + "features5FoldCV")
dataSource = InteractionDataSource(os.path.join(args.data_dir, "miRNA-disease.txt"), args.randseed)
sim_dir = args.fold_dir
interactions = dataSource.interactions
write_path = write_path_pre + args.sim_type + '_imbalance_results.txt'
writer = open(write_path, 'w')
writer.write('Fold,AUC,AUPR\n')
auc_list = list()
aupr_list = list()
arrays, map_dict = interactions.randomSplitTrainProbeArrary(numFold, ranseed=args.randseed)
map_df = pd.DataFrame(np.array(map_dict))
map_df.to_csv(args.result_dir + str(args.randseed) + '_map_dict.csv', index=False, header=False)
print (len(os.listdir(featuresPath)))
for iFold in range(numFold):
score_save_path = None
if args.save_score:
score_save_path = score_dir + str(iFold) + '_' + method + '_epmda_'
if args.faulty:
score_save_path += 'faulty_'
score_save_path += args.sim_type + '_imbalance.csv'
auc, aupr = calAUC(arrays[iFold][0], arrays[iFold][1], iFold + 1, method, numFeatures, featuresPath, score_save_path=score_save_path, randseed=seed, imbalance=True)
auc_list.append(auc)
aupr_list.append(aupr)
writer.write(',' + str(round(auc, 4)) + ',' + str(round(aupr, 4)) + '\n')
auc_arr = np.array(auc_list)
aucmean = np.mean(auc_arr, axis=0).tolist()
aucstd = np.std(auc_arr, axis=0).tolist()
aupr_arr = np.array(aupr_list)
auprmean = np.mean(aupr_arr, axis=0).tolist()
auprstd = np.std(aupr_arr, axis=0).tolist()
writer.write('Average,' + str(round(aucmean,4)) + ' $\pm$ ' + str(round(aucstd, 4)) + ',' + str(round(auprmean,4)) + ' $\pm$ ' + str(round(auprstd, 4)) + '\n')
writer.close()
write_path = write_path_pre + args.sim_type + '_balance_results.txt'
writer = open(write_path, 'w')
writer.write('Fold,AUC,AUPR\n')
auc_list = list()
aupr_list = list()
arrays, _ = interactions.randomSplitTrainProbeArrary(numFold, ranseed=args.randseed)
print (len(os.listdir(featuresPath)))
for iFold in range(numFold):
score_save_path = None
if args.save_score:
score_save_path = score_dir + str(iFold) + '_' + method + '_epmda_'
if args.faulty:
score_save_path += 'faulty_'
score_save_path += args.sim_type + '_balance.csv'
auc, aupr = calAUC(arrays[iFold][0], arrays[iFold][1], iFold + 1, method, numFeatures, featuresPath, score_save_path=score_save_path, randseed=seed, imbalance=False)
auc_list.append(auc)
aupr_list.append(aupr)
writer.write(',' + str(round(auc, 4)) + ',' + str(round(aupr, 4)) + '\n')
auc_arr = np.array(auc_list)
aucmean = np.mean(auc_arr, axis=0).tolist()
aucstd = np.std(auc_arr, axis=0).tolist()
aupr_arr = np.array(aupr_list)
auprmean = np.mean(aupr_arr, axis=0).tolist()
auprstd = np.std(aupr_arr, axis=0).tolist()
writer.write('Average,' + str(round(aucmean,4)) + ' $\pm$ ' + str(round(aucstd, 4)) + ',' + str(round(auprmean,4)) + ' $\pm$ ' + str(round(auprstd, 4)) + '\n')
writer.close()
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
import numpy as np
from sklearn import metrics
import argparse
from utility.utils import *
np.random.seed(1337)
classifiers = ['MLP', 'linear']
import os
def eval_fold(featue_directory, foldIdx, train_pair_list, train_lbl_list, test_pair_list, test_lbl_list, clf, score_save_path=None, numFeatures=7):
if not os.path.exists(featue_directory + "feature" + str(foldIdx) + ".npy"):
return None
features = np.load(featue_directory + "feature" + str(foldIdx) + ".npy")
print('feature.shape: ', features.shape)
X_train = list()
X_test = list()
for pair in train_pair_list:
X_train.append(features[pair[0]][pair[1]][:numFeatures])
X_train = np.array(X_train)
for pair in test_pair_list:
X_test.append(features[pair[0]][pair[1]][:numFeatures])
X_test = np.array(X_test)
return clf_eval_fold(X_train, train_lbl_list, X_test, test_lbl_list, score_save_path, classifier=clf)
def eval_dir(args):
data_dir, fold_dir, ranseed = args.data_dir, args.fold_dir, args.randseed
result_dir = standardize_dir(args.result_dir)
feature_dir = data_dir + str(ranseed) + 'features5FoldCV/'
foldsData = load_fold_data(data_dir, fold_dir, ranseed)
writers = dict()
for clf in classifiers:
result_path = result_dir + str(ranseed) + '_' + clf + '_epmda_'+ args.sim_type +'_balance_results.csv'
if args.faulty:
result_dir.replace('epmda', '_faulty')
writers[clf] = open(result_path, 'w')
writers[clf].write('Fold,AUC,AUPR\n')
foldIdx = 0
for train_pair_list, train_lbl_list, test_pair_list, test_lbl_list, _, _ in foldsData:
foldIdx += 1
for clf in classifiers:
if args.save_score:
score_save_path = standardize_dir(result_dir + str(args.randseed)) + str(foldIdx) + '_' + clf + '_epmda_'+ args.sim_type +'_balance.csv'
else:
score_save_path = None
if args.faulty:
score_save_path = score_save_path.replace('epmda', '_faulty')
auc, aupr = eval_fold(feature_dir, foldIdx, train_pair_list, train_lbl_list, test_pair_list, test_lbl_list, clf,
score_save_path=score_save_path, numFeatures=args.numFeatures)
writers[clf].write(',' + str(round(auc,4)) + ',' + str(round(aupr,4)) + '\n')
for clf in classifiers:
writers[clf].close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='EPMDA original evaluation method')
parser.add_argument('--data_dir', default='/home/dong/Desktop/miRNA-disease/mirna-disease/epmda/data/hmdd2_numpy/', help='dataset directory')
parser.add_argument('--fold_dir', default='data/hmdd2/folds/', help='dataset directory')
parser.add_argument('--result_dir', default='data/hmdd2/results/', help='saved result directory')
parser.add_argument('--save_score', default=False, help='whether to save the predicted score or not')
parser.add_argument('--randseed', default=123, help='the random seed')
parser.add_argument('--numFeatures', default=7, help='the random seed')
parser.add_argument('--numFold', default=5, help='value of K for K-foldCV, default is 5')
parser.add_argument('--neg_rate', default=1.0, help='the negative sampling rate')
parser.add_argument('--faulty', default=False, help='Faulty calculation or not')
parser.add_argument('--sim_type', default='gip', help='Type of input similarities')
args = parser.parse_args()
args.save_score = True if str(args.save_score) == 'True' else False
args.faulty = True if str(args.faulty) == 'True' else False
args.data_dir = standardize_dir(args.data_dir)
args.result_dir = standardize_dir(args.result_dir)
eval_dir(args)
# hmdd2_dir_path = '/home/dong/EPMDA/data/hmdd2_numpy/'
# faulty_hmdd2_dir_path = '/home/dong/EPMDA/data/faultyHmdd2_numpy/'
# hmdd2_fold_path = '/home/dong/mirna-disease/folds_hmdd2/'
# hmdd3_dir_path = '/home/dong/EPMDA/data/hmdd3_numpy/'
# hmdd3_fold_path = '/home/dong/mirna-disease/folds/'
# hmdd2_func_path = '/home/dong/EPMDA/data/hmdd2_func/'
# hmdd2_seq_path = 'epmda/data/hmdd2_seq/'
# eval_dir(hmdd2_dir_path.replace('/home/dong/EPMDA/', './'), hmdd2_fold_path.replace('mirna-disease', 'Desktop/miRNA-disease/mirna-disease'))
# # eval_dir(hmdd2_func_path.replace('/home/dong/EPMDA/', './'), hmdd2_fold_path.replace('mirna-disease', 'Desktop/miRNA-disease/mirna-disease'))
# print('finish hmd2 func')
# # eval_dir(hmdd2_seq_path, hmdd2_fold_path.replace('mirna-disease', 'Desktop/miRNA-disease/mirna-disease'))
# print('finish HMDD2 seq')
# # eval_dir(hmdd3_dir_path.replace('/home/dong/EPMDA/', './'), hmdd3_fold_path.replace('mirna-disease', 'Desktop/miRNA-disease/mirna-disease'))
# print('finish hmdd3')
# # eval_dir(faulty_hmdd2_dir_path.replace('/home/dong/EPMDA/', './'), hmdd2_fold_path.replace('mirna-disease', 'Desktop/miRNA-disease/mirna-disease'))
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment