Commit 002582d7 authored by Ngan Thi Dong's avatar Ngan Thi Dong

add negative sampling based on one class clf

parent 54793bc9
......@@ -7,6 +7,7 @@ import numpy as np
from utility.utils import *
from data.preparation.GIP import *
from data.preparation.miRNA_sim import *
from utility.one_class_clf_negative_sampling import *
import argparse
......@@ -96,12 +97,16 @@ def gen_fold(data_dir, save_dir, numFold=5, negative_rate=1.0, randomseed=123):
save2File(train_miRNA_gip, saving_prefix + mirna_gip_path)
save2File(train_disease_gip, saving_prefix + disease_gip_path)
# get the negative sampling set of training set
train_pair_list, train_pair_lbl = negative_sampling(pos_train_pair_list, n_miRNA, n_disease, negative_rate)
test_pair_list = pos_test_pair_list
test_pair_list.extend(neg_assoc)
test_pair_lbl = test_lbl_list
test_pair_lbl.extend(neglist_lbl)
# get the negative sampling set of training set
train_neg_list = list(neg_assoc)
train_neg_list.extend(test_pair_list)
train_pair_list, train_pair_lbl = negative_sampling(pos_train_pair_list, train_neg_list, train_miRNA_gip, train_disease_gip)
# train_pair_list, train_pair_lbl = negative_sampling(pos_train_pair_list, n_miRNA, n_disease, negative_rate)
save2File(train_pair_list, saving_prefix + 'train.csv')
save2File(train_pair_lbl, saving_prefix + 'train_lbl.csv')
save2File(test_pair_list, saving_prefix + 'test.csv')
......@@ -114,41 +119,41 @@ def save2File(input, out_path):
df.to_csv(out_path, header=False, index=False)
def negative_sampling(pos_samples, n_miRNA, n_disease, negative_rate):
size_of_batch = len(pos_samples)
print(size_of_batch)
if negative_rate > 0:
num_to_generate = size_of_batch * (negative_rate)
values1 = np.random.randint(n_miRNA, size=int(num_to_generate * 1.2))
values2 = np.random.randint(n_disease, size=int(num_to_generate * 1.2))
labels = [1] * len(pos_samples)
neg_samples = list()
#
# if train_pair_list != None:
# print(train_pair_list)
if negative_rate > 0:
for i in range(len(values1)):
pair = [values1[i], values2[i]]
if pair in neg_samples or pair in pos_samples:
continue
neg_samples.append(pair)
labels.append(0)
if len(labels) == size_of_batch + num_to_generate:
break
else:
for i in range(n_miRNA):
for j in range(n_disease):
pair = [i,j]
if pair not in pos_samples:
neg_samples.append(pair)
labels.append(0)
new_pair_list = pos_samples
new_pair_list.extend(neg_samples)
print('len new_pair_list: ', len(new_pair_list))
# print('len(labels): ', len(labels))
return np.asarray(new_pair_list), np.asarray(labels)
# def negative_sampling(pos_samples, n_miRNA, n_disease, negative_rate):
# size_of_batch = len(pos_samples)
# print(size_of_batch)
# if negative_rate > 0:
# num_to_generate = size_of_batch * (negative_rate)
# values1 = np.random.randint(n_miRNA, size=int(num_to_generate * 1.2))
# values2 = np.random.randint(n_disease, size=int(num_to_generate * 1.2))
# labels = [1] * len(pos_samples)
# neg_samples = list()
# #
# # if train_pair_list != None:
# # print(train_pair_list)
# if negative_rate > 0:
# for i in range(len(values1)):
# pair = [values1[i], values2[i]]
# if pair in neg_samples or pair in pos_samples:
# continue
# neg_samples.append(pair)
# labels.append(0)
#
# if len(labels) == size_of_batch + num_to_generate:
# break
# else:
# for i in range(n_miRNA):
# for j in range(n_disease):
# pair = [i,j]
# if pair not in pos_samples:
# neg_samples.append(pair)
# labels.append(0)
#
# new_pair_list = pos_samples
# new_pair_list.extend(neg_samples)
# print('len new_pair_list: ', len(new_pair_list))
# # print('len(labels): ', len(labels))
# return np.asarray(new_pair_list), np.asarray(labels)
# random_seeds=[123,456,789,101,112]
# random_seeds = [123]
......@@ -161,12 +166,16 @@ def negative_sampling(pos_samples, n_miRNA, n_disease, negative_rate):
# gen_fold(hmdd3_dir, hmdd3_savedir, randomseed=randseed)
parser = argparse.ArgumentParser(description='Neural based matrix completion for virus-host PPI')
parser.add_argument('--data_dir', default='/home/dong/simplifying_mirna_disease/hmdd2/', help='dataset directory')
parser.add_argument('--save_dir', default='/home/dong/simplifying_mirna_disease/hmdd2/folds/', help='dataset directory')
parser.add_argument('--randseed', default=456, help='the random seed')
parser.add_argument('--data_dir', default='data/hmdd2/', help='dataset directory')
parser.add_argument('--save_dir', default='data/hmdd2/folds_oneclf/', help='dataset directory')
parser.add_argument('--randseed', default=123, help='the random seed')
parser.add_argument('--numFold', default=5, help='value of K for K-foldCV, default is 5')
parser.add_argument('--neg_rate', default=1.0, help='the negative sampling rate')
args = parser.parse_args()
args.save_dir = standardize_dir(args.save_dir)
gen_fold(args.data_dir, args.save_dir, args.numFold, args.neg_rate, args.randseed)
randseeds = [123,456,789,101,112]
for rand in randseeds:
args.randseed = rand
gen_fold(args.data_dir, args.save_dir, args.numFold, args.neg_rate, args.randseed)
"""
==========================================
One-class SVM for negative sampling
"""
print(__doc__)
import numpy as np
from sklearn import svm
import random
def getData(assoc_list, first_sim, second_sim):
data = list()
for pair in assoc_list:
tmp_list = list(first_sim[pair[0]])
tmp_list.extend(second_sim[pair[1]])
data.append(tmp_list)
return np.array(data)
'''
Negative sampling with one-class SVM
Input:
pos_assoc_list: the list of positive associations pairs
neg_assoc_list: basically the remaining possible pair combination
first_sim: The numpy similarity matrix for the first type of nodes
second_sim: The numpy similarity matrix for the second type of nodes
Return the new list (is the combination of the pos_assoc_list and the newly generated negative assoc list)
and their corresponding label
'''
def negative_sampling(pos_assoc_list, neg_assoc_list, first_sim, second_sim, neg_rate = 1.0):
n_neg_samples = neg_rate * len(pos_assoc_list)
print('Negative sampling with one-class SVM start....')
train_data = getData(pos_assoc_list, first_sim.tolist(), second_sim.tolist())
test_data = getData(neg_assoc_list, first_sim.tolist(), second_sim.tolist())
print('train_data.shape: ', train_data.shape)
print('test_data.shape: ', test_data.shape)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(train_data)
# y_pred_train = clf.predict(train_data)
y_pred_test = clf.predict(test_data)
negative_pool = list()
for i, pred in enumerate(y_pred_test.tolist()):
if pred == -1:
negative_pool.append(neg_assoc_list[i])
negative_rand_index = [i for i in range(len(negative_pool))]
random.shuffle(negative_rand_index)
selected_index = negative_rand_index[:int(n_neg_samples if n_neg_samples < len(negative_pool) else len(negative_pool))]
new_train_list = list(pos_assoc_list)
new_train_lbl = [1.0] * len(pos_assoc_list)
for index in selected_index:
new_train_list.append(negative_pool[index])
new_train_lbl.append(0.0)
return new_train_list, new_train_lbl
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment