Commit c54a64a3 authored by Ngan Thi Dong's avatar Ngan Thi Dong
Browse files

add denovo and generalize model

parent 427f7917
import argparse
from sklearn.svm import SVC
from utils.utils import *
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Denovo model for virus-host PPI')
parser.add_argument('--data_dir', default='data/h1n1/', help='dataset directory')
parser.add_argument('--pos_train_path', default='pos_train_idx.csv', help='pos train path')
parser.add_argument('--neg_train_path', default='neg_train_idx.csv', help='neg train path')
parser.add_argument('--pos_test_path', default='pos_test_idx.csv', help='test path')
parser.add_argument('--neg_test_path', default='neg_test_idx.csv', help='test label path')
args = parser.parse_args()
args.pos_train_path = args.data_dir + args.pos_train_path
args.pos_test_path = args.data_dir + args.pos_test_path
args.neg_train_path = args.data_dir + args.neg_train_path
args.neg_test_path = args.data_dir + args.neg_test_path
virus_seq_path = args.data_dir + 'virus_seq.csv'
human_seq_path = args.data_dir + 'hprots_seq.csv'
vseq_dict = load_seq_dict(virus_seq_path)
hseq_dict = load_seq_dict(human_seq_path)
pos_train_features = get_denovo_feature(args.pos_train_path, vseq_dict, hseq_dict)
print('finish loading pos train features')
neg_train_features = get_denovo_feature(args.neg_train_path, vseq_dict, hseq_dict)
print('finish loading neg train features')
train_data = pos_train_features + neg_train_features
train_lbl = [1] * len(pos_train_features) + [0] * len(neg_train_features)
clf = SVC(kernel='rbf', C=10, gamma=0.001, probability=True)
print('Start training ...')
clf.fit(train_data, train_lbl)
print('Finish training ...')
pos_test_features = get_denovo_feature(args.pos_test_path, vseq_dict, hseq_dict)
print('finish loading pos test features')
neg_test_features = get_denovo_feature(args.neg_test_path, vseq_dict, hseq_dict)
print('finish loading neg test features')
test_data = pos_test_features + neg_test_features
test_lbl = [1] * len(pos_test_features) + [0] * len(neg_test_features)
print('Train pairs: ', len(train_data), 'Test pairs: ', len(test_data))
preds = clf.predict_proba(test_data)[:,1]
print('Finish testing ...')
auc_score, aupr_score,sn, sp, acc, topk, precision, recall, f1 = get_score(test_lbl, preds)
print('Performance: ', auc_score, aupr_score,sn, sp, acc, topk, precision, recall, f1)
import argparse
from sklearn.svm import SVC
from multiprocessing import Process, Queue
from utils.utils import *
def eval_fold(args, res_queue, train_rate, iSet, vseq_dict, hseq_dict):
neg_train_path = args.neg_train_path.replace('.csv', '_'.join(['', str(train_rate), str(iSet)]) + '.csv')
neg_train_features = get_denovo_feature(neg_train_path, vseq_dict, hseq_dict)
print('finish loading neg train features')
train_data = pos_train_features + neg_train_features
train_lbl = [1] * len(pos_train_features) + [0] * len(neg_train_features)
clf = SVC(kernel='rbf', C=10, gamma=0.001, probability=True)
print('Start training ...')
clf.fit(train_data, train_lbl)
print('Finish training ...')
writer_path = args.data_dir + 'denovo_trainrate_' + str(train_rate) + '_' + str(iSet) + '_res.csv'
writer = open(writer_path, 'w')
for test_rate in [1, 2, 5, 10]:
for iTest in range(1):
neg_test_path = args.neg_test_path.replace('.csv', '_'.join(['', str(test_rate), str(iTest)]) + '.csv')
neg_test_features = get_denovo_feature(neg_test_path, vseq_dict, hseq_dict)
print('finish loading neg train features')
test_data = pos_test_features + neg_test_features
test_lbl = [1] * len(pos_test_features) + [0] * len(neg_test_features)
print('Train pairs: ', len(train_data), 'Test pairs: ', len(test_data))
preds = clf.predict_proba(test_data)[:,1]
print('Finish testing ...')
auc_score, aupr_score,sn, sp, acc, topk, precision, recall, f1 = get_score(test_lbl, preds)
print('Performance: ', auc_score, aupr_score,sn, sp, acc, topk, precision, recall, f1)
res_queue.put([test_rate, auc_score, aupr_score,sn, sp, acc, topk, precision, recall, f1])
write_str = [str(item) for item in [train_rate, test_rate, iSet, iTest, auc_score, aupr_score,sn, sp, acc, topk, precision, recall, f1]]
writer.write(','.join(write_str) + '\n')
writer.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Denovo model for virus-host PPI')
parser.add_argument('--data_dir', default='data/h1n1/', help='dataset directory')
parser.add_argument('--pos_train_path', default='pos_train_idx.csv', help='pos train path')
parser.add_argument('--neg_train_path', default='neg_train.csv', help='neg train path')
parser.add_argument('--pos_test_path', default='pos_test_idx.csv', help='test path')
parser.add_argument('--neg_test_path', default='neg_test.csv', help='test label path')
parser.add_argument('--save_path', default='denovo.csv', help='path to the node feature')
args = parser.parse_args()
args.pos_train_path = args.data_dir + args.pos_train_path
args.pos_test_path = args.data_dir + args.pos_test_path
args.neg_train_path = args.data_dir + args.neg_train_path
args.neg_test_path = args.data_dir + args.neg_test_path
args.save_path = args.data_dir + 'denovo_emb'
virus_seq_path = args.data_dir + 'virus_seq.csv'
human_seq_path = args.data_dir + 'hprots_seq.csv'
vseq_dict = load_seq_dict(virus_seq_path)
hseq_dict = load_seq_dict(human_seq_path)
pos_train_features = get_denovo_feature(args.pos_train_path, vseq_dict, hseq_dict)
print('finish loading pos train features')
pos_test_features = get_denovo_feature(args.pos_test_path, vseq_dict, hseq_dict)
print('finish loading pos test features')
for train_rate in [1,2,5,10]:
process_list = list()
performance_dict = dict()
queue_dict = dict()# store the performance result, each queue go to one thread
for i in range(10):
queue_dict[i] = Queue(40)# 4 negative testing rate, each with 10 random sampled sets
for iSet in range(10):# run in parallel, each neg train set in a separate thread
process = Process(target=eval_fold, args=(args, queue_dict[iSet], train_rate, iSet, vseq_dict, hseq_dict))
process_list.append(process)
process.start()
for process in process_list:
process.join()# wait for all process to finish
for i in range(10):# Collect reults from all threads
res_list = [queue_dict[i].get() for _ in range(40)]
if i in [0,1,4,9]:
performance_dict[i+1] = list()
for item in res_list:
performance_dict[item[0]].append(item[1:])
# write results corresponding to each negative training rate
writer = open(args.save_path + '_' + str(train_rate) + '_trainrate.csv', 'w')
writer2 = open(args.save_path + '_' + str(train_rate) + '_trainrate_avg.csv', 'w')
writer.write('train_rate,test_rate,auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1\n')
writer2.write('train_rate,test_rate,auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1\n')
for test_rate in performance_dict.keys():
for rec in performance_dict[test_rate]:
str_rec = [str(item) for item in rec]
writer.write(str(train_rate) + ',' + str(test_rate) + ',' + ','.join(str_rec) + '\n')
print('-'*30)
print('Train_rate: ', train_rate, ', test_rate: ', test_rate)
arr = np.array(performance_dict[train_rate][test_rate])
print('all_scores: ', arr)
mean = np.mean(arr, axis=0)
std = np.std(arr, axis=0)
str_avg = [str(m) + '+-' + str(s) for m,s in zip(mean, std)]
writer2.write(str(train_rate) + ',' + str(test_rate) + ',' + ','.join(str_avg) + '\n')
print('Mean auc_score, aupr_score, sn, sp, acc, topk:')
print(mean)
print('Std auc_score, aupr_score, sn, sp, acc, topk:')
print(std)
writer.close()
writer2.close()
from utils.generalized_model import *
import argparse
from sklearn.svm import SVC
from utils.utils import *
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Generalized model for virus-host PPI')
parser.add_argument('--data_dir', default='data/h1n1/', help='dataset directory')
parser.add_argument('--pos_train_path', default='pos_train_idx.csv', help='pos train path')
parser.add_argument('--neg_train_path', default='neg_train.csv', help='neg train path')
parser.add_argument('--pos_test_path', default='pos_test_idx.csv', help='test path')
parser.add_argument('--neg_test_path', default='neg_test.csv', help='test label path')
args = parser.parse_args()
args.pos_train_path = args.data_dir + args.pos_train_path
args.pos_test_path = args.data_dir + args.pos_test_path
args.neg_train_path = args.data_dir + args.neg_train_path
args.neg_test_path = args.data_dir + args.neg_test_path
virus_seq_path = args.data_dir + 'virus_seq.csv'
human_seq_path = args.data_dir + 'hprots_seq.csv'
vseq_dict = load_seq_dict(virus_seq_path)
hseq_dict = load_seq_dict(human_seq_path)
pos_train_features = get_generalized_feature(args.pos_train_path, vseq_dict, hseq_dict)
print('finish loading pos train features')
neg_train_features = get_generalized_feature(args.neg_train_path, vseq_dict, hseq_dict)
print('finish loading neg train features')
train_data = pos_train_features + neg_train_features
train_lbl = [1] * len(pos_train_features) + [0] * len(neg_train_features)
clf = SVC(kernel='rbf', C=32, gamma=0.03125, probability=True)
print('Start training ...')
clf.fit(train_data, train_lbl)
print('Finish training ...')
pos_test_features = get_generalized_feature(args.pos_test_path, vseq_dict, hseq_dict)
print('finish loading pos test features')
neg_test_features = get_generalized_feature(args.neg_test_path, vseq_dict, hseq_dict)
print('finish loading neg train features')
test_data = pos_test_features + neg_test_features
test_lbl = [1] * len(pos_test_features) + [0] * len(neg_test_features)
print('Train pairs: ', len(train_data), 'Test pairs: ', len(test_data))
preds = clf.predict_proba(test_data)[:,1]
print('Finish testing ...')
auc_score, aupr_score,sn, sp, acc, topk, precision, recall, f1 = get_score(test_lbl, preds)
print('Performance: ', auc_score, aupr_score,sn, sp, acc, topk, precision, recall, f1)
from utils.generalized_model import *
import argparse
from sklearn.svm import SVC
from multiprocessing import Process, Queue
from utils.utils import *
def eval_fold(args, res_queue, train_rate, iSet, vseq_dict, hseq_dict):
neg_train_path = args.neg_train_path.replace('.csv', '_'.join(['', str(train_rate), str(iSet)]) + '.csv')
neg_train_features = get_generalized_feature(neg_train_path, vseq_dict, hseq_dict)
print('finish loading neg train features')
train_data = pos_train_features + neg_train_features
train_lbl = [1] * len(pos_train_features) + [0] * len(neg_train_features)
clf = SVC(kernel='rbf', C=32, gamma=0.03125, probability=True)
print('Start training ...')
clf.fit(train_data, train_lbl)
print('Finish training ...')
writer_path = args.data_dir + 'generalized_trainrate_' + str(train_rate) + '_' + str(iSet) + '_res.csv'
writer = open(writer_path, 'w')
for test_rate in [1, 2, 5, 10]:
if test_rate not in performance_dict[train_rate]:
performance_dict[train_rate][test_rate] = list()
for iTest in range(10):
neg_test_path = args.neg_test_path.replace('.csv', '_'.join(['', str(test_rate), str(iTest)]) + '.csv')
neg_test_features = get_generalized_feature(neg_test_path, vseq_dict, hseq_dict)
print('finish loading neg train features')
test_data = pos_test_features + neg_test_features
test_lbl = [1] * len(pos_test_features) + [0] * len(neg_test_features)
print('Train pairs: ', len(train_data), 'Test pairs: ', len(test_data))
preds = clf.predict_proba(test_data)[:,1]
print('Finish testing ...')
auc_score, aupr_score,sn, sp, acc, topk, precision, recall, f1 = get_score(test_lbl, preds)
print('Performance: ', auc_score, aupr_score,sn, sp, acc, topk, precision, recall, f1)
res_queue.put([test_rate, auc_score, aupr_score,sn, sp, acc, topk, precision, recall, f1])
write_str = [str(item) for item in [train_rate, test_rate, iSet, iTest, auc_score, aupr_score,sn, sp, acc, topk, precision, recall, f1]]
writer.write(','.join(write_str) + '\n')
writer.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Generalized model for virus-host PPI')
parser.add_argument('--data_dir', default='data/h1n1/', help='dataset directory')
parser.add_argument('--pos_train_path', default='pos_train_idx.csv', help='pos train path')
parser.add_argument('--neg_train_path', default='neg_train.csv', help='neg train path')
parser.add_argument('--pos_test_path', default='pos_test_idx.csv', help='test path')
parser.add_argument('--neg_test_path', default='neg_test.csv', help='test label path')
parser.add_argument('--save_path', default='generalized.csv', help='path to the node feature')
args = parser.parse_args()
args.pos_train_path = args.data_dir + args.pos_train_path
args.pos_test_path = args.data_dir + args.pos_test_path
args.neg_train_path = args.data_dir + args.neg_train_path
args.neg_test_path = args.data_dir + args.neg_test_path
args.save_path = args.data_dir + 'generalized_'
virus_seq_path = args.data_dir + 'virus_seq.csv'
human_seq_path = args.data_dir + 'hprots_seq.csv'
vseq_dict = load_seq_dict(virus_seq_path)
hseq_dict = load_seq_dict(human_seq_path)
pos_train_features = get_generalized_feature(args.pos_train_path, vseq_dict, hseq_dict)
print('finish loading pos train features')
pos_test_features = get_generalized_feature(args.pos_test_path, vseq_dict, hseq_dict)
print('finish loading pos test features')
for train_rate in [1,2,5,10]:
process_list = list()
performance_dict = dict()
queue_dict = dict()# store the performance result, each queue go to one thread
for i in range(10):
queue_dict[i] = Queue(40)# 4 negative testing rate, each with 10 random sampled sets
for iSet in range(10):# run in parallel, each neg train set in a separate thread
process = Process(target=eval_fold, args=(args, queue_dict[iSet], train_rate, iSet, vseq_dict, hseq_dict))
process_list.append(process)
process.start()
for process in process_list:
process.join() # wait for all process to finish
for i in range(10): # Collect reults from all threads
res_list = [queue_dict[i].get() for _ in range(40)]
if i in [0,1,4,9]:
performance_dict[i+1] = list()
for item in res_list:
performance_dict[item[0]].append(item[1:])
# write results corresponding to each negative training rate
writer = open(args.save_path + '_' + str(train_rate) + '_trainrate.csv', 'w')
writer2 = open(args.save_path + '_' + str(train_rate) + '_trainrate_avg.csv', 'w')
writer.write('train_rate,test_rate,auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1\n')
writer2.write('train_rate,test_rate,auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1\n')
for test_rate in performance_dict.keys():
for rec in performance_dict[test_rate]:
str_rec = [str(item) for item in rec]
writer.write(str(train_rate) + ',' + str(test_rate) + ',' + ','.join(str_rec) + '\n')
print('-'*30)
print('Train_rate: ', train_rate, ', test_rate: ', test_rate)
arr = np.array(performance_dict[train_rate][test_rate])
print('all_scores: ', arr)
mean = np.mean(arr, axis=0)
std = np.std(arr, axis=0)
str_avg = [str(m) + '+-' + str(s) for m,s in zip(mean, std)]
writer2.write(str(train_rate) + ',' + str(test_rate) + ',' + ','.join(str_avg) + '\n')
print('Mean auc_score, aupr_score, sn, sp, acc, topk:')
print(mean)
print('Std auc_score, aupr_score, sn, sp, acc, topk:')
print(std)
writer.close()
writer2.close()
import torch as t
from torch import nn
import numpy as np
class Model(nn.Module):
def __init__(self, n_vprot, n_hprot, hidden_dim, seq_emb_dim = 1900):
super(Model, self).__init__()
self.n_vprot = n_vprot
self.n_hprot = n_hprot
self.hidden_dim = hidden_dim
self.seq_emb_dim = seq_emb_dim
self.vemb = nn.Embedding(n_vprot, seq_emb_dim)
self.hemb = nn.Embedding(n_hprot, seq_emb_dim)
self.vlinear = nn.Linear(seq_emb_dim, hidden_dim)
self.hlinear = nn.Linear(seq_emb_dim, hidden_dim)
self.assoc_clf = nn.Linear(hidden_dim, 1)
self.ppi_clf = nn.Linear(hidden_dim, 1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
self.init_parameters()
def init_parameters(self):
n = self.seq_emb_dim
y = 1.0/np.sqrt(n)
y2 = 1.0/np.sqrt(self.hidden_dim)
self.assoc_clf.weight.data.uniform_(-y2, y2)
self.assoc_clf.bias.data.fill_(0)
self.ppi_clf.weight.data.uniform_(-y2, y2)
self.ppi_clf.bias.data.fill_(0)
self.vlinear.weight.data.uniform_(-y, y)
self.vlinear.bias.data.fill_(0)
self.hlinear.weight.data.uniform_(-y, y)
self.hlinear.bias.data.fill_(0)
self.vemb.weight.data.uniform_(-y, y)
self.hemb.weight.data.uniform_(-y, y)
def forward(self, vindex_tensor, hindex_tensor, vh_pairs, hppi_pairs):
vemb = self.vemb(vindex_tensor)
hemb = self.hemb(hindex_tensor)
vhid = self.vlinear(vemb)
hhid = self.hlinear(hemb)
vghid = self.relu(vhid)
hghid = self.relu(hhid)
assoc_out = self.sigmoid(self.assoc_clf(vghid[vh_pairs[:,0]] * hghid[vh_pairs[:, 1]]))
hhp_out = self.sigmoid(self.ppi_clf(hghid[hppi_pairs[:,0]] * hghid[hppi_pairs[:, 1]]))
return assoc_out.squeeze(), hhp_out.squeeze()
def infer(self, vindex_tensor, hindex_tensor, vh_pairs):
vemb = self.vemb(vindex_tensor)
hemb = self.hemb(hindex_tensor)
vhid = self.vlinear(vemb)
hhid = self.hlinear(hemb)
vghid = self.relu(vhid)
hghid = self.relu(hhid)
assoc_out = self.sigmoid(self.assoc_clf(vghid[vh_pairs[:,0]] * hghid[vh_pairs[:, 1]]))
return assoc_out.squeeze()
class STT(nn.Module):
def __init__(self, n_vprot, n_hprot, hidden_dim, seq_emb_dim = 1900):
super(STT, self).__init__()
self.n_vprot = n_vprot
self.n_hprot = n_hprot
self.hidden_dim = hidden_dim
self.seq_emb_dim = seq_emb_dim
self.vemb = nn.Embedding(n_vprot, seq_emb_dim)
self.hemb = nn.Embedding(n_hprot, seq_emb_dim)
self.vlinear = nn.Linear(seq_emb_dim, hidden_dim)
self.hlinear = nn.Linear(seq_emb_dim, hidden_dim)
self.assoc_clf = nn.Linear(hidden_dim, 1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
self.init_parameters()
def init_parameters(self):
n = self.seq_emb_dim
y = 1.0/np.sqrt(n)
y2 = 1.0/np.sqrt(self.hidden_dim)
self.assoc_clf.weight.data.uniform_(-y2, y2)
self.assoc_clf.bias.data.fill_(0)
self.vlinear.weight.data.uniform_(-y, y)
self.vlinear.bias.data.fill_(0)
self.hlinear.weight.data.uniform_(-y, y)
self.hlinear.bias.data.fill_(0)
self.vemb.weight.data.uniform_(-y, y)
self.hemb.weight.data.uniform_(-y, y)
def forward(self, vindex_tensor, hindex_tensor, vh_pairs):
vemb = self.vemb(vindex_tensor)
hemb = self.hemb(hindex_tensor)
vhid = self.vlinear(vemb)
hhid = self.hlinear(hemb)
vghid = self.relu(vhid)
hghid = self.relu(hhid)
assoc_out = self.sigmoid(self.assoc_clf(vghid[vh_pairs[:,0]] * hghid[vh_pairs[:, 1]]))
return assoc_out.squeeze()
......@@ -76,16 +76,14 @@ def get_all_feat(hseq, vseq):
cur_feat.extend(vdis)
return cur_feat
def cal_feature(data_path):
df = pd.read_csv(data_path)
tuples = df.values.tolist()
feature_list = list()
for tup in tuples:
hseq = tup[2]
vseq = tup[3]
feature_list.append(get_all_feat(hseq, vseq))
return feature_list
def get_generalized_feature(file_path, vseq_dict, hseq_dict):
df = pd.read_csv(file_path).values.tolist()
data = list()
for p in df:
vseq = vseq_dict[p[0]]
hseq = hseq_dict[p[1]]
data.append(get_all_feat(hseq, vseq))
return data
......
import os
import random
from utils.kmer import *
import pandas as pd
import numpy as np
from sklearn import metrics
......@@ -14,6 +14,35 @@ def standardize_dir(dir):
os.mkdir(res_dir)
return res_dir
def load_seq_dict(file_path):
df = pd.read_csv(file_path, header=None).values.tolist()
seq_dict = dict()
for i, p in enumerate(df):
seq_dict[i] = p[1]
return seq_dict
def get_denovo_feature(file_path, vseq_dict, hseq_dict):
df = pd.read_csv(file_path).values.tolist()
data = list()
kmer_obj = kmerFE()
for p in df:
cur_feat = list()
vseq = vseq_dict[p[0]]
hseq = hseq_dict[p[1]]
hkmer = kmer_obj.kmer_composition(hseq)
vkmer = kmer_obj.kmer_composition(vseq)
minH = min(hkmer)
maxH = max(hkmer)
minV = min(vkmer)
maxV = max(vkmer)
norm_vfeat = [(item-minV)/(maxV - minV) for item in vkmer]
norm_hfeat = [(item-minH) / (maxH - minH) for item in hkmer]
cur_feat.extend(norm_vfeat)
cur_feat.extend(norm_hfeat)
data.append(cur_feat)
return data
def get_threshold(targets, preds):
if type(targets) != list:
targets = targets.tolist()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment