Commit 427f7917 authored by Ngan Thi Dong's avatar Ngan Thi Dong
Browse files

add code for mtt and stt

parent cf58497b
from model import *
from sklearn.model_selection import train_test_split
import argparse
from utils.utils import *
def read_int_data(pos_path, neg_path):
pos_df = pd.read_csv(pos_path).values.tolist()
neg_df = pd.read_csv(neg_path).values.tolist()
int_edges = pos_df + neg_df
int_lbl = [1] * len(pos_df) + [0] * len(neg_df)
return pos_df, t.LongTensor(int_edges), t.FloatTensor(int_lbl)
def read_train_data(pos_path, neg_path, fixval=False):
pos_df = pd.read_csv(pos_path).values.tolist()
pos_train, pos_val = train_test_split(pos_df, test_size=0.1, random_state=42)
neg_df = pd.read_csv(neg_path).values.tolist()
indexes = list(range(len(neg_df)))
random.shuffle(indexes)
selected_indexes = indexes[:int(10*len(pos_df))]
neg_df = [neg_df[item] for item in selected_indexes]
neg_train, neg_val = train_test_split(neg_df, test_size=0.1, random_state=42)
if not fixval:
train_data = pos_train + neg_train
train_lbl = [1] * len(pos_train) + [0] * len(neg_train)
val_data = pos_val + neg_val
val_lbl = [1] * len(pos_val) + [0] * len(neg_val)
val_tensor = t.LongTensor(val_data)
val_lbl_tensor = t.FloatTensor(val_lbl)
else:
train_data = pos_train + neg_train
train_lbl = [1] * len(pos_train) + [0] * len(neg_train)
val_data, val_tensor, val_lbl_tensor = read_int_data(pos_path.replace('train', 'val'), neg_path.replace('train', 'val'))
return train_data, val_data, t.LongTensor(train_data), t.FloatTensor(train_lbl), val_tensor, val_lbl_tensor
def save_model(model, save_path):
t.save(model.state_dict(), save_path)
def load_model(model, model_path):
model.load_state_dict(t.load(model_path))
def main():
parser = argparse.ArgumentParser(description='Multitask transfer model for novel virus-human PPI')
parser.add_argument('--n_runs', type=int, default=10, metavar='N',
help='number of experiment runs')
parser.add_argument('--data_dir', default='data/h1n1/', help='dataset directory')
parser.add_argument('--virus_feature_path', default='virus_seq_1900emb.csv', help='virus_feature_path')
parser.add_argument('--human_feature_path', default='human_features.csv', help = 'human_feature path')
parser.add_argument('--hppi_edge_list', default='hppi_edgelist.csv', help='human ppi edge list path')
parser.add_argument('--hppi_edge_weight', default='hppi_edgeweight.csv', help='human ppi edge weight path')
parser.add_argument('--pos_train_path', default='pos_train_idx.csv', help='pos train path')
parser.add_argument('--pos_test_path', default='pos_test_idx.csv', help='pos test path')
parser.add_argument('--neg_train_path', default='neg_train_idx.csv', help='neg train path')
parser.add_argument('--neg_test_path', default='neg_test_idx.csv', help='neg test path')
parser.add_argument('--fixval', default=False, help='use the fix validation set or not')
args = parser.parse_args()
args.data_dir = standardize_dir(args.data_dir)
negname = 'mtt_' + args.neg_test_path.replace('.csv', '')
args.virus_feature_path = args.data_dir + args.virus_feature_path
args.human_feature_path = args.data_dir + args.human_feature_path
args.hppi_edge_list = args.data_dir + args.hppi_edge_list
args.hppi_edge_weight = args.data_dir + args.hppi_edge_weight
args.pos_train_path = args.data_dir + args.pos_train_path
args.pos_test_path = args.data_dir + args.pos_test_path
args.neg_train_path = args.data_dir + args.neg_train_path
args.neg_test_path = args.data_dir + args.neg_test_path
args.n_runs = int(args.n_runs)
args.fixval = bool(args.fixval)
human_features = pd.read_csv(args.human_feature_path, header=None).values
human_features = t.FloatTensor(human_features)
virus_features = t.FloatTensor(pd.read_csv(args.virus_feature_path, header=None).values)
n_virus = virus_features.size(0)
n_human = human_features.size(0)
print('Finish loading features')
hppi_edgeweight = pd.read_csv(args.hppi_edge_weight).values
hppi_edgelist = pd.read_csv(args.hppi_edge_list).values
hppi_edgeweight = t.FloatTensor(hppi_edgeweight)
hppi_edgelist = t.LongTensor(hppi_edgelist)
print('Finish loading human PPI')
vindex_tensor = t.LongTensor(list(range(n_virus)))
hindex_tensor = t.LongTensor(list(range(n_human)))
pos_train_pairs, val_data, train_tensor, train_lbl_tensor, val_tensor, val_lbl_tensor = read_train_data(args.pos_train_path, args.neg_train_path, args.fixval)
_, test_tensor, test_lbl_tensor = read_int_data(args.pos_test_path, args.neg_test_path)
test_lbl = test_lbl_tensor.detach().numpy()
val_lbl = val_lbl_tensor.detach().numpy()
print('Finish loading int pairs')
hppi_edgeweight = hppi_edgeweight.view(-1)
criterion = t.nn.BCELoss()
criterion2 = t.nn.MSELoss()
if t.cuda.is_available():
hppi_edgelist = hppi_edgelist.cuda()
hppi_edgeweight = hppi_edgeweight.cuda()
vindex_tensor = vindex_tensor.cuda()
hindex_tensor = hindex_tensor.cuda()
virus_features = virus_features.cuda()
human_features = human_features.cuda()
train_tensor = train_tensor.cuda()
test_tensor = test_tensor.cuda()
train_lbl_tensor = train_lbl_tensor.cuda()
criterion = criterion.cuda()
criterion2 = criterion2.cuda()
val_tensor = val_tensor.cuda()
max_auc = [0,0]
lrs = [0.001, 0.01]
grid_epochs = [200]
hiddens = [8,16,32,64]
ppi_weights = [1e-4, 1e-3, 1e-2, 1e-1, 1]
model_prefix = args.data_dir + negname + '_'
performance_dict = dict()
val_performance_dict = dict()
for epochs in grid_epochs:
args.epochs = epochs
for weight in ppi_weights:
args.ppi_weight = weight
for hid in hiddens:
for lr in lrs:
all_scores = list()
params = [epochs, weight, hid, lr, 0]
params = [str(item) for item in params]
save_model_prefix = model_prefix + '_'.join(params) + '.model'
val_all_scores = list()
for irun in range(args.n_runs):
save_model_path = save_model_prefix.replace('0.model', str(irun) + '.model')
model = Model(n_virus, n_human, hid)
# Initialize embedding layers with pre-trained sequence embedding
model.vemb.weight.data = virus_features
model.hemb.weight.data = human_features
optimizer = t.optim.Adam(model.parameters(), lr=lr)
if t.cuda.is_available():
model = model.cuda()
best_ap = 0
for epoch in range(0, args.epochs):
model.train()
optimizer.zero_grad()
score, hppi_out = model(vindex_tensor, hindex_tensor,
train_tensor, hppi_edgelist)
loss = criterion(score, train_lbl_tensor) + args.ppi_weight * criterion2(hppi_out, hppi_edgeweight)
loss.backward()
optimizer.step()
loss_val = loss.item() if not t.cuda.is_available() else loss.cpu().item()
print('Epoch: ', epoch, ' loss: ', loss_val / train_lbl_tensor.size(0))
if epoch % 2 == 0:
model.eval()
pred_score = model.infer(vindex_tensor, hindex_tensor, val_tensor)
pred_score = pred_score.detach().numpy() if not t.cuda.is_available() else pred_score.cpu().detach().numpy()
val_pred_lbl = pred_score.tolist()
val_pred_lbl = [item[0] if type(item) == list else item for item in val_pred_lbl]
auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1 = get_score(val_lbl, val_pred_lbl,K=1)
print('Validation set lr:%.4f, auc:%.4f, aupr:%.4f' %(lr, auc_score, aupr_score))
if best_ap < aupr_score:
best_ap = aupr_score
save_model(model, save_model_path) # Best model on the validation set
print('_'.join(params), 'best_ap: ', best_ap)
model = Model(n_virus, n_human, hid)
if t.cuda.is_available():
model = model.cuda()
load_model(model, save_model_path)
os.remove(save_model_path)
model.eval()
# performance on the testing set
pred_score = model.infer(vindex_tensor, hindex_tensor,
test_tensor)
pred_score = pred_score.detach().numpy() if not t.cuda.is_available() else pred_score.cpu().detach().numpy()
test_pred_lbl = pred_score.tolist()
test_pred_lbl = [item[0] if type(item) == list else item for item in test_pred_lbl]
auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1 = get_score(test_lbl, test_pred_lbl)
print('lr:%.4f, auc:%.4f, aupr:%.4f' %(lr, auc_score, aupr_score))
all_scores.append([auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1])#, sn, sp, acc, topk])
# Save the performance on the validation set also
pred_score = model.infer(vindex_tensor, hindex_tensor,
val_tensor)
pred_score = pred_score.detach().numpy() if not t.cuda.is_available() else pred_score.cpu().detach().numpy()
test_pred_lbl = pred_score.tolist()
test_pred_lbl = [item[0] if type(item) == list else item for item in test_pred_lbl]
auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1 = get_score(val_lbl, test_pred_lbl,K=1)
val_all_scores.append([auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1])#, sn, sp, acc, topk])
if max_auc[0] < auc_score:
max_auc = [auc_score, aupr_score]
t.cuda.empty_cache()
arr = np.array(all_scores)
print('all_scores: ', all_scores)
mean = np.mean(arr, axis=0)
std = np.std(arr, axis=0)
print('Mean auc_score, aupr_score, sn, sp, acc, topk:')
print(mean)
print('Std auc_score, aupr_score, sn, sp, acc, topk:')
print(std)
print('max auc, aupr:', max_auc)
performance_dict['_'.join(params[:4])] = [all_scores, list(mean), list(std)]
arr = np.array(val_all_scores)
print('all_scores: ', all_scores)
mean = np.mean(arr, axis=0)
std = np.std(arr, axis=0)
val_performance_dict['_'.join(params[:4])] = [val_all_scores, list(mean), list(std)]
writer = open(args.data_dir + negname + 'grid_search_res.txt', 'w')
print('write to file:', args.data_dir + negname + 'grid_search_res.txt')
maxf1 = 0
max_key = 0
for key in performance_dict:
writer.write('Result for ' + str(key) + '\n')
writer.write(str(performance_dict[key][0]) + '\n')
writer.write('mean: ' + str(performance_dict[key][1]) + '\n')
writer.write('std: ' + str(performance_dict[key][2]) + '\n')
print('Result for ' + str(key) + '\n')
print(str(performance_dict[key][0]) + '\n')
print('mean: ' + str(performance_dict[key][1]) + '\n')
print('std: ' + str(performance_dict[key][2]) + '\n')
if val_performance_dict[key][1][-1] > maxf1:
maxf1 = val_performance_dict[key][1][-1]
max_key = key
writer.write('Best results: ' + str(maxf1) + ' key: ' + str(max_key) + '\n')
print('Best results: ' + str(performance_dict[max_key]) + ' key: ' + str(max_key) + '\n')
writer.close()
if __name__ == "__main__":
main()
import os
import torch
import torch.cuda
import torch.nn
from model import *
from sklearn.model_selection import train_test_split
import argparse
import pandas as pd
import numpy as np
from utils.utils import *
def read_int_data(pos_path, neg_path):
pos_df = pd.read_csv(pos_path).values.tolist()
neg_df = pd.read_csv(neg_path).values.tolist()
int_edges = pos_df + neg_df
int_lbl = [1] * len(pos_df) + [0] * len(neg_df)
return pos_df, t.LongTensor(int_edges), t.FloatTensor(int_lbl)
def read_train_data(pos_path, neg_path, fixval=False):
pos_df = pd.read_csv(pos_path).values.tolist()
pos_train, pos_val = train_test_split(pos_df, test_size=0.1)
neg_df = pd.read_csv(neg_path).values.tolist()
neg_train, neg_val = train_test_split(neg_df, test_size=0.1)
if not fixval:
train_data = pos_train + neg_train
train_lbl = [1] * len(pos_train) + [0] * len(neg_train)
val_data = pos_val + neg_val
val_lbl = [1] * len(pos_val) + [0] * len(neg_val)
val_tensor = t.LongTensor(val_data)
val_lbl_tensor = t.FloatTensor(val_lbl)
else:
train_data = pos_train + neg_train
train_lbl = [1] * len(pos_train) + [0] * len(neg_train)
val_data, val_tensor, val_lbl_tensor = read_int_data(pos_path.replace('train', 'val'), neg_path.replace('train', 'val'))
return train_data, val_data, t.LongTensor(train_data), t.FloatTensor(train_lbl), val_tensor, val_lbl_tensor
def save_model(model, save_path):
t.save(model.state_dict(), save_path)
def load_model(model, model_path):
model.load_state_dict(t.load(model_path))
def main():
parser = argparse.ArgumentParser(description='Multitask transfer model for novel virus-human PPI')
parser.add_argument('--epochs', type=int, default=200, metavar='N',
help='number of epochs to train')
parser.add_argument('--n_runs', type=int, default=10, metavar='N',
help='number of experiment runs')
parser.add_argument('--ppi_weight', type=float, default=0.001, metavar='N',
help='weight of PPI prediction')
parser.add_argument('--hid', type=float, default=16, metavar='N',
help='Hidden dimension')
parser.add_argument('--lr', type=float, default=0.001, metavar='N',
help='Learning rate')
parser.add_argument('--data_dir', default='data/h1n1/', help='dataset directory')
parser.add_argument('--virus_feature_path', default='virus_seq_1900emb.csv', help='virus_feature_path')
parser.add_argument('--human_feature_path', default='human_features.csv', help = 'human_feature path')
parser.add_argument('--hppi_edge_list', default='hppi_edgelist.csv', help='human ppi edge list path')
parser.add_argument('--hppi_edge_weight', default='hppi_edgeweight.csv', help='human ppi edge weight path')
parser.add_argument('--pos_train_path', default='pos_train_idx.csv', help='pos train path')
parser.add_argument('--pos_test_path', default='pos_test_idx.csv', help='pos test path')
parser.add_argument('--neg_train_path', default='neg_train.csv', help='neg train path')
parser.add_argument('--neg_test_path', default='neg_test.csv', help='neg test path')
parser.add_argument('--fixval', default=False, help='use the fix validation set or not')
parser.add_argument('--sub', default=False, help='use the subset of the known human PPI or not')
args = parser.parse_args()
# args.data_dir = standardize_dir(args.data_dir)
args.virus_feature_path = args.data_dir + args.virus_feature_path
args.human_feature_path = args.data_dir + args.human_feature_path
args.hppi_edge_list = args.data_dir + args.hppi_edge_list
args.hppi_edge_weight = args.data_dir + args.hppi_edge_weight
args.pos_train_path = args.data_dir + args.pos_train_path
args.pos_test_path = args.data_dir + args.pos_test_path
args.neg_train_path = args.data_dir + args.neg_train_path
args.neg_test_path = args.data_dir + args.neg_test_path
args.ppi_weight = float(args.ppi_weight)
args.n_runs = int(args.n_runs)
args.fixval = bool(args.fixval)
args.lr = float(args.lr)
args.hid = int(args.hid)
args.epochs = int(args.epochs)
human_features = pd.read_csv(args.human_feature_path, header=None).values
human_features = t.FloatTensor(human_features)
virus_features = t.FloatTensor(pd.read_csv(args.virus_feature_path, header=None).values)
n_virus = virus_features.size(0)
n_human = human_features.size(0)
print('Finish loading features')
vindex_tensor = t.LongTensor(list(range(n_virus)))
hindex_tensor = t.LongTensor(list(range(n_human)))
criterion = t.nn.BCELoss()
criterion2 = t.nn.MSELoss()
if t.cuda.is_available():
vindex_tensor = vindex_tensor.cuda()
hindex_tensor = hindex_tensor.cuda()
virus_features = virus_features.cuda()
human_features = human_features.cuda()
criterion = criterion.cuda()
criterion2 = criterion2.cuda()
performance_dict = dict()
sub_str = '_sub' if str(args.sub) == 'True' else ''
model_prefix = args.data_dir + 'mtt' + sub_str
for train_rate in [1, 2, 5, 10]:
performance_dict[train_rate] = dict()
for iSet in range(10):
neg_train_path = args.neg_train_path.replace('.csv', '_'.join(['', str(train_rate), str(iSet)]) + '.csv')
train_list, val_data, train_tensor, train_lbl_tensor, val_tensor, val_lbl_tensor = read_train_data(
args.pos_train_path, neg_train_path, args.fixval)
val_lbl = val_lbl_tensor.detach().numpy()
if t.cuda.is_available():
train_tensor = train_tensor.cuda()
val_tensor = val_tensor.cuda()
train_lbl_tensor = train_lbl_tensor.cuda()
all_hprots = [item[1] for item in train_list]
val_list = [item[1] for item in val_data]
all_hprots.extend(val_list)
hppi_edgeweight = pd.read_csv(args.hppi_edge_weight)
hppi_edgelist = pd.read_csv(args.hppi_edge_list)
for test_rate in [1, 2, 5, 10]:
for iTest in range(10):
neg_test_path = args.neg_test_path.replace('.csv', '_'.join(['', str(test_rate), str(iTest)]) + '.csv')
test_list, _, _ = read_int_data(args.pos_test_path, neg_test_path)
test_list = [item[1] for item in test_list]
all_hprots.extend(test_list)
if str(args.sub) == 'True':
hprot_list = list(set(all_hprots))
hppi_df = pd.concat([hppi_edgelist, hppi_edgeweight], axis=1)
hppi_df = hppi_df[(hppi_df['p1'].isin(hprot_list)) & (hppi_df['p2'].isin(hprot_list))]
hppi_edgelist = hppi_df[['p1', 'p2']]
hppi_edgeweight = hppi_df[['score']]
hppi_edgeweight = t.FloatTensor(hppi_edgeweight.values)
hppi_edgelist = t.LongTensor(hppi_edgelist.values)
print('Finish loading human PPI')
hppi_edgeweight = hppi_edgeweight.view(-1)
print('Finish loading int pairs')
if t.cuda.is_available():
hppi_edgelist = hppi_edgelist.cuda()
hppi_edgeweight = hppi_edgeweight.cuda()
weight = args.ppi_weight
hid = args.hid
lr = args.lr
params = [weight, hid, lr, 0]
params = [str(item) for item in params]
save_model_prefix = model_prefix + '_'.join(params) + '.model'
for irun in range(args.n_runs):
save_model_path = save_model_prefix.replace('0.model', str(irun) + '.model')
model = Model(n_virus, n_human, hid)
# Initialize embedding layers with pre-trained sequence embedding
model.vemb.weight.data = virus_features
model.hemb.weight.data = human_features
optimizer = t.optim.Adam(model.parameters(), lr=lr)
if t.cuda.is_available():
model = model.cuda()
best_ap = 0
for epoch in range(0, args.epochs):
model.train()
optimizer.zero_grad()
score, hppi_out = model(vindex_tensor, hindex_tensor,
train_tensor, hppi_edgelist)
loss = criterion(score, train_lbl_tensor) + args.ppi_weight * criterion2(hppi_out, hppi_edgeweight)
loss.backward()
optimizer.step()
loss_val = loss.item() if not t.cuda.is_available() else loss.cpu().item()
print('Epoch: ', epoch, ' loss: ', loss_val / train_lbl_tensor.size(0))
if epoch % 2 == 0:
model.eval()
pred_score = model.infer(vindex_tensor, hindex_tensor,
val_tensor)
pred_score = pred_score.detach().numpy() if not t.cuda.is_available() else pred_score.cpu().detach().numpy()
val_pred_lbl = pred_score.tolist()
val_pred_lbl = [item[0] if type(item) == list else item for item in val_pred_lbl]
auc_score, aupr_score = get_score2(val_lbl, val_pred_lbl, K=10)
print('Validation set lr:%.4f, auc:%.4f, aupr:%.4f' % (lr, auc_score, aupr_score))
if best_ap < aupr_score:
best_ap = aupr_score
save_model(model, save_model_path)
print('_'.join(params), 'best_ap: ', best_ap)
print('load model', save_model_path, args.sub, model_prefix)
model = Model(n_virus, n_human, hid)
if t.cuda.is_available():
model = model.cuda()
load_model(model, save_model_path)
os.remove(save_model_path)
model.eval()
for test_rate in [1, 2, 5, 10]:
performance_dict[train_rate][test_rate] = list()
for iTest in range(10):
neg_test_path = args.neg_test_path.replace('.csv', '_'.join(['', str(test_rate), str(iTest)]) + '.csv')
test_list, test_tensor, test_lbl_tensor = read_int_data(args.pos_test_path, neg_test_path)
test_lbl = test_lbl_tensor.detach().numpy()
if t.cuda.is_available():
test_tensor = test_tensor.cuda()
pred_score = model.infer(vindex_tensor, hindex_tensor,
test_tensor)
pred_score = pred_score.detach().numpy() if not t.cuda.is_available() else pred_score.cpu().detach().numpy()
test_pred_lbl = pred_score.tolist()
test_pred_lbl = [item[0] if type(item) == list else item for item in test_pred_lbl]
auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1 = get_score(test_lbl, test_pred_lbl)
print('lr:%.4f, train rate:%.1f, test rate:%.1f' % (lr, train_rate, test_rate))
print('auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1\n', auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1)
performance_dict[train_rate][test_rate].append([auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1]) # , sn, sp, acc, topk])
t.cuda.empty_cache()
writer = open(model_prefix + '_allscores.csv', 'w')
writer2 = open(model_prefix + '_avg.csv', 'w')
writer.write('train_rate,test_rate,auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1\n')
writer2.write('train_rate,test_rate,auc_score, aupr_score, sn, sp, acc, topk, precision, recall, f1\n')
for train_rate in performance_dict.keys():
for test_rate in performance_dict[train_rate].keys():
for rec in performance_dict[train_rate][test_rate]:
str_rec = [str(item) for item in rec]
writer.write(str(train_rate) + ',' + str(test_rate) + ',' + ','.join(str_rec) + '\n')
print('-'*30)
print('Train_rate: ', train_rate, ', test_rate: ', test_rate)
arr = np.array(performance_dict[train_rate][test_rate])
print('all_scores: ', arr)
mean = np.mean(arr, axis=0)
std = np.std(arr, axis=0)
str_avg = [str(m) + '+-' + str(s) for m,s in zip(mean, std)]
writer2.write(str(train_rate) + ',' + str(test_rate) + ',' + ','.join(str_avg) + '\n')
print('Mean auc_score, aupr_score, sn, sp, acc, topk:')
print(mean)
print('Std auc_score, aupr_score, sn, sp, acc, topk:')
print(std)
writer.close()
writer2.close()
if __name__ == "__main__":
main()
from model import *
from sklearn.model_selection import train_test_split
import argparse
from utils.utils import *
def read_int_data(pos_path, neg_path):
pos_df = pd.read_csv(pos_path).values.tolist()
neg_df = pd.read_csv(neg_path).values.tolist()
int_edges = pos_df + neg_df
int_lbl = [1] * len(pos_df) + [0] * len(neg_df)
return pos_df, t.LongTensor(int_edges), t.FloatTensor(int_lbl)
def read_train_data(pos_path, neg_path, fixval=False):
pos_df = pd.read_csv(pos_path).values.tolist()
pos_train, pos_val = train_test_split(pos_df, test_size=0.1, random_state=42)
neg_df = pd.read_csv(neg_path).values.tolist()
indexes = list(range(len(neg_df)))
random.shuffle(indexes)
selected_indexes = indexes[:int(10*len(pos_df))]
neg_df = [neg_df[item] for item in selected_indexes]
neg_train, neg_val = train_test_split(neg_df, test_size=0.1, random_state=42)
if not fixval:
train_data = pos_train + neg_train
train_lbl = [1] * len(pos_train) + [0] * len(neg_train)
val_data = pos_val + neg_val
val_lbl = [1] * len(pos_val) + [0] * len(neg_val)
val_tensor = t.LongTensor(val_data)
val_lbl_tensor = t.FloatTensor(val_lbl)
else:
train_data = pos_train + neg_train
train_lbl = [1] * len(pos_train) + [0] * len(neg_train)
val_data, val_tensor, val_lbl_tensor = read_int_data(pos_path.replace('train', 'val'), neg_path.replace('train', 'val'))
return train_data, val_data, t.LongTensor(train_data), t.FloatTensor(train_lbl), val_tensor, val_lbl_tensor
def save_model(model, save_path):
t.save(model.state_dict(), save_path)
def load_model(model, model_path):
model.load_state_dict(t.load(model_path))
def main():
parser = argparse.ArgumentParser(description='Multitask transfer model for novel virus-human PPI')
parser.add_argument('--n_runs', type=int, default=10, metavar='N',
help='number of experiment runs')
parser.add_argument('--data_dir', default='data/h1n1/', help='dataset directory')
parser.add_argument('--virus_feature_path', default='virus_seq_1900emb.csv', help='virus_feature_path')
parser.add_argument('--human_feature_path', default='human_features.csv', help = 'human_feature path')
parser.add_argument('--pos_train_path', default='pos_train_idx.csv', help='pos train path')
parser.add_argument('--pos_test_path', default='pos_test_idx.csv', help='pos test path')
parser.add_argument('--neg_train_path', default='neg_train_idx.csv', help='neg train path')
parser.add_argument('--neg_test_path', default='neg_test_idx.csv', help='neg test path')
parser.add_argument('--fixval', default=False, help='use the fix validation set or not')
args = parser.parse_args()
args.data_dir = standardize_dir(args.data_dir)
negname = 'stt_' + args.neg_test_path.replace('.csv', '')
args.virus_feature_path = args.data_dir + args.virus_feature_path
args.human_feature_path = args.data_dir