genFoldsData.py 6.68 KB
Newer Older
1 2 3 4 5 6
import pandas as pd
import random

import os

import numpy as np
Ngan Thi Dong's avatar
Ngan Thi Dong committed
7 8 9 10 11
from utility.utils import *
from data.preparation.GIP import *
from data.preparation.miRNA_sim import *

import argparse
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29

def adjMatrix2list(csv_path):
    vals = pd.read_csv(csv_path).values
    n_miRNA = vals.shape[0]
    n_disease = vals.shape[1]

    # get possivite known association, put into pos_assoc
    tmp_res1 = np.where(vals == 1.0)
    pos_assoc = list(zip(tmp_res1[0], tmp_res1[1]))

    # put the rest into the neg_assoc list
    tmp_res2 = np.where(vals == 0.0)
    neg_assoc = list(zip(tmp_res2[0], tmp_res2[1]))
    return pos_assoc, neg_assoc, n_miRNA, n_disease

def assocList2adjMat(pos_assoc, n_miRNA, n_disease):
    mat = np.zeros((n_miRNA, n_disease))
    for pair in pos_assoc:
Ngan Dong's avatar
Ngan Dong committed
30
        mat[pair[0], pair[1]] = 1.0
31 32 33 34
    return mat


def gen_fold(data_dir, save_dir, numFold=5, negative_rate=1.0, randomseed=123):
Ngan Thi Dong's avatar
Ngan Thi Dong committed
35 36
    data_dir = standardize_dir(data_dir)
    save_dir = standardize_dir(save_dir)
37
    adj_path = data_dir + 'm-d.csv'
Ngan Thi Dong's avatar
Ngan Thi Dong committed
38 39
    onto_disease_sim_path = data_dir + 'disease_sim.csv'
    onto_disease_sim_path2 = data_dir + 'disease_sim2.csv'
Ngan Dong's avatar
Ngan Dong committed
40
    disease_not_found_path = data_dir + 'disease_not_found_list.txt'
41 42 43

    mirna_func_path = 'mirna_func.csv'
    mirna_func_path2 = 'mirna_func2.csv'
Ngan Thi Dong's avatar
Ngan Thi Dong committed
44 45
    disease_sim_path = 'disease_sim.csv'
    disease_sim_path2 = 'disease_sim2.csv'
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
    mirna_gip_path = 'mirna_gip.csv'
    disease_gip_path = 'disease_gip.csv'

    pos_assoc, neg_assoc, n_miRNA, n_disease = adjMatrix2list(adj_path)
    print('len(pos_assoc:', len(pos_assoc), 'len(neg_assoc):', len(neg_assoc))

    N = len(pos_assoc)
    randomIndex = [i for i in range(N)]
    print('Number of known association: ', N)
    random.seed(randomseed)
    random.shuffle(randomIndex)

    neglist_lbl = [0] * len(neg_assoc)

    # make the neccessary folder if not exist
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    for i in range(numFold):
        test_lbl_list = list()
        pos_train_pair_list = list()
        pos_test_pair_list = list()

        # Get the list of sample in the test set
        if i < numFold - 1:
            index = randomIndex[int(N / numFold * i): int(N / numFold * (i+1))]
        else:
            index = randomIndex[int(N / numFold * i):]

        for idx, edge in enumerate(pos_assoc):
            if idx in index: # if idx in the test index, put it in the test set
                pos_test_pair_list.append(edge)
                test_lbl_list.append(1)
            else:
                pos_train_pair_list.append(edge)

        # the saving prefix: randomseed_foldIdx_
        saving_prefix = save_dir + str(randomseed) + '_' + str(i) + '_'
        # generate the similarity
        train_adj = assocList2adjMat(pos_train_pair_list, n_miRNA, n_disease)

        # miRNA functional with disease semantic only
Ngan Thi Dong's avatar
Ngan Thi Dong committed
88 89
        train_disease_semantic_sim, train_miRNA_func_sim= cal_miRNA_func_sim(train_adj, onto_disease_sim_path, disease_not_found_path, onto_disease_sim_path)
        save2File(train_disease_semantic_sim, saving_prefix + disease_sim_path)
90
        save2File(train_miRNA_func_sim, saving_prefix + mirna_func_path)
Ngan Thi Dong's avatar
Ngan Thi Dong committed
91
        train_disease_semantic_sim2, train_miRNA_func_sim2 = cal_miRNA_func_sim(train_adj, onto_disease_sim_path2, disease_not_found_path, onto_disease_sim_path)
92
        save2File(train_miRNA_func_sim2, saving_prefix + mirna_func_path2)
Ngan Thi Dong's avatar
Ngan Thi Dong committed
93
        save2File(train_disease_semantic_sim2, saving_prefix + disease_sim_path2)
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110

        train_miRNA_gip, train_disease_gip = calculate_gip(train_adj)
        save2File(train_miRNA_gip, saving_prefix + mirna_gip_path)
        save2File(train_disease_gip, saving_prefix + disease_gip_path)

        # get the negative sampling set of training set
        train_pair_list, train_pair_lbl = negative_sampling(pos_train_pair_list, n_miRNA, n_disease, negative_rate)
        test_pair_list = pos_test_pair_list
        test_pair_list.extend(neg_assoc)
        test_pair_lbl = test_lbl_list
        test_pair_lbl.extend(neglist_lbl)
        save2File(train_pair_list, saving_prefix + 'train.csv')
        save2File(train_pair_lbl, saving_prefix + 'train_lbl.csv')
        save2File(test_pair_list, saving_prefix + 'test.csv')
        save2File(test_pair_lbl, saving_prefix + 'test_lbl.csv')

def save2File(input, out_path):
Ngan Thi Dong's avatar
Ngan Thi Dong committed
111 112 113
    if str(type(input)).find('ndarray') < 0:
        input = np.array(input)
    df = pd.DataFrame(input)
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
    df.to_csv(out_path, header=False, index=False)


def negative_sampling(pos_samples, n_miRNA, n_disease, negative_rate):
    size_of_batch = len(pos_samples)
    print(size_of_batch)
    if negative_rate > 0:
        num_to_generate = size_of_batch * (negative_rate)
        values1 = np.random.randint(n_miRNA, size=int(num_to_generate * 1.2))
        values2 = np.random.randint(n_disease, size=int(num_to_generate * 1.2))
    labels = [1] * len(pos_samples)
    neg_samples = list()
    #
    # if train_pair_list != None:
    #     print(train_pair_list)
    if negative_rate > 0:
        for i in range(len(values1)):
            pair = [values1[i], values2[i]]
            if pair in neg_samples or pair in pos_samples:
                continue
            neg_samples.append(pair)
            labels.append(0)

            if len(labels) == size_of_batch + num_to_generate:
                break
    else:
        for i in range(n_miRNA):
            for j in range(n_disease):
                pair = [i,j]
                if pair not in pos_samples:
                    neg_samples.append(pair)
                    labels.append(0)

    new_pair_list = pos_samples
    new_pair_list.extend(neg_samples)
    print('len new_pair_list: ', len(new_pair_list))
    # print('len(labels): ', len(labels))
    return np.asarray(new_pair_list), np.asarray(labels)

Ngan Thi Dong's avatar
Ngan Thi Dong committed
153
# random_seeds=[123,456,789,101,112]
Ngan Thi Dong's avatar
Ngan Thi Dong committed
154 155 156 157 158 159 160
# random_seeds = [123]
# hmdd2_dir = 'data/hmdd2/'
# hmdd3_dir = 'data/hmdd3/'
# hmdd2_savedir  = hmdd2_dir + 'folds/'
# hmdd3_savedir = hmdd3_dir + 'folds/'
# for randseed in random_seeds:
#     gen_fold(hmdd2_dir, hmdd2_savedir, randomseed=randseed)
Ngan Thi Dong's avatar
Ngan Thi Dong committed
161
    # gen_fold(hmdd3_dir, hmdd3_savedir, randomseed=randseed)
Ngan Thi Dong's avatar
Ngan Thi Dong committed
162 163 164 165 166 167 168 169 170 171 172

parser = argparse.ArgumentParser(description='Neural based matrix completion for virus-host PPI')
parser.add_argument('--data_dir', default='/home/dong/simplifying_mirna_disease/hmdd2/', help='dataset directory')
parser.add_argument('--save_dir', default='/home/dong/simplifying_mirna_disease/hmdd2/folds/', help='dataset directory')
parser.add_argument('--randseed', default=456, help='the random seed')
parser.add_argument('--numFold', default=5, help='value of K for K-foldCV, default is 5')
parser.add_argument('--neg_rate', default=1.0, help='the negative sampling rate')
args = parser.parse_args()

gen_fold(args.data_dir, args.save_dir, args.numFold, args.neg_rate, args.randseed)