-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathtest_biobridge_primekg_loader.py
242 lines (232 loc) · 9.59 KB
/
test_biobridge_primekg_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
"""
Test cases for primekg_loader.py
"""
import os
import shutil
import pytest
from ..datasets.biobridge_primekg import BioBridgePrimeKG
# Remove the data folder for testing if it exists
PRIMEKG_LOCAL_DIR = "../data/primekg_test/"
LOCAL_DIR = "../data/biobridge_primekg_test/"
shutil.rmtree(LOCAL_DIR, ignore_errors=True)
@pytest.fixture(name="biobridge_primekg")
def biobridge_primekg_fixture():
"""
Fixture for creating an instance of PrimeKG.
"""
return BioBridgePrimeKG(primekg_dir=PRIMEKG_LOCAL_DIR,
local_dir=LOCAL_DIR)
def test_download_primekg(biobridge_primekg):
"""
Test the loading method of the BioBridge-PrimeKG class by downloading data from repository.
"""
# Load BioBridge-PrimeKG data
biobridge_primekg.load_data()
primekg_nodes = biobridge_primekg.get_primekg().get_nodes()
primekg_edges = biobridge_primekg.get_primekg().get_edges()
biobridge_data_config = biobridge_primekg.get_data_config()
biobridge_emb_dict = biobridge_primekg.get_node_embeddings()
biobridge_triplets = biobridge_primekg.get_primekg_triplets()
biobridge_splits = biobridge_primekg.get_train_test_split()
biobridge_node_info = biobridge_primekg.get_node_info_dict()
# Check if the local directories exists
assert os.path.exists(biobridge_primekg.primekg_dir)
assert os.path.exists(biobridge_primekg.local_dir)
# Check if downloaded and processed files exist
# PrimeKG files
files = ["nodes.tab", "primekg_nodes.tsv.gz",
"edges.csv", "primekg_edges.tsv.gz"]
for file in files:
path = f"{biobridge_primekg.primekg_dir}/{file}"
assert os.path.exists(path)
# BioBridge data config
assert os.path.exists(f"{biobridge_primekg.local_dir}/data_config.json")
# BioBridge embeddings
files = [
"protein.pkl",
"mf.pkl",
"cc.pkl",
"bp.pkl",
"drug.pkl",
"disease.pkl",
"embedding_dict.pkl"
]
for file in files:
path = f"{biobridge_primekg.local_dir}/embeddings/{file}"
assert os.path.exists(path)
# BioBridge processed files
files = [
"protein.csv",
"mf.csv",
"cc.csv",
"bp.csv",
"drug.csv",
"disease.csv",
"triplet_full.tsv.gz",
"triplet_full_altered.tsv.gz",
"node_train.tsv.gz",
"triplet_train.tsv.gz",
"node_test.tsv.gz",
"triplet_test.tsv.gz",
]
for file in files:
path = f"{biobridge_primekg.local_dir}/processed/{file}"
assert os.path.exists(path)
# Check processed PrimeKG dataframes
# Nodes
assert primekg_nodes is not None
assert len(primekg_nodes) > 0
assert primekg_nodes.shape[0] == 129375
# Edges
assert primekg_edges is not None
assert len(primekg_edges) > 0
assert primekg_edges.shape[0] == 8100498
# Check processed BioBridge data config
assert biobridge_data_config is not None
assert len(biobridge_data_config) > 0
assert len(biobridge_data_config['node_type']) == 10
assert len(biobridge_data_config['relation_type']) == 18
assert len(biobridge_data_config['emb_dim']) == 6
# Check processed BioBridge embeddings
assert biobridge_emb_dict is not None
assert len(biobridge_emb_dict) > 0
assert len(biobridge_emb_dict) == 85466
# Check processed BioBridge triplets
assert biobridge_triplets is not None
assert len(biobridge_triplets) > 0
assert biobridge_triplets.shape[0] == 3904610
assert list(biobridge_splits.keys()) == ['train', 'node_train', 'test', 'node_test']
assert len(biobridge_splits['train']) == 3510930
assert len(biobridge_splits['node_train']) == 76486
assert len(biobridge_splits['test']) == 393680
assert len(biobridge_splits['node_test']) == 8495
# Check node info dictionary
assert list(biobridge_node_info.keys()) == ['gene/protein',
'molecular_function',
'cellular_component',
'biological_process',
'drug',
'disease']
assert len(biobridge_node_info['gene/protein']) == 19162
assert len(biobridge_node_info['molecular_function']) == 10966
assert len(biobridge_node_info['cellular_component']) == 4013
assert len(biobridge_node_info['biological_process']) == 27478
assert len(biobridge_node_info['drug']) == 6948
assert len(biobridge_node_info['disease']) == 44133
def test_load_existing_primekg(biobridge_primekg):
"""
Test the loading method of the BioBridge-PrimeKG class by loading existing data in local.
"""
# Load BioBridge-PrimeKG data
biobridge_primekg.load_data()
primekg_nodes = biobridge_primekg.get_primekg().get_nodes()
primekg_edges = biobridge_primekg.get_primekg().get_edges()
biobridge_data_config = biobridge_primekg.get_data_config()
biobridge_emb_dict = biobridge_primekg.get_node_embeddings()
biobridge_triplets = biobridge_primekg.get_primekg_triplets()
biobridge_splits = biobridge_primekg.get_train_test_split()
biobridge_node_info = biobridge_primekg.get_node_info_dict()
# Check if the local directories exists
assert os.path.exists(biobridge_primekg.primekg_dir)
assert os.path.exists(biobridge_primekg.local_dir)
# Check if downloaded and processed files exist
# PrimeKG files
files = ["nodes.tab", "primekg_nodes.tsv.gz",
"edges.csv", "primekg_edges.tsv.gz"]
for file in files:
path = f"{biobridge_primekg.primekg_dir}/{file}"
assert os.path.exists(path)
# BioBridge data config
assert os.path.exists(f"{biobridge_primekg.local_dir}/data_config.json")
# BioBridge embeddings
files = [
"protein.pkl",
"mf.pkl",
"cc.pkl",
"bp.pkl",
"drug.pkl",
"disease.pkl",
"embedding_dict.pkl"
]
for file in files:
path = f"{biobridge_primekg.local_dir}/embeddings/{file}"
assert os.path.exists(path)
# BioBridge processed files
files = [
"protein.csv",
"mf.csv",
"cc.csv",
"bp.csv",
"drug.csv",
"disease.csv",
"triplet_full.tsv.gz",
"triplet_full_altered.tsv.gz",
"node_train.tsv.gz",
"triplet_train.tsv.gz",
"node_test.tsv.gz",
"triplet_test.tsv.gz",
]
for file in files:
path = f"{biobridge_primekg.local_dir}/processed/{file}"
assert os.path.exists(path)
# Check processed PrimeKG dataframes
# Nodes
assert primekg_nodes is not None
assert len(primekg_nodes) > 0
assert primekg_nodes.shape[0] == 129375
# Edges
assert primekg_edges is not None
assert len(primekg_edges) > 0
assert primekg_edges.shape[0] == 8100498
# Check processed BioBridge data config
assert biobridge_data_config is not None
assert len(biobridge_data_config) > 0
assert len(biobridge_data_config['node_type']) == 10
assert len(biobridge_data_config['relation_type']) == 18
assert len(biobridge_data_config['emb_dim']) == 6
# Check processed BioBridge embeddings
assert biobridge_emb_dict is not None
assert len(biobridge_emb_dict) > 0
assert len(biobridge_emb_dict) == 85466
# Check processed BioBridge triplets
assert biobridge_triplets is not None
assert len(biobridge_triplets) > 0
assert biobridge_triplets.shape[0] == 3904610
assert list(biobridge_splits.keys()) == ['train', 'node_train', 'test', 'node_test']
assert len(biobridge_splits['train']) == 3510930
assert len(biobridge_splits['node_train']) == 76486
assert len(biobridge_splits['test']) == 393680
assert len(biobridge_splits['node_test']) == 8495
# Check node info dictionary
assert list(biobridge_node_info.keys()) == ['gene/protein',
'molecular_function',
'cellular_component',
'biological_process',
'drug',
'disease']
assert len(biobridge_node_info['gene/protein']) == 19162
assert len(biobridge_node_info['molecular_function']) == 10966
assert len(biobridge_node_info['cellular_component']) == 4013
assert len(biobridge_node_info['biological_process']) == 27478
assert len(biobridge_node_info['drug']) == 6948
assert len(biobridge_node_info['disease']) == 44133
# def test_load_existing_primekg_with_negative_triplets(biobridge_primekg):
# """
# Test the loading method of the BioBridge-PrimeKG class by loading existing data in local.
# In addition, it builds negative triplets for training data.
# """
# # Load BioBridge-PrimeKG data
# # Using 1 negative sample per positive triplet
# biobridge_primekg.load_data(build_neg_triplest=True, n_neg_samples=1)
# biobridge_neg_triplets = biobridge_primekg.get_primekg_triplets_negative()
# # Check if the local directories exists
# assert os.path.exists(biobridge_primekg.primekg_dir)
# assert os.path.exists(biobridge_primekg.local_dir)
# # Check if downloaded and processed files exist
# path = f"{biobridge_primekg.local_dir}/processed/triplet_train_negative.tsv.gz"
# assert os.path.exists(path)
# # Check processed BioBridge triplets
# assert biobridge_neg_triplets is not None
# assert len(biobridge_neg_triplets) > 0
# assert biobridge_neg_triplets.shape[0] == 3510930
# assert len(biobridge_neg_triplets.negative_tail_index[0]) == 1