forked from zhuyitan/IGTD
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTable2Image_Functions.py
847 lines (693 loc) · 36.4 KB
/
Table2Image_Functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
from scipy.stats import spearmanr, rankdata
from scipy.spatial.distance import pdist, squareform
from astropy.stats import median_absolute_deviation
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import shutil
import time
import _pickle as cp
import sys
def ID_mapping(l1, l2):
pos = {}
for i in range(len(l1)):
pos[l1[i]] = i
idd = np.array([pos[i] for i in l2])
return idd
def get_full_data(res, ccl, drug, cancer_col_name, drug_col_name, res_col_name=None):
data = []
idd = ID_mapping(drug['sample'], res.loc[:, drug_col_name])
data.append(drug['data'][idd, :, :, :])
idd = ID_mapping(ccl['sample'], res.loc[:, cancer_col_name])
data.append(ccl['data'][idd, :, :, :])
if res_col_name in res.columns:
label = res.loc[:, res_col_name].values
else:
label = None
return_data = {}
return_data['data'] = data
return_data['label'] = label
return_data['sample'] = res.loc[:, cancer_col_name] + '|' + res.loc[:, drug_col_name]
return return_data
def load_data(res, cancer_image_data_filepath, drug_image_data_filepath, cancer_id_mapping_filepath,
drug_id_mapping_filepath, cancer_col_name, drug_col_name, res_col_name):
'''
This function generates data of matched cancer, drug, and response.
Parameters:
-----------
res: data frame of drug response for which data need to be assembled.
cancer_image_data_filepath: string, file path of cancer data in image format.
drug_image_data_filepath: string, file path of drug data in image format.
cancer_id_mapping_filepath: string, file path of the mapping between cancer IDs and unique cancer IDs.
drug_id_mapping_filepath: string, file path of the mapping between drug IDs and unique drug IDs.
cancer_col_name: string, column name of cancer IDs in data frame
drug_col_name: string, column name of drug IDs in data frame
res_col_name: string, column name of response in data frame
Returns:
--------
matched_data: a dictionary of 'data', 'label', and 'sample' of cancer, drug, and response matched data.
'''
ccl_map = pd.read_csv(cancer_id_mapping_filepath, sep='\t', engine='c',
na_values=['na', '-', ''], header=0, index_col=None).drop_duplicates()
ccl_map.index = ccl_map.iloc[:, 0]
drug_map = pd.read_csv(drug_id_mapping_filepath, sep='\t', engine='c',
na_values=['na', '-', ''], header=0, index_col=None).drop_duplicates()
drug_map.index = drug_map.iloc[:, 0]
pkl_file = open(cancer_image_data_filepath, 'rb')
ccl_norm_d = cp.load(pkl_file)
ccl_data = cp.load(pkl_file)
ccl_samples = cp.load(pkl_file)
pkl_file.close()
ccl_norm_d = None
ccl_data = ccl_data / 255
ccl_data = ccl_data.transpose([2, 0, 1])
pkl_file = open(drug_image_data_filepath, 'rb')
drug_norm_d = cp.load(pkl_file)
drug_data = cp.load(pkl_file)
drug_samples = cp.load(pkl_file)
pkl_file.close()
drug_norm_d = None
drug_data = drug_data / 255
drug_data = drug_data.transpose([2, 0, 1])
data = {}
# Load gene expression data
data['ge'] = {}
data['ge']['sample'] = np.unique(res.loc[:, cancer_col_name])
id = ID_mapping(ccl_samples, ccl_map.loc[data['ge']['sample'], 'Unique_CancID'])
data['ge']['data'] = np.empty((len(id), ccl_data.shape[1], ccl_data.shape[2], 1))
data['ge']['data'][:, :, :, 0] = ccl_data[id, :, :]
# Load drug data
data['md'] = {}
data['md']['sample'] = np.unique(res.loc[:, drug_col_name])
id = ID_mapping(drug_samples, drug_map.loc[data['md']['sample'], 'Unique_DrugID'])
data['md']['data'] = np.empty((len(id), drug_data.shape[1], drug_data.shape[2], 1))
data['md']['data'][:, :, :, 0] = drug_data[id, :, :]
# Load response data
data['res'] = res.loc[:, [cancer_col_name, drug_col_name, res_col_name]]
# Match data
matched_data = get_full_data(data['res'], data['ge'], data['md'], cancer_col_name, drug_col_name, res_col_name)
return matched_data
def generate_unique_id_mapping(ge):
'''
This function generates a mapping between sample IDs and unique sample IDs based on a given data frame.
Parameters:
-----------
ge: a data frame for which to generate a mapping between sample IDs and unique sample IDs.
Each column is a feature and each row is a sample. Row index is sample ID.
Returns:
--------
ge_map: a data frame of the mapping between sample IDs and unique sample IDs.
ge_unique_data: a data frame of unique samples. Each column is a feature and each row is a unique sample.
Row index is unique sample ID.
'''
ge_str = ge.iloc[:, 0].map(str)
for i in range(1, ge.shape[1]):
ge_str = ge_str + '|' + ge.iloc[:, i].map(str)
ge_uni_v, ge_uni_id = np.unique(ge_str, return_index=True)
ge_unique = ge_str.copy()
for i in ge_uni_id:
idi = np.where(ge_str == ge_str[i])[0]
ge_unique[idi] = ge.index[i]
ge_map = pd.DataFrame({'ID': ge.index.values, 'UniqueID': ge_unique.values})
ge_unique_data = ge.iloc[ge_uni_id, :]
return ge_map, ge_unique_data
def min_max_transform(data):
'''
This function does a linear transformation of each feature, so that the minimum and maximum values of a
feature are 0 and 1, respectively.
Input:
data: an input data array with a size of [n_sample, n_feature]
Return:
norm_data: the data array after transformation
'''
norm_data = np.empty(data.shape)
norm_data.fill(np.nan)
for i in range(data.shape[1]):
v = data[:, i].copy()
if np.max(v) == np.min(v):
norm_data[:, i] = 0
else:
v = (v - np.min(v)) / (np.max(v) - np.min(v))
norm_data[:, i] = v
return norm_data
def select_features_by_variation(data, variation_measure='var', threshold=None, num=None, draw_histogram=False,
bins=100, log=False):
'''
This function evaluates the variations of individual features and returns the indices of features with large
variations. Missing values are ignored in evaluating variation.
Parameters:
-----------
data: numpy array or pandas data frame of numeric values, with a shape of [n_samples, n_features].
variation_metric: string indicating the metric used for evaluating feature variation. 'var' indicates variance;
'std' indicates standard deviation; 'mad' indicates median absolute deviation. Default is 'var'.
threshold: float. Features with a variation larger than threshold will be selected. Default is None.
num: positive integer. It is the number of features to be selected based on variation.
The number of selected features will be the smaller of num and the total number of
features with non-missing variations. Default is None. threshold and portion can not take values
and be used simultaneously.
draw_histogram: boolean, whether to draw a histogram of feature variations. Default is False.
bins: positive integer, the number of bins in the histogram. Default is the smaller of 50 and the number of
features with non-missing variations.
log: boolean, indicating whether the histogram should be drawn on log scale.
Returns:
--------
indices: 1-D numpy array containing the indices of selected features. If both threshold and
portion are None, indices will be an empty array.
'''
if isinstance(data, pd.DataFrame):
data = data.values
elif not isinstance(data, np.ndarray):
print('Input data must be a numpy array or pandas data frame')
sys.exit(1)
if variation_measure == 'std':
v_all = np.nanstd(a=data, axis=0)
elif variation_measure == 'mad':
v_all = median_absolute_deviation(data=data, axis=0, ignore_nan=True)
else:
v_all = np.nanvar(a=data, axis=0)
indices = np.where(np.invert(np.isnan(v_all)))[0]
v = v_all[indices]
if draw_histogram:
if len(v) < 50:
print('There must be at least 50 features with variation measures to draw a histogram')
else:
bins = int(min(bins, len(v)))
_ = plt.hist(v, bins=bins, log=log)
plt.show()
if threshold is None and num is None:
return np.array([])
elif threshold is not None and num is not None:
print('threshold and portion can not be used simultaneously. Only one of them can take a real value')
sys.exit(1)
if threshold is not None:
indices = indices[np.where(v > threshold)[0]]
else:
n_f = int(min(num, len(v)))
indices = indices[np.argsort(-v)[:n_f]]
indices = np.sort(indices)
return indices
def generate_feature_distance_ranking(data, method='Pearson'):
'''
This function generates ranking of distances/dissimilarities between features for tabular data.
Input:
data: input data, n_sample by n_feature
method: 'Pearson' uses Pearson correlation coefficient to evaluate similarity between features;
'Spearman' uses Spearman correlation coefficient to evaluate similarity between features;
'set' uses Jaccard index to evaluate similarity between features that are binary variables.
Return:
ranking: symmetric ranking matrix based on dissimilarity
corr: matrix of distances between features
'''
num = data.shape[1]
if method == 'Pearson':
corr = np.corrcoef(np.transpose(data))
elif method == 'Spearman':
corr = spearmanr(data).correlation
elif method == 'Euclidean':
corr = squareform(pdist(np.transpose(data), metric='euclidean'))
corr = np.max(corr) - corr
corr = corr / np.max(corr)
elif method == 'set': # This is the new set operation to calculate similarity. It does not tolerate all-zero features.
corr1 = np.dot(np.transpose(data), data)
corr2 = data.shape[0] - np.dot(np.transpose(1 - data), 1 - data)
corr = corr1 / corr2
corr = 1 - corr
corr = np.around(a=corr, decimals=10)
tril_id = np.tril_indices(num, k=-1)
rank = rankdata(corr[tril_id])
ranking = np.zeros((num, num))
ranking[tril_id] = rank
ranking = ranking + np.transpose(ranking)
return ranking, corr
def generate_matrix_distance_ranking(num_r, num_c, method='Euclidean'):
'''
This function calculates the ranking of distances between all pairs of entries in a matrix of size num_r by num_c.
Input:
num_r: number of rows in the matrix
num_c: number of columns in the matrix
method: method used to calculate distance. Can be 'Euclidean' or 'Manhattan'.
Return:
coordinate: num_r * num_c by 2 matrix giving the coordinates of elements in the matrix.
ranking: a num_r * num_c by num_r * num_c matrix giving the ranking of pair-wise distance.
'''
# generate the coordinates of elements in a matrix
for r in range(num_r):
if r == 0:
coordinate = np.transpose(np.vstack((np.zeros(num_c), range(num_c))))
else:
coordinate = np.vstack((coordinate, np.transpose(np.vstack((np.ones(num_c) * r, range(num_c))))))
# calculate the closeness of the elements
num = num_r * num_c
cord_dist = np.zeros((num, num))
if method == 'Euclidean':
for i in range(num):
cord_dist[i, :] = np.sqrt(np.square(coordinate[i, 0] * np.ones(num) - coordinate[:, 0]) +
np.square(coordinate[i, 1] * np.ones(num) - coordinate[:, 1]))
elif method == 'Manhattan':
for i in range(num):
cord_dist[i, :] = np.abs(coordinate[i, 0] * np.ones(num) - coordinate[:, 0]) + \
np.abs(coordinate[i, 1] * np.ones(num) - coordinate[:, 1])
# generate the ranking based on distance
tril_id = np.tril_indices(num, k=-1)
rank = rankdata(cord_dist[tril_id])
ranking = np.zeros((num, num))
ranking[tril_id] = rank
ranking = ranking + np.transpose(ranking)
coordinate = np.int64(coordinate)
return (coordinate[:, 0], coordinate[:, 1]), ranking
def IGTD_absolute_error(source, target, max_step=1000, switch_t=0, val_step=50, min_gain=0.00001, random_state=1,
save_folder=None, file_name=''):
'''
This function switches the order of rows (columns) in the source ranking matrix to make it similar to the target
ranking matrix. In each step, the algorithm randomly picks a row that has not been switched with others for
the longest time and checks all possible switch of this row, and selects the switch that reduces the
dissimilarity most. Dissimilarity (i.e. the error) is the summation of absolute difference of
lower triangular elements between the rearranged source ranking matrix and the target ranking matrix.
Input:
source: a symmetric ranking matrix with zero diagonal elements.
target: a symmetric ranking matrix with zero diagonal elements. 'source' and 'target' should have the same size.
max_step: the maximum steps that the algorithm should run if never converges.
switch_t: the threshold to determine whether switch should happen
val_step: number of steps for checking gain on the objective function to determine convergence
min_gain: if the objective function is not improved more than 'min_gain' in 'val_step' steps,
the algorithm terminates.
random_state: for setting random seed.
save_folder: a path to save the picture of source ranking matrix in the optimization process.
file_name: a string as part of the file names for saving results
Return:
index_record: indices to rearrange the rows(columns) in source obtained the optimization process
err_record: error obtained in the optimization process
run_time: the time at which each step is completed in the optimization process
'''
np.random.RandomState(seed=random_state)
if os.path.exists(save_folder):
shutil.rmtree(save_folder)
os.mkdir(save_folder)
source = source.copy()
num = source.shape[0]
tril_id = np.tril_indices(num, k=-1)
index = np.array(range(num))
index_record = np.empty((max_step + 1, num))
index_record.fill(np.nan)
index_record[0, :] = index.copy()
# calculate the error associated with each row
err_v = np.empty(num)
err_v.fill(np.nan)
for i in range(num):
err_v[i] = np.sum(np.abs(source[i, 0:i] - target[i, 0:i])) + \
np.sum(np.abs(source[(i + 1):, i] - target[(i + 1):, i]))
step_record = -np.ones(num)
err_record = [np.sum(abs(source[tril_id] - target[tril_id]))]
pre_err = err_record[0]
t1 = time.time()
run_time = [0]
for s in range(max_step):
delta = np.ones(num) * np.inf
# randomly pick a row that has not been considered for the longest time
idr = np.where(step_record == np.min(step_record))[0]
ii = idr[np.random.permutation(len(idr))[0]]
for jj in range(num):
if jj == ii:
continue
if ii < jj:
i = ii
j = jj
else:
i = jj
j = ii
err_ori = err_v[i] + err_v[j] - np.abs(source[j, i] - target[j, i])
err_i = np.sum(np.abs(source[j, :i] - target[i, :i])) + \
np.sum(np.abs(source[(i + 1):j, j] - target[(i + 1):j, i])) + \
np.sum(np.abs(source[(j + 1):, j] - target[(j + 1):, i])) + np.abs(source[i, j] - target[j, i])
err_j = np.sum(np.abs(source[i, :i] - target[j, :i])) + \
np.sum(np.abs(source[i, (i + 1):j] - target[j, (i + 1):j])) + \
np.sum(np.abs(source[(j + 1):, i] - target[(j + 1):, j])) + np.abs(source[i, j] - target[j, i])
err_test = err_i + err_j - np.abs(source[i, j] - target[j, i])
delta[jj] = err_test - err_ori
delta_norm = delta / pre_err
id = np.where(delta_norm <= switch_t)[0]
if len(id) > 0:
jj = np.argmin(delta)
# Update the error associated with each row
if ii < jj:
i = ii
j = jj
else:
i = jj
j = ii
for k in range(num):
if k < i:
err_v[k] = err_v[k] - np.abs(source[i, k] - target[i, k]) - np.abs(source[j, k] - target[j, k]) + \
np.abs(source[j, k] - target[i, k]) + np.abs(source[i, k] - target[j, k])
elif k == i:
err_v[k] = np.sum(np.abs(source[j, :i] - target[i, :i])) + \
np.sum(np.abs(source[(i + 1):j, j] - target[(i + 1):j, i])) + \
np.sum(np.abs(source[(j + 1):, j] - target[(j + 1):, i])) + np.abs(source[i, j] - target[j, i])
elif k < j:
err_v[k] = err_v[k] - np.abs(source[k, i] - target[k, i]) - np.abs(source[j, k] - target[j, k]) + \
np.abs(source[k, j] - target[k, i]) + np.abs(source[i, k] - target[j, k])
elif k == j:
err_v[k] = np.sum(np.abs(source[i, :i] - target[j, :i])) + \
np.sum(np.abs(source[i, (i + 1):j] - target[j, (i + 1):j])) + \
np.sum(np.abs(source[(j + 1):, i] - target[(j + 1):, j])) + np.abs(source[i, j] - target[j, i])
else:
err_v[k] = err_v[k] - np.abs(source[k, i] - target[k, i]) - np.abs(source[k, j] - target[k, j]) + \
np.abs(source[k, j] - target[k, i]) + np.abs(source[k, i] - target[k, j])
# switch rows i and j
ii_v = source[ii, :].copy()
jj_v = source[jj, :].copy()
source[ii, :] = jj_v
source[jj, :] = ii_v
ii_v = source[:, ii].copy()
jj_v = source[:, jj].copy()
source[:, ii] = jj_v
source[:, jj] = ii_v
err = delta[jj] + pre_err
# update rearrange index
t = index[ii]
index[ii] = index[jj]
index[jj] = t
# update step record
step_record[ii] = s
step_record[jj] = s
else:
# error is not changed due to no switch
err = pre_err
# update step record
step_record[ii] = s
err_record.append(err)
print('Step ' + str(s) + ' err: ' + str(err))
index_record[s + 1, :] = index.copy()
run_time.append(time.time() - t1)
if s > val_step:
if np.sum((err_record[-val_step - 1] - np.array(err_record[(-val_step):])) / err_record[
-val_step - 1] >= min_gain) == 0:
break
pre_err = err
index_record = index_record[:len(err_record), :].astype(np.int)
if save_folder is not None:
pd.DataFrame(index_record).to_csv(save_folder + '/' + file_name + '_index.txt', header=False, index=False,
sep='\t', line_terminator='\r\n')
pd.DataFrame(np.transpose(np.vstack((err_record, np.array(range(s + 2))))),
columns=['error', 'steps']).to_csv(save_folder + '/' + file_name + '_error_and_step.txt',
header=True, index=False, sep='\t', line_terminator='\r\n')
pd.DataFrame(np.transpose(np.vstack((err_record, run_time))), columns=['error', 'run_time']).to_csv(
save_folder + '/' + file_name + '_error_and_time.txt', header=True, index=False, sep='\t',
line_terminator='\r\n')
return index_record, err_record, run_time
def IGTD_square_error(source, target, max_step=1000, switch_t=0, val_step=50, min_gain=0.00001, random_state=1,
save_folder=None, file_name=''):
'''
This function switches the order of rows (columns) in the source ranking matrix to make it similar to the target
ranking matrix. In each step, the algorithm randomly picks a row that has not been switched with others for
the longest time and checks all possible switch of this row, and selects the switch that reduces the
dissimilarity most. Dissimilarity (i.e. the error) is the summation of squared difference of
lower triangular elements between the rearranged source ranking matrix and the target ranking matrix.
Input:
source: a symmetric ranking matrix with zero diagonal elements.
target: a symmetric ranking matrix with zero diagonal elements. 'source' and 'target' should have the same size.
max_step: the maximum steps that the algorithm should run if never converges.
switch_t: the threshold to determine whether switch should happen
val_step: number of steps for checking gain on the objective function to determine convergence
min_gain: if the objective function is not improved more than 'min_gain' in 'val_step' steps,
the algorithm terminates.
random_state: for setting random seed.
save_folder: a path to save the picture of source ranking matrix in the optimization process.
file_name: a string as part of the file names for saving results
Return:
index_record: ordering index to rearrange the rows(columns) in 'source' in the optimization process
err_record: the error history in the optimization process
run_time: the time at which each step is finished in the optimization process
'''
np.random.RandomState(seed=random_state)
if os.path.exists(save_folder):
shutil.rmtree(save_folder)
os.mkdir(save_folder)
source = source.copy()
num = source.shape[0]
tril_id = np.tril_indices(num, k=-1)
index = np.array(range(num))
index_record = np.empty((max_step + 1, num))
index_record.fill(np.nan)
index_record[0, :] = index.copy()
# calculate the error associated with each row
err_v = np.empty(num)
err_v.fill(np.nan)
for i in range(num):
err_v[i] = np.sum(np.square(source[i, 0:i] - target[i, 0:i])) + \
np.sum(np.square(source[(i + 1):, i] - target[(i + 1):, i]))
step_record = -np.ones(num)
err_record = [np.sum(np.square(source[tril_id] - target[tril_id]))]
pre_err = err_record[0]
t1 = time.time()
run_time = [0]
for s in range(max_step):
delta = np.ones(num) * np.inf
# randomly pick a row that has not been considered for the longest time
idr = np.where(step_record == np.min(step_record))[0]
ii = idr[np.random.permutation(len(idr))[0]]
for jj in range(num):
if jj == ii:
continue
if ii < jj:
i = ii
j = jj
else:
i = jj
j = ii
err_ori = err_v[i] + err_v[j] - np.square(source[j, i] - target[j, i])
err_i = np.sum(np.square(source[j, :i] - target[i, :i])) + \
np.sum(np.square(source[(i + 1):j, j] - target[(i + 1):j, i])) + \
np.sum(np.square(source[(j + 1):, j] - target[(j + 1):, i])) + np.square(source[i, j] - target[j, i])
err_j = np.sum(np.square(source[i, :i] - target[j, :i])) + \
np.sum(np.square(source[i, (i + 1):j] - target[j, (i + 1):j])) + \
np.sum(np.square(source[(j + 1):, i] - target[(j + 1):, j])) + np.square(source[i, j] - target[j, i])
err_test = err_i + err_j - np.square(source[i, j] - target[j, i])
delta[jj] = err_test - err_ori
delta_norm = delta / pre_err
id = np.where(delta_norm <= switch_t)[0]
if len(id) > 0:
jj = np.argmin(delta)
# Update the error associated with each row
if ii < jj:
i = ii
j = jj
else:
i = jj
j = ii
for k in range(num):
if k < i:
err_v[k] = err_v[k] - np.square(source[i, k] - target[i, k]) - np.square(source[j, k] - target[j, k]) + \
np.square(source[j, k] - target[i, k]) + np.square(source[i, k] - target[j, k])
elif k == i:
err_v[k] = np.sum(np.square(source[j, :i] - target[i, :i])) + \
np.sum(np.square(source[(i + 1):j, j] - target[(i + 1):j, i])) + \
np.sum(np.square(source[(j + 1):, j] - target[(j + 1):, i])) + np.square(source[i, j] - target[j, i])
elif k < j:
err_v[k] = err_v[k] - np.square(source[k, i] - target[k, i]) - np.square(source[j, k] - target[j, k]) + \
np.square(source[k, j] - target[k, i]) + np.square(source[i, k] - target[j, k])
elif k == j:
err_v[k] = np.sum(np.square(source[i, :i] - target[j, :i])) + \
np.sum(np.square(source[i, (i + 1):j] - target[j, (i + 1):j])) + \
np.sum(np.square(source[(j + 1):, i] - target[(j + 1):, j])) + np.square(source[i, j] - target[j, i])
else:
err_v[k] = err_v[k] - np.square(source[k, i] - target[k, i]) - np.square(source[k, j] - target[k, j]) + \
np.square(source[k, j] - target[k, i]) + np.square(source[k, i] - target[k, j])
# switch rows i and j
ii_v = source[ii, :].copy()
jj_v = source[jj, :].copy()
source[ii, :] = jj_v
source[jj, :] = ii_v
ii_v = source[:, ii].copy()
jj_v = source[:, jj].copy()
source[:, ii] = jj_v
source[:, jj] = ii_v
err = delta[jj] + pre_err
# update rearrange index
t = index[ii]
index[ii] = index[jj]
index[jj] = t
# update step record
step_record[ii] = s
step_record[jj] = s
else:
# error is not changed due to no switch
err = pre_err
# update step record
step_record[ii] = s
err_record.append(err)
print('Step ' + str(s) + ' err: ' + str(err))
index_record[s + 1, :] = index.copy()
run_time.append(time.time() - t1)
if s > val_step:
if np.sum((err_record[-val_step - 1] - np.array(err_record[(-val_step):])) / err_record[
-val_step - 1] >= min_gain) == 0:
break
pre_err = err
index_record = index_record[:len(err_record), :].astype(np.int)
if save_folder is not None:
pd.DataFrame(index_record).to_csv(save_folder + '/' + file_name + '_index.txt', header=False, index=False,
sep='\t', line_terminator='\r\n')
pd.DataFrame(np.transpose(np.vstack((err_record, np.array(range(s + 2))))),
columns=['error', 'steps']).to_csv(save_folder + '/' + file_name + '_error_and_step.txt',
header=True, index=False, sep='\t', line_terminator='\r\n')
pd.DataFrame(np.transpose(np.vstack((err_record, run_time))), columns=['error', 'run_time']).to_csv(
save_folder + '/' + file_name + '_error_and_time.txt', header=True, index=False, sep='\t',
line_terminator='\r\n')
return index_record, err_record, run_time
def IGTD(source, target, err_measure='abs', max_step=1000, switch_t=0, val_step=50, min_gain=0.00001, random_state=1,
save_folder=None, file_name=''):
'''
This is just a wrapper function that wraps the two search functions using different error measures.
'''
if err_measure == 'abs':
index_record, err_record, run_time = IGTD_absolute_error(source=source,
target=target, max_step=max_step, switch_t=switch_t, val_step=val_step, min_gain=min_gain,
random_state=random_state, save_folder=save_folder, file_name=file_name)
if err_measure == 'squared':
index_record, err_record, run_time = IGTD_square_error(source=source,
target=target, max_step=max_step, switch_t=switch_t, val_step=val_step, min_gain=min_gain,
random_state=random_state, save_folder=save_folder, file_name=file_name)
return index_record, err_record, run_time
def generate_image_data(data, index, num_row, num_column, coord, image_folder=None, file_name=''):
'''
This function generates the data in image format according to rearrangement indices. It saves the data
sample-by-sample in both txt files and image files
Input:
data: original tabular data, 2D array or data frame, n_samples by n_features
index: indices of features obtained through optimization, according to which the features can be
arranged into a num_r by num_c image.
num_row: number of rows in image
num_column: number of columns in image
coord: coordinates of features in the image/matrix
image_folder: directory to save the image and txt data files. If none, no data file is saved
file_name: a string as a part of the file names to save data
Return:
image_data: the generated data, a 3D numpy array. The third dimension is across samples. The range of values
is [0, 255]. Small values actually indicate high values in the original data.
samples: the names of indices of the samples
'''
if isinstance(data, pd.DataFrame):
samples = data.index.map(np.str)
data = data.values
else:
samples = [str(i) for i in range(data.shape[0])]
if os.path.exists(image_folder):
shutil.rmtree(image_folder)
os.mkdir(image_folder)
data_2 = data.copy()
data_2 = data_2[:, index]
max_v = np.max(data_2)
min_v = np.min(data_2)
data_2 = 255 - (data_2 - min_v) / (max_v - min_v) * 255 # So that black means high value
image_data = np.empty((num_row, num_column, data_2.shape[0]))
image_data.fill(np.nan)
for i in range(data_2.shape[0]):
data_i = np.empty((num_row, num_column))
data_i.fill(np.nan)
data_i[coord] = data_2[i, :]
image_data[:, :, i] = data_i
if image_folder is not None:
fig = plt.figure()
plt.imshow(data_i, cmap='gray', vmin=0, vmax=255)
plt.axis('scaled')
plt.savefig(fname=image_folder + '/' + file_name + '_' + samples[i] + '_image.png', bbox_inches='tight',
pad_inches=0)
plt.close(fig)
pd.DataFrame(image_data[:, :, i], index=None, columns=None).to_csv(image_folder + '/' + file_name + '_'
+ samples[i] + '_data.txt', header=None, index=None, sep='\t', line_terminator='\r\n')
return image_data, samples
def table_to_image(norm_d, scale, fea_dist_method, image_dist_method, save_image_size, max_step, val_step, normDir,
error, switch_t=0, min_gain=0.00001):
'''
This function converts tabular data into images using the IGTD algorithm.
Input:
norm_d: a 2D array or data frame, which is the tabular data. Its size is n_samples by n_features
scale: a list of two positive integers. The number of pixel rows and columns in the image representations,
into which the tabular data will be converted.
fea_dist_method: a string indicating the method used for calculating the pairwise distances between features,
for which there are three options.
'Pearson' uses the Pearson correlation coefficient to evaluate the similarity between features.
'Spearman' uses the Spearman correlation coefficient to evaluate the similarity between features.
'set' uses the Jaccard index to evaluate the similarity between features that are binary variables.
image_dist_method: a string indicating the method used for calculating the distances between pixels in image.
It can be either 'Euclidean' or 'Manhattan'.
save_image_size: size of images (in inches) for saving visual results.
max_step: the maximum number of iterations that the IGTD algorithm will run if never converges.
val_step: the number of iterations for determining algorithm convergence. If the error reduction rate is smaller than
min_gain for val_step iterations, the algorithm converges.
normDir: a string indicating the directory to save result files.
error: a string indicating the function to evaluate the difference between feature distance ranking and pixel
distance ranking. 'abs' indicates the absolute function. 'squared' indicates the square function.
switch_t: the threshold on error change rate. Error change rate is
(error after feature swapping - error before feature swapping) / error before feature swapping.
In each iteration, if the smallest error change rate resulted from all possible feature swappings
is not larger than switch_t, the feature swapping resulting in the smallest error change rate will
be performed. If switch_t <= 0, the IGTD algorithm monotonically reduces the error during optimization.
min_gain: if the error reduction rate is not larger than min_gain for val_step iterations, the algorithm converges.
Return:
This function does not return any variable, but saves multiple result files, which are the following
1. Results.pkl stores the original tabular data, the generated image data, and the names of samples. The generated
image data is a 3D numpy array. Its size is [number of pixel rows in image, number of pixel columns in image,
number of samples]. The range of values is [0, 255]. Small values in the array actually correspond to high
values in the tabular data.
2. Results_Auxiliary.pkl stores the ranking matrix of pairwise feature distances before optimization,
the ranking matrix of pairwise pixel distances, the coordinates of pixels when concatenating pixels
row by row from image to form the pixel distance ranking matrix, error in each iteration,
and time (in seconds) when completing each iteration.
3. original_feature_ranking.png shows the feature distance ranking matrix before optimization.
4. image_ranking.png shows the pixel distance ranking matrix.
5. error_and_runtime.png shows the change of error vs. time during the optimization process.
6. error_and_iteration.png shows the change of error vs. iteration during the optimization process.
7. optimized_feature_ranking.png shows the feature distance ranking matrix after optimization.
8. data folder includes two image data files for each sample. The txt file is the image data in matrix format.
The png file shows the visualization of image data.
'''
if os.path.exists(normDir):
shutil.rmtree(normDir)
os.mkdir(normDir)
ranking_feature, corr = generate_feature_distance_ranking(data=norm_d, method=fea_dist_method)
fig = plt.figure(figsize=(save_image_size, save_image_size))
plt.imshow(np.max(ranking_feature) - ranking_feature, cmap='gray', interpolation='nearest')
plt.savefig(fname=normDir + '/original_feature_ranking.png', bbox_inches='tight', pad_inches=0)
plt.close(fig)
coordinate, ranking_image = generate_matrix_distance_ranking(num_r=scale[0], num_c=scale[1],
method=image_dist_method)
fig = plt.figure(figsize=(save_image_size, save_image_size))
plt.imshow(np.max(ranking_image) - ranking_image, cmap='gray', interpolation='nearest')
plt.savefig(fname=normDir + '/image_ranking.png', bbox_inches='tight', pad_inches=0)
plt.close(fig)
index, err, time = IGTD(source=ranking_feature, target=ranking_image,
err_measure=error, max_step=max_step, switch_t=switch_t, val_step=val_step, min_gain=min_gain, random_state=1,
save_folder=normDir + '/' + error, file_name='')
fig = plt.figure()
plt.plot(time, err)
plt.savefig(fname=normDir + '/error_and_runtime.png', bbox_inches='tight', pad_inches=0)
plt.close(fig)
fig = plt.figure()
plt.plot(range(len(err)), err)
plt.savefig(fname=normDir + '/error_and_iteration.png', bbox_inches='tight', pad_inches=0)
plt.close(fig)
min_id = np.argmin(err)
ranking_feature_random = ranking_feature[index[min_id, :], :]
ranking_feature_random = ranking_feature_random[:, index[min_id, :]]
fig = plt.figure(figsize=(save_image_size, save_image_size))
plt.imshow(np.max(ranking_feature_random) - ranking_feature_random, cmap='gray',
interpolation='nearest')
plt.savefig(fname=normDir + '/optimized_feature_ranking.png', bbox_inches='tight', pad_inches=0)
plt.close(fig)
data, samples = generate_image_data(data=norm_d, index=index[min_id, :], num_row=scale[0], num_column=scale[1],
coord=coordinate, image_folder=normDir + '/data', file_name='')
output = open(normDir + '/Results.pkl', 'wb')
cp.dump(norm_d, output)
cp.dump(data, output)
cp.dump(samples, output)
output.close()
output = open(normDir + '/Results_Auxiliary.pkl', 'wb')
cp.dump(ranking_feature, output)
cp.dump(ranking_image, output)
cp.dump(coordinate, output)
cp.dump(err, output)
cp.dump(time, output)
output.close()