-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexperiment.py
272 lines (232 loc) · 9.85 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# __author__ = 'WeiFu'
from __future__ import division, print_function
import time
from hdp import *
import pickle
def readMatch(src="./result/source_target_match.txt"):
def getStrip(lst):
result = []
for one in lst:
result.append(one[one.index("'") + 1:one.rindex("'")])
return result
result = []
f = open(src, "r")
X = f.readlines()[0].split("}, {")
for each in X:
attr_source = getStrip(
each[each.index("attr_source") + len("attr_source") + 2:each.index("attr_target") - 3].split(","))
attr_target = getStrip(each[each.index("attr_target") + len("attr_target") + 2:each.index("group") - 3].split(","))
group = each[each.index("group") + len("group") + 1:each.index("id") - 2]
score = float(each[each.index("score") + len("score") + 2:each.index("source_src") - 2])
source_src = (each[each.index("source_src") + len("source_src") + 1:each.index("target_src") - 2])
target_src = (each[each.index("target_src") + len("target_src") + 1:])
temp = o(score=score, attr_source=attr_source, attr_target=attr_target, source_src=source_src,
target_name=target_src)
result.append(temp)
return result
def getMedian(lst):
if len(lst) % 2:
return round(lst[int(len(lst) * 0.5)], 3)
else:
return round((lst[int(len(lst) * 0.5 - 0.5)] + lst[int(len(lst) * 0.5 + 0.5)]) / 2, 3)
def getIQR(lst):
def p(x) : return lst[int(x)]
n = int(len(lst)*0.25)
IQR = p(n*3) - p(n*1)
return IQR
def process(source_target_match, target_name, out_hdp):
"""
:param source_target_match: all feasible source_data and target_data pair
:type list of class o instances
:param target_name: name of data set
:type string, e.g. 'ar1.arff'
:param out_hdp: results of hpd for one data set in experiments
:type list of class o instances
:return total_median: median value of all exp as the final prediction.
:type float
e.g. out_hdp[0] {:id 695 :result [0.648] :source_src ./dataset/AEEEM/EQ.arff}
"""
total = []
for i in source_target_match:
one_source_result = None
if i.target_name == target_name:
one_source_result = [j.result[0] for j in out_hdp if j.source_src == i.source_src and j.result != []]
# put all the results from one source together.
if not one_source_result:
continue
one_median = getMedian(sorted(one_source_result))
# print(i.source_src, "===>", target_src, one_median)
total += [one_median]
if len(total) == 0:
print("no results for ", target_name)
return
total_median = getMedian(sorted(total))
# print("final ====>", target_name, total_median)
return total_median
def run1(source_target_match, option):
"""
:param source_target_match: all feasible source_data and target_data pair
:type list of class o instances
:param option: specify which experiment is running
:type list of string
:return out : results of hdp experiments for all target in one repeat.
:type dict of list
e.g. of source_target_match[0]:
{:attr_source ['CountStmtExe', 'CountStmt', 'CountStmtDecl']
:attr_target ['ck_oo_lcom', 'ck_oo_numberOfLinesOfCode', 'ck_oo_rfc']
:group Relink
:id 60
:score 0.600689279613
:source_src ./dataset/Relink/safe.arff
:target_name EQ.arff}
e.g. of out :
{'camel-1.0': [0.654], 'skarbonka': [0.685].....}
"""
out = {}
original_src = "./dataset"
datasrc = readsrc(original_src)
for group, srclst in datasrc.iteritems():
for target_src in srclst:
data = loadWekaData(target_src)
out_wpdp, out_cpdp, out_hdp = [], [], [] # store results for three methods
target_name = target_src[target_src.rindex("/") + 1:]
for _ in xrange(10):
randomized = filter(data, False, "", "weka.filters.unsupervised.instance.Randomize", ["-S", str(_)])
train = filter(randomized, True, "train", "weka.filters.unsupervised.instance.RemoveFolds",
["-N", "2", "-F", "1", "-S", "1"]) # N : numFolds, F: whichFold to keep, S: is the seed
test = filter(randomized, True, "test", "weka.filters.unsupervised.instance.RemoveFolds",
["-N", "2", "-F", "2", "-S", "1"])
# out_wpdp += wpdp(tarin, test)
# cpdp(group,one)
try:
temp = hdp(option, target_name, source_target_match)
except:
print("target", target_name)
# print("match",source_target_match)
continue
if len(temp) == 0:
continue
else:
out_hdp += temp
dataset = target_src[target_src.rindex("/") + 1:-5] # get name of datset
result = process(source_target_match, target_name, out_hdp)
out[dataset] = out.get(dataset, []) + [result]
print(time.strftime("%a, %d %b %Y %H:%M:%S +0000"))
return out
def printout(result_dict):
out = [result_dict["method"]]
for key, val in result_dict.iteritems():
if key == "method":
continue
out.append(val)
printm(out)
def repeat(KSanalyzer, original_src, option, iteration = 20):
"""
:param KSanalyzer: KSanalyzer function
:type function
:param original_src: the original src of data set, e.g : './dataset'
:type basestring
:param option: specify which experiment is running
:type list of string
:param iteration: set the repeats of the whole experiment.
:type int
:return result: results of all target in iteration repeats.
:type dict of list
"""
result, temp = {}, {}
source_target_match = None
for _ in xrange(iteration):
if option and (option[option.index("-S") + 1] == "S" or option[option.index("-T") + 1] == "S"):
small_src = genSmall(option) # generate small data sets
if "-EPV" not in option:
source_target_match = KSanalyzer("./Smalldataset","./Smalldataset",option)
elif option and "-EPV" in option:
source_target_match = KSanalyzer("./EPVSmalldataset","./Smalldataset", option)
else:
source_target_match = KSanalyzer(original_src,original_src, option)
out = run1(source_target_match, option)
for key, val in out.iteritems():
temp[key] = temp.get(key, []) + val
for key, val in temp.iteritems():
result[key] = [getMedian(sorted(val)),getIQR(sorted(val))]
return result
def addResult(result, title, new_option_result):
"""
:param result: results of all target in iteration repeats for old exp
:type dict of list
:param title: the title of this experiment
:type list of string
:param new_option_result: results of hdp with the specific option
:type: dict of list
:return result: results of all target in iteration repeats for old and new exp.
:type: dict of list, the target name is leading in each list
e.g. of result
{'ar3': ['ar3', 0.574, 0.823], 'skarbonka': ['skarbonka', 0.569, 0.694].....}
"""
result["method"] = result.get("method") + title
for key, val in result.iteritems():
if key == "method":
continue
result[key] = result.get(key) + new_option_result[key]
return result
def run(original_src="./dataset", option=["-S", "S", "-T", "S","-N", 50]):
"""
:param original_src: the original src of data set, e.g : './dataset'
:type basestring
:param option: parameters to control expriment
:type list
return None
e.g.
option=["-S", "S", "-T", "S","-N", 50]
"-S", "S": means the sourse is small set
"-T", "S": means the target is samll set
"-N", 50: means the size of data set is 50
option=["-S", "S", "-T", "S","-EPV",20,"-N", 50]
if "-EPV" appear in the list, that means this will use -EPV trick with N=20,
only 20 defects included in the data.
"""
print(time.strftime("%a, %d %b %Y %H:%M:%S +0000"))
out = {"EQ": ['EQ', 0.583,0.782], "JDT": ['JDT',0.795, 0.767], "LC": ['LC', 0.575, 0.655], "ML": ['ML', 0.734,0.692], "PDE": ['PDE',0.684, 0.693],
"apache": ['apache',0.714, 0.731], "safe": ['safe',0.706, 0.837], "zxing": ['zxing',0.605, 0.650], "ant-1.3": ['ant-1.3',0.609, 0.800],
"arc": ['arc', 0.670,0.701], "camel-1.0": ['camel-1.0', 0.550,0.639], "poi-1.5": ['poi-1.5',0.707, 0.723],
"redaktor": ['redaktor',0.744, 0.528], "skarbonka": ['skarbonka',0.569, 0.689], "tomcat": ['tomcat',0.778, 0.737],
"velocity-1.4": ['velocity-1.4', 0.725,0.391], "xalan-2.4": ['xalan-2.4',0.755, 0.673],
"xerces-1.2": ['xerces-1.2', 0.624,0.486], "JM1":["JM1",0.705,0.688],"CM1": ['CM1', 0.653,0.720], "MW1": ['MW1', 0.612,0.745], "PC1": ['PC1', 0.787,0.751],
"PC2":["PC2",0.748,0.893],"PC3": ['PC3', 0.794,0.738], "PC4": ['PC4',0.900, 0.681], "PC5":["PC5",0.954,0.950],"KC3":["KC3",0.609,0.678],"MC2":["MC2",0.646,0.686],
"ar1": ['ar1', 0.582,0.736], "ar3": ['ar3', 0.574,0.835], "MC1":["MC1",0.863,0.893],
"ar4": ['ar4',0.657, 0.812], "ar5": ['ar5',0.804, 0.911], "ar6": ['ar6',0.654, 0.667], "method": ['Target', 'WPDP','HDP-JC']}
out = addResult(out, ['HDP-Scipy', 'HDP-Scipy-IQR'], repeat(KSanalyzer, original_src, []))
for num in range(50, 250, 50):
title = ['N-' + str(num),'N-' + str(num)+'-IQR']
option[option.index("-N")+1] = num
out = addResult(out, title, repeat(KSanalyzer, original_src, option))
printout(out)
pickle_result = open("result20160806_small_N.pkl","wb") # pickle results
pickle.dump(out, pickle_result)
def test():
match = readMatch("./result/Large_Small_match.txt")
original_src = "./dataset"
datasrc = readsrc(original_src)
last = ""
EPV = {}
for group,val in datasrc.iteritems():
for src in val:
temp = 0
count = 0
for i in match:
if i.target_src == src:
data = loadWekaData("./Small"+i.target_src[2:])
num_bug = len(data.attributeToDoubleArray(data.classIndex())) - sum(data.attributeToDoubleArray(data.classIndex()))
temp += float(num_bug/len(i.attr_target))
count += 1
EPV[src[src.rfind("/")+1:src.rfind(".")]] = round(temp/count,3)
# pdb.set_trace()
for key, val in EPV.iteritems():
print(key, ':',val)
print(EPV)
if __name__ == "__main__":
random.seed(1)
# test()
# readMatch()
run()
# printPCA()