-
Notifications
You must be signed in to change notification settings - Fork 127
/
Copy pathanalysis_result_table.py
331 lines (283 loc) · 11.5 KB
/
analysis_result_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# This code is part of Qiskit.
#
# (C) Copyright IBM 2023.
#
# This code is licensed under the Apache License, Version 2.0. You may
# obtain a copy of this license in the LICENSE.txt file in the root directory
# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
#
# Any modifications or derivative works of this code must retain this
# copyright notice, and modified files need to carry a notice indicating
# that they have been altered from the originals.
"""A table-like dataset for analysis results."""
from __future__ import annotations
import re
import threading
import uuid
import warnings
from typing import Any
import numpy as np
import pandas as pd
from qiskit_experiments.database_service.exceptions import ExperimentEntryNotFound
class AnalysisResultTable:
"""A table-like dataset for analysis results.
Default table columns are defined in the class attribute :attr:`.DEFAULT_COLUMNS`.
The table is automatically expanded when an extra key is included in the
input dictionary data. Missing columns in the input data are filled with a null value.
Table row index (i.e. entry ID) is created by truncating the result_id string which
is basically a UUID-4 string. A random unique ID is generated when the result_id
is missing in the input data.
Any operation on the table value via the instance methods guarantees thread safety.
"""
VALID_ID_REGEX = re.compile(r"\A(?P<short_id>\w{8})-\w{4}-\w{4}-\w{4}-\w{12}\Z")
DEFAULT_COLUMNS = [
"name",
"experiment",
"components",
"value",
"quality",
"experiment_id",
"result_id",
"tags",
"backend",
"run_time",
"created_time",
]
def __init__(self):
"""Create new dataset."""
self._data = pd.DataFrame(columns=self.DEFAULT_COLUMNS)
self._lock = threading.RLock()
@classmethod
def from_dataframe(cls, data: pd.DataFrame) -> "AnalysisResultTable":
"""Create new dataset with existing dataframe.
Args:
data: Bare dataframe object.
Returns:
A new AnalysisResults instance.
"""
instance = AnalysisResultTable()
instance._data = pd.concat([instance._data, data])
return instance
@property
def dataframe(self) -> pd.DataFrame:
"""Dataframe object of analysis results."""
with self._lock:
return self._data.copy(deep=False)
@property
def result_ids(self) -> list[str]:
"""Result IDs in current dataset."""
with self._lock:
return list(self._data.result_id)
@property
def columns(self) -> list[str]:
"""All columns in current dataset."""
with self._lock:
return list(self._data.columns)
def add_data(
self,
*,
result_id: str | None = None,
**data,
) -> str:
"""Add new data to this dataset.
Args:
result_id: A unique UUID-4 string for this data entry.
The full string is used to identify the data in the experiment service database,
and a short ID is created by truncating this string as a dataframe index.
data: Arbitrary key-value pairs representing a single data entry.
Missing values for default columns are filled with ``None``.
Returns:
Assigned analysis result ID.
"""
result_id = result_id or self._create_unique_hash()
if matched := re.match(self.VALID_ID_REGEX, result_id):
# Short unique index is generated from result id.
# Showing full result id unnecessary occupies horizontal space of the html table.
# This mechanism is inspired by the github commit hash.
index = matched.group("short_id")
else:
warnings.warn(
f"Result ID of {result_id} is not a valid UUID-4 string. ",
UserWarning,
)
index = result_id[:8]
with self._lock:
if index in self._data.index:
raise ValueError(
f"Table entry index {index} already exists. "
"Please use another ID to avoid index collision."
)
# Add missing columns to the table
if missing := data.keys() - set(self._data.columns):
for k in data:
# Order sensitive
if k in missing:
loc = len(self._data.columns)
self._data.insert(loc, k, value=None)
# A hack to avoid unwanted dtype update. Appending new row with .loc indexer
# performs enlargement and implicitly changes dtype. This often induces a confusion of
# NaN (numeric container) and None (object container) for missing values.
# Filling a row with None values before assigning actual values can keep column dtype,
# but this behavior might change in future pandas version.
# https://github.com/pandas-dev/pandas/issues/6485
# Also see test.framework.test_data_table.TestBaseTable.test_type_*
self._data.loc[index, :] = [None] * len(self._data.columns)
template = dict.fromkeys(self.columns, None)
template["result_id"] = result_id
template.update(data)
self._data.loc[index, :] = pd.array(list(template.values()), dtype=object)
return index
def get_data(
self,
key: str | int | slice | None = None,
columns: str | list[str] = "default",
) -> pd.DataFrame:
"""Get matched entries from this dataset.
Args:
key: Identifier of the entry of interest.
columns: List of names or a policy (default, minimal, all)
of data columns included in the returned data frame.
Returns:
Matched entries in a single data frame or series.
"""
if key is None:
with self._lock:
out = self._data.copy()
else:
uids = self._resolve_key(key)
with self._lock:
out = self._data.filter(items=uids, axis=0)
if columns != "all":
valid_columns = self._resolve_columns(columns)
out = out[valid_columns]
return out
def del_data(
self,
key: str | int,
) -> list[str]:
"""Delete matched entries from this dataset.
Args:
key: Identifier of the entry of interest.
Returns:
Deleted analysis result IDs.
"""
uids = self._resolve_key(key)
with self._lock:
self._data.drop(uids, inplace=True)
return uids
def clear(self):
"""Clear all table entries."""
with self._lock:
self._data = pd.DataFrame(columns=self.DEFAULT_COLUMNS)
def copy(self, new_ids: bool = True):
"""Create new thread-safe instance with the same data.
Args:
new_ids: Whether to generate new IDs for copied entries. Defaults to True.
Returns:
A new shallow copied DataFrame object.
"""
with self._lock:
# Hold the lock so that no data can be added
new_instance = self.__class__()
new_instance._data = self._data.copy(deep=False)
if new_ids:
new_instance._data["result_id"] = None
for idx, _ in new_instance._data.iterrows():
new_instance._data.at[idx, "result_id"] = new_instance._create_unique_hash()
new_instance._data.index = [
result_id[:8] for result_id in new_instance._data["result_id"]
]
return new_instance
def _create_unique_hash(self) -> str:
with self._lock:
n = 0
while n < 1000:
tmp_id = str(uuid.uuid4())
if tmp_id[:8] not in self._data.index:
return tmp_id
raise RuntimeError(
"Unique result_id string cannot be prepared for this table within 1000 trials. "
"Reduce number of entries, or manually provide a unique result_id."
)
def _resolve_columns(self, columns: str | list[str]):
with self._lock:
extra_columns = [c for c in self._data.columns if c not in self.DEFAULT_COLUMNS]
if columns == "default":
return [
"name",
"experiment",
"components",
"value",
"quality",
"backend",
"run_time",
] + extra_columns
if columns == "minimal":
return [
"name",
"components",
"value",
"quality",
] + extra_columns
if not isinstance(columns, str):
out = []
for column in columns:
if column in self._data.columns:
out.append(column)
else:
warnings.warn(
f"Specified column {column} does not exist in this table.",
UserWarning,
)
return out
raise ValueError(
f"Column group {columns} is not valid name. Use either 'all', 'default', 'minimal'."
)
def _resolve_key(self, key: int | slice | str) -> list[str]:
with self._lock:
if isinstance(key, int):
if key >= len(self):
raise ExperimentEntryNotFound(f"Analysis result {key} not found.")
return [self._data.index[key]]
if isinstance(key, slice):
keys = list(self._data.index)[key]
if len(keys) == 0:
raise ExperimentEntryNotFound(f"Analysis result {key} not found.")
return keys
if isinstance(key, str):
if key in self._data.index:
return [key]
# This key is name of entry
loc = self._data["name"] == key
if not any(loc):
raise ExperimentEntryNotFound(f"Analysis result {key} not found.")
return list(self._data.index[loc])
raise TypeError(f"Invalid key type {type(key)}. The key must be either int, slice, or str.")
def __len__(self):
return len(self._data)
def __contains__(self, item):
return item in self._data.index
def __json_encode__(self) -> dict[str, Any]:
with self._lock:
return {
"class": "AnalysisResultTable",
"data": self._data.to_dict(orient="index"),
}
@classmethod
def __json_decode__(cls, value: dict[str, Any]) -> "AnalysisResultTable":
if not value.get("class", None) == "AnalysisResultTable":
raise ValueError("JSON decoded value for AnalysisResultTable is not valid class type.")
instance = object.__new__(cls)
instance._lock = threading.RLock()
instance._data = pd.DataFrame.from_dict(
data=value.get("data", {}),
orient="index",
).replace({np.nan: None})
return instance
def __getstate__(self):
state = self.__dict__.copy()
del state["_lock"]
return state
def __setstate__(self, state):
self.__dict__.update(state)
self._lock = threading.RLock()