Skip to content

Commit f9a4bda

Browse files
committed
lint and headers
1 parent a3e1679 commit f9a4bda

File tree

3 files changed

+117
-73
lines changed

3 files changed

+117
-73
lines changed

modin/core/storage_formats/base/query_compiler.py

+20-16
Original file line numberDiff line numberDiff line change
@@ -109,42 +109,46 @@ def axis_setter(self, labels):
109109

110110

111111
class QCCoercionCost(Enum):
112-
'''
112+
"""
113113
Coercion costs between query compilers can be expressed
114114
as integers in the range -1 to 1000, where 1000 is
115115
considered impossible. Since coercsion costs can be a
116-
function of many variables ( dataset size, partitioning,
116+
function of many variables ( dataset size, partitioning,
117117
network throughput, and query time ) we define a set range
118118
of cost values to simplify comparisons between two query
119119
compilers / engines in a unified way.
120-
120+
121121
COST_UNKNOWN means we do not know the cost associated with changing
122122
query compilers.
123-
123+
124124
COST_ZERO means there is no cost associated, or that the query compilers
125125
are the same.
126-
126+
127127
COST_IMPOSSIBLE means the coercion is effectively impossible, which can
128128
occur if the target system is unable to store the data as a result
129129
of the coercion.
130-
'''
130+
"""
131+
131132
COST_UNKNOWN = -1
132133
COST_ZERO = 0
133134
COST_LOW = 250
134135
COST_MEDIUM = 500
135136
COST_HIGH = 750
136137
COST_IMPOSSIBLE = 1000
137138

138-
def validate_coercsion_cost(cost:QCCoercionCost):
139-
if int(cost) < int(QCCoercionCost.COST_UNKNOWN) or int(cost) > int(QCCoercionCost.COST_IMPOSSIBLE):
140-
raise ValueError("Query compiler coercsion cost out of range")
141-
139+
def validate_coercsion_cost(cost: QCCoercionCost):
140+
if int(cost) < int(QCCoercionCost.COST_UNKNOWN) or int(cost) > int(
141+
QCCoercionCost.COST_IMPOSSIBLE
142+
):
143+
raise ValueError("Query compiler coercsion cost out of range")
144+
142145
def __int__(self):
143146
return self.value
144-
147+
145148
def __add__(self, other) -> int:
146149
return int(self) + int(other)
147150

151+
148152
# FIXME: many of the BaseQueryCompiler methods are hiding actual arguments
149153
# by using *args and **kwargs. They should be spread into actual parameters.
150154
# Currently actual arguments are placed in the methods docstrings, but since they're
@@ -286,17 +290,17 @@ def default_to_pandas(self, pandas_op, *args, **kwargs) -> Self:
286290
return self.__wrap_in_qc(result)
287291

288292
def qc_engine_switch_cost(self, other_qc) -> dict[type, int]:
289-
'''
293+
"""
290294
Coercion costs to and from other_qc
291-
295+
292296
Returns a map of type to QCCoercionCost, where type is the type we are casting to.
293297
This provides a mechanism for the query compilers to provide information to
294298
modin on the cost of moving data to another query compiler ( or the other way ).
295-
'''
299+
"""
296300
if isinstance(type(self), type(other_qc)):
297-
return {type(self): QCCoercionCost.COST_ZERO}
301+
return {type(self): QCCoercionCost.COST_ZERO}
298302
return {}
299-
303+
300304
# Abstract Methods and Fields: Must implement in children classes
301305
# In some cases, there you may be able to use the same implementation for
302306
# some of these abstract methods, but for the sake of generality they are

modin/core/storage_formats/pandas/query_compiler_caster.py

+28-25
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,23 @@
2727

2828
from pandas.core.indexes.frozen import FrozenList
2929

30-
from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler, QCCoercionCost
30+
from modin.core.storage_formats.base.query_compiler import (
31+
BaseQueryCompiler,
32+
QCCoercionCost,
33+
)
3134

3235
Fn = TypeVar("Fn", bound=Any)
3336

37+
3438
class QueryCompilerCasterCalculator:
35-
39+
3640
def __init__(self):
3741
self._caster_costing_map = {}
3842
self._data_cls_map = {}
3943
self._qc_list = []
4044
self._qc_cls_list = []
4145
self._result_type = None
42-
46+
4347
def add_query_compiler(self, query_compiler):
4448
if isinstance(query_compiler, type):
4549
# class
@@ -50,16 +54,16 @@ def add_query_compiler(self, query_compiler):
5054
self._qc_list.append(query_compiler)
5155
self._data_cls_map[qc_type] = query_compiler._modin_frame
5256
self._qc_cls_list.append(qc_type)
53-
57+
5458
def calculate(self):
5559
if self._result_type is not None:
5660
return self._result_type
5761
if len(self._qc_cls_list) == 1:
5862
return self._qc_cls_list[0]
5963
if len(self._qc_cls_list) == 0:
6064
raise ValueError("No query compilers registered")
61-
62-
for (qc_1, qc_2) in combinations(self._qc_list, 2):
65+
66+
for qc_1, qc_2 in combinations(self._qc_list, 2):
6367
costs_1 = qc_1.qc_engine_switch_cost(qc_2)
6468
costs_2 = qc_2.qc_engine_switch_cost(qc_1)
6569
self._add_cost_data(costs_1)
@@ -73,20 +77,24 @@ def calculate(self):
7377
self._result_type = key
7478
break
7579
return self._result_type
76-
77-
def _add_cost_data(self, costs:dict):
80+
81+
def _add_cost_data(self, costs: dict):
7882
for k, v in costs.items():
7983
# filter out any extranious query compilers not in this operation
8084
if k in self._qc_cls_list:
8185
QCCoercionCost.validate_coercsion_cost(v)
8286
# Adds the costs associated with all coercions to a type, k
83-
self._caster_costing_map[k] = v + self._caster_costing_map[k] if k in self._caster_costing_map else v
84-
87+
self._caster_costing_map[k] = (
88+
v + self._caster_costing_map[k]
89+
if k in self._caster_costing_map
90+
else v
91+
)
92+
8593
def result_data_frame(self):
8694
qc_type = self.calculate()
8795
return self._data_cls_map[qc_type]
88-
89-
96+
97+
9098
class QueryCompilerCaster:
9199
"""Cast all query compiler arguments of the member function to current query compiler."""
92100

@@ -111,9 +119,7 @@ def __init_subclass__(
111119
apply_argument_cast(cls)
112120

113121

114-
def visit_nested_args(arguments,
115-
current_qc:BaseQueryCompiler,
116-
fn:callable):
122+
def visit_nested_args(arguments, current_qc: BaseQueryCompiler, fn: callable):
117123
"""
118124
Cast all arguments in nested fashion to current query compiler.
119125
@@ -166,7 +172,6 @@ def apply_argument_cast(obj: Fn) -> Fn:
166172
if isinstance(obj, type):
167173
all_attrs = dict(inspect.getmembers(obj))
168174

169-
170175
# This is required because inspect converts class methods to member functions
171176
current_class_attrs = vars(obj)
172177
for key in current_class_attrs:
@@ -216,38 +221,36 @@ def arg_needs_casting(arg):
216221
if isinstance(arg, current_qc_type):
217222
return False
218223
return True
219-
224+
220225
def register_query_compilers(arg):
221226
if not arg_needs_casting(arg):
222227
return arg
223228
calculator.add_query_compiler(arg)
224229
return arg
225-
230+
226231
def cast_to_qc(arg):
227232
if not arg_needs_casting(arg):
228233
return arg
229234
qc_type = calculator.calculate()
230-
if qc_type == None or qc_type == type(arg):
235+
if qc_type is None or qc_type is type(arg):
231236
return arg
232237
frame_data = calculator.result_data_frame()
233238
result = qc_type.from_pandas(arg.to_pandas(), frame_data)
234239
return result
235-
236-
240+
237241
if isinstance(current_qc, BaseQueryCompiler):
238242
visit_nested_args(kwargs, current_qc, register_query_compilers)
239243
visit_nested_args(args, current_qc, register_query_compilers)
240-
244+
241245
args = visit_nested_args(args, current_qc, cast_to_qc)
242246
kwargs = visit_nested_args(kwargs, current_qc, cast_to_qc)
243247

244-
245248
qc = calculator.calculate()
246249

247-
if qc == None or qc == type(current_qc):
250+
if qc is None or qc is type(current_qc):
248251
return obj(*args, **kwargs)
249252

250-
#breakpoint()
253+
# breakpoint()
251254
# we need to cast current_qc to a new query compiler
252255
if qc != current_qc:
253256
data_cls = current_qc._modin_frame
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,15 @@
1-
1+
# Licensed to Modin Development Team under one or more contributor license agreements.
2+
# See the NOTICE file distributed with this work for additional information regarding
3+
# copyright ownership. The Modin Development Team licenses this file to you under the
4+
# Apache License, Version 2.0 (the "License"); you may not use this file except in
5+
# compliance with the License. You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software distributed under
10+
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific language
12+
# governing permissions and limitations under the License.
213

314
import pandas
415
import pytest
@@ -7,80 +18,103 @@
718

819

920
class CloudQC(NativeQueryCompiler):
10-
'Represents a cloud-hosted query compiler'
21+
"Represents a cloud-hosted query compiler"
22+
1123
def __init__(self, pandas_frame):
1224
self._modin_frame = pandas_frame
1325
super().__init__(pandas_frame)
14-
26+
1527
def qc_engine_switch_cost(self, other_qc):
16-
return {CloudQC: QCCoercionCost.COST_ZERO,
17-
ClusterQC: QCCoercionCost.COST_MEDIUM,
18-
LocalMachineQC: QCCoercionCost.COST_HIGH,
19-
PicoQC: QCCoercionCost.COST_IMPOSSIBLE}
28+
return {
29+
CloudQC: QCCoercionCost.COST_ZERO,
30+
ClusterQC: QCCoercionCost.COST_MEDIUM,
31+
LocalMachineQC: QCCoercionCost.COST_HIGH,
32+
PicoQC: QCCoercionCost.COST_IMPOSSIBLE,
33+
}
34+
2035

2136
class ClusterQC(NativeQueryCompiler):
22-
'Represents a local network cluster query compiler'
37+
"Represents a local network cluster query compiler"
38+
2339
def __init__(self, pandas_frame):
2440
self._modin_frame = pandas_frame
2541
super().__init__(pandas_frame)
26-
42+
2743
def qc_engine_switch_cost(self, other_qc):
28-
return {CloudQC: QCCoercionCost.COST_MEDIUM,
29-
ClusterQC: QCCoercionCost.COST_ZERO,
30-
LocalMachineQC: QCCoercionCost.COST_MEDIUM,
31-
PicoQC: QCCoercionCost.COST_HIGH}
32-
44+
return {
45+
CloudQC: QCCoercionCost.COST_MEDIUM,
46+
ClusterQC: QCCoercionCost.COST_ZERO,
47+
LocalMachineQC: QCCoercionCost.COST_MEDIUM,
48+
PicoQC: QCCoercionCost.COST_HIGH,
49+
}
50+
51+
3352
class LocalMachineQC(NativeQueryCompiler):
34-
'Represents a local machine query compiler'
53+
"Represents a local machine query compiler"
54+
3555
def __init__(self, pandas_frame):
3656
self._modin_frame = pandas_frame
3757
super().__init__(pandas_frame)
38-
58+
3959
def qc_engine_switch_cost(self, other_qc):
40-
return {CloudQC: QCCoercionCost.COST_MEDIUM,
41-
ClusterQC: QCCoercionCost.COST_LOW,
42-
LocalMachineQC: QCCoercionCost.COST_ZERO,
43-
PicoQC: QCCoercionCost.COST_MEDIUM}
60+
return {
61+
CloudQC: QCCoercionCost.COST_MEDIUM,
62+
ClusterQC: QCCoercionCost.COST_LOW,
63+
LocalMachineQC: QCCoercionCost.COST_ZERO,
64+
PicoQC: QCCoercionCost.COST_MEDIUM,
65+
}
66+
4467

4568
class PicoQC(NativeQueryCompiler):
46-
'Represents a query compiler with very few resources'
69+
"Represents a query compiler with very few resources"
70+
4771
def __init__(self, pandas_frame):
4872
self._modin_frame = pandas_frame
4973
super().__init__(pandas_frame)
50-
74+
5175
def qc_engine_switch_cost(self, other_qc):
52-
return {CloudQC: QCCoercionCost.COST_LOW,
53-
ClusterQC: QCCoercionCost.COST_LOW,
54-
LocalMachineQC: QCCoercionCost.COST_LOW,
55-
PicoQC: QCCoercionCost.COST_ZERO}
76+
return {
77+
CloudQC: QCCoercionCost.COST_LOW,
78+
ClusterQC: QCCoercionCost.COST_LOW,
79+
LocalMachineQC: QCCoercionCost.COST_LOW,
80+
PicoQC: QCCoercionCost.COST_ZERO,
81+
}
82+
5683

5784
@pytest.fixture()
5885
def cloud_df():
5986
return CloudQC(pandas.DataFrame([0, 1, 2]))
6087

88+
6189
@pytest.fixture()
6290
def cluster_df():
6391
return ClusterQC(pandas.DataFrame([0, 1, 2]))
6492

93+
6594
@pytest.fixture()
6695
def local_df():
6796
return LocalMachineQC(pandas.DataFrame([0, 1, 2]))
6897

98+
6999
@pytest.fixture()
70100
def pico_df():
71101
return PicoQC(pandas.DataFrame([0, 1, 2]))
72102

103+
73104
def test_two_same_qc_types_noop(pico_df):
74105
df3 = pico_df.concat(axis=1, other=pico_df)
75-
assert(type(df3) == type(pico_df))
106+
assert type(df3) is type(pico_df)
107+
76108

77109
def test_two_two_qc_types_rhs(pico_df, cluster_df):
78110
df3 = pico_df.concat(axis=1, other=cluster_df)
79-
assert(type(df3) == type(cluster_df)) # should move to cluster
111+
assert type(df3) is type(cluster_df) # should move to cluster
112+
80113

81114
def test_two_two_qc_types_lhs(pico_df, cluster_df):
82115
df3 = cluster_df.concat(axis=1, other=pico_df)
83-
assert(type(df3) == type(cluster_df)) # should move to cluster
116+
assert type(df3) is type(cluster_df) # should move to cluster
117+
84118

85119
@pytest.mark.parametrize(
86120
"df1, df2, df3, df4, result_type",
@@ -103,12 +137,15 @@ def test_mixed_dfs(df1, df2, df3, df4, result_type, request):
103137
df3 = request.getfixturevalue(df3)
104138
df4 = request.getfixturevalue(df4)
105139
result = df1.concat(axis=1, other=[df2, df3, df4])
106-
assert(type(result) == result_type)
140+
assert type(result) is result_type
141+
107142

108143
# This currently passes because we have no "max cost" associated
109144
# with a particular QC, so we would move all data to the PicoQC
110145
# As soon as we can represent "max-cost" the result of this operation
111146
# should be to move all dfs to the CloudQC
112147
def test_extreme_pico(pico_df, cloud_df):
113-
result = cloud_df.concat(axis=1, other=[pico_df, pico_df, pico_df, pico_df, pico_df, pico_df, pico_df])
114-
assert(type(result) == PicoQC)
148+
result = cloud_df.concat(
149+
axis=1, other=[pico_df, pico_df, pico_df, pico_df, pico_df, pico_df, pico_df]
150+
)
151+
assert type(result) is PicoQC

0 commit comments

Comments
 (0)