Skip to content

Commit 5f3a5bb

Browse files
authored
Enforce 1 session = 1 run (#1329)
1 parent be04aec commit 5f3a5bb

File tree

17 files changed

+266
-119
lines changed

17 files changed

+266
-119
lines changed

RELEASE.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
* Removed `RegistrationSpecs` and all registration hooks that belonged to it. Going forward users can register custom library components through `settings.py`.
7979
* Added the `PluginManager` `hook_manager` argument to `KedroContext` and the `Runner.run()` method, which will be provided by the `KedroSession`.
8080
* Removed the public method `get_hook_manager()` and replaced its functionality by `_create_hook_manager()`.
81+
* Enforced that only one run can be successfully executed as part of a `KedroSession`. `run_id` has been renamed to `session_id` as a result of that.
8182

8283
## Thanks for supporting contributions
8384

@@ -166,7 +167,8 @@ The parameters should look like this:
166167
* If you had any `networkx.NetworkXDataSet` entries in your catalog, replace them with `networkx.JSONDataSet`.
167168
* If you were using the `KedroContext` to access `ConfigLoader`, please use `settings.CONFIG_LOADER_CLASS` to access the currently used `ConfigLoader` instead.
168169
* To run a pipeline in parallel, use `kedro run --runner=ParallelRunner` rather than `--parallel` or `-p`.
169-
170+
* If you were using `run_id` in the `after_catalog_created` hook, replace it with `save_version` instead.
171+
* If you were using `run_id` in any of the `before_node_run`, `after_node_run`, `on_node_error`, `before_pipeline_run`, `after_pipeline_run` or `on_pipeline_error` hooks, replace it with `session_id` instead.
170172

171173
# Release 0.17.7
172174

docs/source/deployment/aws_batch.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ class AWSBatchRunner(ThreadRunner):
165165
return super()._get_required_workers_count(pipeline)
166166
167167
def _run( # pylint: disable=too-many-locals,useless-suppression
168-
self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None
168+
self, pipeline: Pipeline, catalog: DataCatalog, session_id: str = None
169169
) -> None:
170170
nodes = pipeline.nodes
171171
node_dependencies = pipeline.node_dependencies
@@ -206,7 +206,7 @@ class AWSBatchRunner(ThreadRunner):
206206
node,
207207
node_to_job,
208208
node_dependencies[node],
209-
run_id,
209+
session_id,
210210
)
211211
futures.add(future)
212212
@@ -232,11 +232,11 @@ def _submit_job(
232232
node: Node,
233233
node_to_job: Dict[Node, str],
234234
node_dependencies: Set[Node],
235-
run_id: str,
235+
session_id: str,
236236
) -> Node:
237237
self._logger.info("Submitting the job for node: %s", str(node))
238238
239-
job_name = f"kedro_{run_id}_{node.name}".replace(".", "-")
239+
job_name = f"kedro_{session_id}_{node.name}".replace(".", "-")
240240
depends_on = [{"jobId": node_to_job[dep]} for dep in node_dependencies]
241241
command = ["kedro", "run", "--node", node.name]
242242

docs/source/deployment/dask.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ class DaskRunner(AbstractRunner):
112112
node: Node,
113113
catalog: DataCatalog,
114114
is_async: bool = False,
115-
run_id: str = None,
115+
session_id: str = None,
116116
*dependencies: Node,
117117
) -> Node:
118118
"""Run a single `Node` with inputs from and outputs to the `catalog`.
@@ -126,17 +126,17 @@ class DaskRunner(AbstractRunner):
126126
catalog: A ``DataCatalog`` containing the node's inputs and outputs.
127127
is_async: If True, the node inputs and outputs are loaded and saved
128128
asynchronously with threads. Defaults to False.
129-
run_id: The id of the pipeline run.
129+
session_id: The session id of the pipeline run.
130130
dependencies: The upstream ``Node``s to allow Dask to handle
131131
dependency tracking. Their values are not actually used.
132132
133133
Returns:
134134
The node argument.
135135
"""
136-
return run_node(node, catalog, is_async, run_id)
136+
return run_node(node, catalog, is_async, session_id)
137137

138138
def _run(
139-
self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None
139+
self, pipeline: Pipeline, catalog: DataCatalog, session_id: str = None
140140
) -> None:
141141
nodes = pipeline.nodes
142142
load_counts = Counter(chain.from_iterable(n.inputs for n in nodes))
@@ -153,7 +153,7 @@ class DaskRunner(AbstractRunner):
153153
node,
154154
catalog,
155155
self._is_async,
156-
run_id,
156+
session_id,
157157
*dependencies,
158158
)
159159

docs/source/extend_kedro/hooks.md

+12-9
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ def after_catalog_created(
6868
conf_creds: Dict[str, Any],
6969
save_version: str,
7070
load_versions: Dict[str, str],
71-
run_id: str,
7271
) -> None:
7372
pass
7473
```
@@ -380,23 +379,25 @@ class DataValidationHooks:
380379

381380
@hook_impl
382381
def before_node_run(
383-
self, catalog: DataCatalog, inputs: Dict[str, Any], run_id: str
382+
self, catalog: DataCatalog, inputs: Dict[str, Any], session_id: str
384383
) -> None:
385384
"""Validate inputs data to a node based on using great expectation
386385
if an expectation suite is defined in ``DATASET_EXPECTATION_MAPPING``.
387386
"""
388-
self._run_validation(catalog, inputs, run_id)
387+
self._run_validation(catalog, inputs, session_id)
389388

390389
@hook_impl
391390
def after_node_run(
392-
self, catalog: DataCatalog, outputs: Dict[str, Any], run_id: str
391+
self, catalog: DataCatalog, outputs: Dict[str, Any], session_id: str
393392
) -> None:
394393
"""Validate outputs data from a node based on using great expectation
395394
if an expectation suite is defined in ``DATASET_EXPECTATION_MAPPING``.
396395
"""
397-
self._run_validation(catalog, outputs, run_id)
396+
self._run_validation(catalog, outputs, session_id)
398397

399-
def _run_validation(self, catalog: DataCatalog, data: Dict[str, Any], run_id: str):
398+
def _run_validation(
399+
self, catalog: DataCatalog, data: Dict[str, Any], session_id: str
400+
):
400401
for dataset_name, dataset_value in data.items():
401402
if dataset_name not in self.DATASET_EXPECTATION_MAPPING:
402403
continue
@@ -411,7 +412,9 @@ class DataValidationHooks:
411412
expectation_suite,
412413
)
413414
expectation_context.run_validation_operator(
414-
"action_list_operator", assets_to_validate=[batch], run_id=run_id
415+
"action_list_operator",
416+
assets_to_validate=[batch],
417+
session_id=session_id,
415418
)
416419
```
417420

@@ -499,9 +502,9 @@ class ModelTrackingHooks:
499502
@hook_impl
500503
def before_pipeline_run(self, run_params: Dict[str, Any]) -> None:
501504
"""Hook implementation to start an MLflow run
502-
with the same run_id as the Kedro pipeline run.
505+
with the session_id of the Kedro pipeline run.
503506
"""
504-
mlflow.start_run(run_name=run_params["run_id"])
507+
mlflow.start_run(run_name=run_params["session_id"])
505508
mlflow.log_params(run_params)
506509

507510
@hook_impl

docs/source/nodes_and_pipelines/run_a_pipeline.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ class DryRunner(AbstractRunner):
8282
return MemoryDataSet()
8383

8484
def _run(
85-
self, pipeline: Pipeline, catalog: DataCatalog, run_id: str = None
85+
self, pipeline: Pipeline, catalog: DataCatalog, session_id: str = None
8686
) -> None:
8787
"""The method implementing dry pipeline running.
8888
Example logs output using this implementation:
@@ -95,7 +95,7 @@ class DryRunner(AbstractRunner):
9595
Args:
9696
pipeline: The ``Pipeline`` to run.
9797
catalog: The ``DataCatalog`` from which to fetch data.
98-
run_id: The id of the run.
98+
session_id: The id of the session.
9999
100100
"""
101101
nodes = pipeline.nodes

kedro/framework/context/context.py

-17
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,6 @@ def _get_catalog(
290290
feed_dict=feed_dict,
291291
save_version=save_version,
292292
load_versions=load_versions,
293-
run_id=self.run_id or save_version,
294293
)
295294
return catalog
296295

@@ -335,22 +334,6 @@ def _get_config_credentials(self) -> Dict[str, Any]:
335334
conf_creds = {}
336335
return conf_creds
337336

338-
@property
339-
def run_id(self) -> Union[None, str]:
340-
"""Unique identifier for a run, defaults to None.
341-
If `run_id` is None, `save_version` will be used instead.
342-
"""
343-
return self._get_run_id()
344-
345-
def _get_run_id( # pylint: disable=no-self-use
346-
self, *args, **kwargs # pylint: disable=unused-argument
347-
) -> Union[None, str]:
348-
"""A hook for generating a unique identifier for a
349-
run, defaults to None.
350-
If None, `save_version` will be used instead.
351-
"""
352-
return None
353-
354337

355338
class KedroContextError(Exception):
356339
"""Error occurred when loading project and running context pipeline."""

kedro/framework/hooks/specs.py

+9-11
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ def after_catalog_created( # pylint: disable=too-many-arguments
2323
feed_dict: Dict[str, Any],
2424
save_version: str,
2525
load_versions: Dict[str, str],
26-
run_id: str,
2726
) -> None:
2827
"""Hooks to be invoked after a data catalog is created.
2928
It receives the ``catalog`` as well as
@@ -38,7 +37,6 @@ def after_catalog_created( # pylint: disable=too-many-arguments
3837
for all datasets in the catalog.
3938
load_versions: The load_versions used in ``load`` operations
4039
for each dataset in the catalog.
41-
run_id: The id of the run for which the catalog is loaded.
4240
"""
4341
pass
4442

@@ -53,7 +51,7 @@ def before_node_run( # pylint: disable=too-many-arguments
5351
catalog: DataCatalog,
5452
inputs: Dict[str, Any],
5553
is_async: bool,
56-
run_id: str,
54+
session_id: str,
5755
) -> Optional[Dict[str, Any]]:
5856
"""Hook to be invoked before a node runs.
5957
The arguments received are the same as those used by ``kedro.runner.run_node``
@@ -65,7 +63,7 @@ def before_node_run( # pylint: disable=too-many-arguments
6563
The keys are dataset names and the values are the actual loaded input data,
6664
not the dataset instance.
6765
is_async: Whether the node was run in ``async`` mode.
68-
run_id: The id of the run.
66+
session_id: The id of the session.
6967
7068
Returns:
7169
Either None or a dictionary mapping dataset name(s) to new value(s).
@@ -82,7 +80,7 @@ def after_node_run( # pylint: disable=too-many-arguments
8280
inputs: Dict[str, Any],
8381
outputs: Dict[str, Any],
8482
is_async: bool,
85-
run_id: str,
83+
session_id: str,
8684
) -> None:
8785
"""Hook to be invoked after a node runs.
8886
The arguments received are the same as those used by ``kedro.runner.run_node``
@@ -98,7 +96,7 @@ def after_node_run( # pylint: disable=too-many-arguments
9896
The keys are dataset names and the values are the actual computed output data,
9997
not the dataset instance.
10098
is_async: Whether the node was run in ``async`` mode.
101-
run_id: The id of the run.
99+
session_id: The id of the session.
102100
"""
103101
pass
104102

@@ -110,7 +108,7 @@ def on_node_error( # pylint: disable=too-many-arguments
110108
catalog: DataCatalog,
111109
inputs: Dict[str, Any],
112110
is_async: bool,
113-
run_id: str,
111+
session_id: str,
114112
):
115113
"""Hook to be invoked if a node run throws an uncaught error.
116114
The signature of this error hook should match the signature of ``before_node_run``
@@ -124,7 +122,7 @@ def on_node_error( # pylint: disable=too-many-arguments
124122
The keys are dataset names and the values are the actual loaded input data,
125123
not the dataset instance.
126124
is_async: Whether the node was run in ``async`` mode.
127-
run_id: The id of the run.
125+
session_id: The id of the session.
128126
"""
129127
pass
130128

@@ -143,7 +141,7 @@ def before_pipeline_run(
143141
Should have the following schema::
144142
145143
{
146-
"run_id": str
144+
"session_id": str
147145
"project_path": str,
148146
"env": str,
149147
"kedro_version": str,
@@ -178,7 +176,7 @@ def after_pipeline_run(
178176
Should have the following schema::
179177
180178
{
181-
"run_id": str
179+
"session_id": str
182180
"project_path": str,
183181
"env": str,
184182
"kedro_version": str,
@@ -217,7 +215,7 @@ def on_pipeline_error(
217215
Should have the following schema::
218216
219217
{
220-
"run_id": str
218+
"session_id": str
221219
"project_path": str,
222220
"env": str,
223221
"kedro_version": str,

kedro/framework/session/session.py

+27-4
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,14 @@ def _jsonify_cli_context(ctx: click.core.Context) -> Dict[str, Any]:
7575
}
7676

7777

78+
class KedroSessionError(Exception):
79+
"""``KedroSessionError`` raised by ``KedroSession``
80+
in the case that multiple runs are attempted in one session.
81+
"""
82+
83+
pass
84+
85+
7886
class KedroSession:
7987
"""``KedroSession`` is the object that is responsible for managing the lifecycle
8088
of a Kedro run.
@@ -106,6 +114,7 @@ def __init__(
106114
self.save_on_close = save_on_close
107115
self._package_name = package_name
108116
self._store = self._init_store()
117+
self._run_called = False
109118

110119
hook_manager = _create_hook_manager()
111120
_register_hooks(hook_manager, settings.HOOKS)
@@ -318,6 +327,8 @@ def run( # pylint: disable=too-many-arguments,too-many-locals
318327
defined by `register_pipelines`.
319328
Exception: Any uncaught exception during the run will be re-raised
320329
after being passed to ``on_pipeline_error`` hook.
330+
KedroSessionError: If more than one run is attempted to be executed during
331+
a single session.
321332
Returns:
322333
Any node outputs that cannot be processed by the ``DataCatalog``.
323334
These are returned in a dictionary, where the keys are defined
@@ -327,7 +338,15 @@ def run( # pylint: disable=too-many-arguments,too-many-locals
327338
# Report project name
328339
self._logger.info("** Kedro project %s", self._project_path.name)
329340

330-
save_version = run_id = self.store["session_id"]
341+
if self._run_called:
342+
raise KedroSessionError(
343+
"A run has already been completed as part of the"
344+
" active KedroSession. KedroSession has a 1-1 mapping with"
345+
" runs, and thus only one run should be executed per session."
346+
)
347+
348+
session_id = self.store["session_id"]
349+
save_version = session_id
331350
extra_params = self.store.get("extra_params") or {}
332351
context = self.load_context()
333352

@@ -352,7 +371,7 @@ def run( # pylint: disable=too-many-arguments,too-many-locals
352371
)
353372

354373
record_data = {
355-
"run_id": run_id,
374+
"session_id": session_id,
356375
"project_path": self._project_path.as_posix(),
357376
"env": context.env,
358377
"kedro_version": kedro_version,
@@ -368,7 +387,8 @@ def run( # pylint: disable=too-many-arguments,too-many-locals
368387
}
369388

370389
catalog = context._get_catalog(
371-
save_version=save_version, load_versions=load_versions
390+
save_version=save_version,
391+
load_versions=load_versions,
372392
)
373393

374394
# Run the runner
@@ -379,7 +399,10 @@ def run( # pylint: disable=too-many-arguments,too-many-locals
379399
)
380400

381401
try:
382-
run_result = runner.run(filtered_pipeline, catalog, hook_manager, run_id)
402+
run_result = runner.run(
403+
filtered_pipeline, catalog, hook_manager, session_id
404+
)
405+
self._run_called = True
383406
except Exception as error:
384407
hook_manager.hook.on_pipeline_error(
385408
error=error,

0 commit comments

Comments
 (0)