From 73384852c901e086fb35c41e7f900f4f2735248a Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Thu, 14 Mar 2024 15:25:54 +0000 Subject: [PATCH 1/5] Improve error messages with more details Signed-off-by: Ahdra Merali --- kedro/pipeline/modular_pipeline.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py index 6e89b810b4..bb0dc39434 100644 --- a/kedro/pipeline/modular_pipeline.py +++ b/kedro/pipeline/modular_pipeline.py @@ -2,6 +2,7 @@ from __future__ import annotations import copy +import difflib from typing import AbstractSet, Iterable from kedro.pipeline.node import Node @@ -52,7 +53,7 @@ def _validate_inputs_outputs( free_inputs = {_strip_transcoding(i) for i in pipe.inputs()} if not inputs <= free_inputs: - raise ModularPipelineError("Inputs should be free inputs to the pipeline") + raise ModularPipelineError("Inputs must not be outputs from another node") if outputs & free_inputs: raise ModularPipelineError("Outputs can't contain free inputs to the pipeline") @@ -64,16 +65,20 @@ def _validate_datasets_exist( parameters: AbstractSet[str], pipe: Pipeline, ) -> None: + """Validate that inputs, parameters and outputs map correctly onto the provided nodes.""" inputs = {_strip_transcoding(k) for k in inputs} outputs = {_strip_transcoding(k) for k in outputs} existing = {_strip_transcoding(ds) for ds in pipe.datasets()} non_existent = (inputs | outputs | parameters) - existing if non_existent: - raise ModularPipelineError( - f"Failed to map datasets and/or parameters: " - f"{', '.join(sorted(non_existent))}" - ) + possible_matches = [] + for non_existent_input in non_existent: + possible_matches += difflib.get_close_matches(non_existent_input, existing) + + error_msg = f"Failed to map these inputs onto the nodes provided: {', '.join(sorted(non_existent))}" + suggestions = f" - did you mean one of these instead: {', '.join(possible_matches)}" if possible_matches else "" + raise ModularPipelineError(error_msg + suggestions) def _get_dataset_names_mapping( From 2b40203ca21132fab9890182243709fedc834732 Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Thu, 14 Mar 2024 15:30:55 +0000 Subject: [PATCH 2/5] Fix existing tests Signed-off-by: Ahdra Merali --- kedro/pipeline/modular_pipeline.py | 2 +- tests/pipeline/test_modular_pipeline.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py index bb0dc39434..2d8576bb09 100644 --- a/kedro/pipeline/modular_pipeline.py +++ b/kedro/pipeline/modular_pipeline.py @@ -76,7 +76,7 @@ def _validate_datasets_exist( for non_existent_input in non_existent: possible_matches += difflib.get_close_matches(non_existent_input, existing) - error_msg = f"Failed to map these inputs onto the nodes provided: {', '.join(sorted(non_existent))}" + error_msg = f"Failed to map datasets and/or parameters onto the nodes provided: {', '.join(sorted(non_existent))}" suggestions = f" - did you mean one of these instead: {', '.join(possible_matches)}" if possible_matches else "" raise ModularPipelineError(error_msg + suggestions) diff --git a/tests/pipeline/test_modular_pipeline.py b/tests/pipeline/test_modular_pipeline.py index bfb2e347c3..1d1d9add67 100644 --- a/tests/pipeline/test_modular_pipeline.py +++ b/tests/pipeline/test_modular_pipeline.py @@ -222,7 +222,7 @@ def test_missing_dataset_name( ): # noqa: PLR0913 raw_pipeline = modular_pipeline([node(func, inputs, outputs)]) - with pytest.raises(ModularPipelineError, match=r"Failed to map datasets") as e: + with pytest.raises(ModularPipelineError, match=r"Failed to map datasets and/or parameters") as e: pipeline( raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map ) @@ -378,11 +378,11 @@ def test_non_existent_parameters_mapped(self): ] ) - pattern = r"Failed to map datasets and/or parameters: params:beta" + pattern = r"Failed to map datasets and/or parameters onto the nodes provided: params:beta" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, parameters={"beta": "gamma"}) - pattern = r"Failed to map datasets and/or parameters: parameters" + pattern = r"Failed to map datasets and/or parameters onto the nodes provided: parameters" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, parameters={"parameters": "some_yaml_dataset"}) @@ -394,7 +394,7 @@ def test_bad_inputs_mapping(self): ] ) - pattern = "Inputs should be free inputs to the pipeline" + pattern = "Inputs must not be outputs from another node" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, inputs={"AA": "CC"}) From c55c53539322753d85f8fd85eb1fba595d2832e5 Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Thu, 14 Mar 2024 15:57:01 +0000 Subject: [PATCH 3/5] Add tests Signed-off-by: Ahdra Merali --- kedro/pipeline/modular_pipeline.py | 6 ++- tests/pipeline/test_modular_pipeline.py | 55 ++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py index 2d8576bb09..bf709c5f19 100644 --- a/kedro/pipeline/modular_pipeline.py +++ b/kedro/pipeline/modular_pipeline.py @@ -77,7 +77,11 @@ def _validate_datasets_exist( possible_matches += difflib.get_close_matches(non_existent_input, existing) error_msg = f"Failed to map datasets and/or parameters onto the nodes provided: {', '.join(sorted(non_existent))}" - suggestions = f" - did you mean one of these instead: {', '.join(possible_matches)}" if possible_matches else "" + suggestions = ( + f" - did you mean one of these instead: {', '.join(sorted(possible_matches))}" + if possible_matches + else "" + ) raise ModularPipelineError(error_msg + suggestions) diff --git a/tests/pipeline/test_modular_pipeline.py b/tests/pipeline/test_modular_pipeline.py index 1d1d9add67..b1c397330a 100644 --- a/tests/pipeline/test_modular_pipeline.py +++ b/tests/pipeline/test_modular_pipeline.py @@ -217,16 +217,67 @@ def test_empty_output(self): ), ], ) - def test_missing_dataset_name( + def test_missing_dataset_name_no_suggestion( self, func, inputs, outputs, inputs_map, outputs_map, expected_missing ): # noqa: PLR0913 raw_pipeline = modular_pipeline([node(func, inputs, outputs)]) - with pytest.raises(ModularPipelineError, match=r"Failed to map datasets and/or parameters") as e: + with pytest.raises( + ModularPipelineError, match=r"Failed to map datasets and/or parameters" + ) as e: pipeline( raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map ) assert ", ".join(expected_missing) in str(e.value) + assert " - did you mean one of these instead: " not in str(e.value) + + @pytest.mark.parametrize( + "func, inputs, outputs, inputs_map, outputs_map, expected_missing, expected_suggestion", + [ + # Testing inputs + (identity, "IN", "OUT", {"I": "in"}, {}, ["I"], ["IN"]), + ( + biconcat, + ["IN1", "IN2"], + "OUT", + {"IN": "input"}, + {}, + ["IN"], + ["IN1", "IN2"], + ), + # Testing outputs + (identity, "IN", "OUT", {}, {"OUT_": "output"}, ["OUT_"], ["OUT"]), + ( + identity, + "IN", + ["OUT1", "OUT2"], + {}, + {"OUT": "out"}, + ["OUT"], + ["OUT1", "OUT2"], + ), + ], + ) + def test_missing_dataset_with_suggestion( + self, + func, + inputs, + outputs, + inputs_map, + outputs_map, + expected_missing, + expected_suggestion, + ): + raw_pipeline = modular_pipeline([node(func, inputs, outputs)]) + + with pytest.raises( + ModularPipelineError, match=r"Failed to map datasets and/or parameters" + ) as e: + pipeline( + raw_pipeline, namespace="PREFIX", inputs=inputs_map, outputs=outputs_map + ) + assert ", ".join(expected_missing) in str(e.value) + assert ", ".join(expected_suggestion) in str(e.value) def test_node_properties_preserved(self): """ From a669ed9327023acaa50d598fa134036d8eae133d Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Mon, 18 Mar 2024 07:25:43 +0000 Subject: [PATCH 4/5] Apply suggestions from code review Signed-off-by: Ahdra Merali --- kedro/pipeline/modular_pipeline.py | 15 ++++++++++----- tests/pipeline/test_modular_pipeline.py | 8 ++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py index bf709c5f19..aae29136b8 100644 --- a/kedro/pipeline/modular_pipeline.py +++ b/kedro/pipeline/modular_pipeline.py @@ -53,10 +53,14 @@ def _validate_inputs_outputs( free_inputs = {_strip_transcoding(i) for i in pipe.inputs()} if not inputs <= free_inputs: - raise ModularPipelineError("Inputs must not be outputs from another node") + raise ModularPipelineError( + "Inputs must not be outputs from another node in the same pipeline" + ) if outputs & free_inputs: - raise ModularPipelineError("Outputs can't contain free inputs to the pipeline") + raise ModularPipelineError( + "All outputs must be generated by some node in the pipeline" + ) def _validate_datasets_exist( @@ -72,13 +76,14 @@ def _validate_datasets_exist( existing = {_strip_transcoding(ds) for ds in pipe.datasets()} non_existent = (inputs | outputs | parameters) - existing if non_existent: + sorted_non_existent = sorted(non_existent) possible_matches = [] - for non_existent_input in non_existent: + for non_existent_input in sorted_non_existent: possible_matches += difflib.get_close_matches(non_existent_input, existing) - error_msg = f"Failed to map datasets and/or parameters onto the nodes provided: {', '.join(sorted(non_existent))}" + error_msg = f"Failed to map datasets and/or parameters onto the nodes provided: {', '.join(sorted_non_existent)}" suggestions = ( - f" - did you mean one of these instead: {', '.join(sorted(possible_matches))}" + f" - did you mean one of these instead: {', '.join(possible_matches)}" if possible_matches else "" ) diff --git a/tests/pipeline/test_modular_pipeline.py b/tests/pipeline/test_modular_pipeline.py index b1c397330a..2b003a8401 100644 --- a/tests/pipeline/test_modular_pipeline.py +++ b/tests/pipeline/test_modular_pipeline.py @@ -243,7 +243,7 @@ def test_missing_dataset_name_no_suggestion( {"IN": "input"}, {}, ["IN"], - ["IN1", "IN2"], + ["IN2", "IN1"], ), # Testing outputs (identity, "IN", "OUT", {}, {"OUT_": "output"}, ["OUT_"], ["OUT"]), @@ -254,7 +254,7 @@ def test_missing_dataset_name_no_suggestion( {}, {"OUT": "out"}, ["OUT"], - ["OUT1", "OUT2"], + ["OUT2", "OUT1"], ), ], ) @@ -445,7 +445,7 @@ def test_bad_inputs_mapping(self): ] ) - pattern = "Inputs must not be outputs from another node" + pattern = "Inputs must not be outputs from another node in the same pipeline" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, inputs={"AA": "CC"}) @@ -457,7 +457,7 @@ def test_bad_outputs_mapping(self): ] ) - pattern = "Outputs can't contain free inputs to the pipeline" + pattern = "All outputs must be generated by some node in the pipeline" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, outputs={"A": "C"}) From 8c143a327cf4b41f48f5d595250a61848c1fc761 Mon Sep 17 00:00:00 2001 From: Ahdra Merali Date: Mon, 18 Mar 2024 11:17:54 +0000 Subject: [PATCH 5/5] Apply suggestions from code review pt 2 Signed-off-by: Ahdra Merali --- kedro/pipeline/modular_pipeline.py | 2 +- tests/pipeline/test_modular_pipeline.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py index aae29136b8..9ac2d6a1d1 100644 --- a/kedro/pipeline/modular_pipeline.py +++ b/kedro/pipeline/modular_pipeline.py @@ -59,7 +59,7 @@ def _validate_inputs_outputs( if outputs & free_inputs: raise ModularPipelineError( - "All outputs must be generated by some node in the pipeline" + "All outputs must be generated by some node within the pipeline" ) diff --git a/tests/pipeline/test_modular_pipeline.py b/tests/pipeline/test_modular_pipeline.py index 2b003a8401..c1e76867b5 100644 --- a/tests/pipeline/test_modular_pipeline.py +++ b/tests/pipeline/test_modular_pipeline.py @@ -457,7 +457,7 @@ def test_bad_outputs_mapping(self): ] ) - pattern = "All outputs must be generated by some node in the pipeline" + pattern = "All outputs must be generated by some node within the pipeline" with pytest.raises(ModularPipelineError, match=pattern): pipeline(raw_pipeline, outputs={"A": "C"})