From a345e26093d4ce2485603c2b940478cf0b6979af Mon Sep 17 00:00:00 2001 From: Galileo Galilei Date: Sun, 7 Feb 2021 21:36:44 +0100 Subject: [PATCH] FIX #159 - Refactor CLI to make mlflow.yml read and write more flexible to different configuration logic --- CHANGELOG.md | 3 + docs/source/02_installation/02_setup.md | 8 +- .../03_getting_started/02_first_steps.md | 4 +- docs/source/07_python_objects/04_CLI.md | 6 +- kedro_mlflow/framework/cli/cli.py | 88 ++++++++------- .../framework/context/mlflow_context.py | 14 ++- kedro_mlflow/utils.py | 9 -- tests/framework/cli/test_cli.py | 101 +++++++++++++++--- .../framework/context/test_mlflow_context.py | 18 +++- tests/test_utils.py | 31 ------ 10 files changed, 174 insertions(+), 108 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 57e5ea8a..58e4e1f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,14 +5,17 @@ ### Added - A new `long_parameters_strategy` key is added in the `mlflow.yml` (under in the hook/node section). You can specify different strategies (`fail`, `truncate` or `tag`) to handle parameters over 250 characters which cause crashes for some mlflow backend. ([#69](https://github.com/Galileo-Galilei/kedro-mlflow/issues/69)) +- Add an `env` parameter to `kedro mlflow init` command to specify under which `conf/` subfolder the `mlflow.yml` should be created. ([#159](https://github.com/Galileo-Galilei/kedro-mlflow/issues/159)) ### Fixed - Pin the kedro version to force it to be **strictly** inferior to `0.17` which is not compatible with current `kedro-mlflow` version ([#143](https://github.com/Galileo-Galilei/kedro-mlflow/issues/143)) +- It is no longer assumed for the project to run that the `mlflow.yml` is located under `conf/base`. The project will run as soon as the configuration file is discovered by the registered ConfigLoader ([#159](https://github.com/Galileo-Galilei/kedro-mlflow/issues/159)) ### Changed - The `KedroPipelineModel.load_context()` method now loads all the `DataSets` in memory in the `DataCatalog`. It is also now possible to specify the `runner` to execute the model as well as the `copy_mode` when executing the inference pipeline (instead of deepcopying the datasets between each nodes which is kedro's default). This makes the API serving with `mlflow serve` command considerably faster (~20 times faster) for models which needs compiling (i.e. keras, tensorflow ...) ([#133](https://github.com/Galileo-Galilei/kedro-mlflow/issues/133)) +- The CLI projects commands are now always accessible even if you have not called `kedro mlflow init` yet to create a `mlflow.yml` configuration file ([#159](https://github.com/Galileo-Galilei/kedro-mlflow/issues/159)) ## [0.4.1] - 2020-12-03 diff --git a/docs/source/02_installation/02_setup.md b/docs/source/02_installation/02_setup.md index 3b321c41..28a18393 100644 --- a/docs/source/02_installation/02_setup.md +++ b/docs/source/02_installation/02_setup.md @@ -31,7 +31,13 @@ kedro mlflow init you should see the following message: ```console -'conf/base/mlflow.yml' successfully updated. +'conf/local/mlflow.yml' successfully updated. +``` + +*Note: you can create the configuration file in another kedro environment with the `--env` argument:* + +```console +kedro mlflow init --env= ``` ### Declaring kedro-mlflow hooks diff --git a/docs/source/03_getting_started/02_first_steps.md b/docs/source/03_getting_started/02_first_steps.md index f35a0c71..907690c6 100644 --- a/docs/source/03_getting_started/02_first_steps.md +++ b/docs/source/03_getting_started/02_first_steps.md @@ -11,10 +11,10 @@ kedro mlflow init You will see the following message: ```console -'conf/base/mlflow.yml' successfully updated. +'conf/local/mlflow.yml' successfully updated. ``` -The ``conf/base`` folder is updated and you can see the `mlflow.yml` file: +The ``conf/local`` folder is updated and you can see the `mlflow.yml` file: ![initialized_project](../imgs/initialized_project.png) diff --git a/docs/source/07_python_objects/04_CLI.md b/docs/source/07_python_objects/04_CLI.md index f344fb49..4f852075 100644 --- a/docs/source/07_python_objects/04_CLI.md +++ b/docs/source/07_python_objects/04_CLI.md @@ -3,9 +3,13 @@ ## ``init`` ``kedro mlflow init``: this command is needed to initalize your project. You cannot run any other commands before you run this one once. It performs 2 actions: - - creates a ``mlflow.yml`` configuration file in your ``conf/base`` folder + - creates a ``mlflow.yml`` configuration file in your ``conf/local`` folder - replace the ``src/PYTHON_PACKAGE/run.py`` file by an updated version of the template. If your template has been modified since project creation, a warning wil be raised. You can either run ``kedro mlflow init --force`` to ignore this warning (but this will erase your ``run.py``) or [set hooks manually](#new-hooks). +Init has two arguments: +- `--env` which enable to specifiy another environment where the mlflow.yml should be created (e.g, `base`) +- `--force` which overrides the `mlflow.yml` if it already exists and replaces it with the default one. Use it with caution! + ## ``ui`` ``kedro mlflow ui``: this command opens the mlflow UI (basically launches the ``mlflow ui`` command with the configuration of your ``mlflow.yml`` file) diff --git a/kedro_mlflow/framework/cli/cli.py b/kedro_mlflow/framework/cli/cli.py index c4b41666..181097b1 100644 --- a/kedro_mlflow/framework/cli/cli.py +++ b/kedro_mlflow/framework/cli/cli.py @@ -6,7 +6,7 @@ from kedro_mlflow.framework.cli.cli_utils import write_jinja_template from kedro_mlflow.framework.context import get_mlflow_config -from kedro_mlflow.utils import _already_updated, _is_kedro_project +from kedro_mlflow.utils import _is_kedro_project try: from kedro.framework.context import get_static_project_data @@ -26,20 +26,14 @@ def reset_commands(self): # add commands on the fly based on conditions if _is_kedro_project(): self.add_command(init) - if _already_updated(): - self.add_command(ui) - # self.add_command(run) # TODO : IMPLEMENT THIS FUNCTION + self.add_command(ui) + # self.add_command(run) # TODO : IMPLEMENT THIS FUNCTION else: self.add_command(new) def list_commands(self, ctx): self.reset_commands() commands_list = sorted(self.commands) - if commands_list == ["init"]: - click.secho( - """\n\nYou have not updated your template yet.\nThis is mandatory to use 'kedro-mlflow' plugin.\nPlease run the following command before you can access to other commands :\n\n$ kedro mlflow init""", - fg="yellow", - ) return commands_list def get_command(self, ctx, cmd_name): @@ -60,12 +54,18 @@ def mlflow_commands(): @mlflow_commands.command() +@click.option( + "--env", + "-e", + default="local", + help="The name of the kedro environment where the 'mlflow.yml' should be created. Default to 'local'", +) @click.option( "--force", "-f", is_flag=True, default=False, - help="Update the template without any checks. The modifications you made in 'run.py' will be lost.", + help="Update the template without any checks. If a 'mlflow.yml' already eists, it will be overridden.", ) @click.option( "--silent", @@ -74,49 +74,56 @@ def mlflow_commands(): default=False, help="Should message be logged when files are modified?", ) -def init(force, silent): +def init(env, force, silent): """Updates the template of a kedro project. - Running this command is mandatory to use kedro-mlflow. - 2 actions are performed : - 1. Add "conf/base/mlflow.yml": This is a configuration file - used for run parametrization when calling "kedro run" command. - See INSERT_DOC_URL for further details. - 2. Modify "src/YOUR_PACKAGE_NAME/run.py" to add mlflow hooks - to the ProjectContext. This will erase your current "run.py" - script and all your modifications will be lost. - If you do not want to erase "run.py", insert the hooks manually + Running this command is mandatory to use kedro-mlflow. This + adds a configuration file used for run parametrization when + calling "kedro run" command. """ # get constants + mlflow_yml = "mlflow.yml" project_path = Path().cwd() project_globals = get_static_project_data(project_path) context = load_context(project_path) - conf_root = context.CONF_ROOT + mlflow_yml_path = project_path / context.CONF_ROOT / env / mlflow_yml - # mlflow.yml is just a static file, - # but the name of the experiment is set to be the same as the project - mlflow_yml = "mlflow.yml" - write_jinja_template( - src=TEMPLATE_FOLDER_PATH / mlflow_yml, - is_cookiecutter=False, - dst=project_path / conf_root / "base" / mlflow_yml, - python_package=project_globals["package_name"], - ) - if not silent: + # mlflow.yml is just a static file + if mlflow_yml_path.is_file() and not force: click.secho( click.style( - f"'{conf_root}/base/mlflow.yml' successfully updated.", fg="green" + f"A 'mlflow.yml' already exists at '{mlflow_yml_path}' You can use the ``--force`` option to override it.", + fg="red", ) ) + return 1 + else: + try: + write_jinja_template( + src=TEMPLATE_FOLDER_PATH / mlflow_yml, + is_cookiecutter=False, + dst=mlflow_yml_path, + python_package=project_globals["package_name"], + ) + except FileNotFoundError: + click.secho( + click.style( + f"No env '{env}' found. Please check this folder exists inside '{context.CONF_ROOT}' folder.", + fg="red", + ) + ) + return 1 + if not silent: + click.secho( + click.style( + f"'{context.CONF_ROOT}/{env}/{mlflow_yml}' successfully updated.", + fg="green", + ) + ) + return 0 @mlflow_commands.command() -@click.option( - "--project-path", - "-p", - required=False, - help="The environment within conf folder we want to retrieve.", -) @click.option( "--env", "-e", @@ -124,15 +131,14 @@ def init(force, silent): default="local", help="The environment within conf folder we want to retrieve.", ) -def ui(project_path, env): +def ui(env): """Opens the mlflow user interface with the project-specific settings of mlflow.yml. This interface enables to browse and compares runs. """ - if not project_path: - project_path = Path().cwd() + project_path = Path().cwd() context = load_context(project_path=project_path, env=env) # the context must contains the self.mlflow attribues with mlflow configuration mlflow_conf = get_mlflow_config(context) diff --git a/kedro_mlflow/framework/context/mlflow_context.py b/kedro_mlflow/framework/context/mlflow_context.py index 8ed91bae..50824afc 100644 --- a/kedro_mlflow/framework/context/mlflow_context.py +++ b/kedro_mlflow/framework/context/mlflow_context.py @@ -1,14 +1,22 @@ +from kedro.config import MissingConfigException from kedro.framework.context import KedroContext -from kedro_mlflow.framework.context.config import KedroMlflowConfig +from kedro_mlflow.framework.context.config import ( + KedroMlflowConfig, + KedroMlflowConfigError, +) # this could be a read-only property in the context # with a @property decorator # but for consistency with hook system, it is an external function def get_mlflow_config(context: KedroContext): - - conf_mlflow_yml = context.config_loader.get("mlflow*", "mlflow*/**") + try: + conf_mlflow_yml = context.config_loader.get("mlflow*", "mlflow*/**") + except MissingConfigException: + raise KedroMlflowConfigError( + "No 'mlflow.yml' config file found in environment. Use ``kedro mlflow init`` command in CLI to create a default config file." + ) conf_mlflow = KedroMlflowConfig(context.project_path) conf_mlflow.from_dict(conf_mlflow_yml) return conf_mlflow diff --git a/kedro_mlflow/utils.py b/kedro_mlflow/utils.py index 980c95a8..bdff5881 100644 --- a/kedro_mlflow/utils.py +++ b/kedro_mlflow/utils.py @@ -60,15 +60,6 @@ def _validate_project_path(project_path: Union[str, Path, None] = None) -> Path: return project_path -def _already_updated(project_path: Union[str, Path, None] = None) -> bool: - project_path = _validate_project_path(project_path) - flag = False - # TODO : add a better check ... - if (project_path / "conf" / "base" / "mlflow.yml").is_file(): - flag = True - return flag - - def _parse_requirements(path, encoding="utf-8"): with open(path, mode="r", encoding=encoding) as file_handler: requirements = [ diff --git a/tests/framework/cli/test_cli.py b/tests/framework/cli/test_cli.py index 4080c03d..8da4c281 100644 --- a/tests/framework/cli/test_cli.py +++ b/tests/framework/cli/test_cli.py @@ -1,15 +1,18 @@ import re -import subprocess +import subprocess # noqa: F401 import pytest +import yaml from click.testing import CliRunner from cookiecutter.main import cookiecutter from kedro import __version__ as kedro_version from kedro.framework.cli.cli import TEMPLATE_PATH, info +from kedro.framework.context import load_context from kedro_mlflow.framework.cli.cli import init as cli_init from kedro_mlflow.framework.cli.cli import mlflow_commands as cli_mlflow from kedro_mlflow.framework.cli.cli import ui as cli_ui +from kedro_mlflow.framework.context import get_mlflow_config @pytest.fixture @@ -63,22 +66,13 @@ def test_mlflow_commands_outside_kedro_project(monkeypatch, tmp_path): assert {"new"} == set(extract_cmd_from_help(result.output)) -def test_mlflow_commands_inside_non_initialized_kedro_project( - mocker, monkeypatch, tmp_path, kedro_project -): - monkeypatch.chdir(tmp_path / "fake-project") - cli_runner = CliRunner() - result = cli_runner.invoke(cli_mlflow) - assert {"init"} == set(extract_cmd_from_help(result.output)) - assert "You have not updated your template yet" in result.output - - -def test_mlflow_commands_inside_initialized_kedro_project( - mocker, monkeypatch, tmp_path, kedro_project +def test_mlflow_commands_inside_kedro_project( + monkeypatch, + tmp_path, + kedro_project, ): monkeypatch.chdir(tmp_path / "fake-project") # launch the command to initialize the project - subprocess.call(["kedro", "mlflow", "init"]) cli_runner = CliRunner() result = cli_runner.invoke(cli_mlflow) assert {"init", "ui"} == set(extract_cmd_from_help(result.output)) @@ -97,8 +91,83 @@ def test_cli_init(monkeypatch, tmp_path, kedro_project): assert result.exit_code == 0 # check mlflow.yml file - assert "'conf/base/mlflow.yml' successfully updated." in result.output - assert (project_path / "conf" / "base" / "mlflow.yml").is_file() + assert "'conf/local/mlflow.yml' successfully updated." in result.output + assert (project_path / "conf/local/mlflow.yml").is_file() + + +def test_cli_init_existing_config(monkeypatch, tmp_path, kedro_project): + # "kedro_project" is a pytest.fixture declared in conftest + project_path = tmp_path / "fake-project" + monkeypatch.chdir(project_path) + cli_runner = CliRunner() + + project_context = load_context(project_path.as_posix()) + # emulate first call by writing a mlflow.yml file + yaml_str = yaml.dump(dict(mlflow_tracking_uri="toto")) + (project_path / project_context.CONF_ROOT / "local/mlflow.yml").write_text(yaml_str) + + result = cli_runner.invoke(cli_init) + + # check an error message is raised + assert "A 'mlflow.yml' already exists" in result.output + + # check the file remains unmodified + assert get_mlflow_config(project_context).mlflow_tracking_uri.endswith("toto") + + +def test_cli_init_existing_config_force_option(monkeypatch, tmp_path, kedro_project): + # "kedro_project" is a pytest.fixture declared in conftest + project_path = tmp_path / "fake-project" + monkeypatch.chdir(project_path) + cli_runner = CliRunner() + + project_context = load_context(project_path.as_posix()) + # emulate first call by writing a mlflow.yml file + yaml_str = yaml.dump(dict(mlflow_tracking_uri="toto")) + (project_path / project_context.CONF_ROOT / "local/mlflow.yml").write_text(yaml_str) + + result = cli_runner.invoke(cli_init, args="--force") + + # check an error message is raised + assert "successfully updated" in result.output + + # check the file remains unmodified + assert get_mlflow_config(project_context).mlflow_tracking_uri.endswith("mlruns") + + +@pytest.mark.parametrize( + "env", + ["base", "local"], +) +def test_cli_init_with_env(monkeypatch, tmp_path, kedro_project, env): + # "kedro_project" is a pytest.fixture declared in conftest + project_path = tmp_path / "fake-project" + monkeypatch.chdir(project_path) + cli_runner = CliRunner() + result = cli_runner.invoke(cli_init, f"--env {env}") + + # FIRST TEST: + # the command should have executed propery + assert result.exit_code == 0 + + # check mlflow.yml file + assert f"'conf/{env}/mlflow.yml' successfully updated." in result.output + assert (project_path / "conf" / env / "mlflow.yml").is_file() + + +@pytest.mark.parametrize( + "env", + ["debug"], +) +def test_cli_init_with_wrong_env(monkeypatch, tmp_path, kedro_project, env): + # "kedro_project" is a pytest.fixture declared in conftest + project_path = tmp_path / "fake-project" + monkeypatch.chdir(project_path) + cli_runner = CliRunner() + result = cli_runner.invoke(cli_init, f"--env {env}") + + # A warning message should appear + assert f"No env '{env}' found" in result.output # TODO : This is a fake test. add a test to see if ui is properly up diff --git a/tests/framework/context/test_mlflow_context.py b/tests/framework/context/test_mlflow_context.py index f56acf9d..26606fee 100644 --- a/tests/framework/context/test_mlflow_context.py +++ b/tests/framework/context/test_mlflow_context.py @@ -1,11 +1,9 @@ +import pytest import yaml from kedro.framework.context import load_context from kedro_mlflow.framework.context import get_mlflow_config - -# def test_get_mlflow_config_outside_kedro_project(tmp_path, config_with_base_mlflow_conf): -# with pytest.raises(KedroMlflowConfigError, match="not a valid path to a kedro project"): -# get_mlflow_config(project_path=tmp_path,env="local") +from kedro_mlflow.framework.context.config import KedroMlflowConfigError def _write_yaml(filepath, config): @@ -56,6 +54,18 @@ def test_get_mlflow_config(mocker, tmp_path, config_dir): assert get_mlflow_config(context).to_dict() == expected +def test_get_mlflow_config_in_uninitialized_project(mocker, tmp_path, config_dir): + # config_with_base_mlflow_conf is a pytest.fixture in conftest + mocker.patch("logging.config.dictConfig") + mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) + + context = load_context(tmp_path) + with pytest.raises( + KedroMlflowConfigError, match="No 'mlflow.yml' config file found in environment" + ): + get_mlflow_config(context) + + def test_mlflow_config_with_templated_config(mocker, tmp_path, config_dir): _write_yaml( diff --git a/tests/test_utils.py b/tests/test_utils.py index 74dfb7cb..f1b9b463 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,11 +1,9 @@ from pathlib import Path import pytest -import yaml from kedro_mlflow.utils import ( KEDRO_VERSION, - _already_updated, _get_project_globals, _is_kedro_project, _parse_requirements, @@ -40,35 +38,6 @@ def test_validate_project_path(project_path, project_path_expected): assert project_path_result == project_path_expected -def test_already_updated(tmp_path): - def _write_yaml(filepath, config): - filepath.parent.mkdir(parents=True, exist_ok=True) - yaml_str = yaml.dump(config) - filepath.write_text(yaml_str) - - _write_yaml( - tmp_path / "conf" / "base" / "mlflow.yml", - dict( - mlflow_tracking_uri=(tmp_path / "mlruns").as_posix(), - experiment=dict(name="Default", create=True), - run=dict(id=None, name=None, nested=True), - ui=dict(port=None, host=None), - ), - ) - flag = _already_updated(tmp_path) - expected_flag = True - - return flag == expected_flag - - -def test_not_yet_updated(tmp_path): - - flag = _already_updated(tmp_path) - expected_flag = False - - return flag == expected_flag - - @pytest.mark.parametrize( "project,is_kedro_project", (