Source code for cyrxnopt.OptimizerAmlro

import logging
import os
from collections.abc import Callable
from typing import Any, Optional

from cyrxnopt.NestedVenv import NestedVenv
from cyrxnopt.OptimizerABC import OptimizerABC
from cyrxnopt.utilities.config.transforms import use_subkeys

logger = logging.getLogger(__name__)



[docs]
class OptimizerAmlro(OptimizerABC):
    # Private static data member to list dependency packages required
    # by this class
    _packages = [
        "git+https://github.com/RxnRover/amlo",
        "numpy",
        "pandas",
        "joblib",
    ]

    def __init__(self, venv: NestedVenv) -> None:
        """Optimizer class for the AMLRO package.

        :param venv: Virtual environment to install the optimizer
        :type venv: NestedVenv
        """

        super().__init__(venv)


[docs]
    def get_config(self) -> list[dict[str, Any]]:
        """Gets the configuration options available for this optimizer.

        See :py:meth:`OptimizerABC.get_config` for more information about the
        config descriptions returned by this method and for general usage
        information.

        :return: Configuration option descriptions
        :rtype: list[dict[str, Any]]
        """

        config: list[dict[str, Any]] = [
            {
                "name": "continuous_feature_names",
                "type": "list[str]",
                "value": [],
            },
            {
                "name": "continuous_feature_bounds",
                "type": "list[list[float]]",
                "value": [],
            },
            {
                "name": "continuous_feature_resolutions",
                "type": "list[float]",
                "value": [],
            },
            {
                "name": "categorical_feature_names",
                "type": "list[str]",
                "value": [],
            },
            {
                "name": "categorical_feature_values",
                "type": "list[list[str]]",
                "value": [],
            },
            {
                "name": "budget",
                "type": "int",
                "value": 100,
            },
            {
                "name": "objectives",
                "type": "list[str]",
                "value": ["yield"],
            },
            {
                "name": "direction",
                "type": "str",
                "value": "min",
                "range": ["min", "max"],
            },
        ]
        # TODO: Budget should be constrained to numbers greater than
        #       zero once that format is solidified.
        # TODO: Should the value of this "config" variable be moved into
        #       a JSON file to make it easier to modify without changing
        #       the code?

        return config



[docs]
    def set_config(self, experiment_dir: str, config: dict[str, Any]) -> None:
        """Generates necessary data files based on the given config.

        See :py:meth:`OptimizerABC.set_config` for more information about how
        to form the config dictionary and for general usage information.

        :param experiment_dir: Output directory for generated files
        :type experiment_dir: str
        :param config: Configuration options for this optimizer instance
        :type config: dict[str, Any]
        """

        self._import_deps()

        self._validate_config(config)

        if not os.path.exists(experiment_dir):
            os.makedirs(experiment_dir)

        full_combo_list = self._imports[
            "generate_combos"
        ].generate_uniform_grid(use_subkeys(config))

        full_combo_list = self._imports["np"].around(
            full_combo_list, decimals=4
        )
        full_combo_df = self._imports["pd"].DataFrame(full_combo_list)
        training_combo_df = full_combo_df.sample(20)

        if bool(config["categorical_feature_names"]):
            feature_names_list = self._imports["np"].concatenate(
                (
                    config["continuous_feature_names"],
                    config["categorical_feature_names"],
                )
            )
        else:
            feature_names_list = config["continuous_feature_names"]

        full_combo_df.columns = feature_names_list

        full_combo_path = os.path.join(experiment_dir, "full_combo_file.txt")
        training_combo_path = os.path.join(
            experiment_dir, "training_combo_file.txt"
        )

        full_combo_df.to_csv(full_combo_path, index=False)
        training_combo_df.to_csv(training_combo_path, index=False)

        training_set_path = os.path.join(
            experiment_dir, "training_set_file.txt"
        )
        training_set_decoded_path = os.path.join(
            experiment_dir, "training_set_decoded_file.txt"
        )

        # Write the reaction conditions for training dataset into files
        # (encoded and decoded versions)
        with open(training_set_path, "w") as file_object:
            feature_names = ",".join([str(elem) for elem in feature_names_list])
            file_object.write(feature_names + ",Yield" + "\n")

        with open(training_set_decoded_path, "w") as file_object:
            feature_names = ",".join([str(elem) for elem in feature_names_list])
            file_object.write(feature_names + ",Yield" + "\n")



[docs]
    def train(
        self,
        prev_param: list[Any],
        yield_value: float,
        experiment_dir: str,
        config: dict[str, Any],
        obj_func: Optional[Callable[..., float]] = None,
    ) -> list[Any]:
        """Suggests and records training data points needed for AMLRO.

        :py:meth:`OptimizerAmlro.set_config` must be called prior to this method
        to generate the necessary files.

        The previous parameter+result is recorded during the *next*
        call to either :py:meth:`~OptimizerAmlro.train` or
        :py:meth:`~OptimizerAmlro.predict`. This means that on the first call
        here, the ``prev_param`` and ``yield_value`` values provided are
        ignored. Importantly, this also means that the last suggested
        parameter+result pair from training will not be recorded unless either
        another :py:meth:`~OptimizerAmlro.train` or a subsequent
        :py:meth:`~OptimizerAmlro.predict` call are made afterward!

        :param prev_param: Experimental parameter combination from the previous
            experiment, provide an empty list for the first call
        :type prev_param: list[Any]
        :param yield_value: Experimental yield
        :type yield_value: float
        :param experiment_dir: Output directory for saving data files
        :type experiment_dir: str
        :param config: CyRxnOpt-level config for the optimizer
        :type config: dict[str, Any]
        :param obj_func: Ignored for this optimizer, defaults to None
        :type obj_func: Optional[Callable[..., float]], optional

        :return: Next parameter combination to perform, or an empty list (``[]``)
                 if all training points have been performed
        :rtype: list[Any]
        """

        self._import_deps()

        # TODO: Set these as properties?
        training_set_path = os.path.join(
            experiment_dir, "training_set_file.txt"
        )
        training_set_decoded_path = os.path.join(
            experiment_dir, "training_set_decoded_file.txt"
        )
        training_combo_path = os.path.join(
            experiment_dir, "training_combo_file.txt"
        )

        if config["direction"].lower() == "min":
            yield_value = -yield_value

        # Determine next training row to perform
        training_combos = self._imports["pd"].read_csv(training_combo_path)
        training_set = self._imports["pd"].read_csv(training_set_path)
        next_index = self._get_next_training_index_by_length(
            training_combos, training_set
        )
        # NOTE: Not used due to bug in amlo that causes training_set_file.txt
        # to have the decoded feature headers, so training_set_file.txt and
        # training_combo_file.txt will never have the prerequisite matching
        # column headers.
        # next_index = self._get_next_training_index_next_combo(
        #     training_combos, training_set
        # )

        # Exit early if all training points have already been performed
        if next_index == -1:
            return []

        # Workaround to avoid being stuck at the first parameter
        if len(prev_param) > 0:
            next_index += 1

        # training step
        next_parameters = self._imports[
            "training_set_generator"
        ].generate_training_data(
            training_set_path,
            training_set_decoded_path,
            training_combo_path,
            use_subkeys(config),
            prev_param,
            yield_value,
            next_index,
        )

        return next_parameters



[docs]
    def predict(
        self,
        prev_param: list[Any],
        yield_value: float,
        experiment_dir: str,
        config: dict[str, Any],
        obj_func: Optional[Callable[..., float]] = None,
    ) -> list[Any]:
        """Searches for the best parameters and records results from prior steps.

        :py:meth:`OptimizerAmlro.set_config` and :py:meth:`OptimizerAmlro.train`
        must be called prior to this method to generate the necessary files and
        initial training data for the model.

        The previous parameter+result is recorded during the *next*
        call to :py:meth:`~OptimizerAmlro.predict`. The ``prev_param`` and
        ``yield_value`` values provided here are always recorded. Importantly,
        this also means that the last suggested parameter+result pair from
        prediction will not be recorded unless either another
        :py:meth:`~OptimizerAmlro.predict` call is made afterward!

        :param prev_param: Parameters provided from the previous prediction
                           or from the final call to :py:meth:`OptimizerAmlro.train`
        :type prev_param: list[Any]
        :param yield_value: Result from the previous suggested conditions
        :type yield_value: float
        :param experiment_dir: Output directory for saving data files
        :type experiment_dir: str
        :param config: CyRxnOpt-level config for the optimizer
        :type config: dict[str, Any]
        :param obj_func: Ignored for this optimizer, defaults to None
        :type obj_func: Optional[Callable[..., float]], optional

        :return: The next suggested reaction to perform
        :rtype: list[Any]
        """

        self._import_deps()

        training_set_path = os.path.join(
            experiment_dir, "training_set_file.txt"
        )
        training_set_decoded_path = os.path.join(
            experiment_dir, "training_set_decoded_file.txt"
        )
        full_combo_path = os.path.join(experiment_dir, "full_combo_file.txt")

        if config["direction"].lower() == "min":
            yield_value = -yield_value

        # prediction step
        best_combo = self._imports["optimizer_main"].get_optimized_parameters(
            training_set_path,
            training_set_decoded_path,
            full_combo_path,
            use_subkeys(config),
            prev_param,
            yield_value,
        )

        return best_combo


    def _import_deps(self) -> None:
        """importing all the packages and libries needed for running amlro optimizer"""

        import numpy as np  # type: ignore
        import pandas as pd  # type: ignore
        from amlro import (  # type: ignore
            generate_combos,
            optimizer,
            optimizer_main,
            training_set_generator,
        )

        self._imports = {
            "generate_combos": generate_combos,
            "training_set_generator": training_set_generator,
            "optimizer": optimizer,
            "optimizer_main": optimizer_main,
            "np": np,
            "pd": pd,
        }

    def _get_next_training_index_by_length(  # type: ignore
        self, training_combos, training_dataset
    ) -> int:
        """Gets the index for the next training condition to be performed.

        This is implemented by a simple check on the dataset length. It is
        assumed that the only rows present in the dataset are those from the
        training combo list. This means that if the dataset has 6 rows, then
        the index of the next training combo to perform is also 6.

        :param training_combos: Training conditions suggested by AMLRO
        :type training_combos: pd.DataFrame
        :param training_dataset: Current dataset of performed reactions used to
                                 train AMLRO
        :type training_dataset: pd.DataFrame

        :returns: Index in the training combo list of the next conditions
                  missing from the training dataset. An index of -1 is returned
                  if the dataset is longer than the training combo list.
        :rtype: int
        """

        # Get the row counts of the training combos and dataset
        combo_rows = len(training_combos.index)
        dataset_rows = len(training_dataset.index)
        print(f"DEBUG combo_rows, dataset_rows : {combo_rows}, {dataset_rows}")

        # Simple check assuming the dataset only contains the training combos
        # that have been run
        if dataset_rows >= combo_rows:
            # No more training points to run
            return -1

        # The row count of the dataset will be the next index in the combo
        # list due to zero indexing
        return dataset_rows

    def _get_next_training_index_next_combo(  # type: ignore
        self, training_combos, training_dataset
    ) -> int:
        """Gets the index for the next training condition to be performed.

        This is implemented by checking which training conditions are missing
        in the training dataset, then giving the index in the training condition
        list for the first missing condition. Importantly, this means that a
        training dataset with the correct number of entries, but none matching
        the training combo file, will still receive indices and not be
        considered "completed" yet.

        :param training_combos: Training conditions suggested by AMLRO
        :type training_combos: pd.DataFrame
        :param training_dataset: Current dataset of performed reactions used to
                                 train AMLRO
        :type training_dataset: pd.DataFrame

        :returns: Index in the training combo list of the next conditions
                  missing from the training dataset. An index of -1 is returned
                  if no more training conditions are missing from the dataset.
        :rtype: int
        """

        # Merge the current training dataset with the training combos suggested
        # by AMLRO. A left merge is used to only use keys from the training
        # combos and preserve the row index in the training combos.
        merged = training_combos.merge(
            training_dataset, how="left", indicator=True
        )

        # Determine training combos are missing
        is_missing = merged["_merge"].eq("left_only")

        # The next line to get the first missing row will return 0 for both
        # the first row and if no rows are missing, so exit early with -1
        # if no more training combos are missing
        if is_missing.eq(False).all():
            return -1

        # Get the index of the first training combo not found in the provided
        # training dataset. Apparently between True and False, True is the max
        # so idxmax() works.
        first_missing_index = merged["_merge"].eq("left_only").idxmax()

        return first_missing_index