import logging
import os
from collections.abc import Callable
from typing import Any, Optional
from cyrxnopt.NestedVenv import NestedVenv
from cyrxnopt.OptimizerABC import OptimizerABC
from cyrxnopt.utilities.config.transforms import use_subkeys
logger = logging.getLogger(__name__)
[docs]
class OptimizerAmlro(OptimizerABC):
# Private static data member to list dependency packages required
# by this class
_packages = [
"git+https://github.com/RxnRover/amlo",
"numpy",
"pandas",
"joblib",
]
def __init__(self, venv: NestedVenv) -> None:
"""Optimizer class for the AMLRO package.
:param venv: Virtual environment to install the optimizer
:type venv: NestedVenv
"""
super().__init__(venv)
[docs]
def get_config(self) -> list[dict[str, Any]]:
"""Gets the configuration options available for this optimizer.
See :py:meth:`OptimizerABC.get_config` for more information about the
config descriptions returned by this method and for general usage
information.
:return: Configuration option descriptions
:rtype: list[dict[str, Any]]
"""
config: list[dict[str, Any]] = [
{
"name": "continuous_feature_names",
"type": "list[str]",
"value": [],
},
{
"name": "continuous_feature_bounds",
"type": "list[list[float]]",
"value": [],
},
{
"name": "continuous_feature_resolutions",
"type": "list[float]",
"value": [],
},
{
"name": "categorical_feature_names",
"type": "list[str]",
"value": [],
},
{
"name": "categorical_feature_values",
"type": "list[list[str]]",
"value": [],
},
{
"name": "budget",
"type": "int",
"value": 100,
},
{
"name": "objectives",
"type": "list[str]",
"value": ["yield"],
},
{
"name": "direction",
"type": "str",
"value": "min",
"range": ["min", "max"],
},
]
# TODO: Budget should be constrained to numbers greater than
# zero once that format is solidified.
# TODO: Should the value of this "config" variable be moved into
# a JSON file to make it easier to modify without changing
# the code?
return config
[docs]
def set_config(self, experiment_dir: str, config: dict[str, Any]) -> None:
"""Generates necessary data files based on the given config.
See :py:meth:`OptimizerABC.set_config` for more information about how
to form the config dictionary and for general usage information.
:param experiment_dir: Output directory for generated files
:type experiment_dir: str
:param config: Configuration options for this optimizer instance
:type config: dict[str, Any]
"""
self._import_deps()
self._validate_config(config)
if not os.path.exists(experiment_dir):
os.makedirs(experiment_dir)
full_combo_list = self._imports[
"generate_combos"
].generate_uniform_grid(use_subkeys(config))
full_combo_list = self._imports["np"].around(
full_combo_list, decimals=4
)
full_combo_df = self._imports["pd"].DataFrame(full_combo_list)
training_combo_df = full_combo_df.sample(20)
if bool(config["categorical_feature_names"]):
feature_names_list = self._imports["np"].concatenate(
(
config["continuous_feature_names"],
config["categorical_feature_names"],
)
)
else:
feature_names_list = config["continuous_feature_names"]
full_combo_df.columns = feature_names_list
full_combo_path = os.path.join(experiment_dir, "full_combo_file.txt")
training_combo_path = os.path.join(
experiment_dir, "training_combo_file.txt"
)
full_combo_df.to_csv(full_combo_path, index=False)
training_combo_df.to_csv(training_combo_path, index=False)
training_set_path = os.path.join(
experiment_dir, "training_set_file.txt"
)
training_set_decoded_path = os.path.join(
experiment_dir, "training_set_decoded_file.txt"
)
# Write the reaction conditions for training dataset into files
# (encoded and decoded versions)
with open(training_set_path, "w") as file_object:
feature_names = ",".join([str(elem) for elem in feature_names_list])
file_object.write(feature_names + ",Yield" + "\n")
with open(training_set_decoded_path, "w") as file_object:
feature_names = ",".join([str(elem) for elem in feature_names_list])
file_object.write(feature_names + ",Yield" + "\n")
[docs]
def train(
self,
prev_param: list[Any],
yield_value: float,
experiment_dir: str,
config: dict[str, Any],
obj_func: Optional[Callable[..., float]] = None,
) -> list[Any]:
"""Suggests and records training data points needed for AMLRO.
:py:meth:`OptimizerAmlro.set_config` must be called prior to this method
to generate the necessary files.
The previous parameter+result is recorded during the *next*
call to either :py:meth:`~OptimizerAmlro.train` or
:py:meth:`~OptimizerAmlro.predict`. This means that on the first call
here, the ``prev_param`` and ``yield_value`` values provided are
ignored. Importantly, this also means that the last suggested
parameter+result pair from training will not be recorded unless either
another :py:meth:`~OptimizerAmlro.train` or a subsequent
:py:meth:`~OptimizerAmlro.predict` call are made afterward!
:param prev_param: Experimental parameter combination from the previous
experiment, provide an empty list for the first call
:type prev_param: list[Any]
:param yield_value: Experimental yield
:type yield_value: float
:param experiment_dir: Output directory for saving data files
:type experiment_dir: str
:param config: CyRxnOpt-level config for the optimizer
:type config: dict[str, Any]
:param obj_func: Ignored for this optimizer, defaults to None
:type obj_func: Optional[Callable[..., float]], optional
:return: Next parameter combination to perform, or an empty list (``[]``)
if all training points have been performed
:rtype: list[Any]
"""
self._import_deps()
# TODO: Set these as properties?
training_set_path = os.path.join(
experiment_dir, "training_set_file.txt"
)
training_set_decoded_path = os.path.join(
experiment_dir, "training_set_decoded_file.txt"
)
training_combo_path = os.path.join(
experiment_dir, "training_combo_file.txt"
)
if config["direction"].lower() == "min":
yield_value = -yield_value
# Determine next training row to perform
training_combos = self._imports["pd"].read_csv(training_combo_path)
training_set = self._imports["pd"].read_csv(training_set_path)
next_index = self._get_next_training_index_by_length(
training_combos, training_set
)
# NOTE: Not used due to bug in amlo that causes training_set_file.txt
# to have the decoded feature headers, so training_set_file.txt and
# training_combo_file.txt will never have the prerequisite matching
# column headers.
# next_index = self._get_next_training_index_next_combo(
# training_combos, training_set
# )
# Exit early if all training points have already been performed
if next_index == -1:
return []
# Workaround to avoid being stuck at the first parameter
if len(prev_param) > 0:
next_index += 1
# training step
next_parameters = self._imports[
"training_set_generator"
].generate_training_data(
training_set_path,
training_set_decoded_path,
training_combo_path,
use_subkeys(config),
prev_param,
yield_value,
next_index,
)
return next_parameters
[docs]
def predict(
self,
prev_param: list[Any],
yield_value: float,
experiment_dir: str,
config: dict[str, Any],
obj_func: Optional[Callable[..., float]] = None,
) -> list[Any]:
"""Searches for the best parameters and records results from prior steps.
:py:meth:`OptimizerAmlro.set_config` and :py:meth:`OptimizerAmlro.train`
must be called prior to this method to generate the necessary files and
initial training data for the model.
The previous parameter+result is recorded during the *next*
call to :py:meth:`~OptimizerAmlro.predict`. The ``prev_param`` and
``yield_value`` values provided here are always recorded. Importantly,
this also means that the last suggested parameter+result pair from
prediction will not be recorded unless either another
:py:meth:`~OptimizerAmlro.predict` call is made afterward!
:param prev_param: Parameters provided from the previous prediction
or from the final call to :py:meth:`OptimizerAmlro.train`
:type prev_param: list[Any]
:param yield_value: Result from the previous suggested conditions
:type yield_value: float
:param experiment_dir: Output directory for saving data files
:type experiment_dir: str
:param config: CyRxnOpt-level config for the optimizer
:type config: dict[str, Any]
:param obj_func: Ignored for this optimizer, defaults to None
:type obj_func: Optional[Callable[..., float]], optional
:return: The next suggested reaction to perform
:rtype: list[Any]
"""
self._import_deps()
training_set_path = os.path.join(
experiment_dir, "training_set_file.txt"
)
training_set_decoded_path = os.path.join(
experiment_dir, "training_set_decoded_file.txt"
)
full_combo_path = os.path.join(experiment_dir, "full_combo_file.txt")
if config["direction"].lower() == "min":
yield_value = -yield_value
# prediction step
best_combo = self._imports["optimizer_main"].get_optimized_parameters(
training_set_path,
training_set_decoded_path,
full_combo_path,
use_subkeys(config),
prev_param,
yield_value,
)
return best_combo
def _import_deps(self) -> None:
"""importing all the packages and libries needed for running amlro optimizer"""
import numpy as np # type: ignore
import pandas as pd # type: ignore
from amlro import ( # type: ignore
generate_combos,
optimizer,
optimizer_main,
training_set_generator,
)
self._imports = {
"generate_combos": generate_combos,
"training_set_generator": training_set_generator,
"optimizer": optimizer,
"optimizer_main": optimizer_main,
"np": np,
"pd": pd,
}
def _get_next_training_index_by_length( # type: ignore
self, training_combos, training_dataset
) -> int:
"""Gets the index for the next training condition to be performed.
This is implemented by a simple check on the dataset length. It is
assumed that the only rows present in the dataset are those from the
training combo list. This means that if the dataset has 6 rows, then
the index of the next training combo to perform is also 6.
:param training_combos: Training conditions suggested by AMLRO
:type training_combos: pd.DataFrame
:param training_dataset: Current dataset of performed reactions used to
train AMLRO
:type training_dataset: pd.DataFrame
:returns: Index in the training combo list of the next conditions
missing from the training dataset. An index of -1 is returned
if the dataset is longer than the training combo list.
:rtype: int
"""
# Get the row counts of the training combos and dataset
combo_rows = len(training_combos.index)
dataset_rows = len(training_dataset.index)
print(f"DEBUG combo_rows, dataset_rows : {combo_rows}, {dataset_rows}")
# Simple check assuming the dataset only contains the training combos
# that have been run
if dataset_rows >= combo_rows:
# No more training points to run
return -1
# The row count of the dataset will be the next index in the combo
# list due to zero indexing
return dataset_rows
def _get_next_training_index_next_combo( # type: ignore
self, training_combos, training_dataset
) -> int:
"""Gets the index for the next training condition to be performed.
This is implemented by checking which training conditions are missing
in the training dataset, then giving the index in the training condition
list for the first missing condition. Importantly, this means that a
training dataset with the correct number of entries, but none matching
the training combo file, will still receive indices and not be
considered "completed" yet.
:param training_combos: Training conditions suggested by AMLRO
:type training_combos: pd.DataFrame
:param training_dataset: Current dataset of performed reactions used to
train AMLRO
:type training_dataset: pd.DataFrame
:returns: Index in the training combo list of the next conditions
missing from the training dataset. An index of -1 is returned
if no more training conditions are missing from the dataset.
:rtype: int
"""
# Merge the current training dataset with the training combos suggested
# by AMLRO. A left merge is used to only use keys from the training
# combos and preserve the row index in the training combos.
merged = training_combos.merge(
training_dataset, how="left", indicator=True
)
# Determine training combos are missing
is_missing = merged["_merge"].eq("left_only")
# The next line to get the first missing row will return 0 for both
# the first row and if no rows are missing, so exit early with -1
# if no more training combos are missing
if is_missing.eq(False).all():
return -1
# Get the index of the first training combo not found in the provided
# training dataset. Apparently between True and False, True is the max
# so idxmax() works.
first_missing_index = merged["_merge"].eq("left_only").idxmax()
return first_missing_index