From 82e1be21fccb01fba526834da0749d3b73e0af54 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 12 Jun 2025 22:51:02 -0500 Subject: [PATCH 01/17] sanitize more config logs --- mlos_bench/mlos_bench/services/config_persistence.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlos_bench/mlos_bench/services/config_persistence.py b/mlos_bench/mlos_bench/services/config_persistence.py index a674e6ce75..ff7a55827e 100644 --- a/mlos_bench/mlos_bench/services/config_persistence.py +++ b/mlos_bench/mlos_bench/services/config_persistence.py @@ -190,11 +190,11 @@ def load_config( if any(c in json for c in ("{", "[")): # If the path contains braces, it is likely already a json string, # so just parse it. - _LOG.info("Load config from json string: %s", json) + _LOG.info("Load config from json string: %s", sanitize_config(json)) try: config: Any = json5.loads(json) except ValueError as ex: - _LOG.error("Failed to parse config from JSON string: %s", json) + _LOG.error("Failed to parse config from JSON string: %s", sanitize_config(json)) raise ValueError(f"Failed to parse config from JSON string: {json}") from ex else: json = self.resolve_path(json) @@ -224,7 +224,7 @@ def load_config( # (e.g. Azure ARM templates). del config["$schema"] else: - _LOG.warning("Config %s is not validated against a schema.", json) + _LOG.warning("Config %s is not validated against a schema.", sanitize_config(json)) return config # type: ignore[no-any-return] def prepare_class_load( @@ -700,7 +700,7 @@ def load_services( -------- mlos_bench.services : Examples of service configurations. """ - _LOG.info("Load services: %s parent: %s", jsons, parent.__class__.__name__) + _LOG.info("Load services: %s parent: %s", sanitize_config(jsons), parent.__class__.__name__) service = Service({}, global_config, parent) for json in jsons: config = self.load_config(json, ConfigSchema.SERVICE) @@ -736,7 +736,7 @@ def load_tunables( -------- mlos_bench.tunables : Examples of tunable parameter configurations. """ - _LOG.info("Load tunables: '%s'", jsons) + _LOG.info("Load tunables: '%s'", sanitize_config(jsons)) if parent is None: parent = TunableGroups() tunables = parent.copy() From 9d4ec80a022f54e77f72103304e6a553594a364b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Jun 2025 03:52:54 +0000 Subject: [PATCH 02/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mlos_bench/mlos_bench/services/config_persistence.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/services/config_persistence.py b/mlos_bench/mlos_bench/services/config_persistence.py index ff7a55827e..d7bcb74627 100644 --- a/mlos_bench/mlos_bench/services/config_persistence.py +++ b/mlos_bench/mlos_bench/services/config_persistence.py @@ -700,7 +700,9 @@ def load_services( -------- mlos_bench.services : Examples of service configurations. """ - _LOG.info("Load services: %s parent: %s", sanitize_config(jsons), parent.__class__.__name__) + _LOG.info( + "Load services: %s parent: %s", sanitize_config(jsons), parent.__class__.__name__ + ) service = Service({}, global_config, parent) for json in jsons: config = self.load_config(json, ConfigSchema.SERVICE) From 7927211d162c954ad33ba4b348419cea4c54a1a0 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 12 Jun 2025 23:17:35 -0500 Subject: [PATCH 03/17] expand sanitization processing --- .../mlos_bench/tests/test_sanitize_confs.py | 45 +++++++++++++++++++ mlos_bench/mlos_bench/util.py | 29 +++++++----- 2 files changed, 64 insertions(+), 10 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/test_sanitize_confs.py b/mlos_bench/mlos_bench/tests/test_sanitize_confs.py index a589d2124a..94d840e3a7 100644 --- a/mlos_bench/mlos_bench/tests/test_sanitize_confs.py +++ b/mlos_bench/mlos_bench/tests/test_sanitize_confs.py @@ -21,6 +21,7 @@ def test_sanitize_config_simple() -> None: "other": 42, } sanitized = sanitize_config(config) + assert isinstance(sanitized, dict) assert sanitized["username"] == "user1" assert sanitized["password"] == "[REDACTED]" assert sanitized["token"] == "[REDACTED]" @@ -39,6 +40,7 @@ def test_sanitize_config_nested() -> None: "api_key": "key", } sanitized = sanitize_config(config) + assert isinstance(sanitized, dict) assert sanitized["outer"]["password"] == "[REDACTED]" assert sanitized["outer"]["inner"]["token"] == "[REDACTED]" assert sanitized["outer"]["inner"]["foo"] == "bar" @@ -61,7 +63,50 @@ def test_sanitize_config_mixed_types() -> None: "api_key": {"nested": "val"}, } sanitized = sanitize_config(config) + assert isinstance(sanitized, dict) assert sanitized["password"] == "[REDACTED]" assert sanitized["token"] == "[REDACTED]" assert sanitized["secret"] == "[REDACTED]" assert sanitized["api_key"] == "[REDACTED]" + + +def test_sanitize_config_empty() -> None: + """Test sanitization of an empty configuration.""" + config = {} + sanitized = sanitize_config(config) + assert sanitized == config # Should remain empty dictionary + + +def test_sanitize_array() -> None: + """Test sanitization of an array with sensitive keys.""" + config = [ + {"username": "user1", "password": "pass1"}, + {"username": "user2", "password": "pass2"}, + ] + sanitized = sanitize_config(config) + assert isinstance(sanitized, list) + assert len(sanitized) == 2 + assert sanitized[0]["username"] == "user1" + assert sanitized[0]["password"] == "[REDACTED]" + assert sanitized[1]["username"] == "user2" + assert sanitized[1]["password"] == "[REDACTED]" + + +def test_sanitize_config_with_non_string_values() -> None: + """Test sanitization with non-string values.""" + config = { + "int_value": 42, + "float_value": 3.14, + "bool_value": True, + "none_value": None, + "list_value": [1, "password", 3], + "dict_value": {"key": "value"}, + } + sanitized = sanitize_config(config) + assert isinstance(sanitized, dict) + assert sanitized["int_value"] == 42 + assert sanitized["float_value"] == 3.14 + assert sanitized["bool_value"] is True + assert sanitized["none_value"] is None + assert sanitized["list_value"] == [1, "password", 3] # don't redact raw strings + assert sanitized["dict_value"] == {"key": "value"} diff --git a/mlos_bench/mlos_bench/util.py b/mlos_bench/mlos_bench/util.py index 7cf9bc640c..91c39446cd 100644 --- a/mlos_bench/mlos_bench/util.py +++ b/mlos_bench/mlos_bench/util.py @@ -464,7 +464,7 @@ def datetime_parser( return new_datetime_col -def sanitize_config(config: dict[str, Any]) -> dict[str, Any]: +def sanitize_config(config: dict[str, Any] | list[Any] | Any) -> dict[str, Any] | list[Any] | Any: """ Sanitize a configuration dictionary by obfuscating potentially sensitive keys. @@ -480,16 +480,25 @@ def sanitize_config(config: dict[str, Any]) -> dict[str, Any]: """ sanitize_keys = {"password", "secret", "token", "api_key"} - def recursive_sanitize(conf: dict[str, Any]) -> dict[str, Any]: + def recursive_sanitize( + conf: dict[str, Any] | list[Any] | str, + ) -> dict[str, Any] | list[Any] | str: """Recursively sanitize a dictionary.""" sanitized = {} - for k, v in conf.items(): - if k in sanitize_keys: - sanitized[k] = "[REDACTED]" - elif isinstance(v, dict): - sanitized[k] = recursive_sanitize(v) # type: ignore[assignment] - else: - sanitized[k] = v - return sanitized + if isinstance(conf, list): + return [recursive_sanitize(item) for item in conf] + if isinstance(conf, dict): + for k, v in conf.items(): + if k in sanitize_keys: + sanitized[k] = "[REDACTED]" + elif isinstance(v, dict): + sanitized[k] = recursive_sanitize(v) + elif isinstance(v, list): + sanitized[k] = [recursive_sanitize(item) for item in v] + else: + sanitized[k] = v + return sanitized + # else, return un altered value (e.g., int, float, str) + return conf return recursive_sanitize(config) From 8ef140ea3df0df86d812b49f9d2ba398df9a3cb4 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 12 Jun 2025 23:19:19 -0500 Subject: [PATCH 04/17] tunables shouldn't be sensitive --- mlos_bench/mlos_bench/services/config_persistence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/services/config_persistence.py b/mlos_bench/mlos_bench/services/config_persistence.py index 1d3da77a86..c18f6afd91 100644 --- a/mlos_bench/mlos_bench/services/config_persistence.py +++ b/mlos_bench/mlos_bench/services/config_persistence.py @@ -745,7 +745,7 @@ def load_tunables( -------- mlos_bench.tunables : Examples of tunable parameter configurations. """ - _LOG.info("Load tunables: '%s'", sanitize_config(jsons)) + _LOG.info("Load tunables: '%s'", jsons) if parent is None: parent = TunableGroups() tunables = parent.copy() From 4c8e7757264b626853eef4af7e8be691ed1c38d6 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 12 Jun 2025 23:23:35 -0500 Subject: [PATCH 05/17] more sanitizing --- .../environments/base_environment.py | 8 ++++++-- mlos_bench/mlos_bench/launcher.py | 4 ++-- .../mlos_bench/services/base_service.py | 20 +++++++++++++++---- mlos_bench/mlos_bench/storage/base_storage.py | 6 +++--- 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/mlos_bench/mlos_bench/environments/base_environment.py b/mlos_bench/mlos_bench/environments/base_environment.py index 094085c78b..232016497c 100644 --- a/mlos_bench/mlos_bench/environments/base_environment.py +++ b/mlos_bench/mlos_bench/environments/base_environment.py @@ -21,7 +21,7 @@ from mlos_bench.services.base_service import Service from mlos_bench.tunables.tunable_groups import TunableGroups from mlos_bench.tunables.tunable_types import TunableValue -from mlos_bench.util import instantiate_from_config, merge_parameters +from mlos_bench.util import instantiate_from_config, merge_parameters, sanitize_config if TYPE_CHECKING: from mlos_bench.services.types.config_loader_type import SupportsConfigLoading @@ -174,7 +174,11 @@ def __init__( # pylint: disable=too-many-arguments _LOG.debug("Parameters for '%s' :: %s", name, self._params) if _LOG.isEnabledFor(logging.DEBUG): - _LOG.debug("Config for: '%s'\n%s", name, json.dumps(self.config, indent=2)) + _LOG.debug( + "Config for: '%s'\n%s", + name, + json.dumps(sanitize_config(self.config), indent=2), + ) def _validate_json_config(self, config: dict, name: str) -> None: """Reconstructs a basic json config that this class might have been instantiated diff --git a/mlos_bench/mlos_bench/launcher.py b/mlos_bench/mlos_bench/launcher.py index c728ed7fb2..480c1222d7 100644 --- a/mlos_bench/mlos_bench/launcher.py +++ b/mlos_bench/mlos_bench/launcher.py @@ -31,7 +31,7 @@ from mlos_bench.storage.base_storage import Storage from mlos_bench.tunables.tunable_groups import TunableGroups from mlos_bench.tunables.tunable_types import TunableValue -from mlos_bench.util import try_parse_val +from mlos_bench.util import sanitize_config, try_parse_val _LOG_LEVEL = logging.INFO _LOG_FORMAT = "%(asctime)s %(filename)s:%(lineno)d %(funcName)s %(levelname)s %(message)s" @@ -478,7 +478,7 @@ def _try_parse_extra_args(cmdline: Iterable[str]) -> dict[str, TunableValue]: # other CLI options to use as common python/json variable replacements. config = {k.replace("-", "_"): v for k, v in config.items()} - _LOG.debug("Parsed config: %s", config) + _LOG.debug("Parsed config: %s", sanitize_config(config)) return config def _load_config( diff --git a/mlos_bench/mlos_bench/services/base_service.py b/mlos_bench/mlos_bench/services/base_service.py index 41eebfbb98..fdc4eb7f79 100644 --- a/mlos_bench/mlos_bench/services/base_service.py +++ b/mlos_bench/mlos_bench/services/base_service.py @@ -16,7 +16,7 @@ from mlos_bench.config.schemas import ConfigSchema from mlos_bench.services.types.bound_method import BoundMethod from mlos_bench.services.types.config_loader_type import SupportsConfigLoading -from mlos_bench.util import instantiate_from_config +from mlos_bench.util import instantiate_from_config, sanitize_config _LOG = logging.getLogger(__name__) @@ -99,9 +99,21 @@ def __init__( self._config_loader_service = parent if _LOG.isEnabledFor(logging.DEBUG): - _LOG.debug("Service: %s Config:\n%s", self, json.dumps(self.config, indent=2)) - _LOG.debug("Service: %s Globals:\n%s", self, json.dumps(global_config or {}, indent=2)) - _LOG.debug("Service: %s Parent: %s", self, parent.pprint() if parent else None) + _LOG.debug( + "Service: %s Config:\n%s", + self, + json.dumps(sanitize_config(self.config), indent=2), + ) + _LOG.debug( + "Service: %s Globals:\n%s", + self, + json.dumps(sanitize_config(global_config or {}), indent=2), + ) + _LOG.debug( + "Service: %s Parent: %s", + self, + parent.pprint() if parent else None, + ) @staticmethod def merge_methods( diff --git a/mlos_bench/mlos_bench/storage/base_storage.py b/mlos_bench/mlos_bench/storage/base_storage.py index 75a84bf0b2..48514eaa78 100644 --- a/mlos_bench/mlos_bench/storage/base_storage.py +++ b/mlos_bench/mlos_bench/storage/base_storage.py @@ -38,7 +38,7 @@ from mlos_bench.services.base_service import Service from mlos_bench.storage.base_experiment_data import ExperimentData from mlos_bench.tunables.tunable_groups import TunableGroups -from mlos_bench.util import get_git_info +from mlos_bench.util import get_git_info, sanitize_config _LOG = logging.getLogger(__name__) @@ -62,7 +62,7 @@ def __init__( config : dict Free-format key/value pairs of configuration parameters. """ - _LOG.debug("Storage config: %s", config) + _LOG.debug("Storage config: %s", sanitize_config(config)) self._validate_json_config(config) self._service = service self._config = config.copy() @@ -431,7 +431,7 @@ def new_trial( _config = DictTemplater(config).expand_vars() assert isinstance(_config, dict) except ValueError as e: - _LOG.error("Non-serializable config: %s", config, exc_info=e) + _LOG.error("Non-serializable config: %s", sanitize_config(config), exc_info=e) raise e return self._new_trial(tunables, ts_start, config) From 96cbb3a4c86d3dbdaef70b730b87d174de9a04f2 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 12 Jun 2025 23:50:12 -0500 Subject: [PATCH 06/17] fixups --- mlos_bench/mlos_bench/tests/test_sanitize_confs.py | 2 +- mlos_bench/mlos_bench/util.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/test_sanitize_confs.py b/mlos_bench/mlos_bench/tests/test_sanitize_confs.py index 94d840e3a7..702149bc89 100644 --- a/mlos_bench/mlos_bench/tests/test_sanitize_confs.py +++ b/mlos_bench/mlos_bench/tests/test_sanitize_confs.py @@ -72,7 +72,7 @@ def test_sanitize_config_mixed_types() -> None: def test_sanitize_config_empty() -> None: """Test sanitization of an empty configuration.""" - config = {} + config: dict = {} sanitized = sanitize_config(config) assert sanitized == config # Should remain empty dictionary diff --git a/mlos_bench/mlos_bench/util.py b/mlos_bench/mlos_bench/util.py index 91c39446cd..1804816552 100644 --- a/mlos_bench/mlos_bench/util.py +++ b/mlos_bench/mlos_bench/util.py @@ -484,17 +484,17 @@ def recursive_sanitize( conf: dict[str, Any] | list[Any] | str, ) -> dict[str, Any] | list[Any] | str: """Recursively sanitize a dictionary.""" - sanitized = {} if isinstance(conf, list): return [recursive_sanitize(item) for item in conf] if isinstance(conf, dict): + sanitized = {} for k, v in conf.items(): if k in sanitize_keys: sanitized[k] = "[REDACTED]" elif isinstance(v, dict): - sanitized[k] = recursive_sanitize(v) + sanitized[k] = recursive_sanitize(v) # type: ignore[assignment] elif isinstance(v, list): - sanitized[k] = [recursive_sanitize(item) for item in v] + sanitized[k] = [recursive_sanitize(item) for item in v] # type: ignore[assignment] else: sanitized[k] = v return sanitized From 8e5d4a32a6cc72707a646bb30cafa0d0c24e01ad Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 12 Jun 2025 23:57:47 -0500 Subject: [PATCH 07/17] handle json strings --- mlos_bench/mlos_bench/util.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/util.py b/mlos_bench/mlos_bench/util.py index 1804816552..e54f047acc 100644 --- a/mlos_bench/mlos_bench/util.py +++ b/mlos_bench/mlos_bench/util.py @@ -480,6 +480,16 @@ def sanitize_config(config: dict[str, Any] | list[Any] | Any) -> dict[str, Any] """ sanitize_keys = {"password", "secret", "token", "api_key"} + # Try and parse the config as a JSON string first, if it's a string. + was_json = False + if isinstance(config, str): + try: + config = json.loads(config) + was_json = True + except json.JSONDecodeError: + # If it fails to parse, return the original string. + return config + def recursive_sanitize( conf: dict[str, Any] | list[Any] | str, ) -> dict[str, Any] | list[Any] | str: @@ -501,4 +511,8 @@ def recursive_sanitize( # else, return un altered value (e.g., int, float, str) return conf - return recursive_sanitize(config) + sanitized = recursive_sanitize(config) + if was_json: + # If the original config was a JSON string, return it as a JSON string. + return json.dumps(sanitized, indent=2) + return sanitized From ead7b78bf97331a5eb51aee4f72dae026ae7c1c1 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 12 Jun 2025 23:57:52 -0500 Subject: [PATCH 08/17] lint --- mlos_bench/mlos_bench/util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/util.py b/mlos_bench/mlos_bench/util.py index e54f047acc..df9f627036 100644 --- a/mlos_bench/mlos_bench/util.py +++ b/mlos_bench/mlos_bench/util.py @@ -504,7 +504,9 @@ def recursive_sanitize( elif isinstance(v, dict): sanitized[k] = recursive_sanitize(v) # type: ignore[assignment] elif isinstance(v, list): - sanitized[k] = [recursive_sanitize(item) for item in v] # type: ignore[assignment] + sanitized[k] = [ + recursive_sanitize(item) for item in v # type: ignore[assignment] + ] else: sanitized[k] = v return sanitized From dccb45b4acdfabac9ef9591bdbeb8f1ed9c6d01f Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 12 Jun 2025 23:59:50 -0500 Subject: [PATCH 09/17] style --- mlos_bench/mlos_bench/services/config_persistence.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/services/config_persistence.py b/mlos_bench/mlos_bench/services/config_persistence.py index c18f6afd91..8e74621c24 100644 --- a/mlos_bench/mlos_bench/services/config_persistence.py +++ b/mlos_bench/mlos_bench/services/config_persistence.py @@ -708,7 +708,9 @@ def load_services( mlos_bench.services : Examples of service configurations. """ _LOG.info( - "Load services: %s parent: %s", sanitize_config(jsons), parent.__class__.__name__ + "Load services: %s parent: %s", + sanitize_config(jsons), + parent.__class__.__name__, ) service = Service({}, global_config, parent) for json in jsons: From b8291a6e8998c88ef9de9fb09baded6de24e34a7 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Fri, 13 Jun 2025 00:06:05 -0500 Subject: [PATCH 10/17] logging --- mlos_bench/mlos_bench/launcher.py | 3 ++- .../mlos_bench/services/config_persistence.py | 14 ++++++++------ mlos_bench/mlos_bench/storage/base_storage.py | 3 ++- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/mlos_bench/mlos_bench/launcher.py b/mlos_bench/mlos_bench/launcher.py index 480c1222d7..0b69a14950 100644 --- a/mlos_bench/mlos_bench/launcher.py +++ b/mlos_bench/mlos_bench/launcher.py @@ -478,7 +478,8 @@ def _try_parse_extra_args(cmdline: Iterable[str]) -> dict[str, TunableValue]: # other CLI options to use as common python/json variable replacements. config = {k.replace("-", "_"): v for k, v in config.items()} - _LOG.debug("Parsed config: %s", sanitize_config(config)) + if _LOG.isEnabledFor(logging.DEBUG): + _LOG.debug("Parsed config: %s", sanitize_config(config)) return config def _load_config( diff --git a/mlos_bench/mlos_bench/services/config_persistence.py b/mlos_bench/mlos_bench/services/config_persistence.py index 8e74621c24..6da47d11de 100644 --- a/mlos_bench/mlos_bench/services/config_persistence.py +++ b/mlos_bench/mlos_bench/services/config_persistence.py @@ -191,7 +191,8 @@ def load_config( if any(c in json for c in ("{", "[")): # If the path contains braces, it is likely already a json string, # so just parse it. - _LOG.info("Load config from json string: %s", sanitize_config(json)) + if _LOG.isEnabledFor(logging.INFO): + _LOG.info("Load config from json string: %s", sanitize_config(json)) try: config: Any = json5.loads(json) except ValueError as ex: @@ -707,11 +708,12 @@ def load_services( -------- mlos_bench.services : Examples of service configurations. """ - _LOG.info( - "Load services: %s parent: %s", - sanitize_config(jsons), - parent.__class__.__name__, - ) + if _LOG.isEnabledFor(logging.INFO): + _LOG.info( + "Load services: %s parent: %s", + sanitize_config(jsons), + parent.__class__.__name__, + ) service = Service({}, global_config, parent) for json in jsons: config = self.load_config(json, ConfigSchema.SERVICE) diff --git a/mlos_bench/mlos_bench/storage/base_storage.py b/mlos_bench/mlos_bench/storage/base_storage.py index 48514eaa78..ce0f4ca069 100644 --- a/mlos_bench/mlos_bench/storage/base_storage.py +++ b/mlos_bench/mlos_bench/storage/base_storage.py @@ -62,7 +62,8 @@ def __init__( config : dict Free-format key/value pairs of configuration parameters. """ - _LOG.debug("Storage config: %s", sanitize_config(config)) + if _LOG.isEnabledFor(logging.DEBUG): + _LOG.debug("Storage config: %s", sanitize_config(config)) self._validate_json_config(config) self._service = service self._config = config.copy() From 0542bccd6117e9b4cffda3d343bc149f82a3831d Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Fri, 13 Jun 2025 00:06:14 -0500 Subject: [PATCH 11/17] complexity refactor --- mlos_bench/mlos_bench/util.py | 60 +++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/mlos_bench/mlos_bench/util.py b/mlos_bench/mlos_bench/util.py index df9f627036..2cef347ffc 100644 --- a/mlos_bench/mlos_bench/util.py +++ b/mlos_bench/mlos_bench/util.py @@ -464,6 +464,38 @@ def datetime_parser( return new_datetime_col +_SANITIZE_KEYS = { + "password", + "secret", + "token", + "api_key", +} + + +def _recursive_sanitize( + conf: dict[str, Any] | list[Any] | str, +) -> dict[str, Any] | list[Any] | str: + """Recursively sanitize a dictionary.""" + if isinstance(conf, list): + return [_recursive_sanitize(item) for item in conf] + if isinstance(conf, dict): + sanitized = {} + for k, v in conf.items(): + if k in _SANITIZE_KEYS: + sanitized[k] = "[REDACTED]" + elif isinstance(v, dict): + sanitized[k] = _recursive_sanitize(v) # type: ignore[assignment] + elif isinstance(v, list): + sanitized[k] = [ + _recursive_sanitize(item) for item in v # type: ignore[assignment] + ] + else: + sanitized[k] = v + return sanitized + # else, return un altered value (e.g., int, float, str) + return conf + + def sanitize_config(config: dict[str, Any] | list[Any] | Any) -> dict[str, Any] | list[Any] | Any: """ Sanitize a configuration dictionary by obfuscating potentially sensitive keys. @@ -478,8 +510,6 @@ def sanitize_config(config: dict[str, Any] | list[Any] | Any) -> dict[str, Any] dict Sanitized configuration dictionary. """ - sanitize_keys = {"password", "secret", "token", "api_key"} - # Try and parse the config as a JSON string first, if it's a string. was_json = False if isinstance(config, str): @@ -489,31 +519,7 @@ def sanitize_config(config: dict[str, Any] | list[Any] | Any) -> dict[str, Any] except json.JSONDecodeError: # If it fails to parse, return the original string. return config - - def recursive_sanitize( - conf: dict[str, Any] | list[Any] | str, - ) -> dict[str, Any] | list[Any] | str: - """Recursively sanitize a dictionary.""" - if isinstance(conf, list): - return [recursive_sanitize(item) for item in conf] - if isinstance(conf, dict): - sanitized = {} - for k, v in conf.items(): - if k in sanitize_keys: - sanitized[k] = "[REDACTED]" - elif isinstance(v, dict): - sanitized[k] = recursive_sanitize(v) # type: ignore[assignment] - elif isinstance(v, list): - sanitized[k] = [ - recursive_sanitize(item) for item in v # type: ignore[assignment] - ] - else: - sanitized[k] = v - return sanitized - # else, return un altered value (e.g., int, float, str) - return conf - - sanitized = recursive_sanitize(config) + sanitized = _recursive_sanitize(config) if was_json: # If the original config was a JSON string, return it as a JSON string. return json.dumps(sanitized, indent=2) From 8000f561c9e72fbc1d384dae656653a562902bc6 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Fri, 13 Jun 2025 00:19:22 -0500 Subject: [PATCH 12/17] fixups and tests --- .../mlos_bench/tests/test_sanitize_confs.py | 54 +++++++++++++++++++ mlos_bench/mlos_bench/util.py | 7 +-- 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/mlos_bench/mlos_bench/tests/test_sanitize_confs.py b/mlos_bench/mlos_bench/tests/test_sanitize_confs.py index 702149bc89..af14c0c58d 100644 --- a/mlos_bench/mlos_bench/tests/test_sanitize_confs.py +++ b/mlos_bench/mlos_bench/tests/test_sanitize_confs.py @@ -7,6 +7,7 @@ Tests cover obfuscation of sensitive keys and recursive sanitization. """ +import json5 from mlos_bench.util import sanitize_config @@ -110,3 +111,56 @@ def test_sanitize_config_with_non_string_values() -> None: assert sanitized["none_value"] is None assert sanitized["list_value"] == [1, "password", 3] # don't redact raw strings assert sanitized["dict_value"] == {"key": "value"} + + +def test_sanitize_config_json_string() -> None: + """Test sanitization when input is a JSON string.""" + config = { + "username": "user1", + "password": "mypassword", + "token": "abc123", + "nested": {"api_key": "key", "other": 1}, + "list": [{"secret": "shh"}, {"foo": "bar"}], + } + config_json = json5.dumps(config) + sanitized = sanitize_config(config_json) + # Should return a JSON string + assert isinstance(sanitized, str) + sanitized_dict = json5.loads(sanitized) + assert isinstance(sanitized_dict, dict) + assert sanitized_dict["username"] == "user1" + assert sanitized_dict["password"] == "[REDACTED]" + assert sanitized_dict["token"] == "[REDACTED]" + assert sanitized_dict["nested"]["api_key"] == "[REDACTED]" + assert sanitized_dict["nested"]["other"] == 1 + assert sanitized_dict["list"][0]["secret"] == "[REDACTED]" + assert sanitized_dict["list"][1]["foo"] == "bar" + + +def test_sanitize_config_invalid_json_string() -> None: + """Test sanitization with an invalid JSON string input.""" + invalid_json = '{"username": "user1", "password": "pw"' # missing closing brace + assert sanitize_config(invalid_json) == invalid_json + + +def test_sanitize_config_json5_string() -> None: + """Test sanitization with an invalid JSON5 string input.""" + invalid_json = '{"username": "user1", "password": "pw", }' # with trailing comma + # Should return processed json as string + sanitized = sanitize_config(invalid_json) + assert isinstance(sanitized, str) + sanitize_dict = json5.loads(sanitized) + assert isinstance(sanitize_dict, dict) + assert len(sanitize_dict) == 2 + assert sanitize_dict["username"] == "user1" + assert sanitize_dict["password"] == "[REDACTED]" + + +def test_sanitize_config_json_string_no_sensitive_keys() -> None: + """Test sanitization of a JSON string with no sensitive keys.""" + config = {"foo": 1, "bar": {"baz": 2}} + config_json = json5.dumps(config) + sanitized = sanitize_config(config_json) + assert isinstance(sanitized, str) + sanitized_dict = json5.loads(sanitized) + assert sanitized_dict == config diff --git a/mlos_bench/mlos_bench/util.py b/mlos_bench/mlos_bench/util.py index 2cef347ffc..cdd9a317e7 100644 --- a/mlos_bench/mlos_bench/util.py +++ b/mlos_bench/mlos_bench/util.py @@ -15,6 +15,7 @@ from datetime import datetime from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union +import json5 import pandas import pytz @@ -512,11 +513,11 @@ def sanitize_config(config: dict[str, Any] | list[Any] | Any) -> dict[str, Any] """ # Try and parse the config as a JSON string first, if it's a string. was_json = False - if isinstance(config, str): + if isinstance(config, str) and config: try: - config = json.loads(config) + config = json5.loads(config) was_json = True - except json.JSONDecodeError: + except (json.JSONDecodeError, ValueError): # If it fails to parse, return the original string. return config sanitized = _recursive_sanitize(config) From fd43a4a42c4f84434f979bbf8e79dfdf74776eff Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Jun 2025 05:19:45 +0000 Subject: [PATCH 13/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mlos_bench/mlos_bench/tests/test_sanitize_confs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mlos_bench/mlos_bench/tests/test_sanitize_confs.py b/mlos_bench/mlos_bench/tests/test_sanitize_confs.py index af14c0c58d..6ea9ed0f80 100644 --- a/mlos_bench/mlos_bench/tests/test_sanitize_confs.py +++ b/mlos_bench/mlos_bench/tests/test_sanitize_confs.py @@ -8,6 +8,7 @@ Tests cover obfuscation of sensitive keys and recursive sanitization. """ import json5 + from mlos_bench.util import sanitize_config From 6ff358cd76a855af78f1e42261730574b1b51ac5 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Fri, 13 Jun 2025 12:15:55 -0500 Subject: [PATCH 14/17] tweak --- mlos_bench/mlos_bench/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlos_bench/mlos_bench/util.py b/mlos_bench/mlos_bench/util.py index cdd9a317e7..cc57b22d6f 100644 --- a/mlos_bench/mlos_bench/util.py +++ b/mlos_bench/mlos_bench/util.py @@ -518,8 +518,8 @@ def sanitize_config(config: dict[str, Any] | list[Any] | Any) -> dict[str, Any] config = json5.loads(config) was_json = True except (json.JSONDecodeError, ValueError): - # If it fails to parse, return the original string. - return config + # If it fails to parse, use the original string. + pass sanitized = _recursive_sanitize(config) if was_json: # If the original config was a JSON string, return it as a JSON string. From e7c444d49da390227906bc217632a372777c9572 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Fri, 13 Jun 2025 12:19:16 -0500 Subject: [PATCH 15/17] comments --- mlos_bench/mlos_bench/util.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/mlos_bench/mlos_bench/util.py b/mlos_bench/mlos_bench/util.py index cc57b22d6f..659903eddf 100644 --- a/mlos_bench/mlos_bench/util.py +++ b/mlos_bench/mlos_bench/util.py @@ -499,16 +499,30 @@ def _recursive_sanitize( def sanitize_config(config: dict[str, Any] | list[Any] | Any) -> dict[str, Any] | list[Any] | Any: """ - Sanitize a configuration dictionary by obfuscating potentially sensitive keys. + Attempts to sanitize a configuration dictionary by obfuscating potentially sensitive keys. + + Notes + ----- + Mostly used to make CodeQL scans happy by redacting sensitive information + (e.g., passwords, tokens, API keys) in the configuration. + + Will also attempt to parse the input as a JSON string if it is a string, + and return a JSON string if the original input was a JSON string. + Therefore this function is somewhat expensive so logging should be blocked with + ``if _LOG.isEnabledFor(logging.INFO):`` checks (or similar) before calling it. + + Finally, it will also replace bare strings that match the sensitive keys + with "[REDACTED]" to avoid leaking sensitive information in the logs, though + this is obviously a less effective approach and may hinder useful debugging. Parameters ---------- - config : dict + config : dict | list | Any Configuration dictionary to sanitize. Returns ------- - dict + dict | list | Any Sanitized configuration dictionary. """ # Try and parse the config as a JSON string first, if it's a string. From ef4c16443e82db5817f5551771e055443cee3a6a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Jun 2025 17:19:41 +0000 Subject: [PATCH 16/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mlos_bench/mlos_bench/util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/util.py b/mlos_bench/mlos_bench/util.py index 659903eddf..732b04eda6 100644 --- a/mlos_bench/mlos_bench/util.py +++ b/mlos_bench/mlos_bench/util.py @@ -499,7 +499,8 @@ def _recursive_sanitize( def sanitize_config(config: dict[str, Any] | list[Any] | Any) -> dict[str, Any] | list[Any] | Any: """ - Attempts to sanitize a configuration dictionary by obfuscating potentially sensitive keys. + Attempts to sanitize a configuration dictionary by obfuscating potentially sensitive + keys. Notes ----- From 018f8b564f99dc6ba11d15fa80bb9816dfe787c4 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Fri, 13 Jun 2025 12:40:18 -0500 Subject: [PATCH 17/17] tweaks to replace bad string --- mlos_bench/mlos_bench/util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mlos_bench/mlos_bench/util.py b/mlos_bench/mlos_bench/util.py index 732b04eda6..3d070b53d4 100644 --- a/mlos_bench/mlos_bench/util.py +++ b/mlos_bench/mlos_bench/util.py @@ -477,6 +477,8 @@ def _recursive_sanitize( conf: dict[str, Any] | list[Any] | str, ) -> dict[str, Any] | list[Any] | str: """Recursively sanitize a dictionary.""" + if isinstance(conf, str) and conf in _SANITIZE_KEYS: + return "[REDACTED]" if isinstance(conf, list): return [_recursive_sanitize(item) for item in conf] if isinstance(conf, dict):