From 20e37a2d30f182695a10fea50a887cd67e24cd0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Wed, 31 Dec 2025 02:06:06 +0100 Subject: [PATCH 1/2] experimental model framework --- openml/__init__.py | 2 + openml/_get.py | 9 ++ openml/base/__init__.py | 6 + openml/{base.py => base/_base.py} | 3 +- openml/base/_base_pkg.py | 120 +++++++++++++++++++ openml/models/__init__.py | 5 + openml/models/_get.py | 63 ++++++++++ openml/models/apis/__init__.py | 5 + openml/models/apis/_classifier.py | 24 ++++ openml/models/base/__init__.py | 5 + openml/models/base/_base.py | 41 +++++++ openml/models/classification/__init__.py | 1 + openml/models/classification/auto_sklearn.py | 14 +++ openml/models/classification/xgboost.py | 14 +++ pyproject.toml | 1 + 15 files changed, 311 insertions(+), 2 deletions(-) create mode 100644 openml/_get.py create mode 100644 openml/base/__init__.py rename openml/{base.py => base/_base.py} (98%) create mode 100644 openml/base/_base_pkg.py create mode 100644 openml/models/__init__.py create mode 100644 openml/models/_get.py create mode 100644 openml/models/apis/__init__.py create mode 100644 openml/models/apis/_classifier.py create mode 100644 openml/models/base/__init__.py create mode 100644 openml/models/base/_base.py create mode 100644 openml/models/classification/__init__.py create mode 100644 openml/models/classification/auto_sklearn.py create mode 100644 openml/models/classification/xgboost.py diff --git a/openml/__init__.py b/openml/__init__.py index c49505eb9..f93cbb5d3 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -48,6 +48,7 @@ OpenMLSupervisedTask, OpenMLTask, ) +from openml._get import get def populate_cache( @@ -120,4 +121,5 @@ def populate_cache( "utils", "_api_calls", "__version__", + "get", ] diff --git a/openml/_get.py b/openml/_get.py new file mode 100644 index 000000000..b576668db --- /dev/null +++ b/openml/_get.py @@ -0,0 +1,9 @@ +"""Global get dispatch utility.""" + +# currently just a forward to models +# to discuss and possibly +# todo: add global get utility here +# in general, e.g., datasets will not have same name as models etc +from openml.models import get + +__all__ = ["get"] diff --git a/openml/base/__init__.py b/openml/base/__init__.py new file mode 100644 index 000000000..76a88c42b --- /dev/null +++ b/openml/base/__init__.py @@ -0,0 +1,6 @@ +"""Module of base classes.""" + +from openml.base._base import OpenMLBase +from openml.base._base_pkg import _BasePkg + +__all__ = ["_BasePkg", "OpenMLBase"] diff --git a/openml/base.py b/openml/base/_base.py similarity index 98% rename from openml/base.py rename to openml/base/_base.py index fbfb9dfc8..de2b387bf 100644 --- a/openml/base.py +++ b/openml/base/_base.py @@ -10,8 +10,7 @@ import openml._api_calls import openml.config - -from .utils import _get_rest_api_type_alias, _tag_openml_base +from openml.utils import _get_rest_api_type_alias, _tag_openml_base class OpenMLBase(ABC): diff --git a/openml/base/_base_pkg.py b/openml/base/_base_pkg.py new file mode 100644 index 000000000..9f5d6005e --- /dev/null +++ b/openml/base/_base_pkg.py @@ -0,0 +1,120 @@ +"""Base Packager class.""" + +import inspect +from pathlib import Path +import sys +import textwrap + +from skbase.base import BaseObject +from skbase.utils.dependencies import _check_estimator_deps + + +class _BasePkg(BaseObject): + + _tags = { + "python_dependencies": None, + "python_version": None, + # package register and manifest + "pkg_id": None, # object id contained, "__multiple" if multiple + "pkg_obj": "reference", # or "code" + "pkg_obj_type": None, # openml API type + "pkg_compression": "zlib", # compression + } + + def __init__(self): + super().__init__() + + def materialize(self): + try: + _check_estimator_deps(obj=self) + except ModuleNotFoundError as e: + # prettier message, so the reference is to the pkg_id + # currently, we cannot simply pass the object name to skbase + # in the error message, so this is a hack + # todo: fix this in scikit-base + msg = str(e) + if len(msg) > 11: + msg = msg[11:] + raise ModuleNotFoundError(msg) from e + + return self._materialize() + + def _materialize(self): + raise RuntimeError("abstract method") + + def serialize(self): + cls_str = class_to_source(type(self)) + compress_method = self.get_tag("pkg_compression") + if compress_method in [None, "None"]: + return cls_str + + cls_str = cls_str.encode("utf-8") + exec(f"import {compress_method}") + compressed_str = eval(f"{compress_method}.compress(cls_str)") + + return compressed_str + + +def _has_source(obj) -> bool: + """ + Return True if inspect.getsource(obj) should succeed. + """ + module_name = getattr(obj, "__module__", None) + if not module_name or module_name not in sys.modules: + return False + + module = sys.modules[module_name] + file = getattr(module, "__file__", None) + if not file: + return False + + return Path(file).suffix == ".py" + + +def class_to_source(cls) -> str: + """Return full source definition of python class as string. + + Parameters + ---------- + cls : class to serialize + + Returns + ------- + str : complete definition of cls, as str. + Imports are not contained or serialized. + """"" + + # Fast path: class has retrievable source + if _has_source(cls): + source = inspect.getsource(cls) + return textwrap.dedent(source) + + # Fallback for dynamically created classes + lines = [] + + bases = [base.__name__ for base in cls.__bases__ if base is not object] + base_str = f"({', '.join(bases)})" if bases else "" + lines.append(f"class {cls.__name__}{base_str}:") + + body_added = False + + for name, value in cls.__dict__.items(): + if name.startswith("__") and name.endswith("__"): + continue + + if inspect.isfunction(value): + if _has_source(value): + method_src = inspect.getsource(value) + method_src = textwrap.indent(textwrap.dedent(method_src), " ") + lines.append(method_src) + else: + lines.append(f" def {name}(self): ...") + body_added = True + else: + lines.append(f" {name} = {repr(value)}") + body_added = True + + if not body_added: + lines.append(" pass") + + return "\n".join(lines) diff --git a/openml/models/__init__.py b/openml/models/__init__.py new file mode 100644 index 000000000..ae833fc63 --- /dev/null +++ b/openml/models/__init__.py @@ -0,0 +1,5 @@ +"""Module with packaging adapters.""" + +from openml.models._get import get + +__all__ = ["get"] diff --git a/openml/models/_get.py b/openml/models/_get.py new file mode 100644 index 000000000..b270ec0b6 --- /dev/null +++ b/openml/models/_get.py @@ -0,0 +1,63 @@ + +"""Model retrieval utility.""" + +from functools import lru_cache + + +def get(id: str): + """Retrieve model object with unique identifier. + + Parameter + --------- + id : str + unique identifier of object to retrieve + + Returns + ------- + class + retrieved object + + Raises + ------ + ModuleNotFoundError + if dependencies of object to retrieve are not satisfied + """ + + id_lookup = _id_lookup() + obj = id_lookup.get(id) + if obj is None: + raise ValueError( + f"Error in openml.get, object with package id {id} " + "does not exist." + ) + return obj().materialize() + + +# todo: need to generalize this later to more types +# currently intentionally retrieves only classifiers +# todo: replace this, optionally, by database backend +def _id_lookup(obj_type=None): + return _id_lookup_cached(obj_type=obj_type).copy() + + +@lru_cache +def _id_lookup_cached(obj_type=None): + all_objs = _all_objects(obj_type=obj_type) + + # todo: generalize that pkg can contain more than one object + lookup_dict = {obj.get_class_tag("pkg_id"): obj for obj in all_objs} + + return lookup_dict + + +@lru_cache +def _all_objects(obj_type=None): + from skbase.lookup import all_objects + + from openml.models.apis._classifier import _ModelPkgClassifier + + clses = all_objects( + object_types=_ModelPkgClassifier, package_name="openml", return_names=False + ) + + return clses diff --git a/openml/models/apis/__init__.py b/openml/models/apis/__init__.py new file mode 100644 index 000000000..f560dcf6f --- /dev/null +++ b/openml/models/apis/__init__.py @@ -0,0 +1,5 @@ +"""Module with packaging adapters.""" + +from openml.models.apis._classifier import _ModelPkgClassifier + +__all__ = ["_ModelPkgClassifier"] diff --git a/openml/models/apis/_classifier.py b/openml/models/apis/_classifier.py new file mode 100644 index 000000000..a6d75b967 --- /dev/null +++ b/openml/models/apis/_classifier.py @@ -0,0 +1,24 @@ +"""Base package for sklearn classifiers.""" + +from openml.models.base import _OpenmlModelPkg + + +class _ModelPkgClassifier(_OpenmlModelPkg): + + _tags = { + # tags specific to API type + "pkg_obj_type": "classifier", + } + + def get_obj_tags(self): + """Return tags of the object as a dictionary.""" + return {} # this needs to be implemented + + def get_obj_param_names(self): + """Return parameter names of the object as a list. + + Returns + ------- + list: names of object parameters + """ + return list(self.materialize()().get_params().keys()) diff --git a/openml/models/base/__init__.py b/openml/models/base/__init__.py new file mode 100644 index 000000000..a60e1e404 --- /dev/null +++ b/openml/models/base/__init__.py @@ -0,0 +1,5 @@ +"""Module with packaging adapters.""" + +from openml.models.base._base import _OpenmlModelPkg + +__all__ = ["_OpenmlModelPkg"] diff --git a/openml/models/base/_base.py b/openml/models/base/_base.py new file mode 100644 index 000000000..4384e754c --- /dev/null +++ b/openml/models/base/_base.py @@ -0,0 +1,41 @@ +"""Base model package class.""" + +from openml.base import _BasePkg + + +class _OpenmlModelPkg(_BasePkg): + + _obj = None + + def _materialize(self): + pkg_obj = self.get_tag("pkg_obj") + + _obj = self._obj + + if _obj is None: + raise ValueError( + "Error in materialize." + "Either _materialize must be implemented, or" + "the _obj attribute must be not None." + ) + + if pkg_obj == "reference": + from skbase.utils.dependencies import _safe_import + + obj = _safe_import(self._obj) + return obj + + elif pkg_obj == "code": + exec(self._obj) + + return obj + + # elif pkg_obj == "craft": + # identify and call appropriate craft method + + else: + raise ValueError( + 'Error in package tag "pkg_obj", ' + 'must be one of "reference", "code", "craft", ' + f'but found value {pkg_obj}, of type {type(pkg_obj)}' + ) diff --git a/openml/models/classification/__init__.py b/openml/models/classification/__init__.py new file mode 100644 index 000000000..e547a50cf --- /dev/null +++ b/openml/models/classification/__init__.py @@ -0,0 +1 @@ +"""Sklearn classification models.""" diff --git a/openml/models/classification/auto_sklearn.py b/openml/models/classification/auto_sklearn.py new file mode 100644 index 000000000..0be641394 --- /dev/null +++ b/openml/models/classification/auto_sklearn.py @@ -0,0 +1,14 @@ +"""Auto-sklearn classifier.""" + + +from openml.models.apis import _ModelPkgClassifier + + +class OpenmlPkg__AutoSklearnClassifier(_ModelPkgClassifier): + + _tags = { + "pkg_id": "AutoSklearnClassifier", + "python_dependencies": "auto-sklearn", + } + + _obj = "autosklearn.classification.AutoSklearnClassifier" diff --git a/openml/models/classification/xgboost.py b/openml/models/classification/xgboost.py new file mode 100644 index 000000000..44f3173fe --- /dev/null +++ b/openml/models/classification/xgboost.py @@ -0,0 +1,14 @@ +"""Xgboost classifier.""" + + +from openml.models.apis import _ModelPkgClassifier + + +class OpenmlPkg__XGBClassifier(_ModelPkgClassifier): + + _tags = { + "pkg_id": "XGBClassifier", + "python_dependencies": "xgboost", + } + + _obj = "xgboost.XGBClassifier" diff --git a/pyproject.toml b/pyproject.toml index 2bf762b09..83b62554d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "minio", "pyarrow", "tqdm", # For MinIO download progress bars + "scikit-base", ] requires-python = ">=3.8" maintainers = [ From d79fbe52dceca77b70b1c46a7b32aceecef2aa71 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 31 Dec 2025 01:22:05 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openml/__init__.py | 3 ++- openml/_get.py | 2 ++ openml/base/_base_pkg.py | 17 +++++++---------- openml/models/_get.py | 19 +++++-------------- openml/models/apis/_classifier.py | 3 ++- openml/models/base/_base.py | 19 +++++++++---------- openml/models/classification/auto_sklearn.py | 2 +- openml/models/classification/xgboost.py | 2 +- 8 files changed, 29 insertions(+), 38 deletions(-) diff --git a/openml/__init__.py b/openml/__init__.py index f93cbb5d3..7eb077057 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -18,6 +18,8 @@ # License: BSD 3-Clause from __future__ import annotations +from openml._get import get + from . import ( _api_calls, config, @@ -48,7 +50,6 @@ OpenMLSupervisedTask, OpenMLTask, ) -from openml._get import get def populate_cache( diff --git a/openml/_get.py b/openml/_get.py index b576668db..0c5e9739e 100644 --- a/openml/_get.py +++ b/openml/_get.py @@ -4,6 +4,8 @@ # to discuss and possibly # todo: add global get utility here # in general, e.g., datasets will not have same name as models etc +from __future__ import annotations + from openml.models import get __all__ = ["get"] diff --git a/openml/base/_base_pkg.py b/openml/base/_base_pkg.py index 9f5d6005e..690b93a86 100644 --- a/openml/base/_base_pkg.py +++ b/openml/base/_base_pkg.py @@ -1,16 +1,17 @@ """Base Packager class.""" +from __future__ import annotations + import inspect -from pathlib import Path import sys import textwrap +from pathlib import Path from skbase.base import BaseObject from skbase.utils.dependencies import _check_estimator_deps class _BasePkg(BaseObject): - _tags = { "python_dependencies": None, "python_version": None, @@ -50,15 +51,11 @@ def serialize(self): cls_str = cls_str.encode("utf-8") exec(f"import {compress_method}") - compressed_str = eval(f"{compress_method}.compress(cls_str)") - - return compressed_str + return eval(f"{compress_method}.compress(cls_str)") def _has_source(obj) -> bool: - """ - Return True if inspect.getsource(obj) should succeed. - """ + """Return True if inspect.getsource(obj) should succeed.""" module_name = getattr(obj, "__module__", None) if not module_name or module_name not in sys.modules: return False @@ -82,7 +79,7 @@ def class_to_source(cls) -> str: ------- str : complete definition of cls, as str. Imports are not contained or serialized. - """"" + """ "" # Fast path: class has retrievable source if _has_source(cls): @@ -111,7 +108,7 @@ def class_to_source(cls) -> str: lines.append(f" def {name}(self): ...") body_added = True else: - lines.append(f" {name} = {repr(value)}") + lines.append(f" {name} = {value!r}") body_added = True if not body_added: diff --git a/openml/models/_get.py b/openml/models/_get.py index b270ec0b6..75b807ca7 100644 --- a/openml/models/_get.py +++ b/openml/models/_get.py @@ -1,6 +1,7 @@ - """Model retrieval utility.""" +from __future__ import annotations + from functools import lru_cache @@ -22,14 +23,10 @@ def get(id: str): ModuleNotFoundError if dependencies of object to retrieve are not satisfied """ - id_lookup = _id_lookup() obj = id_lookup.get(id) if obj is None: - raise ValueError( - f"Error in openml.get, object with package id {id} " - "does not exist." - ) + raise ValueError(f"Error in openml.get, object with package id {id} " "does not exist.") return obj().materialize() @@ -45,9 +42,7 @@ def _id_lookup_cached(obj_type=None): all_objs = _all_objects(obj_type=obj_type) # todo: generalize that pkg can contain more than one object - lookup_dict = {obj.get_class_tag("pkg_id"): obj for obj in all_objs} - - return lookup_dict + return {obj.get_class_tag("pkg_id"): obj for obj in all_objs} @lru_cache @@ -56,8 +51,4 @@ def _all_objects(obj_type=None): from openml.models.apis._classifier import _ModelPkgClassifier - clses = all_objects( - object_types=_ModelPkgClassifier, package_name="openml", return_names=False - ) - - return clses + return all_objects(object_types=_ModelPkgClassifier, package_name="openml", return_names=False) diff --git a/openml/models/apis/_classifier.py b/openml/models/apis/_classifier.py index a6d75b967..c1198ee32 100644 --- a/openml/models/apis/_classifier.py +++ b/openml/models/apis/_classifier.py @@ -1,10 +1,11 @@ """Base package for sklearn classifiers.""" +from __future__ import annotations + from openml.models.base import _OpenmlModelPkg class _ModelPkgClassifier(_OpenmlModelPkg): - _tags = { # tags specific to API type "pkg_obj_type": "classifier", diff --git a/openml/models/base/_base.py b/openml/models/base/_base.py index 4384e754c..6b3fa2a92 100644 --- a/openml/models/base/_base.py +++ b/openml/models/base/_base.py @@ -1,10 +1,11 @@ """Base model package class.""" +from __future__ import annotations + from openml.base import _BasePkg class _OpenmlModelPkg(_BasePkg): - _obj = None def _materialize(self): @@ -22,10 +23,9 @@ def _materialize(self): if pkg_obj == "reference": from skbase.utils.dependencies import _safe_import - obj = _safe_import(self._obj) - return obj + return _safe_import(self._obj) - elif pkg_obj == "code": + if pkg_obj == "code": exec(self._obj) return obj @@ -33,9 +33,8 @@ def _materialize(self): # elif pkg_obj == "craft": # identify and call appropriate craft method - else: - raise ValueError( - 'Error in package tag "pkg_obj", ' - 'must be one of "reference", "code", "craft", ' - f'but found value {pkg_obj}, of type {type(pkg_obj)}' - ) + raise ValueError( + 'Error in package tag "pkg_obj", ' + 'must be one of "reference", "code", "craft", ' + f"but found value {pkg_obj}, of type {type(pkg_obj)}" + ) diff --git a/openml/models/classification/auto_sklearn.py b/openml/models/classification/auto_sklearn.py index 0be641394..1d29044da 100644 --- a/openml/models/classification/auto_sklearn.py +++ b/openml/models/classification/auto_sklearn.py @@ -1,11 +1,11 @@ """Auto-sklearn classifier.""" +from __future__ import annotations from openml.models.apis import _ModelPkgClassifier class OpenmlPkg__AutoSklearnClassifier(_ModelPkgClassifier): - _tags = { "pkg_id": "AutoSklearnClassifier", "python_dependencies": "auto-sklearn", diff --git a/openml/models/classification/xgboost.py b/openml/models/classification/xgboost.py index 44f3173fe..5b91e647c 100644 --- a/openml/models/classification/xgboost.py +++ b/openml/models/classification/xgboost.py @@ -1,11 +1,11 @@ """Xgboost classifier.""" +from __future__ import annotations from openml.models.apis import _ModelPkgClassifier class OpenmlPkg__XGBClassifier(_ModelPkgClassifier): - _tags = { "pkg_id": "XGBClassifier", "python_dependencies": "xgboost",