diff --git a/pyproject.toml b/pyproject.toml index 5f90c70..697aef3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,12 @@ eda = [ "pyarrow" ] +models = [ + "xgboost", + "pandas", + "numpy", +] + [project.urls] diff --git a/src/alphapulse/models/__init__.py b/src/alphapulse/models/__init__.py new file mode 100644 index 0000000..502a48b --- /dev/null +++ b/src/alphapulse/models/__init__.py @@ -0,0 +1,4 @@ +from .model_abstract import ModelAbstract +from .model_xgboost import ModelXgboost + +__all__ = ["ModelAbstract", "ModelXgboost"] diff --git a/src/alphapulse/models/model_abstract.py b/src/alphapulse/models/model_abstract.py new file mode 100644 index 0000000..234f2ad --- /dev/null +++ b/src/alphapulse/models/model_abstract.py @@ -0,0 +1,24 @@ +from abc import ABC, abstractmethod +from typing import Any + +import pandas as pd +import xgboost as xgb + + +class ModelAbstract(ABC): + """Abstract class for all models""" + + @abstractmethod + def train(self, *_args: Any, **_kwargs: Any) -> xgb.Booster: + """Initial training the model""" + raise NotImplementedError("Train method needs to be overriden") + + @abstractmethod + def finetune(self, *_args: Any, **_kwargs: Any) -> xgb.Booster: + """Finetune the trained model""" + raise NotImplementedError("Finetune method needs to be overriden") + + @abstractmethod + def predict(self, *_args: Any, **_kwargs: Any) -> pd.Series: + """Predict the result of the trained model""" + raise NotImplementedError("Predict method needs to be overriden") diff --git a/src/alphapulse/models/model_xgboost.py b/src/alphapulse/models/model_xgboost.py new file mode 100644 index 0000000..37a69b7 --- /dev/null +++ b/src/alphapulse/models/model_xgboost.py @@ -0,0 +1,57 @@ +from collections.abc import Mapping +from typing import Any + +import pandas as pd +import xgboost as xgb + +from .model_abstract import ModelAbstract + + +class ModelXgboost(ModelAbstract): + def __init__(self) -> None: + self.model: xgb.Booster | None = None + + def train( + self, + X: pd.DataFrame, + y: pd.Series, + params: Mapping[str, Any], + num_boost_round: int = 10, + **kwargs: Any, + ) -> xgb.Booster: + dtrain = xgb.DMatrix(X, label=y) + + self.model = xgb.train( + params=params, dtrain=dtrain, num_boost_round=num_boost_round, **kwargs + ) + return self.model + + def finetune( + self, + X: pd.DataFrame, + y: pd.Series, + params: Mapping[str, Any], + num_boost_round: int = 10, + **kwargs: Any, + ) -> xgb.Booster: + if self.model is None: + raise RuntimeError("Train initial model") + + dtrain = xgb.DMatrix(X, label=y) + self.model = xgb.train( + params=params, + dtrain=dtrain, + num_boost_round=num_boost_round, + xgb_model=self.model, + **kwargs, + ) + return self.model + + def predict(self, X: pd.DataFrame, **kwargs: Any) -> pd.Series: + if self.model is None: + raise RuntimeError("Train a model first") + + dtest = xgb.DMatrix(X) + + preds = self.model.predict(dtest, **kwargs) + return pd.Series(preds, index=X.index, name="prediction") diff --git a/tests/models/__init__.py b/tests/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/models/test_models_xgboost.py b/tests/models/test_models_xgboost.py new file mode 100644 index 0000000..cc20ffd --- /dev/null +++ b/tests/models/test_models_xgboost.py @@ -0,0 +1,108 @@ +import json +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd +import pytest + +from alphapulse.models.model_xgboost import ModelXgboost + +ROOT = Path(__file__).parent.parent.parent +TRAIN_DATA_PATH = ROOT / "data" / "v5.2" / "train.parquet" +FEATURES_JSON_PATH = ROOT / "data" / "v5.2" / "features.json" +TEST_DATA_PATH = ROOT / "data" / "v5.2" / "live.parquet" + + +@pytest.fixture +def test_data() -> tuple[pd.DataFrame, list[str]]: + """Load Numerai data""" + with open(FEATURES_JSON_PATH, encoding="utf-8") as f: + feature_metadata = json.load(f) + feature_cols = feature_metadata["feature_sets"]["small"] + target_cols = feature_metadata["targets"] + train = pd.read_parquet( + TRAIN_DATA_PATH, columns=["era"] + feature_cols + target_cols + ) + return train, feature_cols + + +@pytest.fixture +def xgb_params() -> dict[str, Any]: + return { + "learning_rate": 0.1, + "max_depth": 6, + "min_child_weight": 1, + "gamma": 0, + "subsample": 0.8, + "colsample_bytree": 0.8, + "lambda": 1, + "alpha": 0, + } + + +def test_train_creates_model( + test_data: tuple[pd.DataFrame, list[str]], xgb_params: dict[str, Any] +) -> None: + """Checks if model was created""" + train, feature_cols = test_data + + model = ModelXgboost() + booster = model.train( + train[feature_cols], + train["target"], + params=xgb_params, + num_boost_round=10, + ) + + assert booster is not None + assert model.model is booster + + +def test_finetune_updates_model( + test_data: tuple[pd.DataFrame, list[str]], xgb_params: dict[str, Any] +) -> None: + """Check if finetuning actually changes the model""" + train, feature_cols = test_data + + model = ModelXgboost() + + booster_before = model.train( + train[feature_cols], + train["target"], + params=xgb_params, + num_boost_round=5, + ) + + booster_after = model.finetune( + train[feature_cols], + train["target"], + params=xgb_params, + num_boost_round=5, + ) + + assert booster_after is not None + assert booster_after is not booster_before + + +def test_predict_output_shape_and_range( + test_data: tuple[pd.DataFrame, list[str]], xgb_params: dict[str, Any] +) -> None: + """Checks if the number of predictions is equal to the number of test samples + and if each prediction is in [0,1] + """ + train, feature_cols = test_data + test = pd.read_parquet(TEST_DATA_PATH, columns=feature_cols) + model = ModelXgboost() + model.train( + train[feature_cols], + train["target"], + params=xgb_params, + num_boost_round=10, + ) + + preds = model.predict(test) + + assert preds.shape[0] == test.shape[0] + assert np.all(preds >= 0.0) + assert np.all(preds <= 1.0) diff --git a/uv.lock b/uv.lock index 963a1c7..03d4698 100644 --- a/uv.lock +++ b/uv.lock @@ -47,6 +47,11 @@ eda = [ { name = "streamlit" }, { name = "types-networkx" }, ] +models = [ + { name = "numpy" }, + { name = "pandas" }, + { name = "xgboost" }, +] [package.dev-dependencies] dev = [ @@ -63,8 +68,10 @@ requires-dist = [ { name = "networkx", marker = "extra == 'eda'" }, { name = "numerapi" }, { name = "numpy", marker = "extra == 'eda'" }, + { name = "numpy", marker = "extra == 'models'" }, { name = "pandas" }, { name = "pandas", marker = "extra == 'eda'" }, + { name = "pandas", marker = "extra == 'models'" }, { name = "pandas-stubs", marker = "extra == 'eda'" }, { name = "plotly", marker = "extra == 'eda'" }, { name = "plotly-stubs", marker = "extra == 'eda'" }, @@ -86,8 +93,9 @@ requires-dist = [ { name = "types-requests", marker = "extra == 'dev'", specifier = ">=2.32" }, { name = "tyro" }, { name = "vulture", marker = "extra == 'dev'", specifier = ">=2.3" }, + { name = "xgboost", marker = "extra == 'models'" }, ] -provides-extras = ["dev", "eda"] +provides-extras = ["dev", "eda", "models"] [package.metadata.requires-dev] dev = [ @@ -1033,6 +1041,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/6f/dde8e2a79a3b6cbc31bc1037c1a1dbc07c90d52d946851bd7cba67e730a8/numpy_typing_compat-20251206.2.3-py3-none-any.whl", hash = "sha256:bfa2e4c4945413e84552cbd34a6d368c88a06a54a896e77ced760521b08f0f61", size = 6300, upload-time = "2025-12-06T20:01:56.664Z" }, ] +[[package]] +name = "nvidia-nccl-cu12" +version = "2.29.2" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/b2/e4dc7b33020645746710040cb2a6ac0de8332687d3ce902156dd3d7c351a/nvidia_nccl_cu12-2.29.2-py3-none-manylinux_2_18_aarch64.whl", hash = "sha256:0712e55c067965c6093cc793a9bbcc5f37b5b47248e9ebf8ae3af06867757587", size = 289707761, upload-time = "2026-01-07T00:21:30.514Z" }, + { url = "https://files.pythonhosted.org/packages/23/2d/609d0392d992259c6dc39881688a7fc13b1397a668bc360fbd68d1396f85/nvidia_nccl_cu12-2.29.2-py3-none-manylinux_2_18_x86_64.whl", hash = "sha256:3a9a0bf4142126e0d0ed99ec202579bef8d007601f9fab75af60b10324666b12", size = 289762233, upload-time = "2026-01-07T00:21:56.124Z" }, +] + [[package]] name = "optype" version = "0.15.0" @@ -2161,3 +2178,21 @@ sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b66 wheels = [ { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" }, ] + +[[package]] +name = "xgboost" +version = "3.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "nvidia-nccl-cu12", marker = "sys_platform == 'linux'" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/db/ff3eb8ff8cdf87a57cbb0f484234b4353178587236c4c84c1d307165c1f8/xgboost-3.1.3.tar.gz", hash = "sha256:0aeaa59d7ba09221a6fa75f70406751cfafdf3f149d0a91b197a1360404a28f3", size = 1237662, upload-time = "2026-01-10T00:20:13.458Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/a9/8668a5662c497c32ab127b7ca57d91153f499b31c725969a1e4147782e64/xgboost-3.1.3-py3-none-macosx_10_15_x86_64.whl", hash = "sha256:e16a6c352ee1a4c19372a7b2bb75129e10e63adeeabd3d11f21b7787378e5a50", size = 2378032, upload-time = "2026-01-10T00:18:14.103Z" }, + { url = "https://files.pythonhosted.org/packages/52/39/ec5c53228b091387e934d3d419e8e3a5ce98c1650d458987d6e254a15304/xgboost-3.1.3-py3-none-macosx_12_0_arm64.whl", hash = "sha256:a7a1d59f3529de0ad9089c59b6cc595cd7b4424feabcc06463c4bde41f202f74", size = 2211477, upload-time = "2026-01-10T00:18:34.409Z" }, + { url = "https://files.pythonhosted.org/packages/99/f7/ceb06e6b959e5a8b303883482ecad346495641947679e3f735ae8ac1caa7/xgboost-3.1.3-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:2e31482633883b2e95fda6055db654bbfac82e10d91ad3d9929086ebd28eb1c4", size = 115346575, upload-time = "2026-01-10T00:19:11.44Z" }, + { url = "https://files.pythonhosted.org/packages/6c/9c/9d4ad7f586698bad52a570d2bf81138e500a5d9f32723c2b4ed1dd9252d8/xgboost-3.1.3-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:687504d1d76dc797df08b0dbe8b83d58629cdc06df52378f617164d16142bf2c", size = 115926894, upload-time = "2026-01-10T00:19:49.123Z" }, + { url = "https://files.pythonhosted.org/packages/3a/d8/4d4ae25452577f2dfabc66b60e712e7c01f9fe6c389fa88c546c2f427c4d/xgboost-3.1.3-py3-none-win_amd64.whl", hash = "sha256:3fe349b4c6030f0d66e166a3a6b7d470e776d530ea240d77335e36144cbe132a", size = 72011993, upload-time = "2026-01-10T00:17:42.98Z" }, +]