diff --git a/.gitignore b/.gitignore index b7faf40..a136db4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,72 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python,linux,windows,macos,jupyternotebooks,visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,windows,macos,jupyternotebooks,visualstudiocode + +### JupyterNotebooks ### +# gitignore template for Jupyter Notebooks +# website: http://jupyter.org/ + +.ipynb_checkpoints +*/.ipynb_checkpoints/* + +# IPython +profile_default/ +ipython_config.py + +# Remove previous ipynb_checkpoints +# git rm -r .ipynb_checkpoints/ + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### Python ### # Byte-compiled / optimized / DLL files __pycache__/ -*.py[codz] +*.py[cod] *$py.class # C extensions @@ -46,7 +112,7 @@ htmlcov/ nosetests.xml coverage.xml *.cover -*.py.cover +*.py,cover .hypothesis/ .pytest_cache/ cover/ @@ -76,11 +142,8 @@ docs/_build/ target/ # Jupyter Notebook -.ipynb_checkpoints # IPython -profile_default/ -ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is @@ -94,35 +157,20 @@ ipython_config.py # install all needed dependencies. #Pipfile.lock -# UV -# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -#uv.lock - # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock -#poetry.toml # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. -# https://pdm-project.org/en/latest/usage/project/#working-with-version-control #pdm.lock -#pdm.toml -.pdm-python -.pdm-build/ - -# pixi -# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. -#pixi.lock -# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one -# in the .venv directory. It is recommended not to include this directory in version control. -.pixi +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ @@ -136,7 +184,6 @@ celerybeat.pid # Environments .env -.envrc .venv env/ venv/ @@ -175,33 +222,99 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -# Abstra -# Abstra is an AI-powered process automation framework. -# Ignore directories containing user credentials, local state, and settings. -# Learn more at https://abstra.io/docs -.abstra/ - -# Visual Studio Code -# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore -# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore -# and can be added to the global gitignore or merged into this file. However, if you prefer, -# you could uncomment the following to ignore the entire vscode folder -# .vscode/ +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml -# Ruff stuff: +# ruff .ruff_cache/ -# PyPI configuration file -.pypirc +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# End of https://www.toptal.com/developers/gitignore/api/python,linux,windows,macos,jupyternotebooks,visualstudiocode + + +lightning_logs/ +logs/ +tmp/ + + +#/**/data/** +#!/**/data/**/ + +#/**/models/** +#!/**/models/**/ + + +#*.json +#*.h5 +#*.pickle +*.csv +*.pkl +*.zip +*.h5 +*.hdf5 +*.joblib +*.feather +*.parquet + +auto_model/ -# Cursor -# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to -# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data -# refer to https://docs.cursor.com/context/ignore-files -.cursorignore -.cursorindexingignore +#*.ckpt +#*.joblib +#*.pkl +#*.pq +#*.parquet -# Marimo -marimo/_static/ -marimo/_lsp/ -__marimo__/ +#!**/.gitkeep +#!**/dvc.yml +#!**/dvc.yaml +#!**/*.dvc +#!**/*.py +#!**/dvc.lock diff --git a/CountLOC.py b/CountLOC.py new file mode 100644 index 0000000..e46255a --- /dev/null +++ b/CountLOC.py @@ -0,0 +1,171 @@ +from pathlib import Path +import re +import json +import os +from rich.console import Console +from rich.syntax import Syntax +from rich.text import Text +import sys + +NOTEBOOKS_PATH = Path("./JupyterNotebooks") +CONFIGS_PATH = Path("./LOCALIZE_Configs") + +def is_blank(line): + return line.strip() == "" + +def strip_comment(line): + line = line.split("#", 1)[0] + line = re.sub(r'("""|\'\'\')(.*?)\1', '', line) + return line.rstrip() + +def strip_structural(line): + return "".join([token for token in line if token not in "{}[](),"]) + +def is_meaningful_line(line): + if is_blank(line): + return False + if is_blank(strip_comment(line)): + return False + if is_blank(strip_structural(line)): + return False + return True + +def strip_meaningless(lines): + return [strip_comment(line) for line in lines if is_meaningful_line(line)] + +def extract_code_lines_notebook(ipynb_path): + """Extracts code lines from a Jupyter notebook (.ipynb) as a list of strings.""" + with open(ipynb_path, 'r', encoding='utf-8') as f: + notebook = json.load(f) + + lines = [] + for cell in notebook.get("cells", []): + if cell.get("cell_type") == "code": + lines.extend(cell.get("source", [])) # source is already a list of lines + lines.append("\n") # separate cells with a newline + return lines + +def keep_unique_lines(lines, other): + out = [] + other = [line.strip() for line in other] + for line in lines: + stripped = line.strip() + if stripped in other: + other.remove(stripped) + else: + out.append(line) + return out + +def remove_unwanted_lines(lines, notebook): + return [line for line in lines if notebook.stem not in line] + +def color_diff_lines(lines, prefix, prefix_style): + styled = [] + for line in lines: + # Create a Text object + text = Text(end = "") + text.append(f"{prefix} ", style=prefix_style) + code = line.strip("\n") + syntax = Syntax(code, "python", theme="monokai", line_numbers=False, word_wrap=False) + hlcd = syntax.highlight(code) + hlcd.rstrip() + text.append(hlcd) # only highlight the actual code + styled.append(text) + return styled + +notebooks = sorted([Path(file) for file in os.listdir(NOTEBOOKS_PATH) if file.endswith(".ipynb")]) + +pr = strip_meaningless(extract_code_lines_notebook(NOTEBOOKS_PATH / notebooks[0])) +pr = remove_unwanted_lines(pr, notebooks[0]) + +chj = { + "deleted": [], + "added": [] +} + +print("Jupyter:") +for notebook in notebooks[1:5]: + c = strip_meaningless(extract_code_lines_notebook(NOTEBOOKS_PATH / notebook)) + c = remove_unwanted_lines(c, notebook) + + deleted = keep_unique_lines(pr, c) + added = keep_unique_lines(c, pr) + + deleted_styled = color_diff_lines(deleted, "-", "bold red") + added_styled = color_diff_lines(added, "+", "bold green") + + pr = c + + console = Console( + force_jupyter=False, + force_terminal=True, + file=sys.stdout, + ) + + console.print(f" [bold red]lines deleted:[/] {len(deleted)}") + console.print(f" [green]lines added:[/] {len(added)}") + print("____________________\n") + chj["deleted"].append(len(deleted)) + chj["added"].append(len(added)) + +configs = sorted([Path(file) for file in os.listdir(CONFIGS_PATH)])[1:6] + +def extract_code_lines_yaml(path): + with open(path, 'r', encoding='utf-8') as f: + return f.readlines() + +def color_diff_lines(lines, prefix, prefix_style): + styled = [] + for line in lines: + # Create a Text object + text = Text(end = "") + text.append(f"{prefix} ", style=prefix_style) + code = line.strip("\n") + syntax = Syntax(code, "yaml", theme="monokai", line_numbers=False, word_wrap=False) + hlcd = syntax.highlight(code) + hlcd.rstrip() + text.append(hlcd) # only highlight the actual code + styled.append(text) + return styled + +print("Framework:") +chf = { + "dvc.yaml":{ + "deleted": [], + "added": [] + }, + "params.yaml":{ + "deleted": [], + "added": [] + } +} +for cnf in ["dvc.yaml", "params.yaml"]: + pr = strip_meaningless(extract_code_lines_yaml(CONFIGS_PATH / configs[0] / cnf)) + pr = remove_unwanted_lines(pr, configs[0]) + for config in configs[1:]: + print(cnf) + c = strip_meaningless(extract_code_lines_yaml(CONFIGS_PATH / config / cnf)) + c = remove_unwanted_lines(c, config) + + deleted = keep_unique_lines(pr, c) + added = keep_unique_lines(c, pr) + + deleted_styled = color_diff_lines(deleted, "-", "bold red") + added_styled = color_diff_lines(added, "+", "bold green") + + pr = c + + console = Console( + force_jupyter=False, + force_terminal=True, + file=sys.stdout, + width = 120 + ) + + console.print(f" [bold red]lines deleted:[/] {len(deleted)}") + console.print(f" [green]lines added:[/] {len(added)}") + + chf[cnf]["deleted"].append(len(deleted)) + chf[cnf]["added"].append(len(added)) + + print("____________________\n") \ No newline at end of file diff --git a/JupyterNotebooks/00-Initial.ipynb b/JupyterNotebooks/00-Initial.ipynb new file mode 100644 index 0000000..2b31707 --- /dev/null +++ b/JupyterNotebooks/00-Initial.ipynb @@ -0,0 +1,213 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "91d230e0-cc84-4ad5-9005-9f166d483d03", + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "import numpy as np\n", + "import sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0304a74-ede7-4e21-8929-8e06206f4d5d", + "metadata": {}, + "outputs": [], + "source": [ + "data = Path(\"DataSets/umu\")\n", + "results_path = Path(\"00-Initial-results\")\n", + "random_seed = 42\n", + "n_splits = 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e320cd24-f026-4fae-ab48-0604c6ec8de0", + "metadata": {}, + "outputs": [], + "source": [ + "# Praparation\n", + "# We assume the dataset has been downloaded and unzipped manually\n", + "\n", + "df = pd.read_excel(data / \"umu\" / \"tcp_nokia_20240325.xlsx\")\n", + "\n", + "df = df[\n", + " [\"Column7\",\"Column8\",\"Column14\",\"Column15\",\"Column42\",\"Column43\",\"Column45\",\n", + " \"Column46\",\"Column47\",\"Column48\",\"Column87\",\"Column88\",\"Column78\",\"Column79\"]\n", + "]\n", + "df.columns = df.iloc[0]\n", + "df = df[1:]\n", + "\n", + "# convert all columns to numeric\n", + "for column in df.columns:\n", + " if df[column].dtype == \"object\":\n", + " df[column] = pd.to_numeric(df[column], errors=\"coerce\")\n", + "\n", + "df.columns = [col.replace(\"nas_value_nr5g_\", \"\") for col in df.columns]\n", + "\n", + "df = df.dropna() # drom the ~2 rows with NaN\n", + "df = df.loc[:, df.nunique() > 1] # keep only columns with more than one uniqe value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814ec0fe-4140-49f3-ae9b-8ab752ab54ed", + "metadata": {}, + "outputs": [], + "source": [ + "#Feature generation\n", + "import math\n", + "\n", + "def lat_lon_to_meters(origin_lat, origin_lon, point_lat, point_lon) -> tuple[float, float]:\n", + " \"\"\"Works \"fine\" for distances less than 100km\"\"\"\n", + "\n", + " R = 6_378_137 # Earth's radius in meters\n", + "\n", + " origin_lat_rad = math.radians(origin_lat) # Convert latitude and longitude from degrees to radians\n", + " delta_lat_rad = math.radians(point_lat - origin_lat)\n", + " delta_lon_rad = math.radians(point_lon - origin_lon)\n", + "\n", + " delta_meters_lat = delta_lat_rad * R # Calculate distance in the latitude direction (North-South)\n", + "\n", + " delta_meters_lon = delta_lon_rad * R * math.cos(origin_lat_rad) # Calculate distance in the longitude direction (East-West)\n", + "\n", + " return delta_meters_lat, delta_meters_lon\n", + "\n", + "origin_lat, origin_lon = df.gpsd_tpv_lat.min(), df.gpsd_tpv_lon.min()\n", + "\n", + "df[[\"target_x\", \"target_y\"]] = df.apply(\n", + " lambda row: lat_lon_to_meters(origin_lat, origin_lon, row[\"gpsd_tpv_lat\"], row[\"gpsd_tpv_lon\"]),\n", + " axis=1,\n", + " result_type=\"expand\",\n", + ")\n", + "\n", + "df.drop(columns=[\"gpsd_tpv_lat\", \"gpsd_tpv_lon\"], inplace=True)\n", + "\n", + "targets = [\"target_x\", \"target_y\"] # Find target column(s)\n", + "\n", + "features, targets = df.drop(targets, axis=1), df[targets] # X are features, y are target(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "997a14f2-afe8-4f6c-b2dd-7312a31a54b9", + "metadata": {}, + "outputs": [], + "source": [ + "#Split generation\n", + "from sklearn import model_selection\n", + "groups = None\n", + "\n", + "cv = model_selection.KFold(\n", + " n_splits=n_splits,\n", + " shuffle=True,\n", + " random_state=random_seed,\n", + ")\n", + "\n", + "indices = []\n", + "\n", + "for train_indices, test_indices in cv.split(features, targets, groups):\n", + " indices.append((train_indices, test_indices))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1593cd46-3eb7-4a64-b358-fb3b7b2bd821", + "metadata": {}, + "outputs": [], + "source": [ + "class PredefinedSplit(model_selection.BaseCrossValidator):\n", + " \"\"\"Simple cross-validator for predefined train-test splits.\"\"\"\n", + "\n", + " def __init__(self, indices_pairs: list[tuple[np.ndarray, np.ndarray]]):\n", + " self.idx_pairs = indices_pairs\n", + "\n", + " def get_n_splits(self, X=None, y=None, groups=None):\n", + " \"\"\"Return the number of splitting iterations in the cross-validator\"\"\"\n", + " return len(self.idx_pairs)\n", + "\n", + " def split(self, X, y=None, groups=None):\n", + " \"\"\"Generate indices to split data into training and test set.\"\"\"\n", + " for train_idx, test_idx in self.idx_pairs:\n", + " yield train_idx, test_idx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d3ec933-69ce-4259-baee-2d539bcfa6b7", + "metadata": {}, + "outputs": [], + "source": [ + "#Train&Evaluate\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import make_scorer, root_mean_squared_error\n", + "cv = PredefinedSplit(indices)\n", + "\n", + "gs = model_selection.GridSearchCV(\n", + " estimator = LinearRegression(),\n", + " param_grid = {\n", + " \"fit_intercept\":[True, False]\n", + " },\n", + " n_jobs = 4,\n", + " error_score = \"raise\",\n", + " refit = True,\n", + " scoring = make_scorer(root_mean_squared_error, greater_is_better=False),\n", + " cv = cv,\n", + ")\n", + "\n", + "gs.fit(features, targets)\n", + "\n", + "results_df = pd.DataFrame(gs.cv_results_)\n", + "\n", + "# Select key columns to display\n", + "cols_to_show = [\n", + " 'params',\n", + " 'mean_test_score',\n", + " 'std_test_score',\n", + " 'rank_test_score',\n", + " 'mean_fit_time',\n", + " 'mean_score_time',\n", + "]\n", + "\n", + "# Print as a table\n", + "print(results_df[cols_to_show].to_string(index=False))\n", + "Path(results_path).mkdir(parents=True, exist_ok=True)\n", + "joblib.dump(gs.best_estimator_, results_path / f\"Model_{estimators[index].__class__.__name__}-KFoldSplit.pkl\") # Note it's only possible to get the best estimator, as the framework uses a modified version of the class to save all the models\n", + "joblib.dump(results_df, results_path / f\"Results_{estimators[index].__class__.__name__}-KFoldSplit.pkl\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (nancy)", + "language": "python", + "name": "nancy" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/JupyterNotebooks/01-Changed_and_added_model.ipynb b/JupyterNotebooks/01-Changed_and_added_model.ipynb new file mode 100644 index 0000000..8ff3091 --- /dev/null +++ b/JupyterNotebooks/01-Changed_and_added_model.ipynb @@ -0,0 +1,231 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "91d230e0-cc84-4ad5-9005-9f166d483d03", + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "import numpy as np\n", + "import sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0304a74-ede7-4e21-8929-8e06206f4d5d", + "metadata": {}, + "outputs": [], + "source": [ + "data = Path(\"DataSets/umu\")\n", + "results_path = Path(\"01-Changed_and_added_model-results\")\n", + "random_seed = 42\n", + "n_splits = 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e320cd24-f026-4fae-ab48-0604c6ec8de0", + "metadata": {}, + "outputs": [], + "source": [ + "# Praparation\n", + "# We assume the dataset has been downloaded and unzipped manually\n", + "\n", + "df = pd.read_excel(data / \"umu\" / \"tcp_nokia_20240325.xlsx\")\n", + "\n", + "df = df[\n", + " [\"Column7\",\"Column8\",\"Column14\",\"Column15\",\"Column42\",\"Column43\",\"Column45\",\n", + " \"Column46\",\"Column47\",\"Column48\",\"Column87\",\"Column88\",\"Column78\",\"Column79\"]\n", + "]\n", + "df.columns = df.iloc[0]\n", + "df = df[1:]\n", + "\n", + "# convert all columns to numeric\n", + "for column in df.columns:\n", + " if df[column].dtype == \"object\":\n", + " df[column] = pd.to_numeric(df[column], errors=\"coerce\")\n", + "\n", + "df.columns = [col.replace(\"nas_value_nr5g_\", \"\") for col in df.columns]\n", + "\n", + "df = df.dropna() # drom the ~2 rows with NaN\n", + "df = df.loc[:, df.nunique() > 1] # keep only columns with more than one uniqe value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814ec0fe-4140-49f3-ae9b-8ab752ab54ed", + "metadata": {}, + "outputs": [], + "source": [ + "#Feature generation\n", + "import math\n", + "\n", + "def lat_lon_to_meters(origin_lat, origin_lon, point_lat, point_lon) -> tuple[float, float]:\n", + " \"\"\"Works \"fine\" for distances less than 100km\"\"\"\n", + "\n", + " R = 6_378_137 # Earth's radius in meters\n", + "\n", + " origin_lat_rad = math.radians(origin_lat) # Convert latitude and longitude from degrees to radians\n", + " delta_lat_rad = math.radians(point_lat - origin_lat)\n", + " delta_lon_rad = math.radians(point_lon - origin_lon)\n", + "\n", + " delta_meters_lat = delta_lat_rad * R # Calculate distance in the latitude direction (North-South)\n", + "\n", + " delta_meters_lon = delta_lon_rad * R * math.cos(origin_lat_rad) # Calculate distance in the longitude direction (East-West)\n", + "\n", + " return delta_meters_lat, delta_meters_lon\n", + "\n", + "origin_lat, origin_lon = df.gpsd_tpv_lat.min(), df.gpsd_tpv_lon.min()\n", + "\n", + "df[[\"target_x\", \"target_y\"]] = df.apply(\n", + " lambda row: lat_lon_to_meters(origin_lat, origin_lon, row[\"gpsd_tpv_lat\"], row[\"gpsd_tpv_lon\"]),\n", + " axis=1,\n", + " result_type=\"expand\",\n", + ")\n", + "\n", + "df.drop(columns=[\"gpsd_tpv_lat\", \"gpsd_tpv_lon\"], inplace=True)\n", + "\n", + "targets = [\"target_x\", \"target_y\"] # Find target column(s)\n", + "\n", + "features, targets = df.drop(targets, axis=1), df[targets] # X are features, y are target(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "997a14f2-afe8-4f6c-b2dd-7312a31a54b9", + "metadata": {}, + "outputs": [], + "source": [ + "#Split generation\n", + "from sklearn import model_selection\n", + "groups = None\n", + "\n", + "cv = model_selection.KFold(\n", + " n_splits=n_splits,\n", + " shuffle=True,\n", + " random_state=random_seed,\n", + ")\n", + "\n", + "indices = []\n", + "\n", + "for train_indices, test_indices in cv.split(features, targets, groups):\n", + " indices.append((train_indices, test_indices))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1593cd46-3eb7-4a64-b358-fb3b7b2bd821", + "metadata": {}, + "outputs": [], + "source": [ + "class PredefinedSplit(model_selection.BaseCrossValidator):\n", + " \"\"\"Simple cross-validator for predefined train-test splits.\"\"\"\n", + "\n", + " def __init__(self, indices_pairs: list[tuple[np.ndarray, np.ndarray]]):\n", + " self.idx_pairs = indices_pairs\n", + "\n", + " def get_n_splits(self, X=None, y=None, groups=None):\n", + " \"\"\"Return the number of splitting iterations in the cross-validator\"\"\"\n", + " return len(self.idx_pairs)\n", + "\n", + " def split(self, X, y=None, groups=None):\n", + " \"\"\"Generate indices to split data into training and test set.\"\"\"\n", + " for train_idx, test_idx in self.idx_pairs:\n", + " yield train_idx, test_idx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d3ec933-69ce-4259-baee-2d539bcfa6b7", + "metadata": {}, + "outputs": [], + "source": [ + "#Train&Evaluate\n", + "from sklearn.ensemble import RandomForestRegressor \n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from sklearn.metrics import make_scorer, root_mean_squared_error\n", + "cv = PredefinedSplit(indices)\n", + "\n", + "estimators = [\n", + " RandomForestRegressor(random_state=42),\n", + " KNeighborsRegressor()\n", + "]\n", + "params=[\n", + " {\n", + " \"n_estimators\": [10, 50, 100, 250, 400], \n", + " \"max_depth\": [5, 10, 30, 50, 150, 200, None]\n", + " },\n", + " {\n", + " \"n_neighbors\": [3, 5, 10], \n", + " \"weights\": [\"uniform\", \"distance\"], \n", + " \"p\": [1, 2], \n", + " \"leaf_size\": [10, 15, 30], \n", + " \"metric\": [\"minkowski\", \"euclidean\"] \n", + " }\n", + "]\n", + "\n", + "for index in range(2):\n", + " gs = model_selection.GridSearchCV(\n", + " estimator = estimators[index],\n", + " param_grid = params[index],\n", + " n_jobs = 4,\n", + " error_score = \"raise\",\n", + " refit = True,\n", + " scoring = make_scorer(root_mean_squared_error, greater_is_better=False),\n", + " cv = cv,\n", + " )\n", + " \n", + " gs.fit(features, targets)\n", + " \n", + " results_df = pd.DataFrame(gs.cv_results_)\n", + " \n", + " # Select key columns to display\n", + " cols_to_show = [\n", + " 'params',\n", + " 'mean_test_score',\n", + " 'std_test_score',\n", + " 'rank_test_score',\n", + " 'mean_fit_time',\n", + " 'mean_score_time',\n", + " ]\n", + " \n", + " # Print as a table\n", + " print(results_df[cols_to_show].to_string(index=False))\n", + " Path(results_path).mkdir(parents=True, exist_ok=True)\n", + " joblib.dump(gs.best_estimator_, results_path / f\"Model_{estimators[index].__class__.__name__}-KFoldSplit.pkl\") # Note it's only possible to get the best estimator, as the framework uses a modified version of the class to save all the models\n", + " joblib.dump(results_df, results_path / f\"Results_{estimators[index].__class__.__name__}-KFoldSplit.pkl\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (nancy)", + "language": "python", + "name": "nancy" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/JupyterNotebooks/02-Changed_dataset_to_logatec.ipynb b/JupyterNotebooks/02-Changed_dataset_to_logatec.ipynb new file mode 100644 index 0000000..0edf0cd --- /dev/null +++ b/JupyterNotebooks/02-Changed_dataset_to_logatec.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "91d230e0-cc84-4ad5-9005-9f166d483d03", + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "import numpy as np\n", + "import sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0304a74-ede7-4e21-8929-8e06206f4d5d", + "metadata": {}, + "outputs": [], + "source": [ + "data = Path(\"DataSets/logatec\")\n", + "results_path = Path(\"02-Changed_dataset_to_logatec-results\")\n", + "random_seed = 42\n", + "n_splits = 5\n", + "test_size = 0.20\n", + "subsets = [\"spring\", \"winter\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a584e20f-f567-4716-bf99-2c631e2fb789", + "metadata": {}, + "outputs": [], + "source": [ + "def load_raw_data(path: Path) -> pd.DataFrame:\n", + " with open(path, mode=\"r\") as fp:\n", + " data = json.load(fp)\n", + "\n", + " df = []\n", + "\n", + " for position, measurements in data.items():\n", + " digits = re.findall(r\"\\d+\", position)\n", + " location = tuple(int(i) for i in digits)\n", + "\n", + " # Winter dataset has measurements only in the middle (3rd) row.\n", + " if len(location) == 1:\n", + " location = (3, *location)\n", + "\n", + " assert len(location) == 2, f\"location identifier is not length 2: {location}\"\n", + "\n", + " pos_x, pos_y = location\n", + "\n", + " for device_id, samples in measurements.items():\n", + " device_id = int(device_id)\n", + " for sample in samples:\n", + " timestamp, value = sample[\"timestamp\"], sample[\"rss\"]\n", + "\n", + " item = {\"pos_x\": pos_x, \"pos_y\": pos_y, \"node\": device_id, \"timestamp\": timestamp, \"value\": value}\n", + " df.append(item)\n", + "\n", + " df = pd.DataFrame(df)\n", + " df.timestamp = pd.to_datetime(df.timestamp, unit=\"s\", origin=\"unix\").astype(\"datetime64[s]\")\n", + " df = df.astype({\"pos_x\": \"uint8\", \"pos_y\": \"uint8\", \"value\": \"int8\", \"node\": \"uint8\"})\n", + "\n", + " return df\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e320cd24-f026-4fae-ab48-0604c6ec8de0", + "metadata": {}, + "outputs": [], + "source": [ + "# Praparation\n", + "# We assume the dataset has been downloaded and unzipped manually\n", + "\n", + "import json\n", + "import re\n", + "\n", + "df = [load_raw_data(data / f\"{subsets[0]}_data.json\"), load_raw_data(data / f\"{subsets[1]}_data.json\")]\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " dat = []\n", + " \n", + " # Average the sample value within a second.\n", + " for (x, y, node, ts), subset in df[idx].groupby(by=[\"pos_x\", \"pos_y\", \"node\", \"timestamp\"]):\n", + " avg_value = subset.value.sum(min_count=1) / subset.value.count()\n", + " item = {\"pos_x\": x, \"pos_y\": y, \"node\": node, \"timestamp\": ts, \"value\": avg_value}\n", + " dat.append(item)\n", + " \n", + " df[idx] = pd.DataFrame(dat)\n", + " df[idx] = df[idx].pivot(index=[\"timestamp\", \"pos_x\", \"pos_y\"], columns=[\"node\"], values=[\"value\"])\n", + " df[idx] = df[idx].reset_index(drop=False)\n", + " \n", + " # After pivot, column names become tuples. Fix that.\n", + " df[idx].columns = [\"\".join(map(str, col)).strip().replace(\"value\", \"node\") for col in df[idx].columns.values]\n", + " \n", + " # Fill the NaN values with some extremely low RSS value\n", + " df[idx] = df[idx].fillna(-180)\n", + " \n", + " # TODO: Should this be part of prepare-feature stage?\n", + " # Remove datetime column\n", + " df[idx] = df[idx].drop(columns=[\"timestamp\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814ec0fe-4140-49f3-ae9b-8ab752ab54ed", + "metadata": {}, + "outputs": [], + "source": [ + "#Feature generation\n", + "for idx, subset in enumerate(subsets):\n", + " # Convert discrete values to meters\n", + " df[idx].pos_x = (df[idx].pos_x - 1) * 1.2 # meters\n", + " df[idx].pos_y = (df[idx].pos_y - 1) * 1.2 # meters\n", + " \n", + " df[idx] = df[idx].rename(columns={\"pos_x\": \"target_x\", \"pos_y\": \"target_y\"})\n", + "\n", + "# Find target column(s)\n", + "targets = [\"target_x\", \"target_y\"]\n", + "\n", + "# X are features, y are target(s)\n", + "features, targets = [df[0].drop(targets, axis=1), df[1].drop(targets, axis=1)], [df[0][targets], df[1][targets]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "997a14f2-afe8-4f6c-b2dd-7312a31a54b9", + "metadata": {}, + "outputs": [], + "source": [ + "#Split generation\n", + "from sklearn import model_selection\n", + "\n", + "groups = None\n", + "\n", + "cv = model_selection.KFold(\n", + " n_splits=n_splits,\n", + " shuffle=True,\n", + " random_state=random_seed,\n", + " )\n", + "\n", + "indices = indices = [[] for _ in range(len(subsets))]\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " for train_indices, test_indices in cv.split(features[idx], targets[idx], groups):\n", + " indices[idx].append((train_indices, test_indices))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1593cd46-3eb7-4a64-b358-fb3b7b2bd821", + "metadata": {}, + "outputs": [], + "source": [ + "class PredefinedSplit(model_selection.BaseCrossValidator):\n", + " \"\"\"Simple cross-validator for predefined train-test splits.\"\"\"\n", + "\n", + " def __init__(self, indices_pairs: list[tuple[np.ndarray, np.ndarray]]):\n", + " self.idx_pairs = indices_pairs\n", + "\n", + " def get_n_splits(self, X=None, y=None, groups=None):\n", + " \"\"\"Return the number of splitting iterations in the cross-validator\"\"\"\n", + " return len(self.idx_pairs)\n", + "\n", + " def split(self, X, y=None, groups=None):\n", + " \"\"\"Generate indices to split data into training and test set.\"\"\"\n", + " for train_idx, test_idx in self.idx_pairs:\n", + " yield train_idx, test_idx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d3ec933-69ce-4259-baee-2d539bcfa6b7", + "metadata": {}, + "outputs": [], + "source": [ + "#Train&Evaluate\n", + "from sklearn.ensemble import RandomForestRegressor \n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from sklearn.metrics import make_scorer, root_mean_squared_error\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " cv = PredefinedSplit(indices[idx])\n", + " \n", + " estimators = [\n", + " RandomForestRegressor(random_state=42),\n", + " KNeighborsRegressor()\n", + " ]\n", + " params=[\n", + " {\n", + " \"n_estimators\": [10, 50, 100, 250, 400], \n", + " \"max_depth\": [5, 10, 30, 50, 150, 200, None]\n", + " },\n", + " {\n", + " \"n_neighbors\": [3, 5, 10], \n", + " \"weights\": [\"uniform\", \"distance\"], \n", + " \"p\": [1, 2], \n", + " \"leaf_size\": [10, 15, 30], \n", + " \"metric\": [\"minkowski\", \"euclidean\"] \n", + " }\n", + " ]\n", + " \n", + " for index in range(len(estimators)):\n", + " gs = model_selection.GridSearchCV(\n", + " estimator = estimators[index],\n", + " param_grid = params[index],\n", + " n_jobs = -1,\n", + " error_score = \"raise\",\n", + " refit = True,\n", + " scoring = make_scorer(root_mean_squared_error, greater_is_better=False),\n", + " cv = cv,\n", + " )\n", + " \n", + " gs.fit(features[idx], targets[idx])\n", + " \n", + " results_df = pd.DataFrame(gs.cv_results_)\n", + " \n", + " # Select key columns to display\n", + " cols_to_show = [\n", + " 'params',\n", + " 'mean_test_score',\n", + " 'std_test_score',\n", + " 'rank_test_score',\n", + " 'mean_fit_time',\n", + " 'mean_score_time',\n", + " ]\n", + " \n", + " # Print as a table\n", + " print(results_df[cols_to_show].to_string(index=False))\n", + " Path(results_path).mkdir(parents=True, exist_ok=True)\n", + " joblib.dump(gs.best_estimator_, results_path / f\"Model_{estimators[index].__class__.__name__}-KFoldSplit-{subset}Subset.pkl\") \n", + " joblib.dump(results_df, results_path / f\"Results_{estimators[index].__class__.__name__}-KFoldSplit-{subset}Subset.pkl\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (nancy)", + "language": "python", + "name": "nancy" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/JupyterNotebooks/03-Added_split_and_metric.ipynb b/JupyterNotebooks/03-Added_split_and_metric.ipynb new file mode 100644 index 0000000..8161553 --- /dev/null +++ b/JupyterNotebooks/03-Added_split_and_metric.ipynb @@ -0,0 +1,281 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "91d230e0-cc84-4ad5-9005-9f166d483d03", + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "import numpy as np\n", + "import sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0304a74-ede7-4e21-8929-8e06206f4d5d", + "metadata": {}, + "outputs": [], + "source": [ + "data = Path(\"DataSets/logatec\")\n", + "results_path = Path(\"03-Added_split_and_metric-results\")\n", + "random_seed = 42\n", + "n_splits = 5\n", + "test_size = 0.20\n", + "subsets = [\"spring\", \"winter\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a584e20f-f567-4716-bf99-2c631e2fb789", + "metadata": {}, + "outputs": [], + "source": [ + "def load_raw_data(path: Path) -> pd.DataFrame:\n", + " with open(path, mode=\"r\") as fp:\n", + " data = json.load(fp)\n", + "\n", + " df = []\n", + "\n", + " for position, measurements in data.items():\n", + " digits = re.findall(r\"\\d+\", position)\n", + " location = tuple(int(i) for i in digits)\n", + "\n", + " # Winter dataset has measurements only in the middle (3rd) row.\n", + " if len(location) == 1:\n", + " location = (3, *location)\n", + "\n", + " assert len(location) == 2, f\"location identifier is not length 2: {location}\"\n", + "\n", + " pos_x, pos_y = location\n", + "\n", + " for device_id, samples in measurements.items():\n", + " device_id = int(device_id)\n", + " for sample in samples:\n", + " timestamp, value = sample[\"timestamp\"], sample[\"rss\"]\n", + "\n", + " item = {\"pos_x\": pos_x, \"pos_y\": pos_y, \"node\": device_id, \"timestamp\": timestamp, \"value\": value}\n", + " df.append(item)\n", + "\n", + " df = pd.DataFrame(df)\n", + " df.timestamp = pd.to_datetime(df.timestamp, unit=\"s\", origin=\"unix\").astype(\"datetime64[s]\")\n", + " df = df.astype({\"pos_x\": \"uint8\", \"pos_y\": \"uint8\", \"value\": \"int8\", \"node\": \"uint8\"})\n", + "\n", + " return df\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e320cd24-f026-4fae-ab48-0604c6ec8de0", + "metadata": {}, + "outputs": [], + "source": [ + "# Praparation\n", + "# We assume the dataset has been downloaded and unzipped manually\n", + "\n", + "import json\n", + "import re\n", + "\n", + "df = [load_raw_data(data / f\"{subsets[0]}_data.json\"), load_raw_data(data / f\"{subsets[1]}_data.json\")]\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " dat = []\n", + " \n", + " # Average the sample value within a second.\n", + " for (x, y, node, ts), subset in df[idx].groupby(by=[\"pos_x\", \"pos_y\", \"node\", \"timestamp\"]):\n", + " avg_value = subset.value.sum(min_count=1) / subset.value.count()\n", + " item = {\"pos_x\": x, \"pos_y\": y, \"node\": node, \"timestamp\": ts, \"value\": avg_value}\n", + " dat.append(item)\n", + " \n", + " df[idx] = pd.DataFrame(dat)\n", + " df[idx] = df[idx].pivot(index=[\"timestamp\", \"pos_x\", \"pos_y\"], columns=[\"node\"], values=[\"value\"])\n", + " df[idx] = df[idx].reset_index(drop=False)\n", + " \n", + " # After pivot, column names become tuples. Fix that.\n", + " df[idx].columns = [\"\".join(map(str, col)).strip().replace(\"value\", \"node\") for col in df[idx].columns.values]\n", + " \n", + " # Fill the NaN values with some extremely low RSS value\n", + " df[idx] = df[idx].fillna(-180)\n", + " \n", + " # TODO: Should this be part of prepare-feature stage?\n", + " # Remove datetime column\n", + " df[idx] = df[idx].drop(columns=[\"timestamp\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814ec0fe-4140-49f3-ae9b-8ab752ab54ed", + "metadata": {}, + "outputs": [], + "source": [ + "#Feature generation\n", + "for idx, subset in enumerate(subsets):\n", + " # Convert discrete values to meters\n", + " df[idx].pos_x = (df[idx].pos_x - 1) * 1.2 # meters\n", + " df[idx].pos_y = (df[idx].pos_y - 1) * 1.2 # meters\n", + " \n", + " df[idx] = df[idx].rename(columns={\"pos_x\": \"target_x\", \"pos_y\": \"target_y\"})\n", + "\n", + "# Find target column(s)\n", + "targets = [\"target_x\", \"target_y\"]\n", + "\n", + "# X are features, y are target(s)\n", + "features, targets = [df[0].drop(targets, axis=1), df[1].drop(targets, axis=1)], [df[0][targets], df[1][targets]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "997a14f2-afe8-4f6c-b2dd-7312a31a54b9", + "metadata": {}, + "outputs": [], + "source": [ + "#Split generation\n", + "from sklearn import model_selection\n", + "\n", + "groups = None\n", + "\n", + "cv = [model_selection.KFold(\n", + " n_splits=n_splits,\n", + " shuffle=True,\n", + " random_state=random_seed,\n", + " ),\n", + " model_selection.ShuffleSplit(\n", + " n_splits=n_splits,\n", + " test_size=test_size,\n", + " random_state=random_seed,\n", + " )\n", + "]\n", + "\n", + "cv_name = [\"KFold\", \"Random\"]\n", + "\n", + "indices = indices = [[[] for _ in range(len(cv))] for _ in range(len(subsets))]\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " for i in range(len(cv)):\n", + " for train_indices, test_indices in cv[i].split(features[idx], targets[idx], groups):\n", + " indices[idx][i].append((train_indices, test_indices))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1593cd46-3eb7-4a64-b358-fb3b7b2bd821", + "metadata": {}, + "outputs": [], + "source": [ + "class PredefinedSplit(model_selection.BaseCrossValidator):\n", + " \"\"\"Simple cross-validator for predefined train-test splits.\"\"\"\n", + "\n", + " def __init__(self, indices_pairs: list[tuple[np.ndarray, np.ndarray]]):\n", + " self.idx_pairs = indices_pairs\n", + "\n", + " def get_n_splits(self, X=None, y=None, groups=None):\n", + " \"\"\"Return the number of splitting iterations in the cross-validator\"\"\"\n", + " return len(self.idx_pairs)\n", + "\n", + " def split(self, X, y=None, groups=None):\n", + " \"\"\"Generate indices to split data into training and test set.\"\"\"\n", + " for train_idx, test_idx in self.idx_pairs:\n", + " yield train_idx, test_idx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d3ec933-69ce-4259-baee-2d539bcfa6b7", + "metadata": {}, + "outputs": [], + "source": [ + "#Train&Evaluate\n", + "from sklearn.ensemble import RandomForestRegressor \n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from sklearn.metrics import make_scorer, root_mean_squared_error, r2_score\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " cv = [PredefinedSplit(indices[idx][0]), PredefinedSplit(indices[idx][1])]\n", + " \n", + " estimators = [\n", + " RandomForestRegressor(random_state=42),\n", + " KNeighborsRegressor()\n", + " ]\n", + " params=[\n", + " {\n", + " \"n_estimators\": [10, 50, 100, 250, 400], \n", + " \"max_depth\": [5, 10, 30, 50, 150, 200, None]\n", + " },\n", + " {\n", + " \"n_neighbors\": [3, 5, 10], \n", + " \"weights\": [\"uniform\", \"distance\"], \n", + " \"p\": [1, 2], \n", + " \"leaf_size\": [10, 15, 30], \n", + " \"metric\": [\"minkowski\", \"euclidean\"] \n", + " }\n", + " ]\n", + " \n", + " for split_index in range(len(cv)):\n", + " for index in range(len(estimators)):\n", + " gs = model_selection.GridSearchCV(\n", + " estimator = estimators[index],\n", + " param_grid = params[index],\n", + " n_jobs = -1,\n", + " error_score = \"raise\",\n", + " refit = \"rmse\",\n", + " scoring = {\"rmse\": make_scorer(root_mean_squared_error, greater_is_better=False), \"r_squared\": make_scorer(r2_score, greater_is_better=True)},\n", + " cv = cv[split_index],\n", + " )\n", + " \n", + " gs.fit(features[idx], targets[idx])\n", + " \n", + " results_df = pd.DataFrame(gs.cv_results_)\n", + " \n", + " # Select key columns to display\n", + " cols_to_show = [\n", + " 'params',\n", + " 'mean_test_rmse',\n", + " 'std_test_rmse',\n", + " 'rank_test_rmse',\n", + " 'mean_test_r_squared',\n", + " 'std_test_r_squared',\n", + " 'rank_test_r_squared',\n", + " 'mean_fit_time',\n", + " 'mean_score_time',\n", + " ]\n", + " \n", + " # Print as a table\n", + " print(results_df[cols_to_show].to_string(index=False))\n", + " Path(results_path).mkdir(parents=True, exist_ok=True)\n", + " joblib.dump(gs.best_estimator_, results_path / f\"Model_{estimators[index].__class__.__name__}-{cv_name[split_index]}Split-{subset}Subset.pkl\") \n", + " joblib.dump(results_df, results_path / f\"Results_{estimators[index].__class__.__name__}-{cv_name[split_index]}Split-{subset}Subset.pkl\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (nancy)", + "language": "python", + "name": "nancy" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/JupyterNotebooks/04-Added_automl_model.ipynb b/JupyterNotebooks/04-Added_automl_model.ipynb new file mode 100644 index 0000000..cd92012 --- /dev/null +++ b/JupyterNotebooks/04-Added_automl_model.ipynb @@ -0,0 +1,477 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "91d230e0-cc84-4ad5-9005-9f166d483d03", + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "import numpy as np\n", + "import sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0304a74-ede7-4e21-8929-8e06206f4d5d", + "metadata": {}, + "outputs": [], + "source": [ + "data = Path(\"DataSets/logatec\")\n", + "results_path = Path(\"04-Added_automl_model-results\")\n", + "random_seed = 42\n", + "n_splits = 5\n", + "test_size = 0.20\n", + "subsets = [\"spring\", \"winter\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a584e20f-f567-4716-bf99-2c631e2fb789", + "metadata": {}, + "outputs": [], + "source": [ + "def load_raw_data(path: Path) -> pd.DataFrame:\n", + " with open(path, mode=\"r\") as fp:\n", + " data = json.load(fp)\n", + "\n", + " df = []\n", + "\n", + " for position, measurements in data.items():\n", + " digits = re.findall(r\"\\d+\", position)\n", + " location = tuple(int(i) for i in digits)\n", + "\n", + " # Winter dataset has measurements only in the middle (3rd) row.\n", + " if len(location) == 1:\n", + " location = (3, *location)\n", + "\n", + " assert len(location) == 2, f\"location identifier is not length 2: {location}\"\n", + "\n", + " pos_x, pos_y = location\n", + "\n", + " for device_id, samples in measurements.items():\n", + " device_id = int(device_id)\n", + " for sample in samples:\n", + " timestamp, value = sample[\"timestamp\"], sample[\"rss\"]\n", + "\n", + " item = {\"pos_x\": pos_x, \"pos_y\": pos_y, \"node\": device_id, \"timestamp\": timestamp, \"value\": value}\n", + " df.append(item)\n", + "\n", + " df = pd.DataFrame(df)\n", + " df.timestamp = pd.to_datetime(df.timestamp, unit=\"s\", origin=\"unix\").astype(\"datetime64[s]\")\n", + " df = df.astype({\"pos_x\": \"uint8\", \"pos_y\": \"uint8\", \"value\": \"int8\", \"node\": \"uint8\"})\n", + "\n", + " return df\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e320cd24-f026-4fae-ab48-0604c6ec8de0", + "metadata": {}, + "outputs": [], + "source": [ + "# Praparation\n", + "# We assume the dataset has been downloaded and unzipped manually\n", + "\n", + "import json\n", + "import re\n", + "\n", + "df = [load_raw_data(data / f\"{subsets[0]}_data.json\"), load_raw_data(data / f\"{subsets[1]}_data.json\")]\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " dat = []\n", + " \n", + " # Average the sample value within a second.\n", + " for (x, y, node, ts), subset in df[idx].groupby(by=[\"pos_x\", \"pos_y\", \"node\", \"timestamp\"]):\n", + " avg_value = subset.value.sum(min_count=1) / subset.value.count()\n", + " item = {\"pos_x\": x, \"pos_y\": y, \"node\": node, \"timestamp\": ts, \"value\": avg_value}\n", + " dat.append(item)\n", + " \n", + " df[idx] = pd.DataFrame(dat)\n", + " df[idx] = df[idx].pivot(index=[\"timestamp\", \"pos_x\", \"pos_y\"], columns=[\"node\"], values=[\"value\"])\n", + " df[idx] = df[idx].reset_index(drop=False)\n", + " \n", + " # After pivot, column names become tuples. Fix that.\n", + " df[idx].columns = [\"\".join(map(str, col)).strip().replace(\"value\", \"node\") for col in df[idx].columns.values]\n", + " \n", + " # Fill the NaN values with some extremely low RSS value\n", + " df[idx] = df[idx].fillna(-180)\n", + " \n", + " # TODO: Should this be part of prepare-feature stage?\n", + " # Remove datetime column\n", + " df[idx] = df[idx].drop(columns=[\"timestamp\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814ec0fe-4140-49f3-ae9b-8ab752ab54ed", + "metadata": {}, + "outputs": [], + "source": [ + "#Feature generation\n", + "for idx, subset in enumerate(subsets):\n", + " # Convert discrete values to meters\n", + " df[idx].pos_x = (df[idx].pos_x - 1) * 1.2 # meters\n", + " df[idx].pos_y = (df[idx].pos_y - 1) * 1.2 # meters\n", + " \n", + " df[idx] = df[idx].rename(columns={\"pos_x\": \"target_x\", \"pos_y\": \"target_y\"})\n", + "\n", + "# Find target column(s)\n", + "targets = [\"target_x\", \"target_y\"]\n", + "\n", + "# X are features, y are target(s)\n", + "features, targets = [df[0].drop(targets, axis=1), df[1].drop(targets, axis=1)], [df[0][targets], df[1][targets]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "997a14f2-afe8-4f6c-b2dd-7312a31a54b9", + "metadata": {}, + "outputs": [], + "source": [ + "#Split generation\n", + "from sklearn import model_selection\n", + "\n", + "groups = None\n", + "\n", + "cv = [model_selection.KFold(\n", + " n_splits=n_splits,\n", + " shuffle=True,\n", + " random_state=random_seed,\n", + " ),\n", + " model_selection.ShuffleSplit(\n", + " n_splits=n_splits,\n", + " test_size=test_size,\n", + " random_state=random_seed,\n", + " )\n", + "]\n", + "\n", + "cv_name = [\"KFold\", \"Random\"]\n", + "\n", + "split_indices = [[[] for _ in range(len(cv))] for _ in range(len(subsets))]\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " for i in range(len(cv)):\n", + " for train_indices, test_indices in cv[i].split(features[idx], targets[idx], groups):\n", + " split_indices[idx][i].append((train_indices, test_indices))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1593cd46-3eb7-4a64-b358-fb3b7b2bd821", + "metadata": {}, + "outputs": [], + "source": [ + "class PredefinedSplit(model_selection.BaseCrossValidator):\n", + " \"\"\"Simple cross-validator for predefined train-test splits.\"\"\"\n", + "\n", + " def __init__(self, indices_pairs: list[tuple[np.ndarray, np.ndarray]]):\n", + " self.idx_pairs = indices_pairs\n", + "\n", + " def get_n_splits(self, X=None, y=None, groups=None):\n", + " \"\"\"Return the number of splitting iterations in the cross-validator\"\"\"\n", + " return len(self.idx_pairs)\n", + "\n", + " def split(self, X, y=None, groups=None):\n", + " \"\"\"Generate indices to split data into training and test set.\"\"\"\n", + " for train_idx, test_idx in self.idx_pairs:\n", + " yield train_idx, test_idx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d3ec933-69ce-4259-baee-2d539bcfa6b7", + "metadata": {}, + "outputs": [], + "source": [ + "#Train&Evaluate\n", + "from sklearn.ensemble import RandomForestRegressor \n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from sklearn.metrics import make_scorer, root_mean_squared_error, r2_score\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " cv = [PredefinedSplit(split_indices[idx][0]), PredefinedSplit(split_indices[idx][1])]\n", + " \n", + " estimators = [\n", + " RandomForestRegressor(random_state=42),\n", + " KNeighborsRegressor()\n", + " ]\n", + " params=[\n", + " {\n", + " \"n_estimators\": [10, 50, 100, 250, 400], \n", + " \"max_depth\": [5, 10, 30, 50, 150, 200, None]\n", + " },\n", + " {\n", + " \"n_neighbors\": [3, 5, 10], \n", + " \"weights\": [\"uniform\", \"distance\"], \n", + " \"p\": [1, 2], \n", + " \"leaf_size\": [10, 15, 30], \n", + " \"metric\": [\"minkowski\", \"euclidean\"] \n", + " }\n", + " ]\n", + " \n", + " for split_index in range(len(cv)):\n", + " for index in range(len(estimators)):\n", + " gs = model_selection.GridSearchCV(\n", + " estimator = estimators[index],\n", + " param_grid = params[index],\n", + " n_jobs = -1,\n", + " error_score = \"raise\",\n", + " refit = \"rmse\",\n", + " scoring = {\"rmse\": make_scorer(root_mean_squared_error, greater_is_better=False), \"r_squared\": make_scorer(r2_score, greater_is_better=True)},\n", + " cv = cv[split_index],\n", + " )\n", + " \n", + " gs.fit(features[idx], targets[idx])\n", + " \n", + " results_df = pd.DataFrame(gs.cv_results_)\n", + " \n", + " # Select key columns to display\n", + " cols_to_show = [\n", + " 'params',\n", + " 'mean_test_rmse',\n", + " 'std_test_rmse',\n", + " 'rank_test_rmse',\n", + " 'mean_test_r_squared',\n", + " 'std_test_r_squared',\n", + " 'rank_test_r_squared',\n", + " 'mean_fit_time',\n", + " 'mean_score_time',\n", + " ]\n", + " \n", + " # Print as a table\n", + " print(results_df[cols_to_show].to_string(index=False))\n", + " Path(results_path).mkdir(parents=True, exist_ok=True)\n", + " joblib.dump(gs.best_estimator_, results_path / f\"Model_{estimators[index].__class__.__name__}-{cv_name[split_index]}Split-{subset}Subset.pkl\") \n", + " joblib.dump(results_df, results_path / f\"Results_{estimators[index].__class__.__name__}-{cv_name[split_index]}Split-{subset}Subset.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "544a5a45-c77b-4d13-91c6-62c39d71ded0", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"TF_DETERMINISTIC_OPS\"] = \"1\"\n", + "import tensorflow as tf\n", + "tf.config.experimental.enable_op_determinism()\n", + "import autokeras as ak\n", + "from sklearn.model_selection import train_test_split\n", + "import keras\n", + "import time\n", + "import gc\n", + "import shutil\n", + "from sklearn.metrics import make_scorer, root_mean_squared_error, r2_score\n", + "from multiprocess import Process\n", + "\n", + "def get_best_models(num_models, auto_model):\n", + " top_trials = auto_model.tuner.oracle.get_best_trials(num_models)\n", + " for trial in top_trials:\n", + " model = auto_model.tuner.load_model(trial)\n", + " yield model, trial.hyperparameters\n", + "\n", + "metrics = {\"rmse\": root_mean_squared_error, \"r_squared\": r2_score}\n", + "\n", + "def run_candidate(**kwargs):\n", + " features = kwargs[\"features\"]\n", + " targets = kwargs[\"targets\"]\n", + " metrics = kwargs[\"metrics\"]\n", + " subset = kwargs[\"subset\"]\n", + " cv_name = kwargs[\"cv_name\"]\n", + " split_index = kwargs[\"split_index\"]\n", + " idx = kwargs[\"idx\"]\n", + " random_seed = kwargs.get(\"random_seed\", 42)\n", + " test_size = kwargs.get(\"test_size\", 0.2)\n", + " results_path = kwargs[\"results_path\"]\n", + " split_indices = kwargs[\"split_indices\"]\n", + "\n", + " tf.config.experimental.enable_op_determinism()\n", + " keras.utils.set_random_seed(random_seed)\n", + " \n", + " inputs = [ak.Input(name=\"data_input\")]\n", + " outputs = [ak.RegressionHead(name=\"x_out\"), ak.RegressionHead(name=\"y_out\")]\n", + " \n", + " keras.utils.set_random_seed(random_seed)\n", + " \n", + " auto_model = ak.AutoModel(\n", + " inputs = inputs,\n", + " outputs = outputs,\n", + " seed = 42,\n", + " max_trials = 10,\n", + " overwrite = True,\n", + " directory = results_path,\n", + " project_name = f\"ExampleModel-{subset}-{cv_name[split_index]}\"\n", + " )\n", + "\n", + " print(f\"{subset}-{cv_name[split_index]}\")\n", + " preped_features = [features[idx].to_numpy()]\n", + " preped_targets = np.hsplit(targets[idx].to_numpy(), 2)\n", + "\n", + " n_samples = preped_features[0].shape[0]\n", + " indices = np.arange(n_samples)\n", + " \n", + " train_indices, val_indices = train_test_split(\n", + " indices, \n", + " test_size = test_size, \n", + " random_state = random_seed, \n", + " shuffle = True\n", + " )\n", + "\n", + " X_train_list = [_[train_indices] for _ in preped_features]\n", + " X_val_list = [_[val_indices] for _ in preped_features]\n", + " y_train_list = [_[train_indices] for _ in preped_targets]\n", + " y_val_list = [_[val_indices] for _ in preped_targets]\n", + "\n", + "\n", + "\n", + " auto_model.fit(\n", + " X_train_list, \n", + " y_train_list, \n", + " validation_data = (X_val_list, y_val_list), \n", + " verbose = 2\n", + " )\n", + "\n", + " save_top_n = max(1, int(len(auto_model.tuner.oracle.trials) * 0.1))\n", + "\n", + " cv = [PredefinedSplit(split_indices[idx][i]) for i in range(len(cv_name))]\n", + " \n", + " results = []\n", + " for idx, (model, hyperparameters) in enumerate(get_best_models(save_top_n, auto_model)):\n", + " print(f\"\\nProcessing model {idx + 1} out of {save_top_n}\")\n", + "\n", + " optimizer = model.optimizer\n", + " del model\n", + "\n", + " scores = {name: [] for name in metrics.keys()}\n", + " train_times = []\n", + " predict_times = []\n", + "\n", + " for split_idx, (train_indices, test_indices) in enumerate(\n", + " cv[split_index].split(preped_features[0], preped_targets[0])\n", + " ):\n", + "\n", + " print(f\"\\tProcessing split {split_idx + 1} out of {cv[split_index].get_n_splits()}\")\n", + " \n", + " keras.utils.set_random_seed(random_seed)\n", + " model = auto_model.tuner.hypermodel.build(hyperparameters)\n", + " new_optimizer = type(optimizer).from_config(optimizer.get_config())\n", + " model.compile(optimizer = new_optimizer, loss=\"mse\")\n", + "\n", + " save_path = os.path.join(results_path / f\"ExampleModel-{subset}-{cv_name[split_index]}\", f\"model-{idx}-{split_idx}.keras\")\n", + " model.save(save_path)\n", + " model_size = os.path.getsize(save_path)\n", + "\n", + " X_train = [x[train_indices] for x in preped_features]\n", + " X_test = [x[test_indices] for x in preped_features]\n", + " y_train = [y[train_indices] for y in preped_targets]\n", + " y_test = [y[test_indices] for y in preped_targets]\n", + "\n", + " #train\n", + " start_time = time.perf_counter()\n", + " model.fit(\n", + " X_train, y_train, \n", + " validation_data = (X_test, y_test), \n", + " epochs = 1000, \n", + " callbacks = [keras.callbacks.EarlyStopping(patience=10, min_delta=1e-4)],\n", + " verbose = 2\n", + " )\n", + " train_times.append(time.perf_counter() - start_time)\n", + "\n", + " # predict\n", + " start_time = time.perf_counter()\n", + " y_pred = model.predict(X_test, batch_size=32, verbose=2)\n", + " y_pred = np.squeeze(np.stack(y_pred, axis=0), axis=-1)\n", + " predict_time = time.perf_counter() - start_time\n", + " predict_times.append(predict_time)\n", + "\n", + " y_test = np.squeeze(np.stack(y_test, axis=0), axis=-1)\n", + " for name, func in metrics.items():\n", + " scores[name].append(func(y_test, y_pred))\n", + "\n", + " del model\n", + " keras.backend.clear_session()\n", + " gc.collect()\n", + " \n", + " results.append({\n", + " \"scores\": {\n", + " name: {\n", + " \"mean\": np.mean(arr), \n", + " \"std\": np.std(arr)\n", + " } \n", + " for name, arr in scores.items()\n", + " },\n", + " \"params\" : hyperparameters.values,\n", + " \"fit_time\" : {\n", + " \"mean\": np.mean(train_times), \n", + " \"std\" : np.std(train_times)\n", + " },\n", + " \"score_time\": {\n", + " \"mean\": np.mean(predict_times), \n", + " \"std\" : np.std(predict_times)\n", + " }\n", + " })\n", + " print(results, flush = True)\n", + " joblib.dump(results, results_path / f\"Results_ExampleModel-{cv_name[split_index]}Split-{subset}Subset.pkl\")\n", + "\n", + " shutil.rmtree(results_path / f\"ExampleModel-{subset}-{cv_name[split_index]}\")\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " for split_index in range(len(cv)):\n", + " # We use multiprocess (the multiprocessing library doesn't work) to isolate each training run.\n", + " # This resolves an issue where model training is dependant on the order in which the models are trained,\n", + " # this is due to a state persisting between runs (despite all the efforts to reset the environment).\n", + " \n", + " p = Process(target=run_candidate, kwargs={\n", + " \"features\" : features,\n", + " \"targets\" : targets,\n", + " \"metrics\" : metrics,\n", + " \"subset\" : subset,\n", + " \"cv_name\" : cv_name,\n", + " \"split_index\" : split_index,\n", + " \"idx\" : idx,\n", + " \"random_seed\" : random_seed,\n", + " \"test_size\" : test_size,\n", + " \"results_path\" : results_path,\n", + " \"split_indices\" : split_indices\n", + " })\n", + " p.start()\n", + " p.join()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (nancy)", + "language": "python", + "name": "nancy" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/JupyterNotebooks/Benchmarking.ipynb b/JupyterNotebooks/Benchmarking.ipynb new file mode 100644 index 0000000..547fd43 --- /dev/null +++ b/JupyterNotebooks/Benchmarking.ipynb @@ -0,0 +1,510 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e17b8d4d-6f08-47e5-af37-b2f891b3d5d8", + "metadata": {}, + "outputs": [], + "source": [ + "import importlib.util\n", + "import os\n", + "\n", + "os.environ[\"TF_DETERMINISTIC_OPS\"] = \"1\"\n", + "os.environ[\"PYTHONHASHSEED\"] = \"42\"\n", + "\n", + "target_path = os.path.abspath(\n", + " os.path.join(os.getcwd(), \"..\", \"performance.py\")\n", + ")\n", + "\n", + "spec = importlib.util.spec_from_file_location(\"performance\", target_path)\n", + "performance = importlib.util.module_from_spec(spec)\n", + "spec.loader.exec_module(performance)\n", + "\n", + "globals().update({k: v for k, v in performance.__dict__.items() if not k.startswith(\"__\")})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91d230e0-cc84-4ad5-9005-9f166d483d03", + "metadata": {}, + "outputs": [], + "source": [ + "import joblib\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "import numpy as np\n", + "import sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92d10d68-95d7-475f-b2e3-13b44ee79c4f", + "metadata": {}, + "outputs": [], + "source": [ + "data = Path(\"DataSets/logatec\")\n", + "results_path = Path(\"Benchmarking-results\")\n", + "random_seed = 42\n", + "n_splits = 5\n", + "test_size = 0.20\n", + "subsets = [\"spring\", \"winter\"]\n", + "\n", + "Path(results_path).mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e320cd24-f026-4fae-ab48-0604c6ec8de0", + "metadata": {}, + "outputs": [], + "source": [ + "start_resource_monitor()\n", + "\n", + "# Praparation\n", + "# We assume the dataset has been downloaded and unzipped manually\n", + "\n", + "import json\n", + "import re\n", + "\n", + "def load_raw_data(path: Path) -> pd.DataFrame:\n", + " with open(path, mode=\"r\") as fp:\n", + " data = json.load(fp)\n", + "\n", + " df = []\n", + "\n", + " for position, measurements in data.items():\n", + " digits = re.findall(r\"\\d+\", position)\n", + " location = tuple(int(i) for i in digits)\n", + "\n", + " # Winter dataset has measurements only in the middle (3rd) row.\n", + " if len(location) == 1:\n", + " location = (3, *location)\n", + "\n", + " assert len(location) == 2, f\"location identifier is not length 2: {location}\"\n", + "\n", + " pos_x, pos_y = location\n", + "\n", + " for device_id, samples in measurements.items():\n", + " device_id = int(device_id)\n", + " for sample in samples:\n", + " timestamp, value = sample[\"timestamp\"], sample[\"rss\"]\n", + "\n", + " item = {\"pos_x\": pos_x, \"pos_y\": pos_y, \"node\": device_id, \"timestamp\": timestamp, \"value\": value}\n", + " df.append(item)\n", + "\n", + " df = pd.DataFrame(df)\n", + " df.timestamp = pd.to_datetime(df.timestamp, unit=\"s\", origin=\"unix\").astype(\"datetime64[s]\")\n", + " df = df.astype({\"pos_x\": \"uint8\", \"pos_y\": \"uint8\", \"value\": \"int8\", \"node\": \"uint8\"})\n", + "\n", + " return df\n", + "\n", + "df = [load_raw_data(data / f\"{subsets[i]}_data.json\") for i in range(len(subsets))]\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " dat = []\n", + " \n", + " # Average the sample value within a second.\n", + " for (x, y, node, ts), subset in df[idx].groupby(by=[\"pos_x\", \"pos_y\", \"node\", \"timestamp\"]):\n", + " avg_value = subset.value.sum(min_count=1) / subset.value.count()\n", + " item = {\"pos_x\": x, \"pos_y\": y, \"node\": node, \"timestamp\": ts, \"value\": avg_value}\n", + " dat.append(item)\n", + " \n", + " df[idx] = pd.DataFrame(dat)\n", + " df[idx] = df[idx].pivot(index=[\"timestamp\", \"pos_x\", \"pos_y\"], columns=[\"node\"], values=[\"value\"])\n", + " df[idx] = df[idx].reset_index(drop=False)\n", + " \n", + " # After pivot, column names become tuples. Fix that.\n", + " df[idx].columns = [\"\".join(map(str, col)).strip().replace(\"value\", \"node\") for col in df[idx].columns.values]\n", + " \n", + " # Fill the NaN values with some extremely low RSS value\n", + " df[idx] = df[idx].fillna(-180)\n", + " \n", + " # TODO: Should this be part of prepare-feature stage?\n", + " # Remove datetime column\n", + " df[idx] = df[idx].drop(columns=[\"timestamp\"])\n", + "\n", + "stop_resource_monitor(results_path / f\"prepare_usage.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "814ec0fe-4140-49f3-ae9b-8ab752ab54ed", + "metadata": {}, + "outputs": [], + "source": [ + "start_resource_monitor()\n", + "\n", + "#Feature generation\n", + "for idx, subset in enumerate(subsets):\n", + " # Convert discrete values to meters\n", + " df[idx].pos_x = (df[idx].pos_x - 1) * 1.2 # meters\n", + " df[idx].pos_y = (df[idx].pos_y - 1) * 1.2 # meters\n", + " \n", + " df[idx] = df[idx].rename(columns={\"pos_x\": \"target_x\", \"pos_y\": \"target_y\"})\n", + "\n", + "# Find target column(s)\n", + "targets = [\"target_x\", \"target_y\"]\n", + "\n", + "# X are features, y are target(s)\n", + "features, targets = [df[i].drop(targets, axis=1) for i in range(len(subsets))], [df[i][targets] for i in range(len(subsets))]\n", + "\n", + "stop_resource_monitor(results_path / f\"featurize_usage.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "997a14f2-afe8-4f6c-b2dd-7312a31a54b9", + "metadata": {}, + "outputs": [], + "source": [ + "start_resource_monitor()\n", + "\n", + "#Split generation\n", + "from sklearn import model_selection\n", + "\n", + "class PredefinedSplit(model_selection.BaseCrossValidator):\n", + " \"\"\"Simple cross-validator for predefined train-test splits.\"\"\"\n", + "\n", + " def __init__(self, indices_pairs: list[tuple[np.ndarray, np.ndarray]]):\n", + " self.idx_pairs = indices_pairs\n", + "\n", + " def get_n_splits(self, X=None, y=None, groups=None):\n", + " \"\"\"Return the number of splitting iterations in the cross-validator\"\"\"\n", + " return len(self.idx_pairs)\n", + "\n", + " def split(self, X, y=None, groups=None):\n", + " \"\"\"Generate indices to split data into training and test set.\"\"\"\n", + " for train_idx, test_idx in self.idx_pairs:\n", + " yield train_idx, test_idx\n", + "\n", + "groups = None\n", + "\n", + "cv = [model_selection.KFold(\n", + " n_splits=n_splits,\n", + " shuffle=True,\n", + " random_state=random_seed,\n", + " ),\n", + " model_selection.ShuffleSplit(\n", + " n_splits=n_splits,\n", + " test_size=test_size,\n", + " random_state=random_seed,\n", + " )\n", + "]\n", + "\n", + "cv_name = [\"KFold\", \"Random\"]\n", + "\n", + "split_indices = [[[] for _ in range(len(cv))] for _ in range(len(subsets))]\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " for i in range(len(cv)):\n", + " for train_indices, test_indices in cv[i].split(features[idx], targets[idx], groups):\n", + " split_indices[idx][i].append((train_indices, test_indices))\n", + "\n", + "stop_resource_monitor(results_path / f\"split_usage.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d3ec933-69ce-4259-baee-2d539bcfa6b7", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "start_resource_monitor()\n", + "\n", + "#Train&Evaluate\n", + "from sklearn.ensemble import RandomForestRegressor \n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from sklearn.metrics import make_scorer, root_mean_squared_error, r2_score\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " cv = [PredefinedSplit(split_indices[idx][i])for i in range(len(cv_name))]\n", + " \n", + " estimators = [\n", + " RandomForestRegressor(random_state=42),\n", + " KNeighborsRegressor()\n", + " ]\n", + " params=[\n", + " {\n", + " \"n_estimators\": [10, 50, 100, 250, 400], \n", + " \"max_depth\": [5, 10, 30, 50, 150, 200, None]\n", + " },\n", + " {\n", + " \"n_neighbors\": [3, 5, 10], \n", + " \"weights\": [\"uniform\", \"distance\"], \n", + " \"p\": [1, 2], \n", + " \"leaf_size\": [10, 15, 30], \n", + " \"metric\": [\"minkowski\", \"euclidean\"] \n", + " }\n", + " ]\n", + " \n", + " for split_index in range(len(cv)):\n", + " for index in range(len(estimators)):\n", + " print(f\"{subset}-{cv_name[split_index]}-{estimators[index].__class__.__name__}\")\n", + " gs = model_selection.GridSearchCV(\n", + " estimator = estimators[index],\n", + " param_grid = params[index],\n", + " n_jobs = 5,\n", + " error_score = \"raise\",\n", + " refit = \"rmse\",\n", + " scoring = {\"rmse\": make_scorer(root_mean_squared_error, greater_is_better=False), \"r_squared\": make_scorer(r2_score, greater_is_better=True)},\n", + " cv = cv[split_index],\n", + " )\n", + "\n", + "\n", + " gs.fit(features[idx], targets[idx])\n", + " \n", + " results_df = pd.DataFrame(gs.cv_results_)\n", + " \n", + " # Select key columns to display\n", + " cols_to_show = [\n", + " 'params',\n", + " 'mean_test_rmse',\n", + " 'std_test_rmse',\n", + " 'rank_test_rmse',\n", + " 'mean_test_r_squared',\n", + " 'std_test_r_squared',\n", + " 'rank_test_r_squared',\n", + " 'mean_fit_time',\n", + " 'mean_score_time',\n", + " ]\n", + " \n", + " #print(results_df[cols_to_show].to_string(index=False))\n", + " Path(results_path).mkdir(parents=True, exist_ok=True)\n", + " joblib.dump(gs.best_estimator_, results_path / f\"Model_{estimators[index].__class__.__name__}-{cv_name[split_index]}Split-{subset}Subset.pkl\") \n", + " joblib.dump(results_df, results_path / f\"Results_{estimators[index].__class__.__name__}-{cv_name[split_index]}Split-{subset}Subset.pkl\")\n", + "\n", + "stop_resource_monitor(results_path / f\"gridsearch_usage.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "544a5a45-c77b-4d13-91c6-62c39d71ded0", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "start_resource_monitor()\n", + "\n", + "import os\n", + "os.environ[\"TF_DETERMINISTIC_OPS\"] = \"1\"\n", + "import tensorflow as tf\n", + "tf.config.experimental.enable_op_determinism()\n", + "import autokeras as ak\n", + "from sklearn.model_selection import train_test_split\n", + "import keras\n", + "import time\n", + "import gc\n", + "import shutil\n", + "from sklearn.metrics import make_scorer, root_mean_squared_error, r2_score\n", + "from multiprocess import Process\n", + "\n", + "def get_best_models(num_models, auto_model):\n", + " top_trials = auto_model.tuner.oracle.get_best_trials(num_models)\n", + " for trial in top_trials:\n", + " model = auto_model.tuner.load_model(trial)\n", + " yield model, trial.hyperparameters\n", + "\n", + "metrics = {\"rmse\": root_mean_squared_error, \"r_squared\": r2_score}\n", + "\n", + "def run_candidate(**kwargs):\n", + " features = kwargs[\"features\"]\n", + " targets = kwargs[\"targets\"]\n", + " metrics = kwargs[\"metrics\"]\n", + " subset = kwargs[\"subset\"]\n", + " cv_name = kwargs[\"cv_name\"]\n", + " split_index = kwargs[\"split_index\"]\n", + " idx = kwargs[\"idx\"]\n", + " random_seed = kwargs.get(\"random_seed\", 42)\n", + " test_size = kwargs.get(\"test_size\", 0.2)\n", + " results_path = kwargs[\"results_path\"]\n", + " split_indices = kwargs[\"split_indices\"]\n", + "\n", + " tf.config.experimental.enable_op_determinism()\n", + " keras.utils.set_random_seed(random_seed)\n", + " \n", + " inputs = [ak.Input(name=\"data_input\")]\n", + " outputs = [ak.RegressionHead(name=\"x_out\"), ak.RegressionHead(name=\"y_out\")]\n", + " \n", + " keras.utils.set_random_seed(random_seed)\n", + " \n", + " auto_model = ak.AutoModel(\n", + " inputs = inputs,\n", + " outputs = outputs,\n", + " seed = 42,\n", + " max_trials = 10,\n", + " overwrite = True,\n", + " directory = results_path,\n", + " project_name = f\"ExampleModel-{subset}-{cv_name[split_index]}\"\n", + " )\n", + "\n", + " print(f\"{subset}-{cv_name[split_index]}\")\n", + " preped_features = [features[idx].to_numpy()]\n", + " preped_targets = np.hsplit(targets[idx].to_numpy(), 2)\n", + "\n", + " n_samples = preped_features[0].shape[0]\n", + " indices = np.arange(n_samples)\n", + " \n", + " train_indices, val_indices = train_test_split(\n", + " indices, \n", + " test_size = test_size, \n", + " random_state = random_seed, \n", + " shuffle = True\n", + " )\n", + "\n", + " X_train_list = [_[train_indices] for _ in preped_features]\n", + " X_val_list = [_[val_indices] for _ in preped_features]\n", + " y_train_list = [_[train_indices] for _ in preped_targets]\n", + " y_val_list = [_[val_indices] for _ in preped_targets]\n", + "\n", + "\n", + "\n", + " auto_model.fit(\n", + " X_train_list, \n", + " y_train_list, \n", + " validation_data = (X_val_list, y_val_list), \n", + " verbose = 2\n", + " )\n", + "\n", + " save_top_n = max(1, int(len(auto_model.tuner.oracle.trials) * 0.1))\n", + "\n", + " cv = [PredefinedSplit(split_indices[idx][i]) for i in range(len(cv_name))]\n", + " \n", + " results = []\n", + " for idx, (model, hyperparameters) in enumerate(get_best_models(save_top_n, auto_model)):\n", + " print(f\"\\nProcessing model {idx + 1} out of {save_top_n}\")\n", + "\n", + " optimizer = model.optimizer\n", + " del model\n", + "\n", + " scores = {name: [] for name in metrics.keys()}\n", + " train_times = []\n", + " predict_times = []\n", + "\n", + " for split_idx, (train_indices, test_indices) in enumerate(\n", + " cv[split_index].split(preped_features[0], preped_targets[0])\n", + " ):\n", + "\n", + " print(f\"\\tProcessing split {split_idx + 1} out of {cv[split_index].get_n_splits()}\")\n", + " \n", + " keras.utils.set_random_seed(random_seed)\n", + " model = auto_model.tuner.hypermodel.build(hyperparameters)\n", + " new_optimizer = type(optimizer).from_config(optimizer.get_config())\n", + " model.compile(optimizer = new_optimizer, loss=\"mse\")\n", + "\n", + " save_path = os.path.join(results_path / f\"ExampleModel-{subset}-{cv_name[split_index]}\", f\"model-{idx}-{split_idx}.keras\")\n", + " model.save(save_path)\n", + " model_size = os.path.getsize(save_path)\n", + "\n", + " X_train = [x[train_indices] for x in preped_features]\n", + " X_test = [x[test_indices] for x in preped_features]\n", + " y_train = [y[train_indices] for y in preped_targets]\n", + " y_test = [y[test_indices] for y in preped_targets]\n", + "\n", + " #train\n", + " start_time = time.perf_counter()\n", + " model.fit(\n", + " X_train, y_train, \n", + " validation_data = (X_test, y_test), \n", + " epochs = 1000, \n", + " callbacks = [keras.callbacks.EarlyStopping(patience=10, min_delta=1e-4)],\n", + " verbose = 2\n", + " )\n", + " train_times.append(time.perf_counter() - start_time)\n", + "\n", + " # predict\n", + " start_time = time.perf_counter()\n", + " y_pred = model.predict(X_test, batch_size=32, verbose=2)\n", + " y_pred = np.squeeze(np.stack(y_pred, axis=0), axis=-1)\n", + " predict_time = time.perf_counter() - start_time\n", + " predict_times.append(predict_time)\n", + "\n", + " y_test = np.squeeze(np.stack(y_test, axis=0), axis=-1)\n", + " for name, func in metrics.items():\n", + " scores[name].append(func(y_test, y_pred))\n", + "\n", + " del model\n", + " keras.backend.clear_session()\n", + " gc.collect()\n", + " \n", + " results.append({\n", + " \"scores\": {\n", + " name: {\n", + " \"mean\": np.mean(arr), \n", + " \"std\": np.std(arr)\n", + " } \n", + " for name, arr in scores.items()\n", + " },\n", + " \"params\" : hyperparameters.values,\n", + " \"fit_time\" : {\n", + " \"mean\": np.mean(train_times), \n", + " \"std\" : np.std(train_times)\n", + " },\n", + " \"score_time\": {\n", + " \"mean\": np.mean(predict_times), \n", + " \"std\" : np.std(predict_times)\n", + " }\n", + " })\n", + " print(results, flush = True)\n", + " joblib.dump(results, results_path / f\"Results_ExampleModel-{cv_name[split_index]}Split-{subset}Subset.pkl\")\n", + "\n", + " shutil.rmtree(results_path / f\"ExampleModel-{subset}-{cv_name[split_index]}\")\n", + "\n", + "for idx, subset in enumerate(subsets):\n", + " for split_index in range(len(cv)):\n", + " # We use multiprocess (the multiprocessing library doesn't work) to isolate each training run.\n", + " # This resolves an issue where model training is dependant on the order in which the models are trained,\n", + " # this is due to a state persisting between runs (despite all the efforts to reset the environment).\n", + " \n", + " p = Process(target=run_candidate, kwargs={\n", + " \"features\" : features,\n", + " \"targets\" : targets,\n", + " \"metrics\" : metrics,\n", + " \"subset\" : subset,\n", + " \"cv_name\" : cv_name,\n", + " \"split_index\" : split_index,\n", + " \"idx\" : idx,\n", + " \"random_seed\" : random_seed,\n", + " \"test_size\" : test_size,\n", + " \"results_path\" : results_path,\n", + " \"split_indices\" : split_indices\n", + " })\n", + " p.start()\n", + " p.join()\n", + " \n", + "stop_resource_monitor(results_path / f\"automl_usage.pkl\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (nancy)", + "language": "python", + "name": "nancy" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/LOCALIZE_Configs/00-Initial/dvc.yaml b/LOCALIZE_Configs/00-Initial/dvc.yaml new file mode 100644 index 0000000..092e9fc --- /dev/null +++ b/LOCALIZE_Configs/00-Initial/dvc.yaml @@ -0,0 +1,77 @@ +# 00-Initial +vars: + - path: + # Path are relative to location of dvc.yaml file + scripts: ../../src/umu + common: ../../src + data: ../../artifacts/00-Initial/data + models: ../../artifacts/00-Initial/models + reports: ../../artifacts/00-Initial/reports + + +stages: + prepare: + desc: Prepare UMU dataset for the MLOps pipeline + cmd: | + unzip ${path.data}/raw/umu.zip -d ${path.data}/raw/ + python ${path.scripts}/prepare.py --method average --input ${path.data}/raw/umu/tcp_nokia_20240325.xlsx --output ${path.data}/interim/umu.pkl + rm ${path.data}/raw/umu/*.xlsx + rm ${path.data}/raw/__MACOSX/umu/.*.xlsx + deps: + - ${path.scripts}/prepare.py + - ${path.data}/raw/umu.zip + outs: + - ${path.data}/interim/umu.pkl + + featurize: + desc: Enrich dataset with additional features + cmd: > + python ${path.scripts}/featurize.py + --input ${path.data}/interim/umu.pkl + --output ${path.data}/prepared/umu.pkl + deps: + - ${path.scripts}/featurize.py + - ${path.data}/interim/umu.pkl + outs: + - ${path.data}/prepared/umu.pkl + + + split: + desc: Split dataset and store indices + matrix: + split: ${split.types} + cmd: > + python ${path.scripts}/split.py + --input ${path.data}/prepared/umu.pkl + --split ${item.split} + --output-indices ${path.data}/splits/${item.split}Split.pkl + params: + - split + deps: + - ${path.scripts}/split.py + - ${path.data}/prepared/umu.pkl + outs: + - ${path.data}/splits/${item.split}Split.pkl + + + gridsearch: + desc: Determine best hyper-parameters for algorithm on several CVs + matrix: + split: ${split.types} + model: ${gridsearch} + cmd: > + python ${path.common}/benchmark.py + --use ${item.model} + --optimizer gridsearch + --data ${path.data}/prepared/umu.pkl + --split-indices ${path.data}/splits/${item.split}Split.pkl + --output-results ${path.models}/gridsearch-${item.split}-${item.model}/${item.model}-${item.split}Split-results.pkl + params: + - gridsearch.${item.model} + deps: + - ${path.common}/benchmark.py + - ${path.data}/prepared/umu.pkl + - ${path.data}/splits/${item.split}Split.pkl + outs: + - ${path.models}/gridsearch-${item.split}-${item.model}/${item.model}-${item.split}Split-results.pkl + diff --git a/LOCALIZE_Configs/00-Initial/params.yaml b/LOCALIZE_Configs/00-Initial/params.yaml new file mode 100644 index 0000000..37685ca --- /dev/null +++ b/LOCALIZE_Configs/00-Initial/params.yaml @@ -0,0 +1,18 @@ +# 00-Initial +evaluation: + metrics: {'rmse': False} + save_top_models: 0.1 + score_with: 'rmse' + +split: + types: [KFold] + seed: 42 + n_splits: 5 + test_size: 0.20 + +gridsearch: + LinearRegression: + module: sklearn.linear_model + class: LinearRegression + hyperparameters: + fit_intercept: [true, false] diff --git a/LOCALIZE_Configs/01-Changed_and_added_model/dvc.yaml b/LOCALIZE_Configs/01-Changed_and_added_model/dvc.yaml new file mode 100644 index 0000000..4f03be4 --- /dev/null +++ b/LOCALIZE_Configs/01-Changed_and_added_model/dvc.yaml @@ -0,0 +1,77 @@ +# 01-Initial +vars: + - path: + # Path are relative to location of dvc.yaml file + scripts: ../../src/umu + common: ../../src + data: ../../artifacts/01-Changed_and_added_model/data + models: ../../artifacts/01-Changed_and_added_model/models + reports: ../../artifacts/01-Changed_and_added_model/reports + + +stages: + prepare: + desc: Prepare UMU dataset for the MLOps pipeline + cmd: | + unzip ${path.data}/raw/umu.zip -d ${path.data}/raw/ + python ${path.scripts}/prepare.py --method average --input ${path.data}/raw/umu/tcp_nokia_20240325.xlsx --output ${path.data}/interim/umu.pkl + rm ${path.data}/raw/umu/*.xlsx + rm ${path.data}/raw/__MACOSX/umu/.*.xlsx + deps: + - ${path.scripts}/prepare.py + - ${path.data}/raw/umu.zip + outs: + - ${path.data}/interim/umu.pkl + + featurize: + desc: Enrich dataset with additional features + cmd: > + python ${path.scripts}/featurize.py + --input ${path.data}/interim/umu.pkl + --output ${path.data}/prepared/umu.pkl + deps: + - ${path.scripts}/featurize.py + - ${path.data}/interim/umu.pkl + outs: + - ${path.data}/prepared/umu.pkl + + + split: + desc: Split dataset and store indices + matrix: + split: ${split.types} + cmd: > + python ${path.scripts}/split.py + --input ${path.data}/prepared/umu.pkl + --split ${item.split} + --output-indices ${path.data}/splits/${item.split}Split.pkl + params: + - split + deps: + - ${path.scripts}/split.py + - ${path.data}/prepared/umu.pkl + outs: + - ${path.data}/splits/${item.split}Split.pkl + + + gridsearch: + desc: Determine best hyper-parameters for algorithm on several CVs + matrix: + split: ${split.types} + model: ${gridsearch} + cmd: > + python ${path.common}/benchmark.py + --use ${item.model} + --optimizer gridsearch + --data ${path.data}/prepared/umu.pkl + --split-indices ${path.data}/splits/${item.split}Split.pkl + --output-results ${path.models}/gridsearch-${item.split}-${item.model}/${item.model}-${item.split}Split-results.pkl + params: + - gridsearch.${item.model} + deps: + - ${path.common}/benchmark.py + - ${path.data}/prepared/umu.pkl + - ${path.data}/splits/${item.split}Split.pkl + outs: + - ${path.models}/gridsearch-${item.split}-${item.model}/${item.model}-${item.split}Split-results.pkl + diff --git a/LOCALIZE_Configs/01-Changed_and_added_model/params.yaml b/LOCALIZE_Configs/01-Changed_and_added_model/params.yaml new file mode 100644 index 0000000..01f1bba --- /dev/null +++ b/LOCALIZE_Configs/01-Changed_and_added_model/params.yaml @@ -0,0 +1,31 @@ +# 01-Initial +evaluation: + metrics: {'rmse': False} + save_top_models: 0.1 + score_with: 'rmse' + +split: + types: [KFold] + seed: 42 + n_splits: 5 + test_size: 0.20 + +gridsearch: + RandomForestRegressor: + module: sklearn.ensemble + class: RandomForestRegressor + parameters: + random_state: 42 + hyperparameters: + n_estimators: [10, 50, 100, 250, 400] + max_depth: [5, 10, 30, 50, 150, 200, null] + + KNeighborsRegressor: + module: sklearn.neighbors + class: KNeighborsRegressor + hyperparameters: + n_neighbors: [3, 5, 10] + weights: [uniform, distance] + p: [1, 2] + leaf_size: [10, 15, 30] + metric: [minkowski, euclidean] diff --git a/LOCALIZE_Configs/02-Changed_dataset_to_logatec/dvc.yaml b/LOCALIZE_Configs/02-Changed_dataset_to_logatec/dvc.yaml new file mode 100644 index 0000000..dbd64e6 --- /dev/null +++ b/LOCALIZE_Configs/02-Changed_dataset_to_logatec/dvc.yaml @@ -0,0 +1,79 @@ +# 02-Initial +vars: + - path: + # Path are relative to location of dvc.yaml file + scripts: ../../src/logatec + common: ../../src + data: ../../artifacts/02-Changed_dataset_to_logatec/data + models: ../../artifacts/02-Changed_dataset_to_logatec/models + reports: ../../artifacts/02-Changed_dataset_to_logatec/reports + + - subsets: [winter, spring] + +stages: + prepare: + desc: Download, unzip, and convert dataset to pickle format + matrix: + subset: ${subsets} + cmd: | + unzip -j ${path.data}/raw/logatec.zip ${item.subset}_data.json -d ${path.data}/raw/ + python ${path.scripts}/prepare.py --method average --input ${path.data}/raw/${item.subset}_data.json --output ${path.data}/interim/${item.subset}.pkl + rm ${path.data}/raw/${item.subset}_data.json + deps: + - ${path.scripts}/prepare.py + - ${path.data}/raw/logatec.zip + outs: + - ${path.data}/interim/${item.subset}.pkl + + featurize: + desc: Enrich dataset with additional features + matrix: + subset: ${subsets} + cmd: > + python ${path.scripts}/featurize.py + --input ${path.data}/interim/${item.subset}.pkl + --output ${path.data}/prepared/${item.subset}.pkl + deps: + - ${path.scripts}/featurize.py + - ${path.data}/interim/${item.subset}.pkl + outs: + - ${path.data}/prepared/${item.subset}.pkl + + + split: + desc: Prepare train-test split indices for model training and evaluation + matrix: + subset: ${subsets} + split: ${split.types} + cmd: > + python ${path.scripts}/split.py + --input ${path.data}/prepared/${item.subset}.pkl + --split ${item.split} + --output-indices ${path.data}/splits/${item.subset}-${item.split}Split.pkl + deps: + - ${path.scripts}/split.py + - ${path.data}/prepared/${item.subset}.pkl + outs: + - ${path.data}/splits/${item.subset}-${item.split}Split.pkl + + + gridsearch: + desc: Determine best hyper-parameters for algorithm on several CVs + matrix: + subset: ${subsets} + split: ${split.types} + model: ${gridsearch} + cmd: > + python ${path.common}/benchmark.py + --use ${item.model} + --optimizer gridsearch + --data ${path.data}/prepared/${item.subset}.pkl + --split-indices ${path.data}/splits/${item.subset}-${item.split}Split.pkl + --output-results ${path.models}/gridsearch-${item.subset}-${item.split}-${item.model}/${item.subset}-${item.model}-${item.split}Split-results.pkl + deps: + - ${path.common}/benchmark.py + - ${path.data}/prepared/${item.subset}.pkl + - ${path.data}/splits/${item.subset}-${item.split}Split.pkl + outs: + - ${path.models}/gridsearch-${item.subset}-${item.split}-${item.model}/${item.subset}-${item.model}-${item.split}Split-results.pkl + diff --git a/LOCALIZE_Configs/02-Changed_dataset_to_logatec/params.yaml b/LOCALIZE_Configs/02-Changed_dataset_to_logatec/params.yaml new file mode 100644 index 0000000..42c2c97 --- /dev/null +++ b/LOCALIZE_Configs/02-Changed_dataset_to_logatec/params.yaml @@ -0,0 +1,31 @@ +# 02-Initial +evaluation: + metrics: {'rmse': False} + save_top_models: 0.1 + score_with: 'rmse' + +split: + types: [KFold] + seed: 42 + n_splits: 5 + test_size: 0.20 + +gridsearch: + RandomForestRegressor: + module: sklearn.ensemble + class: RandomForestRegressor + parameters: + random_state: 42 + hyperparameters: + n_estimators: [10, 50, 100, 250, 400] + max_depth: [5, 10, 30, 50, 150, 200, null] + + KNeighborsRegressor: + module: sklearn.neighbors + class: KNeighborsRegressor + hyperparameters: + n_neighbors: [3, 5, 10] + weights: [uniform, distance] + p: [1, 2] + leaf_size: [10, 15, 30] + metric: [minkowski, euclidean] diff --git a/LOCALIZE_Configs/03-Added_split_and_metric/dvc.yaml b/LOCALIZE_Configs/03-Added_split_and_metric/dvc.yaml new file mode 100644 index 0000000..15db467 --- /dev/null +++ b/LOCALIZE_Configs/03-Added_split_and_metric/dvc.yaml @@ -0,0 +1,79 @@ +# 03-Initial +vars: + - path: + # Path are relative to location of dvc.yaml file + scripts: ../../src/logatec + common: ../../src + data: ../../artifacts/03-Added_split_and_metric/data + models: ../../artifacts/03-Added_split_and_metric/models + reports: ../../artifacts/03-Added_split_and_metric/reports + + - subsets: [winter, spring] + +stages: + prepare: + desc: Download, unzip, and convert dataset to pickle format + matrix: + subset: ${subsets} + cmd: | + unzip -j ${path.data}/raw/logatec.zip ${item.subset}_data.json -d ${path.data}/raw/ + python ${path.scripts}/prepare.py --method average --input ${path.data}/raw/${item.subset}_data.json --output ${path.data}/interim/${item.subset}.pkl + rm ${path.data}/raw/${item.subset}_data.json + deps: + - ${path.scripts}/prepare.py + - ${path.data}/raw/logatec.zip + outs: + - ${path.data}/interim/${item.subset}.pkl + + featurize: + desc: Enrich dataset with additional features + matrix: + subset: ${subsets} + cmd: > + python ${path.scripts}/featurize.py + --input ${path.data}/interim/${item.subset}.pkl + --output ${path.data}/prepared/${item.subset}.pkl + deps: + - ${path.scripts}/featurize.py + - ${path.data}/interim/${item.subset}.pkl + outs: + - ${path.data}/prepared/${item.subset}.pkl + + + split: + desc: Prepare train-test split indices for model training and evaluation + matrix: + subset: ${subsets} + split: ${split.types} + cmd: > + python ${path.scripts}/split.py + --input ${path.data}/prepared/${item.subset}.pkl + --split ${item.split} + --output-indices ${path.data}/splits/${item.subset}-${item.split}Split.pkl + deps: + - ${path.scripts}/split.py + - ${path.data}/prepared/${item.subset}.pkl + outs: + - ${path.data}/splits/${item.subset}-${item.split}Split.pkl + + + gridsearch: + desc: Determine best hyper-parameters for algorithm on several CVs + matrix: + subset: ${subsets} + split: ${split.types} + model: ${gridsearch} + cmd: > + python ${path.common}/benchmark.py + --use ${item.model} + --optimizer gridsearch + --data ${path.data}/prepared/${item.subset}.pkl + --split-indices ${path.data}/splits/${item.subset}-${item.split}Split.pkl + --output-results ${path.models}/gridsearch-${item.subset}-${item.split}-${item.model}/${item.subset}-${item.model}-${item.split}Split-results.pkl + deps: + - ${path.common}/benchmark.py + - ${path.data}/prepared/${item.subset}.pkl + - ${path.data}/splits/${item.subset}-${item.split}Split.pkl + outs: + - ${path.models}/gridsearch-${item.subset}-${item.split}-${item.model}/${item.subset}-${item.model}-${item.split}Split-results.pkl + diff --git a/LOCALIZE_Configs/03-Added_split_and_metric/params.yaml b/LOCALIZE_Configs/03-Added_split_and_metric/params.yaml new file mode 100644 index 0000000..1848da6 --- /dev/null +++ b/LOCALIZE_Configs/03-Added_split_and_metric/params.yaml @@ -0,0 +1,31 @@ +# 03-Initial +evaluation: + metrics: {'rmse': False, 'r_squared': True} + save_top_models: 0.1 + score_with: 'rmse' + +split: + types: [KFold, Random] + seed: 42 + n_splits: 5 + test_size: 0.20 + +gridsearch: + RandomForestRegressor: + module: sklearn.ensemble + class: RandomForestRegressor + parameters: + random_state: 42 + hyperparameters: + n_estimators: [10, 50, 100, 250, 400] + max_depth: [5, 10, 30, 50, 150, 200, null] + + KNeighborsRegressor: + module: sklearn.neighbors + class: KNeighborsRegressor + hyperparameters: + n_neighbors: [3, 5, 10] + weights: [uniform, distance] + p: [1, 2] + leaf_size: [10, 15, 30] + metric: [minkowski, euclidean] diff --git a/LOCALIZE_Configs/04-Added_automl_model/dvc.yaml b/LOCALIZE_Configs/04-Added_automl_model/dvc.yaml new file mode 100644 index 0000000..1a881af --- /dev/null +++ b/LOCALIZE_Configs/04-Added_automl_model/dvc.yaml @@ -0,0 +1,101 @@ +# 04-Added_automl_model +vars: + - path: + # Path are relative to location of dvc.yaml file + scripts: ../../src/logatec + common: ../../src + data: ../../artifacts/04-Added_automl_model/data + models: ../../artifacts/04-Added_automl_model/models + reports: ../../artifacts/04-Added_automl_model/reports + + - subsets: [winter, spring] + +stages: + prepare: + desc: Download, unzip, and convert dataset to pickle format + matrix: + subset: ${subsets} + cmd: | + unzip -j ${path.data}/raw/logatec.zip ${item.subset}_data.json -d ${path.data}/raw/ + python ${path.scripts}/prepare.py --method average --input ${path.data}/raw/${item.subset}_data.json --output ${path.data}/interim/${item.subset}.pkl + rm ${path.data}/raw/${item.subset}_data.json + deps: + - ${path.scripts}/prepare.py + - ${path.data}/raw/logatec.zip + outs: + - ${path.data}/interim/${item.subset}.pkl + + featurize: + desc: Enrich dataset with additional features + matrix: + subset: ${subsets} + cmd: > + python ${path.scripts}/featurize.py + --input ${path.data}/interim/${item.subset}.pkl + --output ${path.data}/prepared/${item.subset}.pkl + deps: + - ${path.scripts}/featurize.py + - ${path.data}/interim/${item.subset}.pkl + outs: + - ${path.data}/prepared/${item.subset}.pkl + + + split: + desc: Prepare train-test split indices for model training and evaluation + matrix: + subset: ${subsets} + split: ${split.types} + cmd: > + python ${path.scripts}/split.py + --input ${path.data}/prepared/${item.subset}.pkl + --split ${item.split} + --output-indices ${path.data}/splits/${item.subset}-${item.split}Split.pkl + deps: + - ${path.scripts}/split.py + - ${path.data}/prepared/${item.subset}.pkl + outs: + - ${path.data}/splits/${item.subset}-${item.split}Split.pkl + + + gridsearch: + desc: Determine best hyper-parameters for algorithm on several CVs + matrix: + subset: ${subsets} + split: ${split.types} + model: ${gridsearch} + cmd: > + python ${path.common}/benchmark.py + --use ${item.model} + --optimizer gridsearch + --data ${path.data}/prepared/${item.subset}.pkl + --split-indices ${path.data}/splits/${item.subset}-${item.split}Split.pkl + --output-results ${path.models}/gridsearch-${item.subset}-${item.split}-${item.model}/${item.subset}-${item.model}-${item.split}Split-results.pkl + deps: + - ${path.common}/benchmark.py + - ${path.data}/prepared/${item.subset}.pkl + - ${path.data}/splits/${item.subset}-${item.split}Split.pkl + outs: + - ${path.models}/gridsearch-${item.subset}-${item.split}-${item.model}/${item.subset}-${item.model}-${item.split}Split-results.pkl + + automl: + desc: Determine best hyper-parameters for algorithm on several CVs + matrix: + subset: ${subsets} + split: ${split.types} + model: ${automl} + cmd: > + python ${path.common}/benchmark.py + --use ${item.model} + --optimizer automl + --data ${path.data}/prepared/${item.subset}.pkl + --split-indices ${path.data}/splits/${item.subset}-${item.split}Split.pkl + --output-results ${path.models}/automl-${item.subset}-${item.split}-${item.model}/${item.subset}-${item.model}-${item.split}Split-results.pkl + params: + - automl.${item.model} + deps: + - ${path.common}/benchmark.py + - ${path.data}/prepared/${item.subset}.pkl + - ${path.data}/splits/${item.subset}-${item.split}Split.pkl + outs: + - ${path.models}/automl-${item.subset}-${item.split}-${item.model}/${item.subset}-${item.model}-${item.split}Split-results.pkl + diff --git a/LOCALIZE_Configs/04-Added_automl_model/params.yaml b/LOCALIZE_Configs/04-Added_automl_model/params.yaml new file mode 100644 index 0000000..ec84d3d --- /dev/null +++ b/LOCALIZE_Configs/04-Added_automl_model/params.yaml @@ -0,0 +1,51 @@ +# 04-Added_automl_model +evaluation: + metrics: {'rmse': False, 'r_squared': True} + save_top_models: 0.1 + score_with: 'rmse' + +split: + types: [KFold, Random] + seed: 42 + n_splits: 5 + test_size: 0.20 + +gridsearch: + RandomForestRegressor: + module: sklearn.ensemble + class: RandomForestRegressor + parameters: + random_state: 42 + n_jobs: -1 + hyperparameters: + n_estimators: [10, 50, 100, 250, 400] + max_depth: [5, 10, 30, 50, 150, 200, null] + + KNeighborsRegressor: + module: sklearn.neighbors + class: KNeighborsRegressor + parameters: + n_jobs: -1 + hyperparameters: + n_neighbors: [3, 5, 10] + weights: [uniform, distance] + p: [1, 2] + leaf_size: [10, 15, 30] + metric: [minkowski, euclidean] + +automl: + ExampleModel: + inputs: + - name: data_input + + outputs: + - name: x_out + - name: y_out + + settings: + seed: 42 + max_trials: 10 + overwrite: True + + fit_settings: + verbose: 2 \ No newline at end of file diff --git a/LOCALIZE_Configs/Benchmarking/README.txt b/LOCALIZE_Configs/Benchmarking/README.txt new file mode 100644 index 0000000..c70af5e --- /dev/null +++ b/LOCALIZE_Configs/Benchmarking/README.txt @@ -0,0 +1 @@ +For the benchmarking the code for each stage was modified so that at it called ´start_resource_monitor´ at the start and ´stop_resource_monitor´ at the end. \ No newline at end of file diff --git a/LOCALIZE_Configs/Benchmarking/dvc.yaml b/LOCALIZE_Configs/Benchmarking/dvc.yaml new file mode 100644 index 0000000..8eecd97 --- /dev/null +++ b/LOCALIZE_Configs/Benchmarking/dvc.yaml @@ -0,0 +1,101 @@ +# Benchmarking +vars: + - path: + # Path are relative to location of dvc.yaml file + scripts: ../../src/logatec + common: ../../src + data: ../../artifacts/Benchmarking/data + models: ../../artifacts/Benchmarking/models + reports: ../../artifacts/Benchmarking/reports + + - subsets: [winter, spring] + +stages: + prepare: + desc: Download, unzip, and convert dataset to pickle format + matrix: + subset: ${subsets} + cmd: | + unzip -j ${path.data}/raw/logatec.zip ${item.subset}_data.json -d ${path.data}/raw/ + python ${path.scripts}/prepare.py --method average --input ${path.data}/raw/${item.subset}_data.json --output ${path.data}/interim/${item.subset}.pkl + rm ${path.data}/raw/${item.subset}_data.json + deps: + - ${path.scripts}/prepare.py + - ${path.data}/raw/logatec.zip + outs: + - ${path.data}/interim/${item.subset}.pkl + + featurize: + desc: Enrich dataset with additional features + matrix: + subset: ${subsets} + cmd: > + python ${path.scripts}/featurize.py + --input ${path.data}/interim/${item.subset}.pkl + --output ${path.data}/prepared/${item.subset}.pkl + deps: + - ${path.scripts}/featurize.py + - ${path.data}/interim/${item.subset}.pkl + outs: + - ${path.data}/prepared/${item.subset}.pkl + + + split: + desc: Prepare train-test split indices for model training and evaluation + matrix: + subset: ${subsets} + split: ${split.types} + cmd: > + python ${path.scripts}/split.py + --input ${path.data}/prepared/${item.subset}.pkl + --split ${item.split} + --output-indices ${path.data}/splits/${item.subset}-${item.split}Split.pkl + deps: + - ${path.scripts}/split.py + - ${path.data}/prepared/${item.subset}.pkl + outs: + - ${path.data}/splits/${item.subset}-${item.split}Split.pkl + + + gridsearch: + desc: Determine best hyper-parameters for algorithm on several CVs + matrix: + subset: ${subsets} + split: ${split.types} + model: ${gridsearch} + cmd: > + python ${path.common}/benchmark.py + --use ${item.model} + --optimizer gridsearch + --data ${path.data}/prepared/${item.subset}.pkl + --split-indices ${path.data}/splits/${item.subset}-${item.split}Split.pkl + --output-results ${path.models}/gridsearch-${item.subset}-${item.split}-${item.model}/${item.subset}-${item.model}-${item.split}Split-results.pkl + deps: + - ${path.common}/benchmark.py + - ${path.data}/prepared/${item.subset}.pkl + - ${path.data}/splits/${item.subset}-${item.split}Split.pkl + outs: + - ${path.models}/gridsearch-${item.subset}-${item.split}-${item.model}/${item.subset}-${item.model}-${item.split}Split-results.pkl + + automl: + desc: Determine best hyper-parameters for algorithm on several CVs + matrix: + subset: ${subsets} + split: ${split.types} + model: ${automl} + cmd: > + python ${path.common}/benchmark.py + --use ${item.model} + --optimizer automl + --data ${path.data}/prepared/${item.subset}.pkl + --split-indices ${path.data}/splits/${item.subset}-${item.split}Split.pkl + --output-results ${path.models}/automl-${item.subset}-${item.split}-${item.model}/${item.subset}-${item.model}-${item.split}Split-results.pkl + params: + - automl.${item.model} + deps: + - ${path.common}/benchmark.py + - ${path.data}/prepared/${item.subset}.pkl + - ${path.data}/splits/${item.subset}-${item.split}Split.pkl + outs: + - ${path.models}/automl-${item.subset}-${item.split}-${item.model}/${item.subset}-${item.model}-${item.split}Split-results.pkl + diff --git a/LOCALIZE_Configs/Benchmarking/params.yaml b/LOCALIZE_Configs/Benchmarking/params.yaml new file mode 100644 index 0000000..ec84d3d --- /dev/null +++ b/LOCALIZE_Configs/Benchmarking/params.yaml @@ -0,0 +1,51 @@ +# 04-Added_automl_model +evaluation: + metrics: {'rmse': False, 'r_squared': True} + save_top_models: 0.1 + score_with: 'rmse' + +split: + types: [KFold, Random] + seed: 42 + n_splits: 5 + test_size: 0.20 + +gridsearch: + RandomForestRegressor: + module: sklearn.ensemble + class: RandomForestRegressor + parameters: + random_state: 42 + n_jobs: -1 + hyperparameters: + n_estimators: [10, 50, 100, 250, 400] + max_depth: [5, 10, 30, 50, 150, 200, null] + + KNeighborsRegressor: + module: sklearn.neighbors + class: KNeighborsRegressor + parameters: + n_jobs: -1 + hyperparameters: + n_neighbors: [3, 5, 10] + weights: [uniform, distance] + p: [1, 2] + leaf_size: [10, 15, 30] + metric: [minkowski, euclidean] + +automl: + ExampleModel: + inputs: + - name: data_input + + outputs: + - name: x_out + - name: y_out + + settings: + seed: 42 + max_trials: 10 + overwrite: True + + fit_settings: + verbose: 2 \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a047bc9 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +Evauluation code/configs for LOCALIZE framework diff --git a/performance.py b/performance.py new file mode 100644 index 0000000..12ad690 --- /dev/null +++ b/performance.py @@ -0,0 +1,157 @@ +INITIAL_PROFILING_DURATION = 0.25 +DEFAULT_PROFILING_INTERVAL = 0.1 +NUM_CORES_TO_ALLOCATE = 5 + +DEBUG = False + + +import psutil, os +from pathlib import Path + +def get_cores_by_usage(interval: float = INITIAL_PROFILING_DURATION): + cpu_usage = psutil.cpu_percent(percpu=True, interval=interval) + return sorted(range(len(cpu_usage)), key=lambda i: cpu_usage[i]) + +cores_by_usage = get_cores_by_usage() +logging_core_id = cores_by_usage[0] +task_execution_core_ids = cores_by_usage[1:1+NUM_CORES_TO_ALLOCATE] + +os.sched_setaffinity(0, {*task_execution_core_ids}) + +print("Executing on core:", task_execution_core_ids) +print("Logging on core:", logging_core_id) + +core = task_execution_core_ids +proc = psutil.Process(os.getpid()) + +for t in proc.threads(): + os.sched_setaffinity(t.id, {*core}) + +for p in proc.children(recursive=True): + os.sched_setaffinity(p.id, {*core}) + + +from multiprocessing import Event, Process, Queue +import pandas as pd +import joblib, time + +_monitor_proc = None +_stop_evt = None +_path_q = None + +def _safe_value(func, default=0.0): + try: + return func() + except (psutil.NoSuchProcess, psutil.ZombieProcess): + return default + +def _sample_once(root, exclude_pids: set[int]=frozenset()): + # persistent cache so we always talk to the *same* Process objects + if not hasattr(_sample_once, "cache"): + _sample_once.cache = {} # pid ➜ Process + + cache = _sample_once.cache + procs = [root] + root.children(recursive=True) + + # make sure every pid we see has a cached Process object + for p in procs: + if p.pid in exclude_pids: + continue + if p.pid not in cache: + cache[p.pid] = p + p.cpu_percent(None) # prime – first call always 0.0 + + total_cpu = 0.0 + total_mem = 0 + c = [] + for pid, p in list(cache.items()): + if not p.is_running(): + cache.pop(pid, None) # clean up dead workers + continue + cpu = p.cpu_percent(None) # non-blocking, since last call + c.append(cpu) + total_cpu += cpu # add this process’s % + total_mem += p.memory_info().rss + + return total_cpu / 100.0, total_mem / (1024 ** 2), c # CPU cores, RAM MB + +def _resource_worker(interval: float, logging_core_id: int, stop_evt, path_q: Queue): + # Use separate core for logging to not affect performance + os.sched_setaffinity(0, {logging_core_id}) + + parent_pid = os.getppid() + logger_pid = os.getpid() + proc = psutil.Process(parent_pid) + + # Prime the logger + _sample_once(proc, exclude_pids={logger_pid}) + time.sleep(interval) + + resource_log = [] + last_sample = 0 + sleep_time = 0 + print_log = "" + start_time = time.perf_counter() + + # Get starting datapoint + cpu, ram, c = _sample_once(proc, exclude_pids={logger_pid}) + resource_log.append({"t": 0, "cpu_cores": cpu, "ram_mb": ram}) + + while not stop_evt.is_set(): + # Sample + now = time.perf_counter() - start_time + cpu, ram, c = _sample_once(proc, exclude_pids={logger_pid}) + resource_log.append({"t": now, "cpu_cores": cpu, "ram_mb": ram}) + + # Compensate drift + elapsed = now - last_sample + sleep_time = min(max(0.05, interval - (elapsed - sleep_time)), interval) + print_log += f"{now:>6.2f} s| {(interval-sleep_time)*1000.0:4.2f} ms| {cpu*100:>5.1f}% [" + "|".join([f"{process_cpu:>3.0f}%" for process_cpu in c]) +f"] - {len(c)}\n" + last_sample = now + time.sleep(sleep_time) + + end_time = time.perf_counter() - start_time + cpu, ram, c = _sample_once(proc, exclude_pids={logger_pid}) + resource_log.append({"t": end_time, "cpu_cores": cpu, "ram_mb": ram}) + if DEBUG: + print(print_log) + + df = pd.DataFrame(resource_log) + outfile = Path(path_q.get()) + joblib.dump(df, outfile) + +def start_resource_monitor(interval: float = DEFAULT_PROFILING_INTERVAL): + global _monitor_proc, _stop_evt, _path_q, logging_core_id + if _monitor_proc is not None and _monitor_proc.is_alive(): + raise RuntimeError("Resource monitor already running.") + + _stop_evt = Event() + _path_q = Queue(maxsize=1) + _monitor_proc = Process( + target = _resource_worker, + args = ( + interval, + logging_core_id, + _stop_evt, + _path_q + ), + daemon =True + ) + _monitor_proc.start() + +def stop_resource_monitor(outfile: Path): + global _monitor_proc, _stop_evt, _path_q + if _monitor_proc is None or not _monitor_proc.is_alive(): + raise RuntimeError("Resource monitor is not running.") + + _path_q.put(outfile) + _stop_evt.set() + _monitor_proc.join() + + _monitor_proc = None + _stop_evt = None + _path_q = None + +def get_directory(path: Path) -> Path: + path = path.resolve() # Get absolute path + return path if path.is_dir() else path.parent \ No newline at end of file