Skip to content
This repository was archived by the owner on Apr 8, 2024. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,36 +31,48 @@ lightgbm_inferencing:
benchmark_name: "benchmark-inferencing" # need to be provided at runtime!

tasks:
- dataset: "data-synthetic-regression-10cols-10000samples-inference"
model: "model-synthetic-regression-10cols-10trees-31leaves"
- dataset: "data-synthetic-regression-10cols-10000samples-inference"
model: "model-synthetic-regression-10cols-100trees-31leaves"
- dataset: "data-synthetic-regression-10cols-10000samples-inference"
model: "model-synthetic-regression-10cols-1000trees-31leaves"
- dataset: "data-synthetic-regression-10cols-10000samples-inference"
model: "model-synthetic-regression-10cols-5000trees-31leaves"
- dataset: "data-synthetic-regression-100cols-10000samples-inference"
model: "model-synthetic-regression-100cols-10trees-31leaves"
- dataset: "data-synthetic-regression-100cols-10000samples-inference"
model: "model-synthetic-regression-100cols-100trees-31leaves"
- dataset: "data-synthetic-regression-100cols-10000samples-inference"
model: "model-synthetic-regression-100cols-1000trees-31leaves"
- dataset: "data-synthetic-regression-100cols-10000samples-inference"
model: "model-synthetic-regression-100cols-5000trees-31leaves"
- dataset: "data-synthetic-regression-1000cols-10000samples-inference"
model: "model-synthetic-regression-1000cols-10trees-31leaves"
- dataset: "data-synthetic-regression-1000cols-10000samples-inference"
model: "model-synthetic-regression-1000cols-100trees-31leaves"
- dataset: "data-synthetic-regression-1000cols-10000samples-inference"
model: "model-synthetic-regression-1000cols-1000trees-31leaves"
- dataset: "data-synthetic-regression-1000cols-10000samples-inference"
model: "model-synthetic-regression-1000cols-5000trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-10cols-10000samples-inference"
model_dataset: "model-synthetic-regression-10cols-10trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-10cols-10000samples-inference"
model_dataset: "model-synthetic-regression-10cols-100trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-10cols-10000samples-inference"
model_dataset: "model-synthetic-regression-10cols-1000trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-10cols-10000samples-inference"
model_dataset: "model-synthetic-regression-10cols-5000trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference"
model_dataset: "model-synthetic-regression-100cols-10trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference"
model_dataset: "model-synthetic-regression-100cols-100trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference"
model_dataset: "model-synthetic-regression-100cols-1000trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference"
model_dataset: "model-synthetic-regression-100cols-5000trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-1000cols-10000samples-inference"
model_dataset: "model-synthetic-regression-1000cols-10trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-1000cols-10000samples-inference"
model_dataset: "model-synthetic-regression-1000cols-100trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-1000cols-10000samples-inference"
model_dataset: "model-synthetic-regression-1000cols-1000trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-1000cols-10000samples-inference"
model_dataset: "model-synthetic-regression-1000cols-5000trees-31leaves"

variants:
- framework: lightgbm_python
build: dockers/lightgbm_cpu_mpi_pip.dockerfile
batch_size: 1
data_loader: "numpy"
n_threads: 1
- framework: lightgbm_python
build: dockers/lightgbm_cpu_mpi_build.dockerfile
batch_size: 1
data_loader: "numpy"
n_threads: 1
- framework: lightgbm_python
build: dockers/lightgbm_cpu_mpi_custom.dockerfile
batch_size: 1
data_loader: "numpy"
n_threads: 1
- framework: treelite_python
batch_size: 1
data_loader: "numpy"
n_threads: 1
4 changes: 2 additions & 2 deletions pipelines/azureml/conf/experiments/lightgbm-inferencing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ lightgbm_inferencing:

# list all the data/model pairs to run inferencing with
tasks:
- dataset: "data-synthetic-regression-100cols-10000samples-inference"
model: "model-synthetic-regression-100cols-10trees-31leaves"
- inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference"
inferencing_model: "model-synthetic-regression-100cols-10trees-31leaves"

# list all inferencing frameworks and their builds
variants:
Expand Down
45 changes: 36 additions & 9 deletions pipelines/azureml/pipelines/lightgbm_inferencing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import sys
import json
from dataclasses import dataclass
from omegaconf import MISSING
from omegaconf import MISSING, OmegaConf
from typing import Optional, List
from azure.ml.component import dsl
from shrike.pipeline.pipeline_helper import AMLPipelineHelper
Expand All @@ -27,6 +27,7 @@
sys.path.append(str(LIGHTGBM_BENCHMARK_ROOT))

from common.tasks import inferencing_task, inferencing_variants
from common.aml import dataset_from_dstore_path

class LightGBMInferencing(AMLPipelineHelper):
"""Runnable/reusable pipeline helper class
Expand Down Expand Up @@ -127,6 +128,9 @@ def lightgbm_inferencing_pipeline_function(benchmark_custom_properties, data, mo
inferencing_step = treelite_score_module(
data = data,
compiled_model = treelite_compile_step.outputs.compiled_model,
data_loader = variant.data_loader,
batch_size = variant.batch_size,
n_threads = variant.n_threads,
verbose = False,
custom_properties = custom_properties
)
Expand All @@ -138,10 +142,13 @@ def lightgbm_inferencing_pipeline_function(benchmark_custom_properties, data, mo
data = data,
model = model,
predict_disable_shape_check = predict_disable_shape_check,
data_loader = variant.data_loader,
batch_size = variant.batch_size,
n_threads = variant.n_threads,
verbose = False,
custom_properties = custom_properties
)
self.apply_smart_runsettings(inferencing_step)
self.apply_smart_runsettings(inferencing_step, windows=(variant.os == "Windows"))

else:
raise NotImplementedError(f"framework {variant.framework} not implemented (yet)")
Expand Down Expand Up @@ -179,23 +186,43 @@ def pipeline_instance(self, pipeline_function, config):
"""
# Here you should create an instance of a pipeline function (using your custom config dataclass)
@dsl.pipeline(name="inferencing_all_tasks", # pythonic name
description="Inferencing on all specified tasks",
description=("```yaml\n"+OmegaConf.to_yaml(config)+"```"),
default_datastore=config.compute.noncompliant_datastore)
def inferencing_all_tasks():
for inferencing_task in config.lightgbm_inferencing.tasks:
data = self.dataset_load(inferencing_task.dataset)
model = self.dataset_load(inferencing_task.model)

# load the given inferencing dataset
if inferencing_task.inferencing_dataset:
inferencing_data = self.dataset_load(
name = inferencing_task.inferencing_dataset,
version = inferencing_task.inferencing_dataset_version # use latest if None
)
elif inferencing_task.inferencing_datastore and inferencing_task.inferencing_datastore_path:
inferencing_data = dataset_from_dstore_path(self.workspace(), inferencing_task.inferencing_datastore, inferencing_task.inferencing_datastore_path, validate=inferencing_task.inferencing_datastore_path_validate)
else:
raise ValueError(f"In inferencing_task {inferencing_task}, you need to provide either inferencing_dataset or inferencing_datastore+inferencing_datastore_path")

# load the given inferencing model (from a dataset)
if inferencing_task.model_dataset:
model_data = self.dataset_load(
name = inferencing_task.model_dataset,
version = inferencing_task.model_dataset_version # use latest if None
)
elif inferencing_task.model_datastore and inferencing_task.model_datastore_path:
model_data = dataset_from_dstore_path(self.workspace(), inferencing_task.model_datastore, inferencing_task.model_datastore_path, validate=inferencing_task.model_datastore_path_validate)
else:
raise ValueError(f"In inferencing_task {inferencing_task}, you need to provide either model_dataset or model_datastore+model_datastore_path")

# create custom properties for this task
benchmark_custom_properties = {
'benchmark_name' : config.lightgbm_inferencing.benchmark_name,
'benchmark_dataset' : inferencing_task.dataset,
'benchmark_model' : inferencing_task.model,
'benchmark_dataset' : inferencing_task.inferencing_dataset,
'benchmark_model' : inferencing_task.model_dataset,
}

inferencing_task_subgraph_step = pipeline_function(
data=data,
model=model,
data=inferencing_data,
model=model_data,
predict_disable_shape_check=inferencing_task.predict_disable_shape_check or False,
benchmark_custom_properties=benchmark_custom_properties
)
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ lightgbm==3.2.1
pytest==6.2.4
pytest-cov==2.12.1
pytest-mock==3.6.1
mlflow==1.19.0
shrike[pipeline]==1.11.1
mlflow==1.20.2
shrike[pipeline]==1.11.5
hydra-core==1.0.7
omegaconf==2.0.6
treelite==1.3.0
treelite_runtime==1.3.0
mpi4py==3.1.1
scikit-learn~=0.24.1
51 changes: 51 additions & 0 deletions src/common/io.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import argparse
import logging
import numpy as np

def input_file_path(path):
""" Resolve input path from AzureML.
Expand Down Expand Up @@ -156,3 +157,53 @@ def run(self, input_path, output_path):
self.split_by_append(input_files, output_path, self.number)
else:
raise NotImplementedError(f"Mode {self.mode} not implemented.")


class DataBatch():
# taken from https://datascience.stackexchange.com/questions/47623/how-feed-a-numpy-array-in-batches-in-keras
def __init__(self, x, y=None, batch_size=0):
self.x = x
self.y = y
if batch_size == 0:
self.batch_size = x.shape[0]
self.num_batches = 1
else:
self.batch_size = batch_size
self.num_batches = np.ceil(x.shape[0] / batch_size)

self.batch_idx = np.array_split(range(x.shape[0]), self.num_batches)
logging.getLogger(__name__).info(f"Creating data batch with {self.num_batches} batches")

def __len__(self):
return len(self.batch_idx)

def __getitem__(self, idx):
return self.x[self.batch_idx[idx]], (self.y[self.batch_idx[idx]] if self.y is not None else None)


def numpy_data_load(path, delimiter=","):
"""Loads data using numpy (csv).

Args:
path (str): path to data file
Returns:
numpy_array, number_of_rows (int), number of cols (int)
"""
raw_data = np.loadtxt(path, delimiter=delimiter)

return raw_data, raw_data.shape[0], raw_data.shape[1]

def libsvm_data_load(path):
"""Loads data using libsvm.

Args:
path (str): path to data file
Returns:
(y, x), number_of_rows (int), number of cols (int)
"""
# importing at last minute intentionally
from sklearn.datasets import load_svmlight_file

x, y = load_svmlight_file(path)

return (x,y), x.shape[0], x.shape[1]
25 changes: 23 additions & 2 deletions src/common/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,26 @@

@dataclass
class inferencing_task:
dataset: str = MISSING
model: str = MISSING
# specify either by dataset name
inferencing_dataset: Optional[str] = None
inferencing_dataset_version: Optional[str] = None
# or by datastore+path
inferencing_datastore: Optional[str] = None
inferencing_datastore_path: Optional[str] = None
inferencing_datastore_path_validate: bool = True

# specify either by model dataset name
model_dataset: Optional[str] = None
model_dataset_version: Optional[str] = None
# or by datastore+path
model_datastore: Optional[str] = None
model_datastore_path: Optional[str] = None
model_datastore_path_validate: bool = True

# task tag
task_key: Optional[str] = None

# turn to True is model and dataset have different shapes
predict_disable_shape_check: bool = False

@dataclass
Expand All @@ -15,6 +32,10 @@ class inferencing_variants:
build: Optional[str] = None
os: str = "Linux" # linux or windows, linux by default

data_loader: str = "lightgbm"
batch_size: int = 0 # all data in 1 batch by default
n_threads: int = 1

@dataclass
class data_generation_task:
task: str = MISSING
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
FROM mcr.microsoft.com/azureml/windows-servercore-1809

ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/lightgbm

# Create conda environment
RUN conda create -p $AZUREML_CONDA_ENVIRONMENT_PATH \
python=3.8 pip=20.2.4

# Prepend path to AzureML conda environment
ENV PATH $AZUREML_CONDA_ENVIRONMENT_PATH/bin:$PATH

# Install pip dependencies
RUN HOROVOD_WITH_TENSORFLOW=1 \
pip install 'pandas>=1.1,<1.2' \
'numpy>=1.10,<1.20' \
'scipy~=1.5.0' \
'scikit-learn~=0.24.1' \
'azureml-core==1.30.0' \
'azureml-defaults==1.30.0' \
'azureml-mlflow==1.30.0' \
'azureml-telemetry==1.30.0'

# install lightgbm with mpi
RUN pip install --upgrade pip setuptools wheel && \
pip install 'cmake==3.21.0' && \
pip install 'lightgbm==3.2.1'

# This is needed for mpi to locate libpython
ENV LD_LIBRARY_PATH $AZUREML_CONDA_ENVIRONMENT_PATH/lib:$LD_LIBRARY_PATH
Loading