Upgrade inferencing pipeline with common data loader + batches + percentiles by jfomhover · Pull Request #114 · microsoft/lightgbm-benchmark

This repository was archived by the owner on Apr 8, 2024. It is now read-only.

pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -31,36 +31,48 @@ lightgbm_inferencing:
  
      benchmark_name: "benchmark-inferencing" # need to be provided at runtime!

      tasks:

        - dataset: "data-synthetic-regression-10cols-10000samples-inference"

          model:  "model-synthetic-regression-10cols-10trees-31leaves"

        - dataset: "data-synthetic-regression-10cols-10000samples-inference"

          model:  "model-synthetic-regression-10cols-100trees-31leaves"

        - dataset: "data-synthetic-regression-10cols-10000samples-inference"

          model:  "model-synthetic-regression-10cols-1000trees-31leaves"

        - dataset: "data-synthetic-regression-10cols-10000samples-inference"

          model:  "model-synthetic-regression-10cols-5000trees-31leaves"

        - dataset: "data-synthetic-regression-100cols-10000samples-inference"

          model:  "model-synthetic-regression-100cols-10trees-31leaves"

        - dataset: "data-synthetic-regression-100cols-10000samples-inference"

          model:  "model-synthetic-regression-100cols-100trees-31leaves"

        - dataset: "data-synthetic-regression-100cols-10000samples-inference"

          model:  "model-synthetic-regression-100cols-1000trees-31leaves"

        - dataset: "data-synthetic-regression-100cols-10000samples-inference"

          model:  "model-synthetic-regression-100cols-5000trees-31leaves"

        - dataset: "data-synthetic-regression-1000cols-10000samples-inference"

          model:  "model-synthetic-regression-1000cols-10trees-31leaves"

        - dataset: "data-synthetic-regression-1000cols-10000samples-inference"

          model:  "model-synthetic-regression-1000cols-100trees-31leaves"

        - dataset: "data-synthetic-regression-1000cols-10000samples-inference"

          model:  "model-synthetic-regression-1000cols-1000trees-31leaves"

        - dataset: "data-synthetic-regression-1000cols-10000samples-inference"

          model:  "model-synthetic-regression-1000cols-5000trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-10cols-10000samples-inference"

          model_dataset:  "model-synthetic-regression-10cols-10trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-10cols-10000samples-inference"

          model_dataset:  "model-synthetic-regression-10cols-100trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-10cols-10000samples-inference"

          model_dataset:  "model-synthetic-regression-10cols-1000trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-10cols-10000samples-inference"

          model_dataset:  "model-synthetic-regression-10cols-5000trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference"

          model_dataset:  "model-synthetic-regression-100cols-10trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference"

          model_dataset:  "model-synthetic-regression-100cols-100trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference"

          model_dataset:  "model-synthetic-regression-100cols-1000trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference"

          model_dataset:  "model-synthetic-regression-100cols-5000trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-1000cols-10000samples-inference"

          model_dataset:  "model-synthetic-regression-1000cols-10trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-1000cols-10000samples-inference"

          model_dataset:  "model-synthetic-regression-1000cols-100trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-1000cols-10000samples-inference"

          model_dataset:  "model-synthetic-regression-1000cols-1000trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-1000cols-10000samples-inference"

          model_dataset:  "model-synthetic-regression-1000cols-5000trees-31leaves"

      variants:

        - framework: lightgbm_python

          build: dockers/lightgbm_cpu_mpi_pip.dockerfile

          batch_size: 1

          data_loader: "numpy"

          n_threads: 1

        - framework: lightgbm_python

          build: dockers/lightgbm_cpu_mpi_build.dockerfile

          batch_size: 1

          data_loader: "numpy"

          n_threads: 1

        - framework: lightgbm_python

          build: dockers/lightgbm_cpu_mpi_custom.dockerfile

          batch_size: 1

          data_loader: "numpy"

          n_threads: 1

        - framework: treelite_python

          batch_size: 1

          data_loader: "numpy"

          n_threads: 1

pipelines/azureml/conf/experiments/lightgbm-inferencing.yaml

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -35,8 +35,8 @@ lightgbm_inferencing:
  
      # list all the data/model pairs to run inferencing with

      tasks:

        - dataset: "data-synthetic-regression-100cols-10000samples-inference"

          model:  "model-synthetic-regression-100cols-10trees-31leaves"

        - inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference"

          inferencing_model:  "model-synthetic-regression-100cols-10trees-31leaves"

      # list all inferencing frameworks and their builds

      variants:

pipelines/azureml/pipelines/lightgbm_inferencing.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -13,7 +13,7 @@
  
    import sys

    import json

    from dataclasses import dataclass

    from omegaconf import MISSING

    from omegaconf import MISSING, OmegaConf

    from typing import Optional, List

    from azure.ml.component import dsl

    from shrike.pipeline.pipeline_helper import AMLPipelineHelper

    @@ -27,6 +27,7 @@
  
        sys.path.append(str(LIGHTGBM_BENCHMARK_ROOT))

    from common.tasks import inferencing_task, inferencing_variants

    from common.aml import dataset_from_dstore_path

    class LightGBMInferencing(AMLPipelineHelper):

        """Runnable/reusable pipeline helper class

    @@ -127,6 +128,9 @@ def lightgbm_inferencing_pipeline_function(benchmark_custom_properties, data, mo
  
                        inferencing_step = treelite_score_module(

                            data = data,

                            compiled_model = treelite_compile_step.outputs.compiled_model,

                            data_loader = variant.data_loader,

                            batch_size = variant.batch_size,

                            n_threads = variant.n_threads,

                            verbose = False,

                            custom_properties = custom_properties

                        )

    @@ -138,10 +142,13 @@ def lightgbm_inferencing_pipeline_function(benchmark_custom_properties, data, mo
  
                            data = data,

                            model = model,

                            predict_disable_shape_check = predict_disable_shape_check,

                            data_loader = variant.data_loader,

                            batch_size = variant.batch_size,

                            n_threads = variant.n_threads,

                            verbose = False,

                            custom_properties = custom_properties

                        )

                        self.apply_smart_runsettings(inferencing_step)

                        self.apply_smart_runsettings(inferencing_step, windows=(variant.os == "Windows"))

                    else:

                        raise NotImplementedError(f"framework {variant.framework} not implemented (yet)")

    @@ -179,23 +186,43 @@ def pipeline_instance(self, pipeline_function, config):
  
            """

            # Here you should create an instance of a pipeline function (using your custom config dataclass)

            @dsl.pipeline(name="inferencing_all_tasks", # pythonic name

                          description="Inferencing on all specified tasks",

                          description=("```yaml\n"+OmegaConf.to_yaml(config)+"```"),

                          default_datastore=config.compute.noncompliant_datastore)

            def inferencing_all_tasks():

                for inferencing_task in config.lightgbm_inferencing.tasks:

                    data = self.dataset_load(inferencing_task.dataset)

                    model = self.dataset_load(inferencing_task.model)

                    # load the given inferencing dataset

                    if inferencing_task.inferencing_dataset:

                        inferencing_data = self.dataset_load(

                            name = inferencing_task.inferencing_dataset,

                            version = inferencing_task.inferencing_dataset_version # use latest if None

                        )

                    elif inferencing_task.inferencing_datastore and inferencing_task.inferencing_datastore_path:

                        inferencing_data = dataset_from_dstore_path(self.workspace(), inferencing_task.inferencing_datastore, inferencing_task.inferencing_datastore_path, validate=inferencing_task.inferencing_datastore_path_validate)

                    else:

                        raise ValueError(f"In inferencing_task {inferencing_task}, you need to provide either inferencing_dataset or inferencing_datastore+inferencing_datastore_path")

                    # load the given inferencing model (from a dataset)

                    if inferencing_task.model_dataset:

                        model_data = self.dataset_load(

                            name = inferencing_task.model_dataset,

                            version = inferencing_task.model_dataset_version # use latest if None

                        )

                    elif inferencing_task.model_datastore and inferencing_task.model_datastore_path:

                        model_data = dataset_from_dstore_path(self.workspace(), inferencing_task.model_datastore, inferencing_task.model_datastore_path, validate=inferencing_task.model_datastore_path_validate)

                    else:

                        raise ValueError(f"In inferencing_task {inferencing_task}, you need to provide either model_dataset or model_datastore+model_datastore_path")

                    # create custom properties for this task

                    benchmark_custom_properties = {

                        'benchmark_name' : config.lightgbm_inferencing.benchmark_name, 

                        'benchmark_dataset' : inferencing_task.dataset,

                        'benchmark_model' : inferencing_task.model,

                        'benchmark_dataset' : inferencing_task.inferencing_dataset,

                        'benchmark_model' : inferencing_task.model_dataset,

                    }

                    inferencing_task_subgraph_step = pipeline_function(

                        data=data,

                        model=model,

                        data=inferencing_data,

                        model=model_data,

                        predict_disable_shape_check=inferencing_task.predict_disable_shape_check or False,

                        benchmark_custom_properties=benchmark_custom_properties

                    )

requirements.txt

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -2,10 +2,11 @@ lightgbm==3.2.1
  
    pytest==6.2.4

    pytest-cov==2.12.1

    pytest-mock==3.6.1

    mlflow==1.19.0

    shrike[pipeline]==1.11.1

    mlflow==1.20.2

    shrike[pipeline]==1.11.5

    hydra-core==1.0.7

    omegaconf==2.0.6

    treelite==1.3.0

    treelite_runtime==1.3.0

    mpi4py==3.1.1

    scikit-learn~=0.24.1

src/common/io.py

-Original file line number
+Diff line change
@@ -1,6 +1,7 @@
     import os
     import argparse
     import logging
+    import numpy as np
     def input_file_path(path):
         """ Resolve input path from AzureML.
@@ Expand Down Expand Up / @@ -156,3 +157,53 @@ def run(self, input_path, output_path): @@
                 self.split_by_append(input_files, output_path, self.number)
             else:
                 raise NotImplementedError(f"Mode {self.mode} not implemented.")
+    class DataBatch():
+        # taken from https://datascience.stackexchange.com/questions/47623/how-feed-a-numpy-array-in-batches-in-keras
+        def __init__(self, x, y=None, batch_size=0):
+            self.x = x
+            self.y = y
+            if batch_size == 0:
+                self.batch_size = x.shape[0]
+                self.num_batches = 1
+            else:
+                self.batch_size = batch_size
+                self.num_batches = np.ceil(x.shape[0] / batch_size)
+            self.batch_idx = np.array_split(range(x.shape[0]), self.num_batches)
+            logging.getLogger(__name__).info(f"Creating data batch with {self.num_batches} batches")
+        def __len__(self):
+            return len(self.batch_idx)
+        def __getitem__(self, idx):
+            return self.x[self.batch_idx[idx]], (self.y[self.batch_idx[idx]] if self.y is not None else None)
+    def numpy_data_load(path, delimiter=","):
+        """Loads data using numpy (csv).
+        Args:
+            path (str): path to data file
+        Returns:
+            numpy_array, number_of_rows (int), number of cols (int)
+        """
+        raw_data = np.loadtxt(path, delimiter=delimiter)
+        return raw_data, raw_data.shape[0], raw_data.shape[1]
+    def libsvm_data_load(path):
+        """Loads data using libsvm.
+        Args:
+            path (str): path to data file
+        Returns:
+            (y, x), number_of_rows (int), number of cols (int)
+        """
+        # importing at last minute intentionally
+        from sklearn.datasets import load_svmlight_file
+        x, y = load_svmlight_file(path)
+        return (x,y), x.shape[0], x.shape[1]

src/common/tasks.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,9 +4,26 @@ @@
     @dataclass
     class inferencing_task:
-        dataset: str = MISSING
-        model: str = MISSING
+        # specify either by dataset name
+        inferencing_dataset: Optional[str] = None
+        inferencing_dataset_version: Optional[str] = None
+        # or by datastore+path
+        inferencing_datastore: Optional[str] = None
+        inferencing_datastore_path: Optional[str] = None
+        inferencing_datastore_path_validate: bool = True
+        # specify either by model dataset name
+        model_dataset: Optional[str] = None
+        model_dataset_version: Optional[str] = None
+        # or by datastore+path
+        model_datastore: Optional[str] = None
+        model_datastore_path: Optional[str] = None
+        model_datastore_path_validate: bool = True
+        # task tag
         task_key: Optional[str] = None
+        # turn to True is model and dataset have different shapes
         predict_disable_shape_check: bool = False
     @dataclass
@@ Expand All / @@ -15,6 +32,10 @@ class inferencing_variants: @@
         build: Optional[str] = None
         os: str = "Linux" # linux or windows, linux by default
+        data_loader: str = "lightgbm"
+        batch_size: int = 0 # all data in 1 batch by default
+        n_threads: int = 1
     @dataclass
     class data_generation_task:
         task: str = MISSING
@@ Expand Down @@

src/scripts/lightgbm_python/dockers/lightgbm_cpu_pip_win.dockerfile

-Original file line number
+Diff line change
@@ -0,0 +1,29 @@
+    FROM mcr.microsoft.com/azureml/windows-servercore-1809
+    ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/lightgbm
+    # Create conda environment
+    RUN conda create -p $AZUREML_CONDA_ENVIRONMENT_PATH \
+        python=3.8 pip=20.2.4
+    # Prepend path to AzureML conda environment
+    ENV PATH $AZUREML_CONDA_ENVIRONMENT_PATH/bin:$PATH
+    # Install pip dependencies
+    RUN HOROVOD_WITH_TENSORFLOW=1 \
+        pip install 'pandas>=1.1,<1.2' \
+                    'numpy>=1.10,<1.20' \
+                    'scipy~=1.5.0' \
+                    'scikit-learn~=0.24.1' \
+                    'azureml-core==1.30.0' \
+                    'azureml-defaults==1.30.0' \
+                    'azureml-mlflow==1.30.0' \
+                    'azureml-telemetry==1.30.0'
+    # install lightgbm with mpi
+    RUN pip install --upgrade pip setuptools wheel && \
+        pip install 'cmake==3.21.0' && \
+        pip install 'lightgbm==3.2.1'
+    # This is needed for mpi to locate libpython
+    ENV LD_LIBRARY_PATH $AZUREML_CONDA_ENVIRONMENT_PATH/lib:$LD_LIBRARY_PATH

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Upgrade inferencing pipeline with common data loader + batches + percentiles #114

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!