From a77feddc12615b4853df5b66b7a28857299ab87b Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Mon, 18 Oct 2021 12:00:08 -0700 Subject: [PATCH 01/12] update requirements --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index f434d05c..cfa12faf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ lightgbm==3.2.1 pytest==6.2.4 pytest-cov==2.12.1 pytest-mock==3.6.1 -mlflow==1.19.0 -shrike[pipeline]==1.11.1 +mlflow==1.20.2 +shrike[pipeline]==1.11.5 hydra-core==1.0.7 omegaconf==2.0.6 treelite==1.3.0 From 92f63be3228833df5f48ae2966513eae183931ce Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Mon, 18 Oct 2021 12:00:47 -0700 Subject: [PATCH 02/12] use dstore+path as alternative data spec in inferencing_task --- .../azureml/pipelines/lightgbm_inferencing.py | 33 +++++++++++++++---- src/common/tasks.py | 21 ++++++++++-- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/pipelines/azureml/pipelines/lightgbm_inferencing.py b/pipelines/azureml/pipelines/lightgbm_inferencing.py index 1823d675..3ba6d614 100644 --- a/pipelines/azureml/pipelines/lightgbm_inferencing.py +++ b/pipelines/azureml/pipelines/lightgbm_inferencing.py @@ -27,6 +27,7 @@ sys.path.append(str(LIGHTGBM_BENCHMARK_ROOT)) from common.tasks import inferencing_task, inferencing_variants +from common.aml import dataset_from_dstore_path class LightGBMInferencing(AMLPipelineHelper): """Runnable/reusable pipeline helper class @@ -183,19 +184,39 @@ def pipeline_instance(self, pipeline_function, config): default_datastore=config.compute.noncompliant_datastore) def inferencing_all_tasks(): for inferencing_task in config.lightgbm_inferencing.tasks: - data = self.dataset_load(inferencing_task.dataset) - model = self.dataset_load(inferencing_task.model) + + # load the given inferencing dataset + if inferencing_task.inferencing_dataset: + inferencing_data = self.dataset_load( + name = inferencing_task.inferencing_dataset, + version = inferencing_task.inferencing_dataset_version # use latest if None + ) + elif inferencing_task.inferencing_datastore and inferencing_task.inferencing_datastore_path: + inferencing_data = dataset_from_dstore_path(self.workspace(), inferencing_task.inferencing_datastore, inferencing_task.inferencing_datastore_path, validate=inferencing_task.inferencing_datastore_path_validate) + else: + raise ValueError(f"In inferencing_task {inferencing_task}, you need to provide either inferencing_dataset or inferencing_datastore+inferencing_datastore_path") + + # load the given inferencing model (from a dataset) + if inferencing_task.model_dataset: + model_data = self.dataset_load( + name = inferencing_task.model_dataset, + version = inferencing_task.model_dataset_version # use latest if None + ) + elif inferencing_task.model_datastore and inferencing_task.model_datastore_path: + model_data = dataset_from_dstore_path(self.workspace(), inferencing_task.model_datastore, inferencing_task.model_datastore_path, validate=inferencing_task.model_datastore_path_validate) + else: + raise ValueError(f"In inferencing_task {inferencing_task}, you need to provide either model_dataset or model_datastore+model_datastore_path") # create custom properties for this task benchmark_custom_properties = { 'benchmark_name' : config.lightgbm_inferencing.benchmark_name, - 'benchmark_dataset' : inferencing_task.dataset, - 'benchmark_model' : inferencing_task.model, + 'benchmark_dataset' : inferencing_task.inferencing_dataset, + 'benchmark_model' : inferencing_task.model_dataset, } inferencing_task_subgraph_step = pipeline_function( - data=data, - model=model, + data=inferencing_data, + model=model_data, predict_disable_shape_check=inferencing_task.predict_disable_shape_check or False, benchmark_custom_properties=benchmark_custom_properties ) diff --git a/src/common/tasks.py b/src/common/tasks.py index c5fd4417..af97ad62 100644 --- a/src/common/tasks.py +++ b/src/common/tasks.py @@ -4,9 +4,26 @@ @dataclass class inferencing_task: - dataset: str = MISSING - model: str = MISSING + # specify either by dataset name + inferencing_dataset: Optional[str] = None + inferencing_dataset_version: Optional[str] = None + # or by datastore+path + inferencing_datastore: Optional[str] = None + inferencing_datastore_path: Optional[str] = None + inferencing_datastore_path_validate: bool = True + + # specify either by model dataset name + model_dataset: Optional[str] = None + model_dataset_version: Optional[str] = None + # or by datastore+path + model_datastore: Optional[str] = None + model_datastore_path: Optional[str] = None + model_datastore_path_validate: bool = True + + # task tag task_key: Optional[str] = None + + # turn to True is model and dataset have different shapes predict_disable_shape_check: bool = False @dataclass From cefcb616a6f06ee5a0af256e30356ac872e3b710 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Mon, 18 Oct 2021 12:02:42 -0700 Subject: [PATCH 03/12] propage changes to yaml config files --- .../benchmarks/lightgbm-inferencing.yaml | 48 +++++++++---------- .../experiments/lightgbm-inferencing.yaml | 4 +- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml b/pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml index 664cf987..2e1a0ea6 100644 --- a/pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml +++ b/pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml @@ -31,30 +31,30 @@ lightgbm_inferencing: benchmark_name: "benchmark-inferencing" # need to be provided at runtime! tasks: - - dataset: "data-synthetic-regression-10cols-10000samples-inference" - model: "model-synthetic-regression-10cols-10trees-31leaves" - - dataset: "data-synthetic-regression-10cols-10000samples-inference" - model: "model-synthetic-regression-10cols-100trees-31leaves" - - dataset: "data-synthetic-regression-10cols-10000samples-inference" - model: "model-synthetic-regression-10cols-1000trees-31leaves" - - dataset: "data-synthetic-regression-10cols-10000samples-inference" - model: "model-synthetic-regression-10cols-5000trees-31leaves" - - dataset: "data-synthetic-regression-100cols-10000samples-inference" - model: "model-synthetic-regression-100cols-10trees-31leaves" - - dataset: "data-synthetic-regression-100cols-10000samples-inference" - model: "model-synthetic-regression-100cols-100trees-31leaves" - - dataset: "data-synthetic-regression-100cols-10000samples-inference" - model: "model-synthetic-regression-100cols-1000trees-31leaves" - - dataset: "data-synthetic-regression-100cols-10000samples-inference" - model: "model-synthetic-regression-100cols-5000trees-31leaves" - - dataset: "data-synthetic-regression-1000cols-10000samples-inference" - model: "model-synthetic-regression-1000cols-10trees-31leaves" - - dataset: "data-synthetic-regression-1000cols-10000samples-inference" - model: "model-synthetic-regression-1000cols-100trees-31leaves" - - dataset: "data-synthetic-regression-1000cols-10000samples-inference" - model: "model-synthetic-regression-1000cols-1000trees-31leaves" - - dataset: "data-synthetic-regression-1000cols-10000samples-inference" - model: "model-synthetic-regression-1000cols-5000trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-10cols-10000samples-inference" + model_dataset: "model-synthetic-regression-10cols-10trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-10cols-10000samples-inference" + model_dataset: "model-synthetic-regression-10cols-100trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-10cols-10000samples-inference" + model_dataset: "model-synthetic-regression-10cols-1000trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-10cols-10000samples-inference" + model_dataset: "model-synthetic-regression-10cols-5000trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference" + model_dataset: "model-synthetic-regression-100cols-10trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference" + model_dataset: "model-synthetic-regression-100cols-100trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference" + model_dataset: "model-synthetic-regression-100cols-1000trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference" + model_dataset: "model-synthetic-regression-100cols-5000trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-1000cols-10000samples-inference" + model_dataset: "model-synthetic-regression-1000cols-10trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-1000cols-10000samples-inference" + model_dataset: "model-synthetic-regression-1000cols-100trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-1000cols-10000samples-inference" + model_dataset: "model-synthetic-regression-1000cols-1000trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-1000cols-10000samples-inference" + model_dataset: "model-synthetic-regression-1000cols-5000trees-31leaves" variants: - framework: lightgbm_python diff --git a/pipelines/azureml/conf/experiments/lightgbm-inferencing.yaml b/pipelines/azureml/conf/experiments/lightgbm-inferencing.yaml index 8bd1c27f..18d9a937 100644 --- a/pipelines/azureml/conf/experiments/lightgbm-inferencing.yaml +++ b/pipelines/azureml/conf/experiments/lightgbm-inferencing.yaml @@ -35,8 +35,8 @@ lightgbm_inferencing: # list all the data/model pairs to run inferencing with tasks: - - dataset: "data-synthetic-regression-100cols-10000samples-inference" - model: "model-synthetic-regression-100cols-10trees-31leaves" + - inferencing_dataset: "data-synthetic-regression-100cols-10000samples-inference" + inferencing_model: "model-synthetic-regression-100cols-10trees-31leaves" # list all inferencing frameworks and their builds variants: From b9ecb195407d3834e9537d00224273e72ab406f1 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Mon, 18 Oct 2021 17:34:12 -0700 Subject: [PATCH 04/12] implement common data loader + batches in lightgbm python score --- .../azureml/pipelines/lightgbm_inferencing.py | 7 +- requirements.txt | 1 + src/common/io.py | 129 ++++++++++++++++++ src/common/tasks.py | 4 + .../dockers/lightgbm_cpu_mpi_pip.dockerfile | 1 + src/scripts/lightgbm_python/score.py | 47 +++++-- src/scripts/lightgbm_python/score_spec.yaml | 11 ++ 7 files changed, 183 insertions(+), 17 deletions(-) diff --git a/pipelines/azureml/pipelines/lightgbm_inferencing.py b/pipelines/azureml/pipelines/lightgbm_inferencing.py index 3ba6d614..c68f5c10 100644 --- a/pipelines/azureml/pipelines/lightgbm_inferencing.py +++ b/pipelines/azureml/pipelines/lightgbm_inferencing.py @@ -13,7 +13,7 @@ import sys import json from dataclasses import dataclass -from omegaconf import MISSING +from omegaconf import MISSING, OmegaConf from typing import Optional, List from azure.ml.component import dsl from shrike.pipeline.pipeline_helper import AMLPipelineHelper @@ -139,6 +139,9 @@ def lightgbm_inferencing_pipeline_function(benchmark_custom_properties, data, mo data = data, model = model, predict_disable_shape_check = predict_disable_shape_check, + data_loader = variant.data_loader, + batch_size = variant.batch_size, + n_threads = variant.n_threads, verbose = False, custom_properties = custom_properties ) @@ -180,7 +183,7 @@ def pipeline_instance(self, pipeline_function, config): """ # Here you should create an instance of a pipeline function (using your custom config dataclass) @dsl.pipeline(name="inferencing_all_tasks", # pythonic name - description="Inferencing on all specified tasks", + description=("```yaml\n"+OmegaConf.to_yaml(config)+"```"), default_datastore=config.compute.noncompliant_datastore) def inferencing_all_tasks(): for inferencing_task in config.lightgbm_inferencing.tasks: diff --git a/requirements.txt b/requirements.txt index cfa12faf..8954f0da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ omegaconf==2.0.6 treelite==1.3.0 treelite_runtime==1.3.0 mpi4py==3.1.1 +libsvm==3.23.0 \ No newline at end of file diff --git a/src/common/io.py b/src/common/io.py index 8512a824..260d683b 100644 --- a/src/common/io.py +++ b/src/common/io.py @@ -1,6 +1,8 @@ import os import argparse import logging +import numpy as np +from lightgbm import Dataset as lightgbm_Dataset def input_file_path(path): """ Resolve input path from AzureML. @@ -156,3 +158,130 @@ def run(self, input_path, output_path): self.split_by_append(input_files, output_path, self.number) else: raise NotImplementedError(f"Mode {self.mode} not implemented.") + + +class DataBatch(): + # taken from https://datascience.stackexchange.com/questions/47623/how-feed-a-numpy-array-in-batches-in-keras + def __init__(self, x, y=None, batch_size=0): + self.x = x + self.y = y + if batch_size == 0: + self.batch_size = x.shape[0] + self.num_batches = 1 + else: + self.batch_size = batch_size + self.num_batches = np.ceil(x.shape[0] / batch_size) + + self.batch_idx = np.array_split(range(x.shape[0]), self.num_batches) + logging.getLogger(__name__).info(f"Creating data batch with {self.num_batches} batches") + + def __len__(self): + return len(self.batch_idx) + + def __getitem__(self, idx): + return self.x[self.batch_idx[idx]], (self.y[self.batch_idx[idx]] if self.y is not None else None) + + +class InputDataLoader(): + """Utility class to load input data with flexible options from argparse""" + # current list of supported loaders + SUPPORTED_LOADERS = ['lightgbm', 'numpy', 'libsvm'] + + # prefix used for all argparse + DEFAULT_ARG_PREFIX = "input_data" + + def __init__(self, + allowed_loaders=SUPPORTED_LOADERS, + arg_prefix=DEFAULT_ARG_PREFIX, + default_loader=None): + """Initialize data loader. + Args: + allowed_loaders (List[str]): list of supported loaders (can restrict to avoid incompatibilities) + arg_prefix (str): which prefix to use for all argparse + default_loader (str): name of default loader (if None, will use first in allowed_loaders) + """ + self.allowed_loaders = allowed_loaders + self.arg_prefix = arg_prefix + self.default_loader = default_loader or allowed_loaders[0] + self.logger = logging.getLogger(__name__) + + def get_arg_parser(self, parser=None): + """Adds arguments for this class + Args: + parser (argparse.ArgumentParser): an argument parser instance + Returns: + ArgumentParser: the argument parser instance + Notes: + if parser is None, creates a new parser instance + """ + # add arguments that are specific to the script + if parser is None: + parser = argparse.ArgumentParser(__doc__) + + parser.add_argument(f"--{self.arg_prefix}_loader", + required=False, type=str, default=self.default_loader, choices=self.allowed_loaders, help="use numpy for csv, libsvm for libsvm, or lightgbm for both") + parser.add_argument(f"--{self.arg_prefix}_batch_size", + required=False, type=int, default=0, help="size of batches (default: all data in 1 batch") + + return parser + + def _lightgbm_loader_load(self, path): + """Loads data using lightgbm construct(). + + Args: + path (str): path to data file + Returns: + lightgbm_data_reference, number_of_rows (int), number of cols (int) + """ + self.logger.info(f"Loading {path} with lightgbm") + # importing at last minute intentionally + data = lightgbm_Dataset(path, free_raw_data=False).construct() + raw_data = data.get_data() + + self.logger.info(f"Loaded {path} data has {data.num_data()} rows and {data.num_feature()} cols") + return raw_data, data.num_data(), data.num_feature() + + def _numpy_loader_load(self, path, batch_size=0): + """Loads data using numpy (csv). + + Args: + path (str): path to data file + Returns: + numpy_array, number_of_rows (int), number of cols (int) + """ + self.logger.info(f"Loading {path} with numpy") + # importing at last minute intentionally + raw_data = np.loadtxt(path, delimiter=",") + + self.logger.info(f"Loaded {path} data has {raw_data.shape[0]} rows and {raw_data.shape[1]} cols") + + return DataBatch(x=raw_data, y=None, batch_size=batch_size), raw_data.shape[0], raw_data.shape[1] + + def _libsvm_loader_load(self, path, batch_size=0): + """Loads data using libsvm. + + Args: + path (str): path to data file + Returns: + (y, x), number_of_rows (int), number of cols (int) + """ + self.logger.info(f"Loading {path} with libsvm") + # importing at last minute intentionally + from libsvm.svmutil import svm_read_problem + + y, x = svm_read_problem(path, return_scipy=True) + + self.logger.info(f"Loaded {path}, data (X) has {x.shape[0]} rows and {x.shape[1]} cols") + return DataBatch(x=x.toarray(), y=y, batch_size=batch_size), x.shape[0], x.shape[1] + + def load(self, args, path): + """Loads data using the right loader""" + loader = getattr(args, f"{self.arg_prefix}_loader") + batch_size = getattr(args, f"{self.arg_prefix}_batch_size") + if loader == "lightgbm": + return self._lightgbm_loader_load(path) + if loader == "numpy": + return self._numpy_loader_load(path, batch_size=batch_size) + if loader == "libsvm": + return self._libsvm_loader_load(path, batch_size=batch_size) + raise NotImplementedError(f"Data loader '{loader}' is not implemented") diff --git a/src/common/tasks.py b/src/common/tasks.py index af97ad62..0237933b 100644 --- a/src/common/tasks.py +++ b/src/common/tasks.py @@ -32,6 +32,10 @@ class inferencing_variants: build: Optional[str] = None os: str = "Linux" # linux or windows, linux by default + data_loader: str = "lightgbm" + batch_size: int = 0 # all data in 1 batch by default + n_threads: int = 1 + @dataclass class data_generation_task: task: str = MISSING diff --git a/src/scripts/lightgbm_python/dockers/lightgbm_cpu_mpi_pip.dockerfile b/src/scripts/lightgbm_python/dockers/lightgbm_cpu_mpi_pip.dockerfile index 2c664a50..b2897e40 100644 --- a/src/scripts/lightgbm_python/dockers/lightgbm_cpu_mpi_pip.dockerfile +++ b/src/scripts/lightgbm_python/dockers/lightgbm_cpu_mpi_pip.dockerfile @@ -19,6 +19,7 @@ RUN HOROVOD_WITH_TENSORFLOW=1 \ 'azureml-defaults==1.30.0' \ 'azureml-mlflow==1.30.0' \ 'azureml-telemetry==1.30.0' \ + 'libsvm==3.23.0' \ 'mpi4py==3.1.1' # install lightgbm with mpi diff --git a/src/scripts/lightgbm_python/score.py b/src/scripts/lightgbm_python/score.py index 9728a517..e896ef43 100644 --- a/src/scripts/lightgbm_python/score.py +++ b/src/scripts/lightgbm_python/score.py @@ -11,6 +11,7 @@ from distutils.util import strtobool import lightgbm import numpy +import time # Add the right path to PYTHONPATH # so that you can import from common.* @@ -22,8 +23,13 @@ # useful imports from common from common.metrics import MetricsLogger -from common.io import input_file_path +from common.io import input_file_path, InputDataLoader +INPUT_DATA_LOADER = InputDataLoader( + allowed_loaders = ['numpy', 'libsvm', 'lightgbm'], + arg_prefix="data", + default_loader="lightgbm" +) def get_arg_parser(parser=None): """Adds component/module arguments to a given argument parser. @@ -44,6 +50,7 @@ def get_arg_parser(parser=None): group_i = parser.add_argument_group("Input Data") group_i.add_argument("--data", required=True, type=input_file_path, help="Inferencing data location (file path)") + INPUT_DATA_LOADER.get_arg_parser(group_i) # add data loading parameters group_i.add_argument("--model", required=False, type=input_file_path, help="Exported model location (file path)") group_i.add_argument("--output", @@ -102,7 +109,9 @@ def run(args, unknown_args=[]): # record relevant parameters metrics_logger.log_parameters( - num_threads=args.num_threads + batch_size=args.data_batch_size, + data_loader=args.data_loader, + num_threads=args.num_threads, ) # register logger for lightgbm logs @@ -113,28 +122,36 @@ def run(args, unknown_args=[]): os.makedirs(args.output, exist_ok=True) args.output = os.path.join(args.output, "predictions.txt") - logger.info(f"Loading model from {args.model}") - booster = lightgbm.Booster(model_file=args.model) - logger.info(f"Loading data for inferencing") with metrics_logger.log_time_block("time_data_loading"): - # NOTE: this is bad, but allows for libsvm format (not just numpy) - inference_data = lightgbm.Dataset(args.data, free_raw_data=False).construct() - inference_raw_data = inference_data.get_data() + inference_raw_data_batches, row_count, feature_count = INPUT_DATA_LOADER.load(args, args.data) + + logger.info(f"Loading model from {args.model}") + booster = lightgbm.Booster(model_file=args.model) # capture data shape as property metrics_logger.set_properties( - inference_data_length = inference_data.num_data(), - inference_data_width = inference_data.num_feature() + inference_data_length = row_count, + inference_data_width = feature_count ) logger.info(f"Running .predict()") + batch_run_times = [] # collect time for each batch with metrics_logger.log_time_block("time_inferencing"): - booster.predict( - data=inference_raw_data, - num_threads=args.num_threads, - predict_disable_shape_check=bool(args.predict_disable_shape_check) - ) + for batch_x, _ in inference_raw_data_batches: + batch_start_time = time.time() + booster.predict( + data=batch_x, + num_threads=args.num_threads, + predict_disable_shape_check=bool(args.predict_disable_shape_check) + ) + batch_run_times.append(time.time() - batch_start_time) + + if len(batch_run_times) > 1: + batch_run_times = numpy.array(batch_run_times) + metrics_logger.log_metric("batch_time_inferencing_p50_usecs", numpy.percentile(batch_run_times, 50) * 1000000) + metrics_logger.log_metric("batch_time_inferencing_p90_usecs", numpy.percentile(batch_run_times, 90) * 1000000) + metrics_logger.log_metric("batch_time_inferencing_p99_usecs", numpy.percentile(batch_run_times, 99) * 1000000) # Important: close logging session before exiting metrics_logger.close() diff --git a/src/scripts/lightgbm_python/score_spec.yaml b/src/scripts/lightgbm_python/score_spec.yaml index 8348224e..3c493d57 100644 --- a/src/scripts/lightgbm_python/score_spec.yaml +++ b/src/scripts/lightgbm_python/score_spec.yaml @@ -18,6 +18,15 @@ inputs: type: Boolean description: "control whether or not LightGBM raises an error when you try to predict on data with a different number of features than the training data" optional: true + data_loader: + type: Enum + enum: + - lightgbm + - libsvm + - numpy + batch_size: + type: Integer + default: 0 n_threads: type: Integer optional: true @@ -33,6 +42,8 @@ command: >- python score.py --data {inputs.data} --model {inputs.model} + --data_loader {inputs.data_loader} + --data_batch_size {inputs.batch_size} [--num_threads {inputs.n_threads}] [--predict_disable_shape_check {inputs.predict_disable_shape_check}] [--verbose {inputs.verbose}] From f78022da34cacfa984ce760983f03b8c7db42029 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Mon, 18 Oct 2021 21:46:00 -0700 Subject: [PATCH 05/12] reduce data loader to a couple functions --- requirements.txt | 2 +- src/common/io.py | 127 ++++-------------- .../dockers/lightgbm_cpu_mpi_pip.dockerfile | 1 - src/scripts/lightgbm_python/score.py | 48 +++++-- 4 files changed, 62 insertions(+), 116 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8954f0da..e0ef678e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,4 @@ omegaconf==2.0.6 treelite==1.3.0 treelite_runtime==1.3.0 mpi4py==3.1.1 -libsvm==3.23.0 \ No newline at end of file +scikit-learn~=0.24.1 \ No newline at end of file diff --git a/src/common/io.py b/src/common/io.py index 260d683b..5dce858d 100644 --- a/src/common/io.py +++ b/src/common/io.py @@ -182,106 +182,33 @@ def __getitem__(self, idx): return self.x[self.batch_idx[idx]], (self.y[self.batch_idx[idx]] if self.y is not None else None) -class InputDataLoader(): - """Utility class to load input data with flexible options from argparse""" - # current list of supported loaders - SUPPORTED_LOADERS = ['lightgbm', 'numpy', 'libsvm'] - - # prefix used for all argparse - DEFAULT_ARG_PREFIX = "input_data" - - def __init__(self, - allowed_loaders=SUPPORTED_LOADERS, - arg_prefix=DEFAULT_ARG_PREFIX, - default_loader=None): - """Initialize data loader. - Args: - allowed_loaders (List[str]): list of supported loaders (can restrict to avoid incompatibilities) - arg_prefix (str): which prefix to use for all argparse - default_loader (str): name of default loader (if None, will use first in allowed_loaders) - """ - self.allowed_loaders = allowed_loaders - self.arg_prefix = arg_prefix - self.default_loader = default_loader or allowed_loaders[0] - self.logger = logging.getLogger(__name__) - - def get_arg_parser(self, parser=None): - """Adds arguments for this class - Args: - parser (argparse.ArgumentParser): an argument parser instance - Returns: - ArgumentParser: the argument parser instance - Notes: - if parser is None, creates a new parser instance - """ - # add arguments that are specific to the script - if parser is None: - parser = argparse.ArgumentParser(__doc__) - - parser.add_argument(f"--{self.arg_prefix}_loader", - required=False, type=str, default=self.default_loader, choices=self.allowed_loaders, help="use numpy for csv, libsvm for libsvm, or lightgbm for both") - parser.add_argument(f"--{self.arg_prefix}_batch_size", - required=False, type=int, default=0, help="size of batches (default: all data in 1 batch") - - return parser - - def _lightgbm_loader_load(self, path): - """Loads data using lightgbm construct(). - - Args: - path (str): path to data file - Returns: - lightgbm_data_reference, number_of_rows (int), number of cols (int) - """ - self.logger.info(f"Loading {path} with lightgbm") - # importing at last minute intentionally - data = lightgbm_Dataset(path, free_raw_data=False).construct() - raw_data = data.get_data() - - self.logger.info(f"Loaded {path} data has {data.num_data()} rows and {data.num_feature()} cols") - return raw_data, data.num_data(), data.num_feature() - - def _numpy_loader_load(self, path, batch_size=0): - """Loads data using numpy (csv). - - Args: - path (str): path to data file - Returns: - numpy_array, number_of_rows (int), number of cols (int) - """ - self.logger.info(f"Loading {path} with numpy") - # importing at last minute intentionally - raw_data = np.loadtxt(path, delimiter=",") +def numpy_data_load(path, delimiter=","): + """Loads data using numpy (csv). + + Args: + path (str): path to data file + Returns: + numpy_array, number_of_rows (int), number of cols (int) + """ + self.logger.info(f"Loading {path} with numpy") + # importing at last minute intentionally + raw_data = np.loadtxt(path, delimiter=delimiter) - self.logger.info(f"Loaded {path} data has {raw_data.shape[0]} rows and {raw_data.shape[1]} cols") + self.logger.info(f"Loaded {path} data has {raw_data.shape[0]} rows and {raw_data.shape[1]} cols") - return DataBatch(x=raw_data, y=None, batch_size=batch_size), raw_data.shape[0], raw_data.shape[1] + return raw_data, raw_data.shape[0], raw_data.shape[1] - def _libsvm_loader_load(self, path, batch_size=0): - """Loads data using libsvm. - - Args: - path (str): path to data file - Returns: - (y, x), number_of_rows (int), number of cols (int) - """ - self.logger.info(f"Loading {path} with libsvm") - # importing at last minute intentionally - from libsvm.svmutil import svm_read_problem - - y, x = svm_read_problem(path, return_scipy=True) - - self.logger.info(f"Loaded {path}, data (X) has {x.shape[0]} rows and {x.shape[1]} cols") - return DataBatch(x=x.toarray(), y=y, batch_size=batch_size), x.shape[0], x.shape[1] - - def load(self, args, path): - """Loads data using the right loader""" - loader = getattr(args, f"{self.arg_prefix}_loader") - batch_size = getattr(args, f"{self.arg_prefix}_batch_size") - if loader == "lightgbm": - return self._lightgbm_loader_load(path) - if loader == "numpy": - return self._numpy_loader_load(path, batch_size=batch_size) - if loader == "libsvm": - return self._libsvm_loader_load(path, batch_size=batch_size) - raise NotImplementedError(f"Data loader '{loader}' is not implemented") +def libsvm_data_load(path): + """Loads data using libsvm. + + Args: + path (str): path to data file + Returns: + (y, x), number_of_rows (int), number of cols (int) + """ + # importing at last minute intentionally + from sklearn.datasets import load_svmlight_file + + x, y = load_svmlight_file(path) + + return (x,y), x.shape[0], x.shape[1] diff --git a/src/scripts/lightgbm_python/dockers/lightgbm_cpu_mpi_pip.dockerfile b/src/scripts/lightgbm_python/dockers/lightgbm_cpu_mpi_pip.dockerfile index b2897e40..2c664a50 100644 --- a/src/scripts/lightgbm_python/dockers/lightgbm_cpu_mpi_pip.dockerfile +++ b/src/scripts/lightgbm_python/dockers/lightgbm_cpu_mpi_pip.dockerfile @@ -19,7 +19,6 @@ RUN HOROVOD_WITH_TENSORFLOW=1 \ 'azureml-defaults==1.30.0' \ 'azureml-mlflow==1.30.0' \ 'azureml-telemetry==1.30.0' \ - 'libsvm==3.23.0' \ 'mpi4py==3.1.1' # install lightgbm with mpi diff --git a/src/scripts/lightgbm_python/score.py b/src/scripts/lightgbm_python/score.py index e896ef43..db5bd053 100644 --- a/src/scripts/lightgbm_python/score.py +++ b/src/scripts/lightgbm_python/score.py @@ -23,13 +23,7 @@ # useful imports from common from common.metrics import MetricsLogger -from common.io import input_file_path, InputDataLoader - -INPUT_DATA_LOADER = InputDataLoader( - allowed_loaders = ['numpy', 'libsvm', 'lightgbm'], - arg_prefix="data", - default_loader="lightgbm" -) +from common.io import input_file_path, DataBatch, libsvm_data_load, numpy_data_load def get_arg_parser(parser=None): """Adds component/module arguments to a given argument parser. @@ -50,13 +44,16 @@ def get_arg_parser(parser=None): group_i = parser.add_argument_group("Input Data") group_i.add_argument("--data", required=True, type=input_file_path, help="Inferencing data location (file path)") - INPUT_DATA_LOADER.get_arg_parser(group_i) # add data loading parameters + group_i.add_argument(f"--data_loader", + required=False, type=str, default="lightgbm", choices=["lightgbm", "libsvm", "numpy"], help="use numpy for csv, libsvm for libsvm, or lightgbm for both") group_i.add_argument("--model", required=False, type=input_file_path, help="Exported model location (file path)") group_i.add_argument("--output", required=False, default=None, type=str, help="Inferencing output location (file path)") group_params = parser.add_argument_group("Scoring parameters") + group_params.add_argument(f"--batch_size", + required=False, type=int, default=0, help="size of batches (default: all data in 1 batch") group_params.add_argument("--num_threads", required=False, default=1, type=int, help="number of threads") group_params.add_argument("--predict_disable_shape_check", @@ -109,7 +106,7 @@ def run(args, unknown_args=[]): # record relevant parameters metrics_logger.log_parameters( - batch_size=args.data_batch_size, + batch_size=args.batch_size, data_loader=args.data_loader, num_threads=args.num_threads, ) @@ -122,9 +119,24 @@ def run(args, unknown_args=[]): os.makedirs(args.output, exist_ok=True) args.output = os.path.join(args.output, "predictions.txt") + if args.batch_size > 0 and args.data_loader == "lightgbm": + logger.warning("--data_loader lightgbm does not support --batch_size > 0 (currently)") + args.batch_size = 0 + logger.info(f"Loading data for inferencing") with metrics_logger.log_time_block("time_data_loading"): - inference_raw_data_batches, row_count, feature_count = INPUT_DATA_LOADER.load(args, args.data) + if args.data_loader == "lightgbm": + inference_data = lightgbm.Dataset(args.data, free_raw_data=False).construct() + inference_raw_data = inference_data.get_data() + row_count = inference_data.num_data() + feature_count = inference_data.num_feature() + elif args.data_loader == "libsvm": + inference_data, row_count, feature_count = libsvm_data_load(args.data) + inference_raw_data = inference_data[0] # (x,y) -> x + elif args.data_loader == "numpy": + inference_data, row_count, feature_count = numpy_data_load(args.data) + else: + raise NotImplementedError(f"--data_loader {args.data_loader} is not implemented.") logger.info(f"Loading model from {args.model}") booster = lightgbm.Booster(model_file=args.model) @@ -138,14 +150,22 @@ def run(args, unknown_args=[]): logger.info(f"Running .predict()") batch_run_times = [] # collect time for each batch with metrics_logger.log_time_block("time_inferencing"): - for batch_x, _ in inference_raw_data_batches: - batch_start_time = time.time() + if args.batch_size > 0: + inference_batches = DataBatch(x=inference_raw_data, y=None, batch_size=args.batch_size) + for data_batch, _ in inference_batches: + batch_start_time = time.time() + booster.predict( + data=data_batch, + num_threads=args.num_threads, + predict_disable_shape_check=bool(args.predict_disable_shape_check) + ) + batch_run_times.append(time.time() - batch_start_time) + else: booster.predict( - data=batch_x, + data=inference_raw_data, num_threads=args.num_threads, predict_disable_shape_check=bool(args.predict_disable_shape_check) ) - batch_run_times.append(time.time() - batch_start_time) if len(batch_run_times) > 1: batch_run_times = numpy.array(batch_run_times) From f0b05ffe6d3efa21c91e8a9b6b3e7a4414b1099f Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Mon, 18 Oct 2021 21:57:36 -0700 Subject: [PATCH 06/12] add windows runsettings --- pipelines/azureml/pipelines/lightgbm_inferencing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/azureml/pipelines/lightgbm_inferencing.py b/pipelines/azureml/pipelines/lightgbm_inferencing.py index c68f5c10..71a61162 100644 --- a/pipelines/azureml/pipelines/lightgbm_inferencing.py +++ b/pipelines/azureml/pipelines/lightgbm_inferencing.py @@ -145,7 +145,7 @@ def lightgbm_inferencing_pipeline_function(benchmark_custom_properties, data, mo verbose = False, custom_properties = custom_properties ) - self.apply_smart_runsettings(inferencing_step) + self.apply_smart_runsettings(inferencing_step, windows=(variant.os == "Windows")) else: raise NotImplementedError(f"framework {variant.framework} not implemented (yet)") From 9cfc2312e3b5306d41e773b2abb9aef064ec97d2 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Mon, 18 Oct 2021 21:57:58 -0700 Subject: [PATCH 07/12] add windows build for lightgbm (no mpi yet) --- .../dockers/lightgbm_cpu_pip_win.dockerfile | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 src/scripts/lightgbm_python/dockers/lightgbm_cpu_pip_win.dockerfile diff --git a/src/scripts/lightgbm_python/dockers/lightgbm_cpu_pip_win.dockerfile b/src/scripts/lightgbm_python/dockers/lightgbm_cpu_pip_win.dockerfile new file mode 100644 index 00000000..15775598 --- /dev/null +++ b/src/scripts/lightgbm_python/dockers/lightgbm_cpu_pip_win.dockerfile @@ -0,0 +1,29 @@ +FROM mcr.microsoft.com/azureml/windows-servercore-1809 + +ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/lightgbm + +# Create conda environment +RUN conda create -p $AZUREML_CONDA_ENVIRONMENT_PATH \ + python=3.8 pip=20.2.4 + +# Prepend path to AzureML conda environment +ENV PATH $AZUREML_CONDA_ENVIRONMENT_PATH/bin:$PATH + +# Install pip dependencies +RUN HOROVOD_WITH_TENSORFLOW=1 \ + pip install 'pandas>=1.1,<1.2' \ + 'numpy>=1.10,<1.20' \ + 'scipy~=1.5.0' \ + 'scikit-learn~=0.24.1' \ + 'azureml-core==1.30.0' \ + 'azureml-defaults==1.30.0' \ + 'azureml-mlflow==1.30.0' \ + 'azureml-telemetry==1.30.0' + +# install lightgbm with mpi +RUN pip install --upgrade pip setuptools wheel && \ + pip install 'cmake==3.21.0' && \ + pip install 'lightgbm==3.2.1' + +# This is needed for mpi to locate libpython +ENV LD_LIBRARY_PATH $AZUREML_CONDA_ENVIRONMENT_PATH/lib:$LD_LIBRARY_PATH From f11faf22f9d1fceb4a041a25c831702a84ef1cbb Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Mon, 18 Oct 2021 22:34:33 -0700 Subject: [PATCH 08/12] add params to treelite --- .../experiments/benchmarks/lightgbm-inferencing.yaml | 12 ++++++++++++ pipelines/azureml/pipelines/lightgbm_inferencing.py | 3 +++ 2 files changed, 15 insertions(+) diff --git a/pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml b/pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml index 2e1a0ea6..8e710bdf 100644 --- a/pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml +++ b/pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml @@ -59,8 +59,20 @@ lightgbm_inferencing: variants: - framework: lightgbm_python build: dockers/lightgbm_cpu_mpi_pip.dockerfile + batch_size: 1 + data_loader: "libsvm" + n_threads: 1 - framework: lightgbm_python build: dockers/lightgbm_cpu_mpi_build.dockerfile + batch_size: 1 + data_loader: "libsvm" + n_threads: 1 - framework: lightgbm_python build: dockers/lightgbm_cpu_mpi_custom.dockerfile + batch_size: 1 + data_loader: "libsvm" + n_threads: 1 - framework: treelite_python + batch_size: 1 + data_loader: "libsvm" + n_threads: 1 diff --git a/pipelines/azureml/pipelines/lightgbm_inferencing.py b/pipelines/azureml/pipelines/lightgbm_inferencing.py index 71a61162..8c68f3eb 100644 --- a/pipelines/azureml/pipelines/lightgbm_inferencing.py +++ b/pipelines/azureml/pipelines/lightgbm_inferencing.py @@ -128,6 +128,9 @@ def lightgbm_inferencing_pipeline_function(benchmark_custom_properties, data, mo inferencing_step = treelite_score_module( data = data, compiled_model = treelite_compile_step.outputs.compiled_model, + data_loader = variant.data_loader, + batch_size = variant.batch_size, + n_threads = variant.n_threads, verbose = False, custom_properties = custom_properties ) From f9118b9704d5c3d486d3d225f1fddcd35ed205b0 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Mon, 18 Oct 2021 22:35:16 -0700 Subject: [PATCH 09/12] add data loading to treelite --- src/scripts/lightgbm_python/score_spec.yaml | 2 +- src/scripts/treelite_python/conda_env.yaml | 1 + src/scripts/treelite_python/score.py | 53 +++++++++++++++++---- src/scripts/treelite_python/score_spec.yaml | 14 ++++++ 4 files changed, 59 insertions(+), 11 deletions(-) diff --git a/src/scripts/lightgbm_python/score_spec.yaml b/src/scripts/lightgbm_python/score_spec.yaml index 3c493d57..5a4e73a0 100644 --- a/src/scripts/lightgbm_python/score_spec.yaml +++ b/src/scripts/lightgbm_python/score_spec.yaml @@ -43,7 +43,7 @@ command: >- --data {inputs.data} --model {inputs.model} --data_loader {inputs.data_loader} - --data_batch_size {inputs.batch_size} + --batch_size {inputs.batch_size} [--num_threads {inputs.n_threads}] [--predict_disable_shape_check {inputs.predict_disable_shape_check}] [--verbose {inputs.verbose}] diff --git a/src/scripts/treelite_python/conda_env.yaml b/src/scripts/treelite_python/conda_env.yaml index 10eb17c2..c2cc4254 100644 --- a/src/scripts/treelite_python/conda_env.yaml +++ b/src/scripts/treelite_python/conda_env.yaml @@ -9,3 +9,4 @@ dependencies: - azureml-mlflow==1.30.0 - treelite==1.3.0 - treelite_runtime==1.3.0 + - scikit-learn~=0.24.1 diff --git a/src/scripts/treelite_python/score.py b/src/scripts/treelite_python/score.py index 9d5e69f2..8d3ee0b3 100644 --- a/src/scripts/treelite_python/score.py +++ b/src/scripts/treelite_python/score.py @@ -8,6 +8,7 @@ import sys import argparse import logging +import time import numpy from distutils.util import strtobool import pandas as pd @@ -23,7 +24,7 @@ # useful imports from common from common.metrics import MetricsLogger -from common.io import input_file_path +from common.io import input_file_path, DataBatch, libsvm_data_load, numpy_data_load def get_arg_parser(parser=None): @@ -45,12 +46,16 @@ def get_arg_parser(parser=None): group_i = parser.add_argument_group("Input Data") group_i.add_argument("--data", required=True, type=input_file_path, help="Inferencing data location (file path)") + group_i.add_argument(f"--data_loader", + required=False, type=str, default="libsvm", choices=["libsvm", "numpy"], help="use numpy for csv, libsvm for libsvm, or lightgbm for both") group_i.add_argument("--so_path", required=False, default = "./mymodel.so" , help="full path to model so") group_i.add_argument("--output", required=False, default=None, type=str, help="Inferencing output location (file path)") group_params = parser.add_argument_group("Scoring parameters") + group_params.add_argument(f"--batch_size", + required=False, type=int, default=0, help="size of batches (default: all data in 1 batch") group_params.add_argument("--nthreads", required=False, default=1, type=int, help="number of threads") @@ -116,18 +121,46 @@ def run(args, unknown_args=[]): logger.info(f"Loading data for inferencing") with metrics_logger.log_time_block("time_data_loading"): - my_data = pd.read_csv(args.data).to_numpy() - - predictor = treelite_runtime.Predictor( - args.so_path, - verbose=True, - nthread=args.nthreads - ) - dmat = treelite_runtime.DMatrix(my_data) + if args.data_loader == "libsvm": + inference_data, row_count, feature_count = libsvm_data_load(args.data) + inference_raw_data = inference_data[0] # (x,y) -> x + elif args.data_loader == "numpy": + inference_data, row_count, feature_count = numpy_data_load(args.data) + else: + raise NotImplementedError(f"--data_loader {args.data_loader} is not implemented.") + + inference_data_raw = treelite_runtime.DMatrix(inference_data) + + logger.info(f"Loading model from {args.model}") + predictor = treelite_runtime.Predictor( + args.so_path, + verbose=True, + nthread=args.nthreads + ) + + # capture data shape as property + metrics_logger.set_properties( + inference_data_length = row_count, + inference_data_width = feature_count + ) logger.info(f"Running .predict()") + batch_run_times = [] # collect time for each batch with metrics_logger.log_time_block("time_inferencing"): - predictor.predict(dmat) + if args.batch_size > 0: + inference_batches = DataBatch(x=inference_raw_data, y=None, batch_size=args.batch_size) + for data_batch, _ in inference_batches: + batch_start_time = time.time() + predictor.predict(data_batch) + batch_run_times.append(time.time() - batch_start_time) + else: + predictor.predict(inference_data_raw) + + if len(batch_run_times) > 1: + batch_run_times = numpy.array(batch_run_times) + metrics_logger.log_metric("batch_time_inferencing_p50_usecs", numpy.percentile(batch_run_times, 50) * 1000000) + metrics_logger.log_metric("batch_time_inferencing_p90_usecs", numpy.percentile(batch_run_times, 90) * 1000000) + metrics_logger.log_metric("batch_time_inferencing_p99_usecs", numpy.percentile(batch_run_times, 99) * 1000000) # Important: close logging session before exiting metrics_logger.close() diff --git a/src/scripts/treelite_python/score_spec.yaml b/src/scripts/treelite_python/score_spec.yaml index d325c0a5..43805269 100644 --- a/src/scripts/treelite_python/score_spec.yaml +++ b/src/scripts/treelite_python/score_spec.yaml @@ -15,6 +15,17 @@ inputs: type: AnyDirectory description: directory to the model optional: false + data_loader: + type: Enum + enum: + - libsvm + - numpy + batch_size: + type: Integer + default: 0 + n_threads: + type: Integer + optional: true verbose: type: Boolean optional: true @@ -28,6 +39,9 @@ command: >- python score.py --data {inputs.data} --so_path {inputs.compiled_model} + --data_loader {inputs.data_loader} + --batch_size {inputs.batch_size} + [--nthreads {inputs.n_threads}] [--verbose {inputs.verbose}] [--custom_properties {inputs.custom_properties}] From 8ac4b0d8f7832254aec02ef045503b77594864ec Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Mon, 18 Oct 2021 22:38:17 -0700 Subject: [PATCH 10/12] use numpy --- .../conf/experiments/benchmarks/lightgbm-inferencing.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml b/pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml index 8e710bdf..a0e01c0a 100644 --- a/pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml +++ b/pipelines/azureml/conf/experiments/benchmarks/lightgbm-inferencing.yaml @@ -60,19 +60,19 @@ lightgbm_inferencing: - framework: lightgbm_python build: dockers/lightgbm_cpu_mpi_pip.dockerfile batch_size: 1 - data_loader: "libsvm" + data_loader: "numpy" n_threads: 1 - framework: lightgbm_python build: dockers/lightgbm_cpu_mpi_build.dockerfile batch_size: 1 - data_loader: "libsvm" + data_loader: "numpy" n_threads: 1 - framework: lightgbm_python build: dockers/lightgbm_cpu_mpi_custom.dockerfile batch_size: 1 - data_loader: "libsvm" + data_loader: "numpy" n_threads: 1 - framework: treelite_python batch_size: 1 - data_loader: "libsvm" + data_loader: "numpy" n_threads: 1 From bce132fa02133f84a4fd1fbbf02506863c55a620 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Mon, 18 Oct 2021 23:14:49 -0700 Subject: [PATCH 11/12] fix numpy loading --- src/common/io.py | 7 ++----- src/scripts/treelite_python/score.py | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/common/io.py b/src/common/io.py index 5dce858d..475b0706 100644 --- a/src/common/io.py +++ b/src/common/io.py @@ -1,8 +1,6 @@ import os import argparse import logging -import numpy as np -from lightgbm import Dataset as lightgbm_Dataset def input_file_path(path): """ Resolve input path from AzureML. @@ -190,11 +188,10 @@ def numpy_data_load(path, delimiter=","): Returns: numpy_array, number_of_rows (int), number of cols (int) """ - self.logger.info(f"Loading {path} with numpy") # importing at last minute intentionally - raw_data = np.loadtxt(path, delimiter=delimiter) + import numpy as np - self.logger.info(f"Loaded {path} data has {raw_data.shape[0]} rows and {raw_data.shape[1]} cols") + raw_data = np.loadtxt(path, delimiter=delimiter) return raw_data, raw_data.shape[0], raw_data.shape[1] diff --git a/src/scripts/treelite_python/score.py b/src/scripts/treelite_python/score.py index 8d3ee0b3..520cb007 100644 --- a/src/scripts/treelite_python/score.py +++ b/src/scripts/treelite_python/score.py @@ -125,11 +125,11 @@ def run(args, unknown_args=[]): inference_data, row_count, feature_count = libsvm_data_load(args.data) inference_raw_data = inference_data[0] # (x,y) -> x elif args.data_loader == "numpy": - inference_data, row_count, feature_count = numpy_data_load(args.data) + inference_raw_data, row_count, feature_count = numpy_data_load(args.data) else: raise NotImplementedError(f"--data_loader {args.data_loader} is not implemented.") - inference_data_raw = treelite_runtime.DMatrix(inference_data) + inference_data_raw = treelite_runtime.DMatrix(inference_raw_data) logger.info(f"Loading model from {args.model}") predictor = treelite_runtime.Predictor( From fdd35833615df1f378efaa22505ecf3914846cd6 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Mon, 18 Oct 2021 23:16:35 -0700 Subject: [PATCH 12/12] fix numpy import --- src/common/io.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/common/io.py b/src/common/io.py index 475b0706..a3201432 100644 --- a/src/common/io.py +++ b/src/common/io.py @@ -1,6 +1,7 @@ import os import argparse import logging +import numpy as np def input_file_path(path): """ Resolve input path from AzureML. @@ -188,9 +189,6 @@ def numpy_data_load(path, delimiter=","): Returns: numpy_array, number_of_rows (int), number of cols (int) """ - # importing at last minute intentionally - import numpy as np - raw_data = np.loadtxt(path, delimiter=delimiter) return raw_data, raw_data.shape[0], raw_data.shape[1]