Skip to content
This repository was archived by the owner on Apr 8, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ module_loader: # module loading params

lightgbm_inferencing:
# name of your particular benchmark
benchmark_name: "benchmark-inferencing-20211109.3" # need to be provided at runtime!
benchmark_name: "benchmark-inferencing-20211124.1" # need to be provided at runtime!

tasks:
- data:
Expand Down Expand Up @@ -82,11 +82,19 @@ lightgbm_inferencing:

variants:
- framework: lightgbm_python # v3.3.0 via pypi
num_threads: 1

- framework: lightgbm_c_api # v3.3.0 with C API prediction

- framework: lightgbm_c_api # v3.3.0 with C API prediction
build: docker/lightgbm-custom/v330_patch_cpu_mpi_build.dockerfile

- framework: lightgbm_c_api # v3.2.1 with C API prediction
build: docker/lightgbm-v3.2.1/linux_cpu_mpi_build.dockerfile

- framework: lightgbm_c_api # v3.2.1 with C API prediction
build: docker/lightgbm-custom/v321_patch_cpu_mpi_build.dockerfile

- framework: treelite_python # v1.3.0
num_threads: 1
batch_size: 0 # use whole file as batch
9 changes: 9 additions & 0 deletions pipelines/azureml/conf/experiments/lightgbm-inferencing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,21 @@ lightgbm_inferencing:
# list all inferencing frameworks and their builds
variants:
- framework: lightgbm_python # v3.3.0 via pypi
num_threads: 1

- framework: lightgbm_c_api # v3.3.0 with C API prediction

- framework: lightgbm_c_api # v3.3.0 with C API prediction
build: docker/lightgbm-custom/v330_patch_cpu_mpi_build.dockerfile

- framework: lightgbm_c_api # v3.2.1 with C API prediction
build: docker/lightgbm-v3.2.1/linux_cpu_mpi_build.dockerfile

- framework: lightgbm_c_api # v3.2.1 with C API prediction
build: docker/lightgbm-custom/v321_patch_cpu_mpi_build.dockerfile

- framework: treelite_python # v1.3.0
num_threads: 1
batch_size: 0 # use whole file as batch


8 changes: 8 additions & 0 deletions pipelines/azureml/pipelines/lightgbm_inferencing.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ def lightgbm_inferencing_pipeline_function(benchmark_custom_properties, data, mo
inferencing_step = treelite_score_module(
data = data,
compiled_model = treelite_compile_step.outputs.compiled_model,
num_threads = variant.num_threads,
batch_size = variant.batch_size,
verbose = False,
custom_properties = custom_properties
)
Expand All @@ -140,6 +142,8 @@ def lightgbm_inferencing_pipeline_function(benchmark_custom_properties, data, mo
inferencing_step = lightgbm_c_api_score_module(
data = data,
model = model,
num_threads = variant.num_threads,
# batch_size = variant.batch_size, # not supported yet
predict_disable_shape_check = predict_disable_shape_check,
verbose = False,
custom_properties = custom_properties
Expand All @@ -151,6 +155,8 @@ def lightgbm_inferencing_pipeline_function(benchmark_custom_properties, data, mo
inferencing_step = lightgbm_cli_score_module(
data = data,
model = model,
num_threads = variant.num_threads,
# batch_size = variant.batch_size, # not supported yet
predict_disable_shape_check = predict_disable_shape_check,
verbose = False,
custom_properties = custom_properties
Expand All @@ -162,6 +168,8 @@ def lightgbm_inferencing_pipeline_function(benchmark_custom_properties, data, mo
inferencing_step = lightgbm_python_score_module(
data = data,
model = model,
num_threads = variant.num_threads,
# batch_size = variant.batch_size, # not supported yet
predict_disable_shape_check = predict_disable_shape_check,
verbose = False,
custom_properties = custom_properties
Expand Down
40 changes: 40 additions & 0 deletions src/common/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
import argparse
import logging
import csv

def input_file_path(path):
""" Argparse type to resolve input path as single file from directory.
Expand Down Expand Up @@ -225,3 +226,42 @@ def run(self, input_path, output_path):
self.split_by_append(input_files, output_path, self.number)
else:
raise NotImplementedError(f"Mode {self.mode} not implemented.")


class CustomLightGBMDataBatchIterator():
def __init__(self, file_path, batch_size=0, file_format="csv", **kwargs):
self.file_path = file_path
self.batch_size = batch_size
self.file_format = file_format
self.reader_options = kwargs

def iter(self):
if self.file_format == "csv":
with open(self.file_path, "r") as i_file:
reader = csv.reader(i_file, **self.reader_options)

batch = []
if self.batch_size == 0:
# use the entire file as a batch
batch = [
[
float(col) for col in row # convert all values to float for lightgbm
] for row in reader
]
elif self.batch_size > 1:
# create batches
for row in reader:
batch.append(
[ float(col) for col in row ] # convert all values to float for lightgbm
)
if len(batch) >= self.batch_size:
yield batch
batch = [] # reset batch
else:
raise ValueError("batch_size must be >= 0")

# any remaining batch, or whole file
if len(batch) >= 0:
yield batch
else:
raise NotImplementedError("file_format={self.file_format} is not implemented yet.")
5 changes: 5 additions & 0 deletions src/common/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,15 @@ class inferencing_task:

@dataclass
class inferencing_variants:
# framework
framework: str = MISSING
build: Optional[str] = None
os: str = "Linux" # linux or windows, linux by default

# parameters
batch_size: int = 0 # use whole file as batch
num_threads: int = 1 # use only one thread

@dataclass
class data_generation_task:
task: str = MISSING
Expand Down
4 changes: 2 additions & 2 deletions src/scripts/inferencing/lightgbm_c_api/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ inputs:
type: Boolean
description: "control whether or not LightGBM raises an error when you try to predict on data with a different number of features than the training data"
default: False
n_threads:
num_threads:
type: Integer
default: 1
verbose:
Expand All @@ -37,7 +37,7 @@ command: >-
python score.py
--data {inputs.data}
--model {inputs.model}
--num_threads {inputs.n_threads}
--num_threads {inputs.num_threads}
--output {outputs.predictions}
--predict_disable_shape_check {inputs.predict_disable_shape_check}
--verbose {inputs.verbose}
Expand Down
6 changes: 3 additions & 3 deletions src/scripts/inferencing/lightgbm_cli/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ inputs:
type: Boolean
description: "control whether or not LightGBM raises an error when you try to predict on data with a different number of features than the training data"
optional: true
n_threads:
num_threads:
type: Integer
optional: true
default: 1
lightgbm_exec_path:
type: String
optional: true
Expand All @@ -37,7 +37,7 @@ command: >-
python score.py
--data {inputs.data}
--model {inputs.model}
[--num_threads {inputs.n_threads}]
--num_threads {inputs.num_threads}
[--lightgbm_exec_path {inputs.lightgbm_exec_path}]
[--predict_disable_shape_check {inputs.predict_disable_shape_check}]
[--verbose {inputs.verbose}]
Expand Down
6 changes: 3 additions & 3 deletions src/scripts/inferencing/lightgbm_python/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ inputs:
type: Boolean
description: "control whether or not LightGBM raises an error when you try to predict on data with a different number of features than the training data"
default: False
n_threads:
num_threads:
type: Integer
optional: true
default: 1
verbose:
type: Boolean
default: False
Expand All @@ -38,7 +38,7 @@ command: >-
--data {inputs.data}
--model {inputs.model}
--output {outputs.predictions}
[--num_threads {inputs.n_threads}]
--num_threads {inputs.num_threads}
--predict_disable_shape_check {inputs.predict_disable_shape_check}
--verbose {inputs.verbose}
[--custom_properties {inputs.custom_properties}]
Expand Down
1 change: 1 addition & 0 deletions src/scripts/inferencing/treelite_python/conda_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ dependencies:
- treelite_runtime==2.1.0
- pandas>=1.1,<1.2
- numpy>=1.10,<1.20
- matplotlib==3.4.3
79 changes: 58 additions & 21 deletions src/scripts/inferencing/treelite_python/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,30 @@
import sys
import argparse
import logging
import numpy
import time
import numpy as np
from distutils.util import strtobool
import pandas as pd
import treelite, treelite_runtime

# Add the right path to PYTHONPATH
# so that you can import from common.*
COMMON_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))

if COMMON_ROOT not in sys.path:
print(f"Adding {COMMON_ROOT} to PYTHONPATH")
logging.info(f"Adding {COMMON_ROOT} to PYTHONPATH")
sys.path.append(str(COMMON_ROOT))

# useful imports from common
from common.components import RunnableScript
from common.io import input_file_path
from common.io import input_file_path, CustomLightGBMDataBatchIterator


class TreeLightInferencingScript(RunnableScript):
def __init__(self):
super().__init__(
task = 'score',
task = "score",
framework = 'treelite_python',
framework_version = treelite.__version__
framework_version = "PYTHON_API."+str(treelite.__version__)
)

@classmethod
Expand Down Expand Up @@ -61,6 +61,8 @@ def get_arg_parser(cls, parser=None):
group_params = parser.add_argument_group("Scoring parameters")
group_params.add_argument("--num_threads",
required=False, default=1, type=int, help="number of threads")
group_params.add_argument("--batch_size",
required=False, default=0, type=int, help="size of batches for predict call")

return parser

Expand All @@ -76,31 +78,66 @@ def run(self, args, logger, metrics_logger, unknown_args):
"""
# record relevant parameters
metrics_logger.log_parameters(
num_threads=args.num_threads
num_threads=args.num_threads,
batch_size=args.batch_size,
)

# make sure the output argument exists
if args.output:
# make sure the output argument exists
os.makedirs(args.output, exist_ok=True)

# and create your own file inside the output
args.output = os.path.join(args.output, "predictions.txt")

logger.info(f"Loading model from {args.so_path}")
predictor = treelite_runtime.Predictor(
args.so_path,
verbose=True,
nthread=args.num_threads
)

logger.info(f"Loading data for inferencing")
with metrics_logger.log_time_block("time_data_loading"):
my_data = pd.read_csv(args.data).to_numpy()

predictor = treelite_runtime.Predictor(
args.so_path,
verbose=True,
nthread=args.num_threads
)
dmat = treelite_runtime.DMatrix(my_data)
# accumulate predictions and latencies
predictions = []
time_inferencing_per_batch = []
batch_lengths = []

# loop through batches
for batch in CustomLightGBMDataBatchIterator(args.data, batch_size=args.batch_size, file_format="csv").iter():
if len(batch) == 0:
break
batch_lengths.append(len(batch))

# transform into dense matrix for treelite
batch_data = np.array(batch)
batch_dmat = treelite_runtime.DMatrix(batch_data)

# run prediction on batch
batch_start_time = time.monotonic()
predictions.extend(predictor.predict(batch_dmat))
time_inferencing_per_batch.append((time.monotonic() - batch_start_time)) # usecs

# log overall time
metrics_logger.log_metric("time_inferencing", sum(time_inferencing_per_batch))

# use helper to log latency with the right metric names
metrics_logger.log_inferencing_latencies(
time_inferencing_per_batch,
batch_length=batch_lengths,
factor_to_usecs=1000000.0 # values are in seconds
)

logger.info(f"Running .predict()")
with metrics_logger.log_time_block("time_inferencing"):
predictor.predict(dmat)
if args.output:
np.savetxt(
args.output,
predictions,
fmt='%f',
delimiter=',',
newline='\n',
header='',
footer='',
comments='# ',
encoding=None
)


def get_arg_parser(parser=None):
Expand Down
10 changes: 7 additions & 3 deletions src/scripts/inferencing/treelite_python/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@ inputs:
type: AnyDirectory
description: directory to the model
optional: false
n_threads:
num_threads:
type: Integer
optional: true
default: 1
batch_size:
type: Integer
default: 0 # default: use whole file as a batch
verbose:
type: Boolean
default: False
Expand All @@ -31,7 +34,8 @@ command: >-
python score.py
--data {inputs.data}
--so_path {inputs.compiled_model}
[--num_threads {inputs.n_threads}]
--num_threads {inputs.num_threads}
--batch_size {inputs.batch_size}
--verbose {inputs.verbose}
[--custom_properties {inputs.custom_properties}]

Expand Down