Skip to content
This repository was archived by the owner on Apr 8, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/common/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def open(self):
def close(self):
"""Close the MLFlow session."""
if MetricsLogger._initialized:
self._logger.info(f"Finalizing MLFLOW [session='{self._session_name}']")
self._logger.info(f"Finalizing MLFLOW [session='{self._session_name}', session_id='{mlflow.active_run().info.run_id}'']")
mlflow.end_run()
MetricsLogger._initialized = False
else:
Expand Down
36 changes: 32 additions & 4 deletions src/scripts/data_processing/generate_data/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def get_arg_parser(cls, parser=None):
# add arguments that are specific to the script
group_params = parser.add_argument_group("Synthesis params")
group_params.add_argument(
"--type", required=True, type=str, choices=["classification", "regression"]
"--type", required=True, type=str, choices=["classification", "regression", "lambdarank"]
)
group_params.add_argument("--train_samples", required=True, type=int)
group_params.add_argument("--test_samples", required=True, type=int)
Expand All @@ -61,6 +61,8 @@ def get_arg_parser(cls, parser=None):
group_params.add_argument("--n_informative", required=True, type=int)
group_params.add_argument("--n_redundant", required=False, type=int)
group_params.add_argument("--random_state", required=False, default=None, type=int)
group_params.add_argument("--docs_per_query", required=False, default=20, type=int)
group_params.add_argument("--delimiter", required=False, default=',', type=str)

group_o = parser.add_argument_group("Outputs")
group_o.add_argument(
Expand All @@ -81,6 +83,12 @@ def get_arg_parser(cls, parser=None):
type=str,
help="Output data location (directory)",
)
group_o.add_argument(
"--output_header",
required=True,
type=str,
help="Output header location (directory)",
)

return parser

Expand All @@ -98,6 +106,7 @@ def run(self, args, logger, metrics_logger, unknown_args):
os.makedirs(args.output_train, exist_ok=True)
os.makedirs(args.output_test, exist_ok=True)
os.makedirs(args.output_inference, exist_ok=True)
os.makedirs(args.output_header, exist_ok=True)

metrics_logger.log_parameters(
type=args.type,
Expand Down Expand Up @@ -131,6 +140,19 @@ def run(self, args, logger, metrics_logger, unknown_args):
n_informative=args.n_informative,
random_state=args.random_state,
)
elif args.type == "lambdarank":
X, y = make_regression(
n_samples=total_samples,
n_features=args.n_features,
n_informative=args.n_informative,
random_state=args.random_state,
)
# add query column
query_col = [[i // args.docs_per_query] for i in range(total_samples)]
X = numpy.hstack((query_col, X))
# create 30 ranking labels
y = ((y - min(y))/(max(y)-min(y))*30).astype(int)

else:
raise NotImplementedError(f"--type {args.type} is not implemented.")

Expand All @@ -150,30 +172,36 @@ def run(self, args, logger, metrics_logger, unknown_args):
inference_data = X[args.train_samples + args.test_samples :]
logger.info(f"Inference data shape: {inference_data.shape}")

# create a header
header = [f'Column_{i}' for i in range(train_data.shape[1])]
if args.delimiter == 'tab':
args.delimiter = "\t"
# save as CSV
logger.info(f"Saving data...")
with metrics_logger.log_time_block("time_data_saving"):
numpy.savetxt(
os.path.join(args.output_train, "train.txt"),
train_data,
delimiter=",",
delimiter=args.delimiter,
newline="\n",
fmt="%1.3f",
)
numpy.savetxt(
os.path.join(args.output_test, "test.txt"),
test_data,
delimiter=",",
delimiter=args.delimiter,
newline="\n",
fmt="%1.3f",
)
numpy.savetxt(
os.path.join(args.output_inference, "inference.txt"),
inference_data,
delimiter=",",
delimiter=args.delimiter,
newline="\n",
fmt="%1.3f",
)
with open(os.path.join(args.output_header, "header.txt"), 'w') as hf:
hf.writelines(args.delimiter.join(header))


def get_arg_parser(parser=None):
Expand Down
15 changes: 15 additions & 0 deletions src/scripts/data_processing/generate_data/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ inputs:
enum:
- regression
- classification
- lambdarank
train_samples:
type: Integer
description: Number of training samples
Expand Down Expand Up @@ -41,6 +42,15 @@ inputs:
type: Integer
description: random seed
optional: true
delimiter:
type: String
description: selimiter
optional: true
default: ","
docs_per_query:
type: Integer
description: random seed
optional: true
verbose:
type: Boolean
default: False
Expand All @@ -56,6 +66,8 @@ outputs:
type: AnyDirectory
output_inference:
type: AnyDirectory
output_header:
type: AnyDirectory

command: >-
python generate.py
Expand All @@ -67,11 +79,14 @@ command: >-
--n_informative {inputs.n_informative}
[--n_redundant {inputs.n_redundant}]
[--random_state {inputs.random_state}]
[--docs_per_query {inputs.docs_per_query}]
--output_train {outputs.output_train}
--output_test {outputs.output_test}
--output_inference {outputs.output_inference}
--output_header {outputs.output_header}
--verbose {inputs.verbose}
[--custom_properties {inputs.custom_properties}]
[--delimiter {inputs.delimiter}]

environment:
conda:
Expand Down
31 changes: 26 additions & 5 deletions src/scripts/training/lightgbm_python/default.dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,29 @@
FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210615.v1
LABEL lightgbmbenchmark.linux.cpu.mpi.pip.version="3.3.0/20211111.1"
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest
LABEL lightgbmbenchmark.linux.cpu.mpi.pip.version="3.3.1/20211210.1"

# https://github.com/microsoft/lightgbm-transform/blob/main/docs/Installation-Guide.rst
# Install CMake, gcc, g++, boost.
RUN apt-get update && apt-get -y upgrade && DEBIAN_FRONTEND="noninteractive" apt-get install -y libboost-all-dev gcc g++ wget cmake git curl libtinfo5

# Install LLVM with RTTI feature.
WORKDIR /root
RUN wget https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-3.5.1.tar.gz && tar zxvf llvmorg-3.5.1.tar.gz
WORKDIR /root/llvm-project-llvmorg-3.5.1/llvm
RUN mkdir build && cd build && cmake -DLLVM_REQUIRES_RTTI=1 .. && make -j4 && make install

# Install bond.
WORKDIR /root
RUN git clone --recursive https://github.com/microsoft/bond.git
RUN DEBIAN_FRONTEND="noninteractive" apt-get install -y clang zlib1g-dev
RUN curl -sSL https://get.haskellstack.org/ | sh
WORKDIR /root/bond
RUN mkdir build && cd build && cmake -DBOND_ENABLE_GRPC=FALSE .. && make -j4 && make install


# Those arguments will NOT be used by AzureML
# they are here just to allow for lightgbm-benchmark build to actually check
# dockerfiles in a PR against their actual branch
ARG lightgbm_version="3.3.0"
ARG lightgbm_version="3.3.1"

ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/lightgbm

Expand All @@ -26,12 +45,14 @@ RUN HOROVOD_WITH_TENSORFLOW=1 \
'azureml-defaults==1.35.0' \
'azureml-mlflow==1.35.0' \
'azureml-telemetry==1.35.0' \
'mpi4py==3.1.1'
'mpi4py==3.1.1' \
'omegaconf'

# install lightgbm with mpi
RUN pip install --upgrade pip setuptools wheel && \
pip install 'cmake==3.21.0' && \
pip install lightgbm==${lightgbm_version} --install-option=--mpi
pip install lightgbm==${lightgbm_version} --install-option=--mpi &&\
pip install lightgbm-transform

# This is needed for mpi to locate libpython
ENV LD_LIBRARY_PATH $AZUREML_CONDA_ENVIRONMENT_PATH/lib:$LD_LIBRARY_PATH
5 changes: 5 additions & 0 deletions src/scripts/training/lightgbm_python/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ inputs:
type: AnyDirectory
description: directory to the testing data
optional: false
parser_config_file:
type: AnyDirectory
description: directory to the transform parser config
optional: true
construct:
type: Boolean
description: "Use lazy intialization during data loading phase, see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Dataset.html#lightgbm.Dataset.construct"
Expand Down Expand Up @@ -144,6 +148,7 @@ launcher:
python train.py
--train {inputs.train}
--test {inputs.test}
[--parser_config_file {inputs.parser_config_file}]
--construct {inputs.construct}
--header {inputs.header}
[--label_column {inputs.label_column}]
Expand Down
1 change: 1 addition & 0 deletions src/scripts/training/lightgbm_python/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def get_arg_parser(cls, parser=None):
group_i.add_argument("--header", required=False, default=False, type=strtobool)
group_i.add_argument("--label_column", required=False, default="0", type=str)
group_i.add_argument("--group_column", required=False, default=None, type=str)
group_i.add_argument("--parser_config_file", required=False, default=None, type=str, help="transfomr parser config location (file path)")

group_o = parser.add_argument_group("Outputs")
group_o.add_argument("--export_model",
Expand Down