diff --git a/src/common/metrics.py b/src/common/metrics.py index a3ec9ff0..275822c0 100644 --- a/src/common/metrics.py +++ b/src/common/metrics.py @@ -48,7 +48,7 @@ def open(self): def close(self): """Close the MLFlow session.""" if MetricsLogger._initialized: - self._logger.info(f"Finalizing MLFLOW [session='{self._session_name}']") + self._logger.info(f"Finalizing MLFLOW [session='{self._session_name}', session_id='{mlflow.active_run().info.run_id}'']") mlflow.end_run() MetricsLogger._initialized = False else: diff --git a/src/scripts/data_processing/generate_data/generate.py b/src/scripts/data_processing/generate_data/generate.py index 7eecaa89..1c52c01b 100644 --- a/src/scripts/data_processing/generate_data/generate.py +++ b/src/scripts/data_processing/generate_data/generate.py @@ -52,7 +52,7 @@ def get_arg_parser(cls, parser=None): # add arguments that are specific to the script group_params = parser.add_argument_group("Synthesis params") group_params.add_argument( - "--type", required=True, type=str, choices=["classification", "regression"] + "--type", required=True, type=str, choices=["classification", "regression", "lambdarank"] ) group_params.add_argument("--train_samples", required=True, type=int) group_params.add_argument("--test_samples", required=True, type=int) @@ -61,6 +61,8 @@ def get_arg_parser(cls, parser=None): group_params.add_argument("--n_informative", required=True, type=int) group_params.add_argument("--n_redundant", required=False, type=int) group_params.add_argument("--random_state", required=False, default=None, type=int) + group_params.add_argument("--docs_per_query", required=False, default=20, type=int) + group_params.add_argument("--delimiter", required=False, default=',', type=str) group_o = parser.add_argument_group("Outputs") group_o.add_argument( @@ -81,6 +83,12 @@ def get_arg_parser(cls, parser=None): type=str, help="Output data location (directory)", ) + group_o.add_argument( + "--output_header", + required=True, + type=str, + help="Output header location (directory)", + ) return parser @@ -98,6 +106,7 @@ def run(self, args, logger, metrics_logger, unknown_args): os.makedirs(args.output_train, exist_ok=True) os.makedirs(args.output_test, exist_ok=True) os.makedirs(args.output_inference, exist_ok=True) + os.makedirs(args.output_header, exist_ok=True) metrics_logger.log_parameters( type=args.type, @@ -131,6 +140,19 @@ def run(self, args, logger, metrics_logger, unknown_args): n_informative=args.n_informative, random_state=args.random_state, ) + elif args.type == "lambdarank": + X, y = make_regression( + n_samples=total_samples, + n_features=args.n_features, + n_informative=args.n_informative, + random_state=args.random_state, + ) + # add query column + query_col = [[i // args.docs_per_query] for i in range(total_samples)] + X = numpy.hstack((query_col, X)) + # create 30 ranking labels + y = ((y - min(y))/(max(y)-min(y))*30).astype(int) + else: raise NotImplementedError(f"--type {args.type} is not implemented.") @@ -150,30 +172,36 @@ def run(self, args, logger, metrics_logger, unknown_args): inference_data = X[args.train_samples + args.test_samples :] logger.info(f"Inference data shape: {inference_data.shape}") + # create a header + header = [f'Column_{i}' for i in range(train_data.shape[1])] + if args.delimiter == 'tab': + args.delimiter = "\t" # save as CSV logger.info(f"Saving data...") with metrics_logger.log_time_block("time_data_saving"): numpy.savetxt( os.path.join(args.output_train, "train.txt"), train_data, - delimiter=",", + delimiter=args.delimiter, newline="\n", fmt="%1.3f", ) numpy.savetxt( os.path.join(args.output_test, "test.txt"), test_data, - delimiter=",", + delimiter=args.delimiter, newline="\n", fmt="%1.3f", ) numpy.savetxt( os.path.join(args.output_inference, "inference.txt"), inference_data, - delimiter=",", + delimiter=args.delimiter, newline="\n", fmt="%1.3f", ) + with open(os.path.join(args.output_header, "header.txt"), 'w') as hf: + hf.writelines(args.delimiter.join(header)) def get_arg_parser(parser=None): diff --git a/src/scripts/data_processing/generate_data/spec.yaml b/src/scripts/data_processing/generate_data/spec.yaml index 3ca86b4f..8d7f7146 100644 --- a/src/scripts/data_processing/generate_data/spec.yaml +++ b/src/scripts/data_processing/generate_data/spec.yaml @@ -13,6 +13,7 @@ inputs: enum: - regression - classification + - lambdarank train_samples: type: Integer description: Number of training samples @@ -41,6 +42,15 @@ inputs: type: Integer description: random seed optional: true + delimiter: + type: String + description: selimiter + optional: true + default: "," + docs_per_query: + type: Integer + description: random seed + optional: true verbose: type: Boolean default: False @@ -56,6 +66,8 @@ outputs: type: AnyDirectory output_inference: type: AnyDirectory + output_header: + type: AnyDirectory command: >- python generate.py @@ -67,11 +79,14 @@ command: >- --n_informative {inputs.n_informative} [--n_redundant {inputs.n_redundant}] [--random_state {inputs.random_state}] + [--docs_per_query {inputs.docs_per_query}] --output_train {outputs.output_train} --output_test {outputs.output_test} --output_inference {outputs.output_inference} + --output_header {outputs.output_header} --verbose {inputs.verbose} [--custom_properties {inputs.custom_properties}] + [--delimiter {inputs.delimiter}] environment: conda: diff --git a/src/scripts/training/lightgbm_python/default.dockerfile b/src/scripts/training/lightgbm_python/default.dockerfile index 080f2b56..16b19c8a 100644 --- a/src/scripts/training/lightgbm_python/default.dockerfile +++ b/src/scripts/training/lightgbm_python/default.dockerfile @@ -1,10 +1,29 @@ -FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210615.v1 -LABEL lightgbmbenchmark.linux.cpu.mpi.pip.version="3.3.0/20211111.1" +FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest +LABEL lightgbmbenchmark.linux.cpu.mpi.pip.version="3.3.1/20211210.1" + +# https://github.com/microsoft/lightgbm-transform/blob/main/docs/Installation-Guide.rst +# Install CMake, gcc, g++, boost. +RUN apt-get update && apt-get -y upgrade && DEBIAN_FRONTEND="noninteractive" apt-get install -y libboost-all-dev gcc g++ wget cmake git curl libtinfo5 + +# Install LLVM with RTTI feature. +WORKDIR /root +RUN wget https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-3.5.1.tar.gz && tar zxvf llvmorg-3.5.1.tar.gz +WORKDIR /root/llvm-project-llvmorg-3.5.1/llvm +RUN mkdir build && cd build && cmake -DLLVM_REQUIRES_RTTI=1 .. && make -j4 && make install + +# Install bond. +WORKDIR /root +RUN git clone --recursive https://github.com/microsoft/bond.git +RUN DEBIAN_FRONTEND="noninteractive" apt-get install -y clang zlib1g-dev +RUN curl -sSL https://get.haskellstack.org/ | sh +WORKDIR /root/bond +RUN mkdir build && cd build && cmake -DBOND_ENABLE_GRPC=FALSE .. && make -j4 && make install + # Those arguments will NOT be used by AzureML # they are here just to allow for lightgbm-benchmark build to actually check # dockerfiles in a PR against their actual branch -ARG lightgbm_version="3.3.0" +ARG lightgbm_version="3.3.1" ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/lightgbm @@ -26,12 +45,14 @@ RUN HOROVOD_WITH_TENSORFLOW=1 \ 'azureml-defaults==1.35.0' \ 'azureml-mlflow==1.35.0' \ 'azureml-telemetry==1.35.0' \ - 'mpi4py==3.1.1' + 'mpi4py==3.1.1' \ + 'omegaconf' # install lightgbm with mpi RUN pip install --upgrade pip setuptools wheel && \ pip install 'cmake==3.21.0' && \ - pip install lightgbm==${lightgbm_version} --install-option=--mpi + pip install lightgbm==${lightgbm_version} --install-option=--mpi &&\ + pip install lightgbm-transform # This is needed for mpi to locate libpython ENV LD_LIBRARY_PATH $AZUREML_CONDA_ENVIRONMENT_PATH/lib:$LD_LIBRARY_PATH diff --git a/src/scripts/training/lightgbm_python/spec.yaml b/src/scripts/training/lightgbm_python/spec.yaml index 61b94f65..ebcbe0e8 100644 --- a/src/scripts/training/lightgbm_python/spec.yaml +++ b/src/scripts/training/lightgbm_python/spec.yaml @@ -16,6 +16,10 @@ inputs: type: AnyDirectory description: directory to the testing data optional: false + parser_config_file: + type: AnyDirectory + description: directory to the transform parser config + optional: true construct: type: Boolean description: "Use lazy intialization during data loading phase, see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Dataset.html#lightgbm.Dataset.construct" @@ -144,6 +148,7 @@ launcher: python train.py --train {inputs.train} --test {inputs.test} + [--parser_config_file {inputs.parser_config_file}] --construct {inputs.construct} --header {inputs.header} [--label_column {inputs.label_column}] diff --git a/src/scripts/training/lightgbm_python/train.py b/src/scripts/training/lightgbm_python/train.py index d6b96fef..73828d6e 100644 --- a/src/scripts/training/lightgbm_python/train.py +++ b/src/scripts/training/lightgbm_python/train.py @@ -62,6 +62,7 @@ def get_arg_parser(cls, parser=None): group_i.add_argument("--header", required=False, default=False, type=strtobool) group_i.add_argument("--label_column", required=False, default="0", type=str) group_i.add_argument("--group_column", required=False, default=None, type=str) + group_i.add_argument("--parser_config_file", required=False, default=None, type=str, help="transfomr parser config location (file path)") group_o = parser.add_argument_group("Outputs") group_o.add_argument("--export_model",