diff --git a/requirements.txt b/requirements.txt index fc710cc0..d2534489 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ psutil==5.8.0 # frameworks ray==1.9.2 lightgbm-ray==0.1.2 -lightgbm==3.3.0 +lightgbm==3.3.1 treelite==2.1.0 treelite_runtime==2.1.0 flaml==0.9.6 diff --git a/src/scripts/data_processing/generate_data/generate.py b/src/scripts/data_processing/generate_data/generate.py index 50af07ee..ba5445d7 100644 --- a/src/scripts/data_processing/generate_data/generate.py +++ b/src/scripts/data_processing/generate_data/generate.py @@ -252,6 +252,7 @@ def run(self, args, logger, metrics_logger, unknown_args): os.makedirs(args.output_train, exist_ok=True) os.makedirs(args.output_test, exist_ok=True) os.makedirs(args.output_inference, exist_ok=True) + os.makedirs(args.external_header, exist_ok=True) # transform delimiter diff --git a/src/scripts/data_processing/generate_data/spec.yaml b/src/scripts/data_processing/generate_data/spec.yaml index 33433b10..339a8a8c 100644 --- a/src/scripts/data_processing/generate_data/spec.yaml +++ b/src/scripts/data_processing/generate_data/spec.yaml @@ -111,7 +111,7 @@ outputs: type: AnyDirectory external_header: type: AnyDirectory - + command: >- python generate.py --type {inputs.learning_task} @@ -135,7 +135,7 @@ command: >- [--custom_properties {inputs.custom_properties}] [--docs_per_query {inputs.docs_per_query}] [--n_label_classes {inputs.n_label_classes}] - + environment: conda: # conda file path is resolved after additional includes diff --git a/src/scripts/training/lightgbm_python/default.dockerfile b/src/scripts/training/lightgbm_python/default.dockerfile index 1a1ae441..9bf0fcd5 100644 --- a/src/scripts/training/lightgbm_python/default.dockerfile +++ b/src/scripts/training/lightgbm_python/default.dockerfile @@ -1,10 +1,29 @@ FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest -LABEL lightgbmbenchmark.linux.cpu.mpi.pip.version="3.3.0/20210118.1" +LABEL lightgbmbenchmark.linux.cpu.mpi.pip.version="3.3.1/20211210.1" + +# https://github.com/microsoft/lightgbm-transform/blob/main/docs/Installation-Guide.rst +# Install CMake, gcc, g++, boost. +RUN apt-get update && apt-get -y upgrade && DEBIAN_FRONTEND="noninteractive" apt-get install -y libboost-all-dev gcc g++ wget cmake git curl libtinfo5 + +# Install LLVM with RTTI feature. +WORKDIR /root +RUN wget https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-3.5.1.tar.gz && tar zxvf llvmorg-3.5.1.tar.gz +WORKDIR /root/llvm-project-llvmorg-3.5.1/llvm +RUN mkdir build && cd build && cmake -DLLVM_REQUIRES_RTTI=1 .. && make -j4 && make install + +# Install bond. +WORKDIR /root +RUN git clone --recursive https://github.com/microsoft/bond.git +RUN DEBIAN_FRONTEND="noninteractive" apt-get install -y clang zlib1g-dev +RUN curl -sSL https://get.haskellstack.org/ | sh +WORKDIR /root/bond +RUN mkdir build && cd build && cmake -DBOND_ENABLE_GRPC=FALSE .. && make -j4 && make install + # Those arguments will NOT be used by AzureML # they are here just to allow for lightgbm-benchmark build to actually check # dockerfiles in a PR against their actual branch -ARG lightgbm_version="3.3.0" +ARG lightgbm_version="3.3.1" ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/lightgbm @@ -27,7 +46,13 @@ RUN HOROVOD_WITH_TENSORFLOW=1 \ 'azureml-mlflow==1.35.0' \ 'azureml-telemetry==1.35.0' \ 'mpi4py==3.1.1' \ - lightgbm==${lightgbm_version} + 'omegaconf' + +# install lightgbm with mpi +RUN pip install --upgrade pip setuptools wheel && \ + pip install 'cmake==3.21.0' && \ + pip install lightgbm==${lightgbm_version} --install-option=--mpi &&\ + pip install lightgbm-transform==${lightgbm_version} # This is needed for mpi to locate libpython ENV LD_LIBRARY_PATH $AZUREML_CONDA_ENVIRONMENT_PATH/lib:$LD_LIBRARY_PATH diff --git a/src/scripts/training/lightgbm_python/spec.yaml b/src/scripts/training/lightgbm_python/spec.yaml index f8805df3..c3ee88fc 100644 --- a/src/scripts/training/lightgbm_python/spec.yaml +++ b/src/scripts/training/lightgbm_python/spec.yaml @@ -30,6 +30,10 @@ inputs: type: AnyDirectory description: Directory to the testing data optional: false + parser_config_file: + type: AnyDirectory + description: directory to the transform parser config + optional: true construct: type: Boolean description: "Use lazy intialization during data loading phase (both train and test datasets)" @@ -167,6 +171,7 @@ launcher: python train.py --train {inputs.train} --test {inputs.test} + [--parser_config_file {inputs.parser_config_file}] --construct {inputs.construct} --header {inputs.header} --label_column {inputs.label_column} diff --git a/src/scripts/training/lightgbm_python/train.py b/src/scripts/training/lightgbm_python/train.py index c2e3de2a..fbd6e31c 100644 --- a/src/scripts/training/lightgbm_python/train.py +++ b/src/scripts/training/lightgbm_python/train.py @@ -63,6 +63,7 @@ def get_arg_parser(cls, parser=None): group_i.add_argument("--header", required=False, default=False, type=strtobool) group_i.add_argument("--label_column", required=False, default="0", type=str) group_i.add_argument("--group_column", required=False, default=None, type=str) + group_i.add_argument("--parser_config_file", required=False, default=None, type=str, help="transfomr parser config location (file path)") group_o = parser.add_argument_group(f"Outputs [{__name__}:{cls.__name__}]") group_o.add_argument("--export_model",