diff --git a/Makefile b/Makefile index a2037123..b0b820ad 100644 --- a/Makefile +++ b/Makefile @@ -206,10 +206,10 @@ build-pytorch10-tf27-rocm50: -t $(DOCKERHUB_REGISTRY)/$(ROCM50_TORCH_TF_ENVIRONMENT_NAME)-$(VERSION) \ . -DEEPSPEED_VERSION := 0.8.3 -export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX) -export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.10-gpt-neox-deepspeed$(GPU_SUFFIX) -export TORCH_PIP_DEEPSPEED_GPU := torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html +DEEPSPEED_VERSION := 0.9.2 +export GPU_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.12-deepspeed-$(DEEPSPEED_VERSION)$(GPU_SUFFIX) +export GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME := $(CUDA_113_PREFIX)pytorch-1.12-gpt-neox-deepspeed$(GPU_SUFFIX) +export TORCH_PIP_DEEPSPEED_GPU := torch==1.12.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html export TORCH_TB_PROFILER_PIP := torch-tb-profiler==0.4.1 # This builds deepspeed environment off of upstream microsoft/DeepSpeed. @@ -239,8 +239,8 @@ build-gpt-neox-deepspeed-gpu: build-gpu-cuda-113-base --build-arg TORCH_TB_PROFILER_PIP="$(TORCH_TB_PROFILER_PIP)" \ --build-arg TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0" \ --build-arg APEX_GIT="https://github.com/determined-ai/apex.git@3caf0f40c92e92b40051d3afff8568a24b8be28d" \ - --build-arg "$(NCCL_BUILD_ARG)" \ - --build-arg DEEPSPEED_PIP="git+https://github.com/determined-ai/deepspeed.git@eleuther_dai" \ + --build-arg DET_BUILD_NCCL="" \ + --build-arg DEEPSPEED_PIP="git+https://github.com/determined-ai/deepspeed.git@determined2#egg=deepspeed" \ -t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ -t $(DOCKERHUB_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(VERSION) \ -t $(NGC_REGISTRY)/$(GPU_GPT_NEOX_DEEPSPEED_ENVIRONMENT_NAME)-$(SHORT_GIT_HASH) \ diff --git a/dockerfile_scripts/install_deepspeed.sh b/dockerfile_scripts/install_deepspeed.sh index 623cd884..cdf63bda 100755 --- a/dockerfile_scripts/install_deepspeed.sh +++ b/dockerfile_scripts/install_deepspeed.sh @@ -3,7 +3,30 @@ set -e DEBIAN_FRONTEND=noninteractive apt-get install -y pdsh libaio-dev -# Triton is needed to build deepspeed's sparse_attn operation. -python -m pip install triton==1.0.0 -DS_BUILD_OPS=1 python -m pip install $DEEPSPEED_PIP --no-binary deepspeed +# Not building sparse attn operation which depends on a very old version of triton +DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 python -m pip install $DEEPSPEED_PIP --no-binary deepspeed python -m deepspeed.env_report + +if [[ "$DEEPSPEED_PIP" == *"determined2"* ]]; then + # Build gpt-neox and dependencies when we install the gpt-neox version of deepspeed. + # Triton is needed for flash attn + python -m pip install triton==2.0.0.dev20221202 + # This is a dependency of gpt-neox + apt-get install -y mpich + # Need this to avoid `AttributeError: module 'distutils' has no attribute 'version'` when importing tensorboard. See https://github.com/pytorch/pytorch/issues/69894. + pip install setuptools==59.5.0 + # Install gpt-neox and dependencies + git clone -b determined2 https://github.com/determined-ai/gpt-neox.git + python gpt-neox/megatron/fused_kernels/setup.py install + + # Exclude DeeperSpeed reinstall since the version in requirements is not pinned. + pip install $(grep -ivE "DeeperSpeed" gpt-neox/requirements/requirements.txt) + pip install -r /gpt-neox/requirements/requirements-flashattention.txt + + # Download sample data + gsutil cp -r gs://determined-ai-public-datasets/text_data /gpt-neox && mv /gpt-neox/text_data /gpt-neox/data + + # Modify permissions to enable example to run in nonroot mode + chmod -R 777 /gpt-neox + chmod -R 777 /tmp +fi