From 1cdf8bc883392a2b059bc41ab36bf4a66644e1be Mon Sep 17 00:00:00 2001 From: Daran He Date: Thu, 27 Apr 2023 17:27:42 -0700 Subject: [PATCH] feat: Create tiny config for GPT-NeoX for quick tests The main zero1.yaml uses a config that exceeds the allocation limit of existing AWS machines. Here we create a tiny yaml to allow quick testing. Test: ``` det experiment create zero1_tiny.yaml . ``` Job: http://ec2-44-213-33-242.compute-1.amazonaws.com:8080/det/experiments/10/overview [ghstack-poisoned] --- examples/deepspeed/gpt_neox/README.md | 3 +- examples/deepspeed/gpt_neox/zero1_tiny.yaml | 47 +++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 examples/deepspeed/gpt_neox/zero1_tiny.yaml diff --git a/examples/deepspeed/gpt_neox/README.md b/examples/deepspeed/gpt_neox/README.md index dcaaf4ea6fa..39abbd298c0 100644 --- a/examples/deepspeed/gpt_neox/README.md +++ b/examples/deepspeed/gpt_neox/README.md @@ -39,7 +39,8 @@ mounted at `/run/determined/workdir/shared_fs`. This is done by default for clu Once a cluster is available, run the following command: ``` -det experiment create zero1.yaml . +det experiment create zero1.yaml . # For full training +det experiment create zero1_tiny.yaml . # For quick tests ``` **Note:** You will need to run on GPUs that support fp16 training. diff --git a/examples/deepspeed/gpt_neox/zero1_tiny.yaml b/examples/deepspeed/gpt_neox/zero1_tiny.yaml new file mode 100644 index 00000000000..ddd8d6e33a9 --- /dev/null +++ b/examples/deepspeed/gpt_neox/zero1_tiny.yaml @@ -0,0 +1,47 @@ +name: gpt-neox-zero1-2-7B +debug: false +profiling: + enabled: false + begin_on_batch: 50 + end_after_batch: 100 + sync_timings: false +hyperparameters: + search_world_size: false + conf_dir: /gpt-neox/configs + conf_file: + - 2-7B.yml + - determined_cluster.yml + overwrite_values: + pipe_parallel_size: 2 + model_parallel_size: 2 + train_batch_size: 64 + train_micro_batch_size_per_gpu: 2 + wandb_group: null + wandb_team: null + user_script: null + eval_tasks: null +environment: + environment_variables: + - NCCL_DEBUG=INFO + # You may need to modify this to match your network configuration. + - NCCL_SOCKET_IFNAME=ens,eth,ib + force_pull_image: true + image: + gpu: determinedai/gpt-neox:4850e79 +resources: + slots_per_trial: 16 +searcher: + name: single + metric: lm_loss + smaller_is_better: false + max_length: + batches: 100 +min_validation_period: + batches: 5000 +max_restarts: 0 +entrypoint: + - python3 + - -m + - determined.launch.deepspeed + - --trial + - gpt2_trial:GPT2Trial