diff --git a/examples/deepspeed/gpt_neox/README.md b/examples/deepspeed/gpt_neox/README.md index dcaaf4ea6fa..39abbd298c0 100644 --- a/examples/deepspeed/gpt_neox/README.md +++ b/examples/deepspeed/gpt_neox/README.md @@ -39,7 +39,8 @@ mounted at `/run/determined/workdir/shared_fs`. This is done by default for clu Once a cluster is available, run the following command: ``` -det experiment create zero1.yaml . +det experiment create zero1.yaml . # For full training +det experiment create zero1_tiny.yaml . # For quick tests ``` **Note:** You will need to run on GPUs that support fp16 training. diff --git a/examples/deepspeed/gpt_neox/zero1_tiny.yaml b/examples/deepspeed/gpt_neox/zero1_tiny.yaml new file mode 100644 index 00000000000..ddd8d6e33a9 --- /dev/null +++ b/examples/deepspeed/gpt_neox/zero1_tiny.yaml @@ -0,0 +1,47 @@ +name: gpt-neox-zero1-2-7B +debug: false +profiling: + enabled: false + begin_on_batch: 50 + end_after_batch: 100 + sync_timings: false +hyperparameters: + search_world_size: false + conf_dir: /gpt-neox/configs + conf_file: + - 2-7B.yml + - determined_cluster.yml + overwrite_values: + pipe_parallel_size: 2 + model_parallel_size: 2 + train_batch_size: 64 + train_micro_batch_size_per_gpu: 2 + wandb_group: null + wandb_team: null + user_script: null + eval_tasks: null +environment: + environment_variables: + - NCCL_DEBUG=INFO + # You may need to modify this to match your network configuration. + - NCCL_SOCKET_IFNAME=ens,eth,ib + force_pull_image: true + image: + gpu: determinedai/gpt-neox:4850e79 +resources: + slots_per_trial: 16 +searcher: + name: single + metric: lm_loss + smaller_is_better: false + max_length: + batches: 100 +min_validation_period: + batches: 5000 +max_restarts: 0 +entrypoint: + - python3 + - -m + - determined.launch.deepspeed + - --trial + - gpt2_trial:GPT2Trial