From 1cdf8bc883392a2b059bc41ab36bf4a66644e1be Mon Sep 17 00:00:00 2001
From: Daran He <daran.he@hpe.com>
Date: Thu, 27 Apr 2023 17:27:42 -0700
Subject: [PATCH] feat: Create tiny config for GPT-NeoX for quick tests

The main zero1.yaml uses a config that exceeds the allocation limit of existing AWS machines. Here we create a tiny yaml to allow quick testing.

Test:
```
det experiment create zero1_tiny.yaml .
```

Job: http://ec2-44-213-33-242.compute-1.amazonaws.com:8080/det/experiments/10/overview

[ghstack-poisoned]
---
 examples/deepspeed/gpt_neox/README.md       |  3 +-
 examples/deepspeed/gpt_neox/zero1_tiny.yaml | 47 +++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 examples/deepspeed/gpt_neox/zero1_tiny.yaml

diff --git a/examples/deepspeed/gpt_neox/README.md b/examples/deepspeed/gpt_neox/README.md
index dcaaf4ea6fa..39abbd298c0 100644
--- a/examples/deepspeed/gpt_neox/README.md
+++ b/examples/deepspeed/gpt_neox/README.md
@@ -39,7 +39,8 @@ mounted at `/run/determined/workdir/shared_fs`.  This is done by default for clu
 
 Once a cluster is available, run the following command: 
 ```
-det experiment create zero1.yaml .
+det experiment create zero1.yaml . # For full training
+det experiment create zero1_tiny.yaml . # For quick tests
 ```
 
 **Note:** You will need to run on GPUs that support fp16 training. 
diff --git a/examples/deepspeed/gpt_neox/zero1_tiny.yaml b/examples/deepspeed/gpt_neox/zero1_tiny.yaml
new file mode 100644
index 00000000000..ddd8d6e33a9
--- /dev/null
+++ b/examples/deepspeed/gpt_neox/zero1_tiny.yaml
@@ -0,0 +1,47 @@
+name: gpt-neox-zero1-2-7B
+debug: false
+profiling:
+    enabled: false
+    begin_on_batch: 50
+    end_after_batch: 100
+    sync_timings: false
+hyperparameters:
+  search_world_size: false
+  conf_dir: /gpt-neox/configs
+  conf_file:
+      - 2-7B.yml
+      - determined_cluster.yml
+  overwrite_values:
+     pipe_parallel_size: 2
+     model_parallel_size: 2
+     train_batch_size: 64
+     train_micro_batch_size_per_gpu: 2
+  wandb_group: null
+  wandb_team: null
+  user_script: null
+  eval_tasks: null
+environment:
+    environment_variables:
+        - NCCL_DEBUG=INFO
+        # You may need to modify this to match your network configuration.
+        - NCCL_SOCKET_IFNAME=ens,eth,ib
+    force_pull_image: true
+    image:
+      gpu: determinedai/gpt-neox:4850e79
+resources:
+  slots_per_trial: 16
+searcher:
+  name: single
+  metric: lm_loss
+  smaller_is_better: false
+  max_length:
+    batches: 100
+min_validation_period:
+    batches: 5000
+max_restarts: 0
+entrypoint:
+  - python3
+  - -m
+  - determined.launch.deepspeed
+  - --trial
+  - gpt2_trial:GPT2Trial