feat: Create tiny config for GPT-NeoX for quick tests

daranday · daranday · commit 89fbf5987913 · 2023-04-27T17:27:45.000-07:00
The main zero1.yaml uses a config that exceeds the allocation limit of existing AWS machines. Here we create a tiny yaml to allow quick testing. Test: ``` det experiment create zero1_tiny.yaml . ``` Job: http://ec2-44-213-33-242.compute-1.amazonaws.com:8080/det/experiments/10/overview ghstack-source-id: 2b558da Pull Request resolved: #6669
diff --git a/examples/deepspeed/gpt_neox/README.md b/examples/deepspeed/gpt_neox/README.md
@@ -39,7 +39,8 @@ mounted at `/run/determined/workdir/shared_fs`.  This is done by default for clu
 
 Once a cluster is available, run the following command: 
 ```
-det experiment create zero1.yaml .
+det experiment create zero1.yaml . # For full training
+det experiment create zero1_tiny.yaml . # For quick tests
 ```
 
 **Note:** You will need to run on GPUs that support fp16 training. 
diff --git a/examples/deepspeed/gpt_neox/zero1_tiny.yaml b/examples/deepspeed/gpt_neox/zero1_tiny.yaml
@@ -0,0 +1,47 @@
+name: gpt-neox-zero1-2-7B
+debug: false
+profiling:
+    enabled: false
+    begin_on_batch: 50
+    end_after_batch: 100
+    sync_timings: false
+hyperparameters:
+  search_world_size: false
+  conf_dir: /gpt-neox/configs
+  conf_file:
+      - 2-7B.yml
+      - determined_cluster.yml
+  overwrite_values:
+     pipe_parallel_size: 2
+     model_parallel_size: 2
+     train_batch_size: 64
+     train_micro_batch_size_per_gpu: 2
+  wandb_group: null
+  wandb_team: null
+  user_script: null
+  eval_tasks: null
+environment:
+    environment_variables:
+        - NCCL_DEBUG=INFO
+        # You may need to modify this to match your network configuration.
+        - NCCL_SOCKET_IFNAME=ens,eth,ib
+    force_pull_image: true
+    image:
+      gpu: determinedai/gpt-neox:4850e79
+resources:
+  slots_per_trial: 16
+searcher:
+  name: single
+  metric: lm_loss
+  smaller_is_better: false
+  max_length:
+    batches: 100
+min_validation_period:
+    batches: 5000
+max_restarts: 0
+entrypoint:
+  - python3
+  - -m
+  - determined.launch.deepspeed
+  - --trial
+  - gpt2_trial:GPT2Trial