diff --git a/Classification/resnet50/0_dist_ssh_key/README.md b/Classification/resnet50/0_dist_ssh_key/README.md new file mode 100644 index 0000000..856df74 --- /dev/null +++ b/Classification/resnet50/0_dist_ssh_key/README.md @@ -0,0 +1,83 @@ +# 使用 Ansible 将 SSH 公钥分发到多个目标主机 + +## 1. 创建变量文件并加密 + +创建一个包含密码的变量文件vars.yml: + +```yaml +all: + hosts: + 192.168.1.27: + ansible_user: myuser + ansible_password: mypassword + 192.168.1.28: + ansible_user: myuser + ansible_password: mypassword +``` + +然后使用Ansible Vault加密这个文件: + +```bash +ansible-vault encrypt vars.yml +``` + +注意: + +1. 执行 `ansible-vault` 的过程中需要设定一个密码,请记住或保存好这个密码 +2. `vars.yml`将被替换为加密后的文件 + +## 2. 创建主机清单文件 + +创建一个主机清单文件`inventory.ini`: + +```ini +[all] +node1 ansible_host=192.168.1.27 ansible_user=myuser +node2 ansible_host=192.168.1.28 ansible_user=myuser +``` + +注:需要根据情况修改 `ansible_user` 的值 + +## 3. 创建Playbook + +如果文件存在,这一步可以忽略。 + +创建一个Playbook distribute_ssh_key.yml: + +```yaml +--- +- name: Distribute SSH key + hosts: all + vars_files: + - vars.yml + tasks: + - name: Create .ssh directory if it doesn't exist + file: + path: /home/{{ ansible_user }}/.ssh + state: directory + mode: '0700' + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + + - name: Copy the SSH key to the authorized_keys file + authorized_key: + user: "{{ ansible_user }}" + state: present + key: "{{ lookup('file', '/path/to/id_rsa.pub') }}" +``` + +注:`vars_files` 配置为 `vars.yml` + +## 4. 运行Playbook + +使用以下命令运行Playbook,并解密变量文件: + +```bash +ansible-playbook -i inventory.ini distribute_ssh_key.yml --ask-vault-pass +``` +或者运行 + +```bash +./dist_ssh_key.sh +``` + diff --git a/Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh b/Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh new file mode 100755 index 0000000..7cb0f1d --- /dev/null +++ b/Classification/resnet50/0_dist_ssh_key/dist_ssh_key.sh @@ -0,0 +1 @@ +ansible-playbook -i inventory.ini distribute_ssh_key.yml --ask-vault-pass diff --git a/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml b/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml new file mode 100644 index 0000000..95147c2 --- /dev/null +++ b/Classification/resnet50/0_dist_ssh_key/distribute_ssh_key.yml @@ -0,0 +1,19 @@ +--- +- name: Distribute SSH key + hosts: all + vars_files: + - vars.yml + tasks: + - name: Create .ssh directory if it doesn't exist + file: + path: /home/{{ ansible_user }}/.ssh + state: directory + mode: '0700' + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + + - name: Copy the SSH key to the authorized_keys file + authorized_key: + user: "{{ ansible_user }}" + state: present + key: "{{ lookup('file', '/home/xiexuan/.ssh/id_rsa.pub') }}" diff --git a/Classification/resnet50/0_dist_ssh_key/inventory.ini b/Classification/resnet50/0_dist_ssh_key/inventory.ini new file mode 100644 index 0000000..894b65d --- /dev/null +++ b/Classification/resnet50/0_dist_ssh_key/inventory.ini @@ -0,0 +1,3 @@ +[all] +of27 ansible_host=192.168.1.27 ansible_user=xiexuan +of28 ansible_host=192.168.1.28 ansible_user=xiexuan diff --git a/Classification/resnet50/0_dist_ssh_key/vars.yml b/Classification/resnet50/0_dist_ssh_key/vars.yml new file mode 100644 index 0000000..49c7dbf --- /dev/null +++ b/Classification/resnet50/0_dist_ssh_key/vars.yml @@ -0,0 +1,8 @@ +all: + hosts: + 192.168.1.27: + ansible_user: myuser + ansible_password: mypassword + 192.168.1.28: + ansible_user: myuser + ansible_password: mypassword diff --git a/Classification/resnet50/1_get_docker_image/README.md b/Classification/resnet50/1_get_docker_image/README.md new file mode 100644 index 0000000..3f6282a --- /dev/null +++ b/Classification/resnet50/1_get_docker_image/README.md @@ -0,0 +1,61 @@ +# 拉取或导入镜像 + +## 拉取镜像 + +适用于直接从 dockerhub 拉取镜像。 + +用法: `./pull.sh [镜像标签]` + +参数说明: + +- 镜像标签 (可选) : 要拉取的Docker镜像标签,例如 alpine:latest。如果未提供,则使用playbook中的默认值。 + +示例: + +- 默认使用: + +```bash +./pull.sh +``` + +- 指定镜像标签: + + ```bash +./pull.sh alpine:latest + ``` + +## 导入镜像 + +适用于本地共享目录有已经保存镜像的tar文件,使用 `docker load` 导入。 + +用法: `./load.sh [镜像文件路径] [镜像标签] [强制导入]` + +参数说明: + +- 镜像文件路径 (可选) : 要导入的Docker镜像tar文件路径,默认为 `/share_nfs/k85/oneflow.0.9.1.dev20240203-cuda11.8.tar` +- 镜像标签 (可选) : 导入后设置的Docker镜像标签,默认为 `oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8` +- 强制导入 (可选) : 是否强制导入镜像(true 或 false),默认为 false + +示例: + +- 默认使用: + + ```bash + ./load.sh + ``` + +- 指定镜像文件路径和标签: + +```bash +./load.sh /path/to/shared/abc.tar myrepo/myimage:latest +``` + +- 强制导入镜像: + +```bash +./load.sh /path/to/shared/abc.tar myrepo/myimage:latest true +``` + + + + diff --git a/Classification/resnet50/1_get_docker_image/load.sh b/Classification/resnet50/1_get_docker_image/load.sh new file mode 100755 index 0000000..5df8bcd --- /dev/null +++ b/Classification/resnet50/1_get_docker_image/load.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +if [ -n "$1" ]; then + docker_image_path=$1 +else + docker_image_path="/share_nfs/k85/oneflow.0.9.1.dev20240203-cuda11.8.tar" +fi + +if [ -n "$2" ]; then + docker_image_tag=$2 +else + docker_image_tag="oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8" +fi + +if [ -n "$3" ]; then + force_load=$3 +else + force_load=false +fi + +ansible-playbook \ + -i ../inventory.ini \ + load_and_tag_docker_image.yml \ + -e "docker_image_path=$docker_image_path" \ + -e "docker_image_tag=$docker_image_tag" \ + -e "force_load=$force_load" diff --git a/Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml b/Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml new file mode 100644 index 0000000..5a2f92d --- /dev/null +++ b/Classification/resnet50/1_get_docker_image/load_and_tag_docker_image.yml @@ -0,0 +1,28 @@ +--- +- name: Load and tag Docker image + hosts: all + vars: + docker_image_path: "/share_nfs/k85/oneflow.0.9.1.dev20240203-cuda11.8.tar" + docker_image_tag: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8" + force_load: false + + tasks: + - name: Check if Docker image with the specified tag already exists + command: "docker images -q {{ docker_image_tag }}" + register: image_id + changed_when: false + when: not force_load + + - name: Load Docker image from tar file + command: "docker load -i {{ docker_image_path }}" + when: force_load or image_id.stdout == "" + register: load_output + + - name: Get image ID from load output + set_fact: + loaded_image_id: "{{ load_output.stdout_lines[-1] | regex_search('sha256:[0-9a-f]+') }}" + when: force_load or image_id.stdout == "" + + - name: Tag the loaded Docker image + command: "docker tag {{ loaded_image_id }} {{ docker_image_tag }}" + when: force_load or image_id.stdout == "" diff --git a/Classification/resnet50/1_get_docker_image/pull.sh b/Classification/resnet50/1_get_docker_image/pull.sh new file mode 100755 index 0000000..8787fea --- /dev/null +++ b/Classification/resnet50/1_get_docker_image/pull.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +if [ -n "$1" ]; then + ansible-playbook -i ../inventory.ini pull_docker_image.yml -e "docker_image=$1" +else + ansible-playbook -i ../inventory.ini pull_docker_image.yml +fi diff --git a/Classification/resnet50/1_get_docker_image/pull_docker_image.yml b/Classification/resnet50/1_get_docker_image/pull_docker_image.yml new file mode 100644 index 0000000..d086e6a --- /dev/null +++ b/Classification/resnet50/1_get_docker_image/pull_docker_image.yml @@ -0,0 +1,17 @@ +--- +- name: Pull specified Docker image + hosts: all + vars: + docker_image: "oneflowinc/oneflow:0.9.1.dev20240203-cuda11.8" + + tasks: + - name: Check if the Docker image is already present + command: "docker images -q {{ docker_image }}" + register: docker_image_id + changed_when: false + + - name: Pull Docker image if not present + docker_image: + name: "{{ docker_image }}" + source: pull + when: docker_image_id.stdout == "" diff --git a/Classification/resnet50/Readme.md b/Classification/resnet50/Readme.md new file mode 100644 index 0000000..b19a080 --- /dev/null +++ b/Classification/resnet50/Readme.md @@ -0,0 +1,225 @@ +# 千卡 0.85 + +[toc] + +## 文件目录结构 +``` +├── ansible_workspace # 主节点上的工作目录 +│   ├── inventory.ini # 用来配置节点信息 +│   ├── set_docker.sh # 在各节点上创建docker,并且配置好docker内环境 +│   ├── profile.sh # 根据节点数启动profile +│   ├── train.sh # 根据节点数启动训练 +│   └── update_tools.sh # 将主节点的tools文件夹复制到各个子节点 +├── tools # 在各个节点使用的文件 +│ ├── args_train_ddp_graph_resnet50.sh # 接受模型训练参数并启动训练 +│ ├── models.tar.gz # 模型,为防止git网络问题,建议先下载放在共享目录下 +│ ├── extract.py # 提取log中train阶段的throughput的平均值 +│ ├── prepare_docker.sh # 用于配置docker内环境 +│ ├── profile.sh # 根据节点数在本机启动profile +│ └── train.sh # 根据节点数在本机启动训练 +└── Readme.md +``` + +需求:有NVLink,以及 shared_nfs + +以下供参考 + +## 第一步: 配置环境 + +### 1.1 所有节点配置SSH Key,并设置authorized_keys + +(怎么自动化) + +需要一个共享的存储空间,如:`/shared_nfs/k85`,在一个文件夹下准备好 + +- authorized_keys : 在主节点运行 + + ```bash + #!/bin/bash + + # 设置 SSH 目录路径 + SSH_DIR="$HOME/.ssh" + + # 检查 SSH 目录是否存在,如果不存在则创建 + if [ ! -d "$SSH_DIR" ]; then + mkdir -p "$SSH_DIR" + echo "Created directory: $SSH_DIR" + fi + + # 设置密钥文件路径 + KEY_PATH="$SSH_DIR/id_rsa" + + # 生成 SSH 密钥对 + ssh-keygen -t rsa -b 2048 -f "$KEY_PATH" -N "" -q + + # 创建 authorized_keys 文件 + cat $SSH_DIR/id_rsa.pub > $SSH_DIR/authorized_keys + + # 将 authorized_keys 文件拷贝到共享目录 + cp $SSH_DIR/authorized_keys shared_nfs/k85 + ``` + +- 在子节点运行 + + ```bash + #!/bin/bash + + # 设置 SSH 目录路径 + SSH_DIR="$HOME/.ssh" + + # 检查 SSH 目录是否存在,如果不存在则创建 + if [ ! -d "$SSH_DIR" ]; then + mkdir -p "$SSH_DIR" + echo "Created directory: $SSH_DIR" + fi + + # 设置密钥文件路径 + KEY_PATH="$SSH_DIR/id_rsa" + + # 生成 SSH 密钥对 + ssh-keygen -t rsa -b 2048 -f "$KEY_PATH" -N "" -q + + # 将 authorized_keys 文件拷贝到 .ssh 目录 + cp shared_nfs/k85/authorized_keys $SSH_DIR + ``` + +### 1.2 主节点安装 Ansible,并配置节点ip +示例文件:./ansible_workspace/inventory.ini +```ini +[hosts] +of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +of25 ansible_host=192.168.1.25 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +of26 ansible_host=192.168.1.26 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +``` + +### 1.3 共享目录中拷贝镜像、数据集、models脚本 +主要为设置docker内环境的脚本 和 启动docker内训练的脚本 +设置docker内环境脚本(./tools/prepare_docker.sh)如下: +```Bash +#!/bin/bash +# 将tools视为共享目录 +pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +python3 -m pip install --upgrade pip +python3 -m pip install --pre oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu121 + + +cd /workspace +cp tools/models.tar.gz ./ +tar -xvf models.tar.gz +pip install -r models/dev-requirements.txt +pip install -r models/Vision/classification/image/resnet50/requirements.txt + +# 将需要使用到的脚本拷到对应文件夹下 +cp tools/args_train_ddp_graph_resnet50.sh models/Vision/classification/image/resnet50/examples/ +cp tools/train.sh models/Vision/classification/image/resnet50/ +cp tools/profile.sh models/Vision/classification/image/resnet50/ +``` +启动dokcer内训练的脚本(./tools/train.sh)如下: +```Bash +# 根据使用的节点数,来判断本机是否开始训练 +NUM_NODES=${1:-1} + +if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then + bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '' 1 +else + echo do nothing +fi +``` +启动dokcer内profile(./tools/profile.sh)如下: +```Bash +# 根据使用的节点数,来判断是否在本地开始profile +NUM_NODES=${1:-1} + +if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then + # 在启动训练时添加nsys启动路径,即可进行profile + bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 192 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1 +else + echo do nothing +fi +``` +args_train_ddp_graph_resnet50.sh文件参考自OneAutoTest仓库[args_train_ddp_graph.sh](https://github.com/Oneflow-Inc/OneAutoTest/blob/main/ResNet50/args_train_ddp_graph.sh),其中包含使用nsys启动的选项 +### 1.4 使用ansible 在所有节点执行 docker load, docker tag命令 +根据上文中inventory.ini文件依次在节点上创建docker,并将NODE_RANK写入docker的环境变量内,脚本(./ansible_workspace/set_docker.sh)内容如下: +```Bash +set -ex +if [ $# -ne 1 ]; then + echo "Usage: $0 filename" + exit 1 +fi +host_file="$1" +num_hosts=$(wc -l < "$host_file") +docker_name="cd_test" + +mapfile -t lines < "$host_file" + +for (( i=1; i<${#lines[@]}; i++ )); do + line="${lines[$i]}" + host_name=$(echo "$line" | awk '{print $1}') + # 根据inventory.ini文件中节点顺序,将NODE_RANK写入docker的环境变量中 + ansible $host_name -i $host_file -m shell -a "docker run -itd -e NODE_RANK=$((i-1)) -v /data/dataset/ImageNet:/data/dataset/ImageNet -v /data/home/chende/tools:/workspace/tools --network host --gpus all --shm-size=16g --ulimit memlock=-1 --ulimit core=0 --ulimit stack=67108864 --privileged --ipc host --cap-add=IPC_LOCK --name $docker_name nvcr.io/nvidia/pytorch:24.03-py3 bash" +done +# 在docker内运行环境设置的脚本 +ansible hosts -i "$host_file" -m shell -a "docker exec $docker_name bash -c 'bash /workspace/tools/prepare_docker.sh'" +``` +使用方式: +```Bash +bash set_docker.sh inventory.ini +``` + +## 第二步:进行测试 + +### 2.1 自动测试与日志搜集 + +编写一个测试命令脚本文件(./ansible_workspace/train.sh) +```Bash +#!/bin/bash +set -ex +if [ $# -ne 1 ]; then + echo "Usage: $0 num_nodes" + exit 1 +fi +NUM_NODES="$1" +docker_name="cd_test_new" +ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash train.sh $NUM_NODES'" +``` + +- 需要一个参数: 节点数, +- 运行该命令能够自动启动相应数量的节点运行。 +- 运行结束后收集日志到主节点。 +- 保存日志的目录可以以:`prefix_节点数_日期时间_surfix` 命名,前缀和后缀可以自定义 + +### 2.2 自动日志解析 + +可以使用2.1节提供的命令运行多次,比如: + +```bash +train.sh 1 +train.sh 2 +train.sh 4 +train.sh 8 +train.sh 16 +``` + +完成后应该保存了多个日志目录,需要编写一个日志处理脚本,从这些日志目录中提取性能数据并制成 markdown 格式的表格 + +注:不需要完整训练,训练稳定后获取到数据就可以了。 + +### 2.3 自动 nsys 性能测试 + +需要编写一个能够运行 nsys 的性能测试脚本文件(./ansible_workspace/profile.sh),和2.1的脚本类似,只是启动时需要调用nsys,我们需要搜集这些信息分析,然后进行优化。这个脚本文件。 +```Bash +#!/bin/bash +set -ex +if [ $# -ne 1 ]; then + echo "Usage: $0 num_nodes" + exit 1 +fi +NUM_NODES="$1" +docker_name="cd_test_new" +ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash profile.sh $NUM_NODES'" +``` +- 需要一个参数: 节点数, +- 运行该命令能够自动启动相应数量的节点运行。 +- 运行结束后收集日志和nsys相关文件到主节点。 +- 保存日志的目录可以以:`prefix_节点数_日期时间_surfix` 命名,前缀和后缀可以自定义 \ No newline at end of file diff --git a/Classification/resnet50/ansible_workspace/profile.sh b/Classification/resnet50/ansible_workspace/profile.sh new file mode 100644 index 0000000..db6abfb --- /dev/null +++ b/Classification/resnet50/ansible_workspace/profile.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -ex +if [ $# -ne 1 ]; then + echo "Usage: $0 num_nodes" + exit 1 +fi +NUM_NODES="$1" +docker_name="cd_test_new" +ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash profile.sh $NUM_NODES'" \ No newline at end of file diff --git a/Classification/resnet50/ansible_workspace/set_docker.sh b/Classification/resnet50/ansible_workspace/set_docker.sh new file mode 100755 index 0000000..7a55f49 --- /dev/null +++ b/Classification/resnet50/ansible_workspace/set_docker.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -ex +if [ $# -ne 1 ]; then + echo "Usage: $0 filename" + exit 1 +fi +host_file="$1" +num_hosts=$(wc -l < "$host_file") +docker_name="cd_test_new" + +mapfile -t lines < "$host_file" + +for (( i=1; i<${#lines[@]}; i++ )); do + line="${lines[$i]}" + host_name=$(echo "$line" | awk '{print $1}') + ansible $host_name -i $host_file -m shell -a "docker run -itd -e NODE_RANK=$((i-1)) -v /data/dataset/ImageNet:/data/dataset/ImageNet -v /data/home/chende/tools:/workspace/tools --network host --gpus all --shm-size=16g --ulimit memlock=-1 --ulimit core=0 --ulimit stack=67108864 --privileged --ipc host --cap-add=IPC_LOCK --name $docker_name nvcr.io/nvidia/pytorch:24.03-py3 bash" +done +ansible hosts -i "$host_file" -m shell -a "docker exec $docker_name bash -c 'bash /workspace/tools/prepare_docker.sh'" \ No newline at end of file diff --git a/Classification/resnet50/ansible_workspace/train.sh b/Classification/resnet50/ansible_workspace/train.sh new file mode 100644 index 0000000..350bf78 --- /dev/null +++ b/Classification/resnet50/ansible_workspace/train.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -ex +if [ $# -ne 1 ]; then + echo "Usage: $0 num_nodes" + exit 1 +fi +NUM_NODES="$1" +docker_name="cd_test_new" +ansible hosts -i inventory.ini -m shell -a "docker exec $docker_name bash -c 'cd /workspace/models/Vision/classification/image/resnet50 && bash train.sh $NUM_NODES'" \ No newline at end of file diff --git a/Classification/resnet50/ansible_workspace/update_tools.sh b/Classification/resnet50/ansible_workspace/update_tools.sh new file mode 100755 index 0000000..cda6deb --- /dev/null +++ b/Classification/resnet50/ansible_workspace/update_tools.sh @@ -0,0 +1,3 @@ +#!/bin/bash +echo "/data/home/chende/tools" +ansible hosts -i inventory.ini -m copy -a "src=/data/home/chende/tools dest=/data/home/chende/ mode=0755" \ No newline at end of file diff --git a/Classification/resnet50/inventory.ini b/Classification/resnet50/inventory.ini new file mode 100644 index 0000000..01027fd --- /dev/null +++ b/Classification/resnet50/inventory.ini @@ -0,0 +1,3 @@ +[all] +of27 ansible_host=192.168.1.27 ansible_ssh_common_args='-o StrictHostKeyChecking=no' +of28 ansible_host=192.168.1.28 ansible_ssh_common_args='-o StrictHostKeyChecking=no' diff --git a/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh b/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh new file mode 100755 index 0000000..b4fbd36 --- /dev/null +++ b/Classification/resnet50/tools/args_train_ddp_graph_resnet50.sh @@ -0,0 +1,144 @@ +rm -rf core.* + +set -ex + + +# bash examples/args_train_ddp_graph.sh ${NUM_NODES} ${DEVICE_NUM_PER_NODE} ${NODE_RANK} ${MASTER_ADDR} +# ${OFRECORD_PATH} ${TRAIN_BATCH_SIZE} ${EPOCH} ${USE_FP16} ${PYTHON_BIN} ${RUN_TYPE} ${DEBUG_AND_NCCL} ${NSYS_BIN} ${RUN_COMMIT} + +# bash examples/args_train_ddp_graph.sh 1 8 0 127.0.0.1 /dataset/79846248 192 50 false python3 ddp false '' 1 + +NUM_NODES=${1:-1} +DEVICE_NUM_PER_NODE=${2:-8} +NODE_RANK=${3:-0} +MASTER_ADDR=${4:-"127.0.0.1"} +OFRECORD_PATH=${5:-"/dataset/imagenet/ofrecord"} +TRAIN_BATCH_SIZE=${6:-192} +EPOCH=${7:-50} +USE_FP16=${8:-false} +PYTHON_BIN=${9:-"python3"} +RUN_TYPE=${10:-"ddp"} # graph+fp16 +DECODE_TYPE=${11:-"cpu"} +PRINT_INTERVAL=${12:-100} +DEBUG_AND_NCCL=${13:-false} +NSYS_BIN=${14:-""} +RUN_COMMIT=${15:-"master"} +ACC=${16:-1} +VAL_BATCH_SIZE=${17:-50} + + +SRC_DIR=$(realpath $(dirname $0)/..) + +AMP_OR="FP32" +if $USE_FP16; then + AMP_OR="FP16" +fi + +TRAN_MODEL="resnet50" +RUN_TIME=$(date "+%Y%m%d_%H%M%S%N") +LOG_FOLDER=${SRC_DIR}/test_logs/$HOSTNAME/${NUM_NODES}n${DEVICE_NUM_PER_NODE}g +mkdir -p $LOG_FOLDER +LOG_FILENAME=$LOG_FOLDER/${TRAN_MODEL}_${RUN_TYPE}_DC${DECODE_TYPE}_${AMP_OR}_mb${TRAIN_BATCH_SIZE}_gb$((${TRAIN_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC}))_acc${ACC}_${NUM_NODES}n${DEVICE_NUM_PER_NODE}g_${RUN_COMMIT}_${RUN_TIME} + + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +#export ONEFLOW_COMM_NET_IB_ENABLE=True +export NCCL_LAUNCH_MODE=GROUP +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +echo DEBUG_AND_NCCL=$DEBUG_AND_NCCL +if $DEBUG_AND_NCCL; then + export ONEFLOW_DEBUG_MODE=1 + echo ONEFLOW_DEBUG_MODE=$ONEFLOW_DEBUG_MODE + export NCCL_DEBUG=INFO + echo NCCL_DEBUG=$NCCL_DEBUG +fi + +#export ONEFLOW_KERNEL_ENABLE_CUDA_GRAPH=1 +#export ONEFLOW_THREAD_ENABLE_LOCAL_MESSAGE_QUEUE=1 +#export ONEFLOW_KERNEL_DISABLE_BLOB_ACCESS_CHECKER=1 +#export ONEFLOW_ACTOR_ENABLE_LIGHT_ACTOR=1 +#export ONEFLOW_STREAM_REUSE_CUDA_EVENT=1 + +#export ONEFLOW_STREAM_CUDA_EVENT_FLAG_BLOCKING_SYNC=true +#export ONEFLOW_VM_WORKLOAD_ON_SCHEDULER_THREAD=1 + +LEARNING_RATE=$(echo | awk "{print $NUM_NODES*$DEVICE_NUM_PER_NODE*$TRAIN_BATCH_SIZE*$ACC/1000}") +MOM=0.875 +OFRECORD_PART_NUM=256 + +EXIT_NUM=-1 + +if [ ${EPOCH} -lt 10 ];then + EXIT_NUM=300 +fi +CMD="" + +if [[ ! -z "${NSYS_BIN}" ]]; then + export ONEFLOW_PROFILER_KERNEL_PROFILE_KERNEL_FORWARD_RANGE=1 + export ONEFLOW_DEBUG_MODE=1 + # CMD+="${NSYS_BIN} profile --stats true -t nvtx --output ${LOG_FILENAME} " + export CUDNN_LOGINFO_DBG=1 + export CUDNN_LOGDEST_DBG=${SRC_DIR}/cudnn.log + CMD+="${NSYS_BIN} profile --stats true --output ${LOG_FILENAME} " + EXIT_NUM=30 +fi + + +CMD+="${PYTHON_BIN} -m oneflow.distributed.launch " + +CMD+="--nproc_per_node ${DEVICE_NUM_PER_NODE} " +CMD+="--nnodes ${NUM_NODES} " +CMD+="--node_rank ${NODE_RANK} " +CMD+="--master_addr ${MASTER_ADDR} " +CMD+="--master_port 12345 " +CMD+="${SRC_DIR}/train.py " +CMD+="--ofrecord-path ${OFRECORD_PATH} " +CMD+="--ofrecord-part-num ${OFRECORD_PART_NUM} " +CMD+="--num-devices-per-node ${DEVICE_NUM_PER_NODE} " +CMD+="--lr ${LEARNING_RATE} " +CMD+="--momentum ${MOM} " +CMD+="--num-epochs ${EPOCH} " +CMD+="--train-batch-size ${TRAIN_BATCH_SIZE} " +CMD+="--train-global-batch-size $((${TRAIN_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC})) " +CMD+="--val-batch-size ${VAL_BATCH_SIZE} " +CMD+="--val-global-batch-size $((${VAL_BATCH_SIZE}*${NUM_NODES}*${DEVICE_NUM_PER_NODE}*${ACC})) " +CMD+="--print-interval ${PRINT_INTERVAL} " +#CMD+="--synthetic-data " + +if $USE_FP16; then + echo USE_FP16=$USE_FP16 + CMD+="--use-fp16 --channel-last " +fi + +if [ $EXIT_NUM != -1 ]; then + CMD+="--skip-eval " +fi +if [ $RUN_TYPE == 'ddp' ]; then + CMD+="--ddp " +else + CMD+="--scale-grad --graph " + CMD+="--fuse-bn-relu " + CMD+="--fuse-bn-add-relu " +fi + +if [ $DECODE_TYPE == 'gpu' ]; then + CMD+="--use-gpu-decode " +fi + +echo "Rum cmd ${CMD}" + +$CMD 2>&1 | tee ${LOG_FILENAME}.log + +echo "Writting log to ${LOG_FILENAME}.log" + +if [[ ! -z "${NSYS_BIN}" ]]; then + rm ${LOG_FOLDER}/*.sqlite + mkdir -p ${LOG_FILENAME} + #rm -rf ./log/$HOSTNAME/oneflow.* + cp ./log/$HOSTNAME/* ${LOG_FILENAME}/ + mv ${SRC_DIR}/cudnn.log ${LOG_FILENAME}/cudnn.log +fi + +rm -rf ./log/$HOSTNAME +echo "done" diff --git a/Classification/resnet50/tools/extract.py b/Classification/resnet50/tools/extract.py new file mode 100644 index 0000000..477903d --- /dev/null +++ b/Classification/resnet50/tools/extract.py @@ -0,0 +1,27 @@ +import sys +import re + +# 文件路径 +file_path = sys.argv[1] + +# 存储 train 模式下的 throughput +print(file_path) +train_throughputs = [] + +# 正则表达式模式匹配 train 模式下的 throughput +pattern = re.compile(r'\[train\][^|]*?throughput:\s(\d+\.\d+)') + +# 读取文件并提取需要的信息 +with open(file_path, 'r') as file: + for line in file: + matches = pattern.findall(line) + for match in matches: + throughput = float(match) + train_throughputs.append(throughput) + +# 计算平均 throughput +if train_throughputs: + average_throughput = sum(train_throughputs) / len(train_throughputs) + print(f'The average throughput for [train] mode is: {average_throughput:.6f}') +else: + print('No [train] mode throughputs found.') diff --git a/Classification/resnet50/tools/prepare_docker.sh b/Classification/resnet50/tools/prepare_docker.sh new file mode 100755 index 0000000..84147f5 --- /dev/null +++ b/Classification/resnet50/tools/prepare_docker.sh @@ -0,0 +1,14 @@ +#!/bin/bash +pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +python3 -m pip install --upgrade pip +python3 -m pip install --pre oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu121 + +cd /workspace +cp tools/models.tar.gz ./ +tar -xvf models.tar.gz +pip install -r models/dev-requirements.txt +pip install -r models/Vision/classification/image/resnet50/requirements.txt + +cp tools/args_train_ddp_graph_resnet50.sh models/Vision/classification/image/resnet50/examples/ +cp tools/train.sh models/Vision/classification/image/resnet50/ +cp tools/profile.sh models/Vision/classification/image/resnet50/ \ No newline at end of file diff --git a/Classification/resnet50/tools/profile.sh b/Classification/resnet50/tools/profile.sh new file mode 100755 index 0000000..79e0d0a --- /dev/null +++ b/Classification/resnet50/tools/profile.sh @@ -0,0 +1,7 @@ +NUM_NODES=${1:-1} + +if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then + bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 128 50 true python3 graph gpu 100 false '/usr/local/cuda/bin/nsys' 1 +else + echo do nothing +fi \ No newline at end of file diff --git a/Classification/resnet50/tools/train.sh b/Classification/resnet50/tools/train.sh new file mode 100755 index 0000000..5758c05 --- /dev/null +++ b/Classification/resnet50/tools/train.sh @@ -0,0 +1,7 @@ +NUM_NODES=${1:-1} + +if [ "$NODE_RANK" -lt "$NUM_NODES" ]; then + bash examples/args_train_ddp_graph_resnet50.sh "$NUM_NODES" 8 "$NODE_RANK" 192.168.1.27 /data/dataset/ImageNet/ofrecord 128 50 true python3 graph gpu 100 false '' 1 +else + echo do nothing +fi \ No newline at end of file