diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 000000000..88e12687b --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,264 @@ +def rocmnode(name) { + return 'rocmtest && miopen && ' + name +} + +def show_node_info() { + sh """ + echo "NODE_NAME = \$NODE_NAME" + lsb_release -sd + uname -r + ls /opt/ -la + """ +} + +def runShell(String command){ + def responseCode = sh returnStatus: true, script: "${command} > tmp.txt" + def output = readFile(file: "tmp.txt") + echo "tmp.txt contents: $output" + return (output != "") +} + +def getDockerImageName(){ + def img + img = "${env.CK_DOCKERHUB}:ait_rocm${params.ROCMVERSION}" + return img +} + +def getDockerImage(Map conf=[:]){ + env.DOCKER_BUILDKIT=1 + def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm + def no_cache = conf.get("no_cache", false) + def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg ROCMVERSION='${params.ROCMVERSION}' " + echo "Docker Args: ${dockerArgs}" + def image = getDockerImageName() + //Check if image exists + def retimage + try + { + echo "Pulling image: ${image}" + retimage = docker.image("${image}") + retimage.pull() + } + catch(Exception ex) + { + error "Unable to locate image: ${image}" + } + return [retimage, image] +} + +def build_ait(Map conf=[:]){ + + def build_cmd = """ + export ROCM_PATH=/opt/rocm + export ROC_USE_FGS_KERNARG=0 + python3 -c "import torch; print(torch.__version__)" + """ + + def cmd = conf.get("cmd", """ + ${build_cmd} + """) + + echo cmd + sh cmd +} + +def Run_Step(Map conf=[:]){ + show_node_info() + + env.HSA_ENABLE_SDMA=0 + checkout scm + + def image = getDockerImageName() + def prefixpath = conf.get("prefixpath", "/opt/rocm") + + // Jenkins is complaining about the render group + def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + if (conf.get("enforce_xnack_on", false)) { + dockerOpts = dockerOpts + " --env HSA_XNACK=1 " + } + def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg ROCMVERSION='${params.ROCMVERSION}' " + def variant = env.STAGE_NAME + def retimage + + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'AITemplate') { + try { + (retimage, image) = getDockerImage(conf) + withDockerContainer(image: image, args: dockerOpts) { + timeout(time: 5, unit: 'MINUTES'){ + sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log' + if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){ + throw new Exception ("GPU not found") + } + else{ + echo "GPU is OK" + } + } + } + } + catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ + echo "The job was cancelled or aborted" + throw e + } + + withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + timeout(time: 24, unit: 'HOURS') + { + build_ait(conf) + dir("examples"){ + if (params.RUN_FULL_QA){ + sh "./run_qa.sh $HF_TOKEN ${env.BRANCH_NAME} ${NODE_NAME} ${params.ROCMVERSION}" + } + else{ + sh "./run_tests.sh $HF_TOKEN ${env.BRANCH_NAME} ${NODE_NAME} ${params.ROCMVERSION}" + } + } + dir("examples/01_resnet-50"){ + archiveArtifacts "01_resnet50.log" + stash includes: "01_resnet50.log", name: "01_resnet50.log" + } + dir("examples/03_bert"){ + archiveArtifacts "03_bert.log" + stash includes: "03_bert.log", name: "03_bert.log" + } + dir("examples/04_vit"){ + archiveArtifacts "04_vit.log" + stash includes: "04_vit.log", name: "04_vit.log" + } + dir("examples/05_stable_diffusion/"){ + archiveArtifacts "05_sdiff.log" + stash includes: "05_sdiff.log", name: "05_sdiff.log" + } + } + } + } + return retimage +} + +def Run_Step_and_Reboot(Map conf=[:]){ + try{ + Run_Step(conf) + } + catch(e){ + echo "throwing error exception while building CK" + echo 'Exception occurred: ' + e.toString() + throw e + } + finally{ + if (!conf.get("no_reboot", false)) { + reboot() + } + } +} + +def process_results(Map conf=[:]){ + env.HSA_ENABLE_SDMA=0 + checkout scm + def image = getDockerImageName() + def prefixpath = "/opt/rocm" + + // Jenkins is complaining about the render group + def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined" + if (conf.get("enforce_xnack_on", false)) { + dockerOpts = dockerOpts + " --env HSA_XNACK=1 " + } + + def variant = env.STAGE_NAME + def retimage + + gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'AITemplate') { + try { + (retimage, image) = getDockerImage(conf) + } + catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){ + echo "The job was cancelled or aborted" + throw e + } + } + + withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { + timeout(time: 1, unit: 'HOURS'){ + try{ + dir("examples"){ + // clean up any old logs, then unstash perf files to master + sh "rm -rf *.log" + unstash "01_resnet50.log" + unstash "03_bert.log" + unstash "04_vit.log" + unstash "05_sdiff.log" + sh "python3 process_results.py" + } + } + catch(e){ + echo "throwing error exception while processing performance test results" + echo 'Exception occurred: ' + e.toString() + throw e + } + } + } +} + +//launch amd-develop branch daily at 17:00 UT in FULL_QA mode +CRON_SETTINGS = BRANCH_NAME == "amd-develop" ? '''0 17 * * * % RUN_FULL_QA=true''' : "" + +pipeline { + agent none + triggers { + parameterizedCron(CRON_SETTINGS) + } + options { + parallelsAlwaysFailFast() + } + parameters { + string( + name: 'ROCMVERSION', + defaultValue: '5.4.3', + description: 'Specify which ROCM version to use: 5.4.3 (default).') + booleanParam( + name: "RUN_FULL_QA", + defaultValue: false, + description: "Select whether to run small set of performance tests (default) or full QA") + } + environment{ + dbuser = "${dbuser}" + dbpassword = "${dbpassword}" + dbsship = "${dbsship}" + dbsshport = "${dbsshport}" + dbsshuser = "${dbsshuser}" + dbsshpassword = "${dbsshpassword}" + status_wrapper_creds = "${status_wrapper_creds}" + HF_TOKEN = "${HF_TOKEN}" + DOCKER_BUILDKIT = "1" + } + stages{ + stage("Build AITemplate") + { + parallel + { + stage("Build AIT and Run Tests") + { + agent{ label rocmnode("gfx908 || gfx90a") } + steps{ + Run_Step_and_Reboot(no_reboot:true, , prefixpath: '/usr/local') + } + } + } + } + stage("Process Performance Test Results") + { + when { + beforeAgent true + expression { params.RUN_FULL_QA.toBoolean() } + } + parallel + { + stage("Process results"){ + agent { label 'mici' } + steps{ + process_results() + } + } + } + } + } +} + diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 8146b506c..1d430c396 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -15,7 +15,7 @@ # ROCM Docker Image for AITemplate FROM ubuntu:20.04 -ARG ROCMVERSION=5.3 +ARG ROCMVERSION=5.4.3 RUN set -xe @@ -44,9 +44,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- libpthread-stubs0-dev \ llvm-amdgpu \ pkg-config \ - python \ python3 \ - python-dev \ python3-dev \ python3-pip \ software-properties-common \ @@ -97,7 +95,20 @@ RUN bash /Install/install_test_dep.sh RUN bash /Install/install_doc_dep.sh # Install Pytorch -RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.1.1 +RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.4.2 + +# Install some useful python packages +RUN pip3 install --upgrade pip + +RUN pip3 install transformers click sympy recordtype parameterized einops jinja2 +RUN pip3 install diffusers==0.11.1 accelerate + +# Install packages for processing the performance results +RUN pip3 install sqlalchemy==1.4.46 +RUN pip3 install pymysql pandas setuptools-rust sshtunnel + +# Install lint packages +RUN pip3 install ufmt==2.0.1 click==8.1.3 black==22.12.0 flake8==5.0.4 # for detection RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata @@ -115,3 +126,12 @@ ADD ./static /AITemplate/static ADD ./licenses /AITemplate/licenses ADD ./docker/install/install_ait.sh /AITemplate/ RUN bash /AITemplate/install_ait.sh + +# Create a folder for Hugging Face cache +RUN mkdir /.aitemplate && chmod a+rw /.aitemplate +RUN mkdir /.cache && chmod a+rw /.cache +WORKDIR "/.cache" +RUN mkdir huggingface && chmod a+rw huggingface +WORKDIR "/.cache/huggingface" +RUN mkdir hub && chmod a+rw hub +WORKDIR / diff --git a/examples/process_results.py b/examples/process_results.py index 19f013637..01b5852fd 100644 --- a/examples/process_results.py +++ b/examples/process_results.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 -import glob,os, io, argparse, datetime -#import numpy as np +import glob, os, io, argparse, datetime import sqlalchemy from sqlalchemy.types import NVARCHAR, Float, Integer +from sqlalchemy import text import pymysql import pandas as pd from sshtunnel import SSHTunnelForwarder @@ -79,8 +79,8 @@ def parse_logfile(files): return res def get_baseline(table, connection): - query = '''SELECT * from '''+table+''' WHERE Datetime = (SELECT MAX(Datetime) FROM '''+table+''' where git_branch='amd-develop' );''' - return pd.read_sql_query(query, connection) + query = text('''SELECT * from '''+table+''' WHERE Datetime = (SELECT MIN(Datetime) FROM '''+table+''' where Test64 IS NOT NULL );''') + return pd.read_sql(query, connection) def store_new_test_result(table_name, test_results, testlist, node_id, branch_name, commit, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers, connection): params=[str(node_id),str(branch_name),str(commit),str(gpu_arch),compute_units,ngpus,str(rocm_vers),str(compiler_vers),str(datetime.datetime.now())] @@ -93,7 +93,7 @@ def store_new_test_result(table_name, test_results, testlist, node_id, branch_na def compare_test_to_baseline(baseline,test,testlist): regression=0 - if not baseline.empty: + if not len(baseline)==0: base=baseline[testlist].to_numpy(dtype='float') base_list=base[0] ave_perf=0 @@ -120,6 +120,7 @@ def main(): testlist=[] #parse the test parameters from the logfile for filename in files: + print("processing file: ",filename) branch_name, commit, node_id, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers = get_log_params(filename) print("Branch name:",branch_name) @@ -132,38 +133,45 @@ def main(): #parse results, get the Tflops value for "Best Perf" kernels results=parse_logfile(files) + for i in range(1,len(results)+1): + testlist.append("Test%i"%i) + table_name="ait_performance" + print("Number of tests:",len(results)) sql_hostname = '127.0.0.1' + sql_port = 3306 sql_username = os.environ["dbuser"] sql_password = os.environ["dbpassword"] - sql_main_database = 'sys' - sql_port = 3306 - hostname = os.uname()[1] - if hostname == 'jwr-amd-132': + host = os.uname()[1] + + if host == 'jwr-amd-132': + print("connecting to local database") + sql_main_database = 'sys' sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}/{3}'. format(sql_username, sql_password, sql_hostname, sql_main_database)) conn = sqlEngine.connect() + baseline = get_baseline(table_name,conn) + store_new_test_result(table_name, results, testlist, node_id, branch_name, commit, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers, conn) + conn.close() else: + print("connecting to remote database") + sql_main_database = "miopen_perf" ssh_host = os.environ["dbsship"] ssh_user = os.environ["dbsshuser"] ssh_port = int(os.environ["dbsshport"]) ssh_pass = os.environ["dbsshpassword"] with SSHTunnelForwarder( - (ssh_host, ssh_port), - ssh_username=ssh_user, - ssh_password=ssh_pass, - remote_bind_address=(sql_hostname, sql_port)) as tunnel: - sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'. - format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database)) - conn = sqlEngine.connect() - #save gemm performance tests: - for i in range(1,len(results)+1): - testlist.append("Test%i"%i) - table_name="ait_performance" - - baseline = get_baseline(table_name,conn) - store_new_test_result(table_name, results, testlist, node_id, branch_name, commit, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers, conn) - conn.close() + (ssh_host, ssh_port), + ssh_username=ssh_user, + ssh_password=ssh_pass, + remote_bind_address=(sql_hostname, sql_port)) as tunnel: + + sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'. + format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database)) + conn = sqlEngine.connect() + baseline = get_baseline(table_name,conn) + store_new_test_result(table_name, results, testlist, node_id, branch_name, commit, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers, conn) + conn.close() #compare the results to the baseline if baseline exists regression=0 diff --git a/examples/run_qa.sh b/examples/run_qa.sh new file mode 100755 index 000000000..5979ecfac --- /dev/null +++ b/examples/run_qa.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# +# this is a script to run tests during ROCM CI +# input argument: +# Hugging Face token + +export HF_TOKEN=$1 +export GIT_BRANCH=$2 +export hostname=$3 +export TRANSFORMERS_CACHE=/.cache/huggingface/hub + + +function print_log_header(){ + rm -f $1; + echo "hostname: " $2 &> $1; + echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1 + rocminfo | grep "Compute Unit:" >> $1 + echo "git_branch: " $3 >> $1 + git show --summary | grep commit >> $1 + /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1 +} + +echo "Running RESNET50 tests" +cd 01_resnet-50 +print_log_header 01_resnet50.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a 01_resnet50.log + +echo "Running BERT tests" +cd ../03_bert +print_log_header 03_bert.log $hostname $GIT_BRANCH +for sq in 64 128 384 512 1024 +do + HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --seq-length $sq 2>&1 | tee -a 03_bert.log +done + +export NUM_BUILDERS=$(($(nproc)/2)) +echo "Running VIT tests" +cd ../04_vit +print_log_header 04_vit.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a 04_vit.log +# test 2 gcd +for BATCH_SIZE in 1 2 4 8 16 32 64 128 256 +do + HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size $BATCH_SIZE 2>&1 | tee -a 04_vit.log +done +export NUM_BUILDERS=$(($(nproc)/4)) +echo "Running Stable Diffusion tests" +cd ../05_stable_diffusion +print_log_header 05_sdiff.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a 05_sdiff.log +HIP_VISIBLE_DEVICES=0 python3 demo.py --token $HF_TOKEN --benchmark 1 2>&1 | tee -a 05_sdiff.log diff --git a/examples/run_tests.sh b/examples/run_tests.sh new file mode 100755 index 000000000..d51bef013 --- /dev/null +++ b/examples/run_tests.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# +# this is a script to run tests during ROCM CI +# input argument: +# Hugging Face token + +export HF_TOKEN=$1 +export GIT_BRANCH=$2 +export hostname=$3 +export TRANSFORMERS_CACHE=/.cache/huggingface/hub + + +function print_log_header(){ + rm -f $1; + echo "hostname: " $2 &> $1; + echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1 + rocminfo | grep "Compute Unit:" >> $1 + echo "git_branch: " $3 >> $1 + git show --summary | grep commit >> $1 + /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1 +} + +echo "Running RESNET50 tests" +cd 01_resnet-50 +print_log_header 01_resnet50.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a 01_resnet50.log + +echo "Running BERT tests" +cd ../03_bert +print_log_header 03_bert.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --seq-length 64 2>&1 | tee -a 03_bert.log + +export NUM_BUILDERS=$(($(nproc)/2)) +echo "Running VIT tests" +cd ../04_vit +print_log_header 04_vit.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py 2>&1 | tee -a 04_vit.log + +export NUM_BUILDERS=$(($(nproc)/4)) +echo "Running Stable Diffusion tests" +cd ../05_stable_diffusion +print_log_header 05_sdiff.log $hostname $GIT_BRANCH +HIP_VISIBLE_DEVICES=0 python3 compile.py --token $HF_TOKEN 2>&1 | tee -a 05_sdiff.log +HIP_VISIBLE_DEVICES=0 python3 demo.py --token $HF_TOKEN --benchmark 1 2>&1 | tee -a 05_sdiff.log