diff --git a/.circleci/config.yml b/.circleci/config.yml index 19c2d377a..0193fc253 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -18,45 +18,91 @@ setup_env: &setup_env - run: name: Setup environment command: | - python3.8 --version - python3.8 -m pip install --upgrade pip - cd python - python3.8 setup.py bdist_wheel - sudo python3.8 -m pip install --no-input dist/*.whl - cd .. - python3.8 -m pip install pytest - python3.8 -m pip install torch - python3.8 -m pip install numpy - python3.8 -m pip install jinja2 - python3.8 -m pip install recordtype - python3.8 -m pip install parameterized - python3.8 -m pip install einops - git submodule sync - git submodule update --init - echo 'export PYTHONPATH=$PWD/python:$PYTHONPATH' >> $BASH_ENV - echo 'export PATH=/usr/local/cuda-11.4/bin:$PATH' >> $BASH_ENV - echo 'export CI_FLAG=CIRCLECI' >> $BASH_ENV - echo 'export CACHE_DIR=$PWD/tests/ci_profile_cache' >> $BASH_ENV + for i in {1..3}; do + sudo update-alternatives --set cuda /usr/local/cuda-11.4 + echo 'export PATH=/usr/local/cuda/bin:$PATH' >> $BASH_ENV && + source "$BASH_ENV" + python3.8 --version && + python3.8 -m pip install --upgrade pip && + cd /home/circleci/project/python && + python3.8 setup.py bdist_wheel && + sudo python3.8 -m pip install --no-input dist/*.whl && + cd /home/circleci/project && + python3.8 -m pip install 'cuda-python<12.0.0' && + python3.8 -m pip install pytest && + python3.8 -m pip install torch && + python3.8 -m pip install numpy && + python3.8 -m pip install jinja2 && + python3.8 -m pip install sympy && + python3.8 -m pip install recordtype && + python3.8 -m pip install parameterized && + python3.8 -m pip install einops && + git submodule sync && + git submodule update --init && + echo 'export PYTHONPATH=$PWD/python:$PYTHONPATH' >> $BASH_ENV && + echo 'export CI_FLAG=CIRCLECI' >> $BASH_ENV && + echo 'export CACHE_DIR=$PWD/tests/ci_profile_cache' >> $BASH_ENV && + echo 'export LOGLEVEL=DEBUG' >> $BASH_ENV && + break || sleep 5; + done + + +setup_fx2ait_env: &setup_fx2ait_env + - run: + name: Setup fx2ait environment + command: | + for i in {1..3}; do + wget https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz + tar -xvf cudnn-*-archive.tar.xz + sudo cp cudnn-*-archive/include/cudnn*.h /usr/local/cuda/include + sudo cp -P cudnn-*-archive/lib/libcudnn* /usr/local/cuda/lib64 + sudo chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn* + python3.8 -m pip install --ignore-installed --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118 + pushd fx2ait + python3.8 setup.py develop --user + popd + break || sleep 5; + done basic_tests: &basic_tests - run: name: Run tests + no_output_timeout: 20m command: | set -e TEST_FILES=$(circleci tests glob "tests/unittest/**/test_*.py" | grep -v benchmark | circleci tests split --split-by=timings) - mkdir test-results - python3.8 -m pytest $TEST_FILES --junitxml=test-results/junit.xml --verbose --continue-on-collection-errors -rA + mkdir ~/test-results + python3.8 -m pytest $TEST_FILES -o junit_family=xunit1 --junitxml=~/test-results/junit.xml --verbose --continue-on-collection-errors -rA +fx2ait_tests: &fx2ait_tests + - run: + name: Run fx2ait tests + command: | + source $BASH_ENV + mkdir -p ~/test-fx2ait-results + TEST_FILES=$(circleci tests glob "fx2ait/fx2ait/test/test_*.py" "fx2ait/fx2ait/test/converters/**/test_*.py") + python3.8 -m pytest $TEST_FILES -o junit_family=xunit1 --junitxml=~/test-fx2ait-results/junit.xml --verbose --continue-on-collection-errors -rA # Define a job to be invoked later in a workflow. # See: https://circleci.com/docs/2.0/configuration-reference/#jobs jobs: + fx2ait-test: + machine: + image: linux-cuda-11:default + resource_class: gpu.nvidia.medium + steps: + - checkout + - <<: *setup_env + - <<: *setup_fx2ait_env + - <<: *fx2ait_tests + - store_test_results: + path: ~/test-fx2ait-results + build-and-test: machine: - image: ubuntu-2004-cuda-11.4:202110-01 + image: linux-cuda-11:default # Check T101565170 for multi-gpu use cases. resource_class: gpu.nvidia.medium - parallelism: 10 # Checkout the code as the first step. This is a dedicated CircleCI step. @@ -69,7 +115,7 @@ jobs: - <<: *setup_env - <<: *basic_tests - store_test_results: - path: test-results + path: ~/test-results # Invoke jobs via workflows # See: https://circleci.com/docs/2.0/configuration-reference/#workflows @@ -77,4 +123,5 @@ workflows: unittest: # This is the name of the workflow, feel free to change it to better match your workflow. # Inside the workflow, you define the jobs you want to run. jobs: + - fx2ait-test - build-and-test diff --git a/.flake8 b/.flake8 index 71a5883ed..9ef66bc0d 100644 --- a/.flake8 +++ b/.flake8 @@ -7,111 +7,111 @@ ignore = # Found in https://github.com/psf/black/issues/429 # Line too long. B950, - # Indentation is not a multiple of four. - E111, + # Indentation is not a multiple of four. + E111, # Expected an indented block (comment). - E115, + E115, # Over-indented. E117, - # Continuation line under-indented for hanging indent. + # Continuation line under-indented for hanging indent. E121, - # Continuation line missing indentation or outdented. + # Continuation line missing indentation or outdented. E122, - # Closing bracket does not match indentation of opening bracket's line. + # Closing bracket does not match indentation of opening bracket's line. E123, - # Closing bracket does not match visual indentation. + # Closing bracket does not match visual indentation. E124, - # Continuation line with same indent as next logical line. + # Continuation line with same indent as next logical line. E125, - # Continuation line over-indented for hanging indent. + # Continuation line over-indented for hanging indent. E126, - # Continuation line over-indented for visual indent. + # Continuation line over-indented for visual indent. E127, - # Continuation line under-indented for visual indent. + # Continuation line under-indented for visual indent. E128, - # Visually indented line with same indent as next logical line. + # Visually indented line with same indent as next logical line. E129, - # Continuation line unaligned for hanging indent. + # Continuation line unaligned for hanging indent. E131, - # Whitespace after '('. + # Whitespace after '('. E201, - # Whitespace before ')'. + # Whitespace before ')'. E202, - # Whitespace before ':'. + # Whitespace before ':'. E203, - # Multiple spaces before operator. + # Multiple spaces before operator. E221, - # Multiple spaces after operator. + # Multiple spaces after operator. E222, - # Missing whitespace around operator. + # Missing whitespace around operator. E225, - # Missing whitespace around arithmetic operator. + # Missing whitespace around arithmetic operator. E226, - # Missing whitespace around bitwise or shift operator. + # Missing whitespace around bitwise or shift operator. E227, - # Missing whitespace after ',', ';', or ':'. + # Missing whitespace after ',', ';', or ':'. E231, - # Multiple spaces after ','. + # Multiple spaces after ','. E241, - # Unexpected spaces around keyword / parameter equals. + # Unexpected spaces around keyword / parameter equals. E251, - # Missing whitespace around parameter equals. + # Missing whitespace around parameter equals. E252, - # At least two spaces before inline comment. - E261, + # At least two spaces before inline comment. + E261, # Inline comment should start with '# '. - E262, + E262, # Block comment should start with '# '. E265, - # Multiple spaces after keyword. + # Multiple spaces after keyword. E271, - # Multiple spaces before keyword. + # Multiple spaces before keyword. E272, - # Expected 1 blank line, found 0. + # Expected 1 blank line, found 0. E301, - # Expected 2 blank lines, found 0. + # Expected 2 blank lines, found 0. E302, - # Too many blank lines (3). + # Too many blank lines (3). E303, - # Expected 2 blank lines after end of function or class. + # Expected 2 blank lines after end of function or class. E305, - # Expected 1 blank line before a nested definition. + # Expected 1 blank line before a nested definition. E306, - # Line too long (82 > 79 characters). + # Line too long (82 > 79 characters). E501, - # The backslash is redundant between brackets. + # The backslash is redundant between brackets. E502, - # Multiple statements on one line (colon). + # Multiple statements on one line (colon). E701, - # Multiple statements on one line (semicolon). + # Multiple statements on one line (semicolon). E702, - # Statement ends with a semicolon. + # Statement ends with a semicolon. E703, - # Multiple statements on one line (def). + # Multiple statements on one line (def). E704, - # Trailing whitespace. + # Trailing whitespace. W291, - # No newline at end of file. + # No newline at end of file. W292, - # Blank line contains whitespace. + # Blank line contains whitespace. W293, - # Blank line at end of file. + # Blank line at end of file. W391, - # Line break occurred after a binary operator. - W504, + # Line break occurred after a binary operator. + W504, # Too opinionated. # Block comment should start with '# '. E265, - # Too many leading '#' for block comment. + # Too many leading '#' for block comment. E266, - # Module level import not at top of file. (Use cases like demandimport https://fburl.com/demandimport require statements before imports) - E402, + # Module level import not at top of file. (Use cases like demandimport https://fburl.com/demandimport require statements before imports) + E402, # Do not use bare except, specify exception instead. (Duplicate of B001) - E722, + E722, # (Duplicate of B003) - P207, + P207, # (Duplicate of C403) P208, # Line break occurred before a binary operator. - W503 + W503 diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml new file mode 100644 index 000000000..3ebf6640d --- /dev/null +++ b/.github/workflows/docs.yaml @@ -0,0 +1,34 @@ +name: Docs + +on: + push: + branches: + - main + + pull_request: + branches: + - main +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9"] + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python3.9 -m pip install --upgrade pip + python3.9 -m pip install numpy autodocsumm 'sphinx<6' sphinx_rtd_theme sphinx_gallery sphinxcontrib-inlinesyntaxhighlight sphinx_toolbox + cd python + python setup.py develop + cd .. + - name: Build documents with Sphinx + run: | + cd docs + make html + cd .. diff --git a/.github/workflows/docs.yml b/.github/workflows/pages.yaml similarity index 80% rename from .github/workflows/docs.yml rename to .github/workflows/pages.yaml index 208bd1f77..815cfd887 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/pages.yaml @@ -1,5 +1,5 @@ # Simple workflow for deploying static content to GitHub Pages -name: Documentation +name: Deploy docs to Pages on: # Runs on pushes targeting the default branch @@ -39,15 +39,8 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install autodocsumm - pip install sphinx_rtd_theme - pip install sphinx_gallery - pip install sphinxcontrib-inlinesyntaxhighlight - pip install sphinx_toolbox - pip install numpy - pip install jinja2 - pip install torch + python3.9 -m pip install --upgrade pip + python3.9 -m pip install numpy autodocsumm 'sphinx<6' sphinx_rtd_theme sphinx_gallery sphinxcontrib-inlinesyntaxhighlight sphinx_toolbox jinja2 torch cd python python setup.py develop cd .. diff --git a/.github/workflows/lint.yml b/.github/workflows/pylint.yaml similarity index 85% rename from .github/workflows/lint.yml rename to .github/workflows/pylint.yaml index dbd4beb83..91f0018eb 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/pylint.yaml @@ -23,9 +23,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install ufmt - pip install click - pip install flake8 + pip install ufmt==2.0.1 click==8.1.3 black==22.12.0 flake8==5.0.4 - name: Analyzing the code with flake8 run: | echo "::add-matcher::tests/lint/flake8_problem_matcher.json" @@ -38,4 +36,5 @@ jobs: - name: Check Meta copyright header run: | python tests/lint/check_meta_header.py --path=./tests --fixit=False - python tests/lint/check_meta_header.py --path=./python --fixit=False \ No newline at end of file + python tests/lint/check_meta_header.py --path=./python --fixit=False + python tests/lint/check_meta_header.py --path=./fx2ait --fixit=False diff --git a/.github/workflows/ait_ci.yml b/.github/workflows/rocm_ci.yml similarity index 93% rename from .github/workflows/ait_ci.yml rename to .github/workflows/rocm_ci.yml index 0f598865f..61c93d643 100644 --- a/.github/workflows/ait_ci.yml +++ b/.github/workflows/rocm_ci.yml @@ -1,10 +1,12 @@ -name: AITemplate_ci +name: ROCM_CI -on: - push: +on: + pull_request: + types: [labeled, synchronize, reopened] jobs: build: + if: contains(github.event.label.name, 'rocm') runs-on: rocm steps: @@ -33,7 +35,7 @@ jobs: rocm-smi rocminfo | grep "gfx" export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}} - git clone --recursive -b $GIT_BRANCH https://github.com/ROCmSoftwarePlatform/AITemplate.git + git clone --recursive -b $GIT_BRANCH https://github.com/facebookincubator/AITemplate.git cd AITemplate DOCKER_BUILDKIT=1 ./docker/build.sh rocm docker run --network=host --device=/dev/kfd --device=/dev/dri --ipc=host --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $HOME:/dockerx/ ait:latest @@ -124,8 +126,9 @@ jobs: git show --summary | grep commit >> sdiff.log /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> sdiff.log # profiling - HIP_VISIBLE_DEVICES=0,1 python3 compile.py --token ${{ secrets.HF_TOKEN }} 2>&1 | tee -a sdiff.log - HIP_VISIBLE_DEVICES=0 python3 demo.py --token ${{ secrets.HF_TOKEN }} --benchmark 1 2>&1 | tee -a sdiff.log + python3 scripts/download_pipeline.py --token ${{ secrets.HF_TOKEN }} 2>&1 | tee -a sdiff.log + HIP_VISIBLE_DEVICES=0,1 python3 scripts/compile.py 2>&1 | tee -a sdiff.log + HIP_VISIBLE_DEVICES=0 python3 scripts/demo.py --benchmark 1 2>&1 | tee -a sdiff.log - name: Archive logs uses: actions/upload-artifact@v3 with: @@ -140,4 +143,3 @@ jobs: export dbuser=${{ secrets.DBUSER }} export dbpassword=${{ secrets.DBPASSWORD }} python3 process_results.py - diff --git a/.gitignore b/.gitignore index f3bbc0889..8897298b9 100644 --- a/.gitignore +++ b/.gitignore @@ -136,6 +136,9 @@ tags # macOS dir files .DS_Store +# PyCharm files +.idea + # vscode .vscode diff --git a/.gitmodules b/.gitmodules index a82a39064..1272127de 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "3rdparty/cutlass"] path = 3rdparty/cutlass - url = https://github.com/AITemplate/cutlass.git + url = https://github.com/facebookincubator/cutlass-fork.git [submodule "3rdparty/cub"] path = 3rdparty/cub url = https://github.com/NVIDIA/cub.git @@ -8,3 +8,6 @@ path = 3rdparty/composable_kernel url = https://github.com/ROCmSoftwarePlatform/composable_kernel.git branch = develop +[submodule "3rdparty/picojson"] + path = 3rdparty/picojson + url = https://github.com/kazuho/picojson.git diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel index 52abc2f37..78eb3f0b4 160000 --- a/3rdparty/composable_kernel +++ b/3rdparty/composable_kernel @@ -1 +1 @@ -Subproject commit 52abc2f37112d49f85f31aa343a14bd92a83b07c +Subproject commit 78eb3f0b46aafc52c6d19a07b9dc5bd19b8e7807 diff --git a/3rdparty/cutlass b/3rdparty/cutlass index f434be22a..a9d9b8049 160000 --- a/3rdparty/cutlass +++ b/3rdparty/cutlass @@ -1 +1 @@ -Subproject commit f434be22a6270f9f000712286f92545ccca045b7 +Subproject commit a9d9b80493e20086732f51f90f10f99ae50ae5ed diff --git a/3rdparty/picojson b/3rdparty/picojson new file mode 160000 index 000000000..111c9be51 --- /dev/null +++ b/3rdparty/picojson @@ -0,0 +1 @@ +Subproject commit 111c9be5188f7350c2eac9ddaedd8cca3d7bf394 diff --git a/README.md b/README.md index 38330592d..907e9d3bc 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,16 @@ # AITemplate [![License](https://img.shields.io/badge/License-Apache_2.0-brightgreen.svg)](https://github.com/facebookincubator/AITemplate/blob/main/LICENSE) | -[![Documentation](https://github.com/facebookincubator/AITemplate/actions/workflows/docs.yml/badge.svg)](https://facebookincubator.github.io/AITemplate) | +[![Documentation](https://github.com/facebookincubator/AITemplate/actions/workflows/docs.yaml/badge.svg)](https://facebookincubator.github.io/AITemplate) | [![CircleCI](https://circleci.com/gh/facebookincubator/AITemplate.svg?style=svg)](https://app.circleci.com/pipelines/github/facebookincubator/AITemplate) - - +[![Deploy docs to Pages](https://github.com/facebookincubator/AITemplate/actions/workflows/pages.yaml/badge.svg)](https://github.com/facebookincubator/AITemplate/actions/workflows/pages.yaml) AITemplate (AIT) is a Python framework that transforms deep neural networks into CUDA (NVIDIA GPU) / HIP (AMD GPU) C++ code for lightning-fast inference serving. AITemplate highlights include: - High performance: close to roofline fp16 TensorCore (NVIDIA GPU) / MatrixCore (AMD GPU) performance on major models, including ResNet, MaskRCNN, BERT, VisionTransformer, Stable Diffusion, etc. -- Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easy extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms. +- Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easily extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms. + ## More about AITemplate @@ -24,32 +24,48 @@ AITemplate provides unique advanced horizontal fusion. AITemplate can fuse paral ### Vertical Fusion -AITemplate provides strong vertical fusion. AITemplate can fuse a large range of operations into TensorCore/MatrixCore operations, such as elementwise operations, reduction operations, and layout permutation operations. AITemplate also provides back-to-back style TensorCore / MatrixCore operation fusion. +AITemplate provides strong vertical fusion. AITemplate can fuse a large range of operations into TensorCore/MatrixCore operations, such as elementwise operations, reductions, and layout permutations. AITemplate also provides back-to-back style TensorCore / MatrixCore operation fusion. ### Memory Fusion AITemplate provides innovative memory fusions. AITemplate can fuse GEMM, LayerNorm, and other operators, followed by memory operations such as concatenation, split, and slice into a single operator. ### Working w/wo PyTorch + The AITemplate-generated Python runtime can take PyTorch tensors as inputs and outputs without an extra copy. For environments without PyTorch, the AITemplate Python/C++ runtime is self-contained. ### Extensions without suffering AITemplate provides a straightforward approach for making an extension in codegen. To add a new operator or a new fused kernel into AITemplate, most of the time one only needs to add two Python files: one for a graph node definition and another for the backend codegen. The CUDA/HIP kernel in a text header file can be directly utilized in the codegen. + +## FX2AIT + +FX2AIT is a Python-based tool that converts PyTorch models into AITemplate (AIT) engine for lightning-fast inference serving. Using FX2AIT's built-in AITLowerer, partial AIT acceleration can be achieved for models with unsupported operators in AITemplate. + +Key features of FX2AIT include: + +* Easy Conversion: FX2AIT requires only a PyTorch model and input for conversion, generating an "AITModule" output for inference serving. +* Expanded Support: AITemplate does not support all PyTorch operators. FX2AIT's AITLowerer offers a solution for partial AIT conversion for models with unsupported operators. Check the `fx2ait/fx2ait/example/03_lowering_split` for more information. + +More info can be found from https://github.com/facebookincubator/AITemplate/tree/main/fx2ait. + + ## Installation -**Hardware requirement:** +**Hardware requirements:** - **NVIDIA**: AIT is only tested on SM80+ GPUs (Ampere etc). Not all kernels work with old SM75/SM70 (T4/V100) GPUs. - **AMD**: AIT is only tested on CDNA2 (MI-210/250) GPUs. There may be compiler issues for old CDNA1 (MI-100) GPUs. -## Clone the code +### Clone the code + When cloning the code, please use the following command to also clone the submodules: ``` git clone --recursive https://github.com/facebookincubator/AITemplate ``` ### Docker Image + We highly recommend using AITemplate with Docker to avoid accidentally using a wrong version of NVCC or HIPCC. - CUDA: `./docker/build.sh cuda` - ROCM: `DOCKER_BUILDKIT=1 ./docker/build.sh rocm` @@ -57,6 +73,7 @@ We highly recommend using AITemplate with Docker to avoid accidentally using a w This will build a docker image with tag `ait:latest`. ### From Source + The following command will create a Python wheel for AITemplate. Please ensure you have correct CUDA/ROCm compiler installed. - CUDA: CUDA 11.6 - ROCm: We tested on ROCm 5.2.3 with a customized build HIPCC with the command in docker/Dockerfile.rocm#L87-L96 @@ -83,45 +100,48 @@ There are a few tutorials for onboarding: ## Examples & Performance -AITemplate provides the following model templates & reference performance data on A100/MI-250 + +AITemplate provides the following model templates & reference performance data on A100/MI-250: - [01_ResNet-50](examples/01_resnet-50/) with PyTorch Image Models (TIMM) - [02_MaskRCNN-FPN](examples/02_detectron2/) with Detectron2 -- [03_BERT](examples/03_bert/) with HuggingFace Transformer +- [03_BERT](examples/03_bert/) with Hugging Face Transformer - [04_Vision Transformer](examples/04_vit/) with PyTorch Image Models (TIMM) -- [05_Stable Diffusion](examples/05_stable_diffusion/) with HuggingFace Diffusers +- [05_Stable Diffusion](examples/05_stable_diffusion/) with Hugging Face Diffusers ## Release -AITemplate has a 90 days release cycle. -In the next one or two releases, we will focus on: -- Deprecating FlashAttention: Unify CUDA Attention computation to Composable Kernel (AMD GPU) style back-to-back fusion to improve performance and increase flexibility for NVIDIA GPU Transformer users. -- Remove kernel profiling requirement. -- GEMM + LayerNorm fusion, GEMM + GEMM fusion, Conv + Conv fusion. -- Better dynamic shape support: Focus on the dynamic sequence in Transformers. -- More model templates: Provide model templates with control flow and containers. +All current development updates can be seen in the AITemplate repository. Releases are not on a set schedule and will only be tagged for significant feature releases. + +Mid-term plan: +- Better dynamic shape support: Focus on the dynamic sequence in Transformers. Add symbolic shape support. - More automatic graph passes: Relief manual rewrite models to obtain the best performance. -- Enable more fusions on AMD backend. +- Quantization: fp8/int8/int4. +- Sparsity pruning for Gemm. +- PT2 integration: Aten2AIT is under active development. -Some ongoing/potential work that won't appear in the next short-term release: -- Automatic Pytorch-FX, ONNX, Open-XLA and other format model conversion. -- Quantized model (int8/fp8/int4) support. +Long-term plan: +- Automatic ONNX, Open-XLA and other format model conversion. - Composable Kernel CPU extension on AVX2/AVX-512 for AMD Epyc CPU. ## Contributing + Check our [contributing guide](CONTRIBUTING.md) to learn about how to contribute to the project. ## The Team -AITemplate is co-created by Meta engineers: [Bing Xu](https://github.com/antinucleon), [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), and [Terry Chen](https://github.com/terrychenism), with major contributions coming from more talented engineers. A non-exhaustive list to mention is Mike Iovine, Mu-Chu Lee, Scott Wolchok, Oleg Khabinov, Shirong Wu, Huaming Li, Hui Guo, Zhijing Li, Max Podkorytov. We also want to thank the discussions with Andrew Tulloch, Yinghai Lu, Lu Fang. +AITemplate is currently maintained by Meta engineers: [Ying Zhang](https://github.com/ipiszy), [Yang Chen](https://github.com/chenyang78), [Terry Chen](https://github.com/terrychenism), [Mu-Chu Lee](https://github.com/muchulee8), [Max Podkorytov](https://github.com/tenpercent), [Adnan Akhundov](https://github.com/aakhundov). -AITemplate is currently maintained by Meta engineers: [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), [Terry Chen](https://github.com/terrychenism), [Mike Iovine](https://github.com/mikeiovine), [Mu-Chu Lee](https://github.com/muchulee8) and [Bing Xu](https://github.com/antinucleon). +AITemplate is co-created by Meta engineers: [Bing Xu](https://github.com/antinucleon), [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), and [Terry Chen](https://github.com/terrychenism), with major contributions coming from more talented engineers. A non-exhaustive list to mention is Mike Iovine, Mu-Chu Lee, Scott Wolchok, Oleg Khabinov, Shirong Wu, Huaming Li, Hui Guo, Zhijing Li, Max Podkorytov. We also want to thank Andrew Tulloch, Yinghai Lu, Lu Fang for the valuable discussions. +FX2AIT and Aten2AIT are co-created and maintained by Meta engineers: [Wei Wei](https://github.com/frank-wei), [Shirong Wu](https://github.com/wushirong) and [Zhijing Li](https://github.com/tissue3). -## Acknowledgement -AITemplate team works deeply with NVIDIA [CUTLASS](https://github.com/NVIDIA/cutlass) Team (Led by Andrew Kerr, Haicheng Wu) and AMD [Composable Kernel](https://github.com/ROCmSoftwarePlatform/composable_kernel) Team (Led by Chao Liu, Jing Zhang). We co-designed many advanced GPU optimizations specialized for each platform, and nothing is possible without our close collaboration. +## Acknowledgements + +AITemplate team works deeply with NVIDIA [CUTLASS](https://github.com/NVIDIA/cutlass) Team (led by Andrew Kerr, Haicheng Wu) and AMD [Composable Kernel](https://github.com/ROCmSoftwarePlatform/composable_kernel) Team (led by Chao Liu, Jing Zhang). We co-designed many advanced GPU optimizations specialized for each platform, and nothing is possible without our close collaboration. ## License + AITemplate is licensed under the [Apache 2.0 License](https://github.com/facebookincubator/AITemplate/blob/main/LICENSE). diff --git a/default.nix b/default.nix new file mode 100644 index 000000000..d521651e9 --- /dev/null +++ b/default.nix @@ -0,0 +1,50 @@ +{ pkgs ? import { + config = { + allowUnfree = true; + cudaSupport = true; + }; +}}: + +let + ait-deps = ps: with ps; [ + pytorch-bin + pip + wheel + click + unidecode + inflect + librosa + jinja2 + sympy + einops + parameterized + transformers + # ( + # buildPythonPackage rec { + # pname = "cuda_python"; + # version = "12.1.0"; + # format = "wheel"; + # src = fetchPypi { + # inherit pname version format; + # sha256 = "94506d730baade1744767e2c05d5ddd84d7fbe4c9b6f694a54a3f376f7ffa525"; + # abi = "cp39"; + # python = "cp39"; + # platform = "manylinux_2_17_x86_64.manylinux2014_x86_64"; + # }; + # doCheck = false; + # } + # ) + ]; +in +pkgs.mkShell { + buildInputs = [ + pkgs.cmake + pkgs.cudatoolkit + (pkgs.python310.withPackages ait-deps) + ]; + + shellHook = '' + export CUDA_PATH=${pkgs.cudatoolkit} + echo "You are now using a NIX environment" + ''; +} diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda index 0461f45bf..1d481809f 100644 --- a/docker/Dockerfile.cuda +++ b/docker/Dockerfile.cuda @@ -40,6 +40,9 @@ RUN bash /Install/install_doc_dep.sh # install Pytorch RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 +# install NVIDIA cuda-python +RUN pip3 install 'cuda-python<12.0.0' + # for detection RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata RUN bash /Install/install_detection_deps.sh diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 8146b506c..f7ca24bc7 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -13,105 +13,30 @@ # limitations under the License. # # ROCM Docker Image for AITemplate -FROM ubuntu:20.04 - -ARG ROCMVERSION=5.3 - -RUN set -xe - -ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ -# Add rocm repository -RUN apt-get update -RUN apt-get install -y wget gnupg -RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - -RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" -RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - -RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list" - -# Install dependencies -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ - apt-utils \ - build-essential \ - cmake-data \ - cmake \ - curl \ - git \ - hip-rocclr \ - jq \ - libelf-dev \ - libncurses5-dev \ - libnuma-dev \ - libpthread-stubs0-dev \ - llvm-amdgpu \ - pkg-config \ - python \ - python3 \ - python-dev \ - python3-dev \ - python3-pip \ - software-properties-common \ - rocm-dev \ - rocm-device-libs \ - rocm-cmake \ - rocm-libs \ - vim \ - zlib1g-dev \ - openssh-server \ - clang-format-10 \ - kmod && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Setup ubsan environment to printstacktrace -RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer -ENV UBSAN_OPTIONS=print_stacktrace=1 - -# Install an init system -RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb -RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb - -ARG PREFIX=/opt/rocm +FROM docker.io/rocm/pytorch:rocm6.0_ubuntu22.04_py3.9_pytorch_2.0.1 # Setup ubsan environment to printstacktrace ENV UBSAN_OPTIONS=print_stacktrace=1 ENV LC_ALL=C.UTF-8 ENV LANG=C.UTF-8 -ADD ./docker/install/rocm_dev-requirements.txt dev-requirements.txt RUN groupadd -f render -# Install the new rocm-cmake version -RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git && \ - cd rocm-cmake && mkdir build && cd build && \ - cmake .. && cmake --build . && cmake --build . --target install - WORKDIR / -ADD ./docker/install/ /Install +RUN git clone -b merge_upstream --recursive https://github.com/ROCmSoftwarePlatform/AITemplate.git + +WORKDIR /AITemplate # necessary package -RUN bash /Install/install_basic_dep.sh +RUN bash ./docker/install/install_basic_dep.sh # for test -RUN bash /Install/install_test_dep.sh +RUN bash ./docker/install/install_test_dep.sh # for docs -RUN bash /Install/install_doc_dep.sh - -# Install Pytorch -RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.1.1 +RUN bash ./docker/install/install_doc_dep.sh # for detection RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata -RUN bash /Install/install_detection_deps.sh +RUN bash ./docker/install/install_detection_deps.sh -# Copy AITemplate to Docker -RUN mkdir /AITemplate -ADD ./COMMIT_INFO /AITemplate/COMMIT_INFO -ADD ./python /AITemplate/python -ADD ./3rdparty /AITemplate/3rdparty -ADD ./examples /AITemplate/examples -ADD ./tests /AITemplate/tests -ADD ./docs /AITemplate/docs -ADD ./static /AITemplate/static -ADD ./licenses /AITemplate/licenses -ADD ./docker/install/install_ait.sh /AITemplate/ -RUN bash /AITemplate/install_ait.sh +RUN bash ./docker/install/install_ait.sh diff --git a/docker/install/install_basic_dep.sh b/docker/install/install_basic_dep.sh index 801ef53ef..18f37f628 100644 --- a/docker/install/install_basic_dep.sh +++ b/docker/install/install_basic_dep.sh @@ -1,4 +1,5 @@ #!/bin/bash +apt install -y time pip3 install numpy pip3 install jinja2 diff --git a/docker/install/install_detection_deps.sh b/docker/install/install_detection_deps.sh index 47238cd3c..e8b91f9d5 100644 --- a/docker/install/install_detection_deps.sh +++ b/docker/install/install_detection_deps.sh @@ -5,5 +5,5 @@ pip3 install yacs pip3 install opencv-python pip3 install tqdm pip3 install timm -pip3 install transformers -pip3 install diffusers +pip3 install transformers==4.25.0 +pip3 install diffusers==0.24.0 \ No newline at end of file diff --git a/docs/image/gpu_grid_block.png b/docs/image/gpu_grid_block.png new file mode 100644 index 000000000..a486a5bf9 Binary files /dev/null and b/docs/image/gpu_grid_block.png differ diff --git a/docs/image/pack_size_1.png b/docs/image/pack_size_1.png new file mode 100644 index 000000000..b07bb5ff4 Binary files /dev/null and b/docs/image/pack_size_1.png differ diff --git a/docs/image/pack_size_2.png b/docs/image/pack_size_2.png new file mode 100644 index 000000000..6769b0691 Binary files /dev/null and b/docs/image/pack_size_2.png differ diff --git a/docs/image/pack_size_4.png b/docs/image/pack_size_4.png new file mode 100644 index 000000000..40deaa502 Binary files /dev/null and b/docs/image/pack_size_4.png differ diff --git a/docs/image/pack_size_8.png b/docs/image/pack_size_8.png new file mode 100644 index 000000000..804187fd9 Binary files /dev/null and b/docs/image/pack_size_8.png differ diff --git a/docs/image/softmax.png b/docs/image/softmax.png new file mode 100644 index 000000000..e2fc3c523 Binary files /dev/null and b/docs/image/softmax.png differ diff --git a/docs/image/vs_oneflow.png b/docs/image/vs_oneflow.png new file mode 100644 index 000000000..495bdd426 Binary files /dev/null and b/docs/image/vs_oneflow.png differ diff --git a/docs/source/arch/philosophy.rst b/docs/source/arch/philosophy.rst index 2eefb8f5d..d1ac35db4 100644 --- a/docs/source/arch/philosophy.rst +++ b/docs/source/arch/philosophy.rst @@ -5,12 +5,17 @@ Design Philosophy KISS (Keep it simple and stupid) -------------------------------- -AITemplate avoids deep IR lowering stacks to reduce the system's complexity. A highly modularized, multiple backend codegen system written in pure Python directly attacks the pain point in high-performance GPU inference. +AITemplate avoids deep IR lowering stacks to reduce the system's complexity. +A highly modularized, multiple backend codegen system written in pure Python directly attacks the pain point in high-performance GPU inference. Pragmatism ---------- -AITemplate provides a PyTorch-style frontend to enable engineers to manually match the PyTorch model & weights to AITemplate for optimization. Using it is less painful than debugging different lowering IR stacks, especially for complex models such as MaskRCNN. +AITemplate provides a PyTorch-style frontend to enable engineers to manually match the PyTorch model & weights to AITemplate for optimization. +Using it is less painful than debugging different lowering IR stacks, especially for complex models such as MaskRCNN. - -We believe most of the neural network workload can be decoupled. For example, most of the network can be decoupled into Encoder, Decoder, and Decoder logics. For encoder and decoder, it is a computation bounded problem. For decoder logic, it may involve more control flows. By using divide and conquer, we left the decoder logic part to C++ or Python rather than build a unified language / IR stack to play as the silver bullet. \ No newline at end of file +We believe most of the neural network workload can be decoupled. +For example, most of the network can be decoupled into Encoder, Decoder, and Decoder logics. +For encoder and decoder, it is a computation-bounded problem. +For decoder logic, it may involve more control flows. +By using divide and conquer, we left the decoder logic part to C++ or Python rather than build a unified language / IR stack as a silver bullet. diff --git a/docs/source/conf.py b/docs/source/conf.py index bf239d5d1..51fbf50db 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,11 +18,11 @@ # -- Project information ----------------------------------------------------- project = "AITemplate" -copyright = "2022, Meta Platforms" +copyright = "2022-2023, Meta Platforms" author = "Meta Platforms" # The full version, including alpha/beta/rc tags -release = "0.1" +release = "0.2" # -- General configuration --------------------------------------------------- diff --git a/docs/source/debughints.rst b/docs/source/debughints.rst index 074254a75..0bd07d3c1 100644 --- a/docs/source/debughints.rst +++ b/docs/source/debughints.rst @@ -1,14 +1,15 @@ Debug Hints =========== -AITemplate is a new project under active development. We have a rich test set to avoid bugs but don't be surprised if there is anything unexpected. +AITemplate is a new project under active development. +We have a rich test set to avoid bugs but don't be surprised if there is anything unexpected. -Here are some helpful tips when we learned during the development AITemplate: +Here are some helpful tips we learned during the development of AITemplate: -1. Once the codegen for op which requires profiling is changed, remember to delete old profilers (usually located at workdir), and flush the cache by either deleting ~/.aitemplate or setting environment variable FLUSH_PROFILE_CACHE=1 +1. Once the codegen for op which requires profiling is changed, remember to delete old profilers (usually located at workdir), and flush the cache by either deleting `~/.aitemplate` or setting the environment variable `FLUSH_PROFILE_CACHE=1`. -2. Check the pseudo code/visualization generated by each optimization pass if some optimization is harmful. +2. Check the pseudo code/visualization generated by each optimization pass if some optimization behaves in unexpected way. 3. Always do the numerical test, from small to large, to make sure the entire model is correct. -4. Try to make the new fusion subgraph work in a manual way, then try to add an automatic pass to rewrite the graph with the fused subgraph. \ No newline at end of file +4. Try to make the new fusion subgraph work in a manual way, then try to add an automatic pass to rewrite the graph with the fused subgraph. diff --git a/docs/source/index.rst b/docs/source/index.rst index 775d33792..9dbcdcc9a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,11 +1,11 @@ AITemplate Documentation -====================================== +======================== AITemplate (AIT) is a Python framework that transforms deep neural networks into CUDA (NVIDIA GPU) / HIP (AMD GPU) C++ code for lightning-fast inference serving. AITemplate highlights include: * High performance: close to roofline fp16 TensorCore (NVIDIA GPU) / MatrixCore (AMD GPU) performance on major models, including ResNet, MaskRCNN, BERT, VisionTransformer, Stable Diffusion, etc. -* Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easy extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms. +* Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easily extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms. .. toctree:: diff --git a/docs/source/install/index.rst b/docs/source/install/index.rst index 48244cfa7..6e684ab03 100644 --- a/docs/source/install/index.rst +++ b/docs/source/install/index.rst @@ -7,11 +7,11 @@ Using Docker The easiest way to get started is to use Docker. Using docker is able to avoid performance regression caused by incorrect version of NVCC and HIPCC. To use docker, we provide a bash script to build the docker image. -- CUDA: +- CUDA: .. code-block:: bash ./docker/build.sh cuda -- ROCM: +- ROCM: .. code-block:: bash DOCKER_BUILDKIT=1 ./docker/build.sh rocm @@ -31,13 +31,13 @@ To launch the docker container docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined ait:latest -AITemplate will be installed in as a Python package to Python 3.8. There will be also a copy of source code and examples at `/AITemplate` +AITemplate will be installed as a Python package in Python 3.8. There will be also a copy of the source code and examples at `/AITemplate`. -Install as standard Python package ----------------------------------- +Installing as a Standard Python Package +--------------------------------------- -Before start installing AITemplate, first make sure you have correct hardware and software environment. +Before installing AITemplate, first make sure you have correct hardware and software environment. - Hardware - NVIDIA: AIT is only tested on SM80+ GPUs (Ampere etc). @@ -52,24 +52,23 @@ Before start installing AITemplate, first make sure you have correct hardware an - AMD: ROCm 5.2, with HIPCC 10736 (commit `b0f4678b9058a4ae00200dfb1de0da5f2ea84dcb`) .. warning:: - - Incorrect compiler version will lead performance regression. - - Instruction for building HIPCC 10736 can be founded in `docker/Dockerfile.rocm` + - Incorrect compiler version may lead to performance regression. + - Instruction for building HIPCC 10736 can be founded in `docker/Dockerfile.rocm`. -When clone the code, please use the following command to clone the submodules: -``` -git clone --recursive https://github.com/facebookincubator/AITemplate -``` +When cloning the code, please use the following command to clone the submodules: + + .. code-block:: bash + + git clone --recursive https://github.com/facebookincubator/AITemplate .. warning:: - Please check all submodules are cloned correctly before go to next step. + Please check that all submodules are cloned correctly before the next step. -Then build Python wheel package and install. +Then build the Python wheel package and install it: .. code-block:: bash cd python python setup.py bdist_wheel - pip install dist/aitemplate-0.0.1-py3-none-any.whl - - + pip install dist/aitemplate-*.whl diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst index 1342becf6..392999b33 100644 --- a/docs/source/reference/env.rst +++ b/docs/source/reference/env.rst @@ -1,13 +1,40 @@ Environment Variables ===================== -AITemplate uses environment variables to control the behavior of codegen and profiling. All the environment variables used in AITemplate are listed here. +AITemplate uses environment variables to control the behavior of codegen and profiling. +The environment variables used in AITemplate are listed here. Codegen ------- **NUM_BUILDERS**: The number of CPU jobs running in parallel during codegen. It controls both the profiler codegen and the final .so codegen. It's set to 12 in NIGHTLY jobs. Internally, it's set to 12 for normal tests and 24 for heavy tests. By default, the builder uses all the available CPUs for building. -**RECOMPILE**: If set to "0", it skips compilation for the .so and reuses the previously compiled ones. It is used to speed up local testing. The default value is "1" to always recompile. +**AIT_RECOMPILE**: If set to "0", it skips compilation for the .so and reuses the previously compiled ones. It is used to speed up local testing. The default value is "1" to always recompile. + +**AIT_NDEBUG**: If set to "1", compile with `NDEBUG`, disabling debug assertions. Recommended for production builds. "1" by default. + +**AIT_COMPILER_OPT**: The optimization level for a compiler, which is directly passed to the host compiler command line. AITemplate host code may be very light in certain cases, so there is nothing to optimize for a host compiler. Thus, there is no need to make host compiler perform time costly optimizations. It may be very useful to use "-O0" value for debugging GPU kernels. "-O3" by default. + +**AIT_TIME_COMPILATION**: If set to "1", time each make command at the compilation time. This helps us to do compilation time analysis. Requires to install `time `_ package. + +**AIT_MULTISTREAM_MODE**: Controls multi-stream mode. Default mode is "0". +* If set to "0", then no multistreaming is used. +* If set to "1", then a simple multistreaming is used (iteratively track a wavefront of independent operators and execute ones). + +**AIT_MULTISTREAM_EXTRA_STREAMS**: Specifies the number of additional streams used. Default value is "4". + +**AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS**: Maximum number of parallel operators used in memory planning for simple multi-stream mode. Default value is "99999999" (basically, unlimited). + +**AIT_USE_CMAKE_COMPILATION**: (An experimental feature) If set to "1", then `cmake` will used instead of `make`. This allows to build AITemplate using MSVC Compiler + MSBuild on Windows, and it works for linux as well. This builder does not support many features (such as caching) yet. But it allows to generate a cmake project that can be loaded to a modern IDE. Default value is "0". + +**AIT_ENABLE_STANDALONE**: Enable standalone test and benchmark executable generation. Default value is "0" (disabled). If set to "1", this will generate a "test" executable that may be used to run standalone tests and benchmarks. This standalone executable is also well suited for running through debuggers and/or profiling tools, as it does not pull in python and pytorch as dependencies, unlike most python unit tests. + +**AIT_ENABLE_PTXAS_INFO**: Set this to "1" to enable the generation and logging of verbose ( tuning-relevant ) information about CUDA ptx assembly code produced by the CUDA compiler nvcc. Intermediate ptx files, annotated with C++ source info will be written to the build directory. In addition, this flag enables warnings about CUDA register spilling and resource usage. + +**AIT_CUDA_DEBUG_LEVEL**: Configure level of CUDA debug information. Defaults to no debug info. This may either be a string with options passed to nvcc ( for example "-g -G" or "-lineinfo" ) or a CUDA debug level from "0" (default, no debug info), "1" ( "-lineinfo" ) include source code line information. Ideal for profiling with ncu/nsight-compute, "2" full debug information (**warning**: this disables all optimizations, regardless of other settings) + +**AIT_ENABLE_CUDA_SOURCE_NAVIGATION_FIX**: (Only supported by FBCUDA target so far): When this flag is enabled by setting it to "1" (it is disabled by default), every *.cu file in build dirs into a corresponding *.cu.h file and create a *.cu file which just includes this file. This fixes code navigation issues in some IDE's which don't treat .cu files as C++ files and disable code navigation. + +**AIT_ENABLE_INCLUDE_FROM_SOURCETREE**: (Only supported by FBCUDA target so far) When this flag is enabled by setting it to "1" (it is disabled by default), the target will create an in-place build which tries to directly reference the include paths within the AITemplate source tree. This helps to iterate faster during native Kernel/Operator development and debugging. Profiling --------- @@ -22,16 +49,24 @@ Profiling **HIP_VISIBLE_DEVICES**: This one is from ROCm itself. It's used to set the number of GPU devices available for profiling. Set to "0,1,2,3,4,5,6,7" to speed up profiling. For benchmarking, it's useful to set to a particular device to lower noise. -**FORCE_PROFILE**: If set to "1", it will do profiling regarless in_ci_env and disable_profiler_codegen. For non-NIGHTLY CI, we do not do profiling, and we could use FORCE_PROFILE=1 in these CI to do runs with codegen, compile, and profile. +**FORCE_PROFILE**: If set to "1", it will do profiling regardless in_ci_env and disable_profiler_codegen. For non-NIGHTLY CI, we do not do profiling, and we could use FORCE_PROFILE=1 in these CI to do runs with codegen, compile, and profile. + +**COMBINE_PROFILER_MULTI_SOURCES**: Whether to combine multiple profiler sources per target. "0" - Disabled, "1" - Enabled (default). + +**FORCE_ONE_PROFILER_SOURCE_PER_TARGET**: Whether to combine multiple profiler sources per target into one. "0" - Disabled (default), "1" - Enabled. OSS CI ------ -**CI_FLAG**: It is set to "CIRCLECI" in OSS CI to indicate we're in OSS CI environment. The behavior of the profiler and codegen is different in CI to speed up testing. Profiling itself for gemm/conv ops is disabled in CI. But we still compiles two random profilers to make sure the profiler codegen is not broken. +**CI_FLAG**: It is set to "CIRCLECI" in OSS CI to indicate we're in OSS CI environment. The behavior of the profiler and codegen is different in CI to speed up testing. Profiling itself for gemm/conv ops is disabled in CI. But we still compile two random profilers to make sure the profiler codegen is not broken. -**BUILD_DOCS**: If set to "1", it will create a fake CUDA target to enable doc building in Github Actions. +**AIT_BUILD_DOCS**: If set to "1", it will create a fake CUDA target to enable doc building in Github Actions. Miscellaneous ------------- -**LOGLEVEL**: It is used to control the logging level in python. It's default to "INFO". "DEBUG" is useful for debugging. +**LOGLEVEL**: It is used to control the logging level in Python. The default value is "INFO". "DEBUG" is useful for debugging. + +**AIT_PLOT_SHORTEN_TENSOR_NAMES**: If set to "1", shorten too long tensor names for a plot of a model graph, thus making a plot much easier to analyze visually. "0" by default. + +**AIT_USE_FAST_MATH**: If set to "0", no fast math option will be used for the device code generation. Default value is "1". diff --git a/docs/source/runtime/cxx_design.rst b/docs/source/runtime/cxx_design.rst index 5ef18f889..d4608409f 100644 --- a/docs/source/runtime/cxx_design.rst +++ b/docs/source/runtime/cxx_design.rst @@ -1,29 +1,30 @@ -================== +================ C++ Runtime Note -================== +================ `Model` v.s. `ModelContainer` -============================== +============================= -These are the two main classes involved in the C++ runtime implementation. +These are the two main classes involved in the C++ runtime implementation: -* The bulk of the runtime implementation is in `Model`. -* `ModelContainer` stores a set of shared constants and a collection of `Model`s. Almost all functions in `model_interface.h` forward to a method on `ModelContainer`. When `Run` is invoked, `ModelContainer` looks for an available `Model`, or blocks until one is available (see the section on asynchronous predictions). It then forwards the run request to the runtime. +* The bulk of the runtime implementation is in the `Model` class. +* The `ModelContainer` class stores a set of shared constants and a collection of `Model` instances. Almost all functions in `model_interface.h` forward to a method in `ModelContainer`. When `Run` is invoked, `ModelContainer` looks for an available `Model`, or blocks until one becomes available (see the section on asynchronous predictions). It then forwards the run request to the runtime. Code Structure ============== Some important files: -1. `include/model_interface.h`: The interface that we expose in the compiled .so +1. `include/model_interface.h`: The interface that we expose in the compiled `.so`. 2. `include/model_container.h`: The bulk of the `ModelContainer` implementation. Some files are generated at compile time. These include: -* `model-generated.h`: The implementation for `Model`. -* `model_container_base.cu`: A small part of the implementation for `ModelContainer` needs to be codegened. So `ModelContainer` inherits from `ModelContainerBase`, and `ModelContainerBase`'s implementation lives in this file. See `model_container.h` for more details. +* `model-generated.h`: The implementation of the `Model`. +* `model_container_base.cu`: A small part of the implementation for `ModelContainer` that needs to be generated. `ModelContainer` inherits from `ModelContainerBase`, and `ModelContainerBase`'s implementation lives in this file. See `model_container.h` for more details. -All codegen templates can be found in `backend/main_templates.py`. The codegen implementation is in `backend/codegen.py`. - -Note that many of the headers in this directory rely on generated code and thus cannot be `#include`d in external projects. The exception is `model_interface.h`. +All codegen templates can be found in `backend/main_templates.py`. +The codegen implementation is in `backend/codegen.py`. +Note that many of the headers in this directory rely on generated code and thus cannot be `#include` -d in external projects. +`model_interface.h` is an exception. diff --git a/docs/source/runtime/py_design.rst b/docs/source/runtime/py_design.rst index c143123de..55093b8df 100644 --- a/docs/source/runtime/py_design.rst +++ b/docs/source/runtime/py_design.rst @@ -1,6 +1,6 @@ -===================== +=================== Python Runtime Note -===================== +=================== Python `Model` ============== @@ -16,7 +16,7 @@ This class represents a contiguous blob of memory that AIT will use as a tensor. * `shape: List[int]`: The shape of the tensor. * `dtype: str`: The tensor's dtype; one of `"float32", "float16", "int32", "int64"`. Note that most ops only support float16 at this stage. -If using AITemplate with PyTorch, `AITData`s can be constructed with the `torch_to_ait_data` utility: +When using AITemplate with PyTorch, `AITData` can be constructed with the `torch_to_ait_data` utility: .. code-block:: python @@ -30,14 +30,14 @@ If PyTorch is not available, `Model` provides a set of functions for copying, al `run` ----- -`run` takes a set of inputs and outputs as `AITData`s. Both arguments can be passed as either an ordered list or a dictionary (mapping name to tensor). +`run` takes inputs and outputs as collections of `AITData` instances. Both arguments can be passed as either an ordered list or a dictionary (mapping name to tensor). .. code-block:: python # Arguments as a dictionary module.run( {"input0": in0_ait, "input1": in1_ait}, - {"output0": out0_ait, "output1": out0_ait}, + {"output0": out0_ait, "output1": out1_ait}, ) # Arguments as an ordered list. Note that you might need to query @@ -45,8 +45,8 @@ If PyTorch is not available, `Model` provides a set of functions for copying, al input_name_to_idx = module.get_input_name_to_index_map() output_name_to_idx = module.get_output_name_to_index_map() - inputs = [None for i in range(len(input_name_to_idx))] - outputs = [None for i in range(len(input_name_to_idx))] + inputs = [None] * len(input_name_to_idx) + outputs = [None] * len(output_name_to_idx) for name in input_name_to_idx: inputs[input_name_to_idx[name]] = ait_inputs[name] @@ -55,9 +55,9 @@ If PyTorch is not available, `Model` provides a set of functions for copying, al outputs[output_name_to_idx[name]] = ait_outputs[name] module.run(inputs, outputs) - -One important caveat is that the output must be its **maximum** size. This is because of dynamic shapes - the size of the output may vary, but its shape is not inferred until inference time. The maximum shape can be queried with the `get_output_maximum_shape()`: + +One important caveat is that the output must have the **maximum** possible size. This is because of dynamic shapes: the size of the output may vary, but its shape is not inferred until inference time. The maximum shape can be queried with the `get_output_maximum_shape()`: .. code-block:: python @@ -67,7 +67,7 @@ One important caveat is that the output must be its **maximum** size. This is be max_shape = module.get_output_maximum_shape("output") -`Model.run` returns a dictionary of output `AITData`s with (possibly dynamic) shapes that the runtime inferred. +`Model.run` returns a dictionary of output `AITData` instances with (possibly dynamic) shapes that inferred in the runtime. Nullptr Inputs/Outputs ---------------------- @@ -102,7 +102,7 @@ Constants are read-only and *shared* with all runtimes in the `ModelContainer`. `run_with_tensors` ------------------ -`run_with_tensors` is a convenience method with the same interface as `run`, except it can take lists of `torch.Tensor`s: +`run_with_tensors` is a convenience method with the same interface as `run`, except it can take lists (or dicts) of `torch.Tensor` instances: .. code-block:: python @@ -115,9 +115,14 @@ Constants are read-only and *shared* with all runtimes in the `ModelContainer`. Streams and Asynchronous Predictions ------------------------------------ -A pointer to a stream can optionally be passed to `run`. If none is given, the prediction happens on the default stream 0. If the `sync` argument is set to `True`, the stream is synchronized before `run()` returns. `sync` is `True` by default. +A pointer to a stream can optionally be passed to `run`. +If none is given, the prediction happens on the default stream 0. +If the `sync` argument is set to `True`, the stream is synchronized before `run()` returns. +`sync` is `True` by default. -Multiple predictions can happen at the same time (on the same or different streams). Under the hood, there is a fixed-size pool of runtime objects. When all the runtimes are used, `run()` blocks until one is available. +Multiple predictions can happen at the same time (on the same or different streams). +Under the hood, there is a fixed-size pool of runtime objects. +When all the runtimes are used, `run()` blocks until one becomes available. The size of this pool can be configured with the `num_runtimes` option in `Model`'s constructor. CUDA Graph diff --git a/docs/source/tutorial/how_to_add_op.rst b/docs/source/tutorial/how_to_add_op.rst index 160745336..988f5375e 100644 --- a/docs/source/tutorial/how_to_add_op.rst +++ b/docs/source/tutorial/how_to_add_op.rst @@ -1,17 +1,17 @@ How to add an operator to the AIT codegen -========================================= +========================================= This tutorial will demonstrate how to add a new operator to the AIT codegen. -Full source code can be founded at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py` +Full source code can be found at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py`. 0. Prerequisites ------------------ +---------------- -We need to import necessary Python modules +We need to import necessary Python modules: .. code-block:: python - + from typing import Any, Dict, List import jinja2 @@ -26,9 +26,9 @@ We need to import necessary Python modules 1. Define the operator graph node ----------------------------------- +--------------------------------- -Graph node is usually defined at `aitemplate/compiler/ops`. +Graph nodes are usually defined at `aitemplate/compiler/ops`. .. code-block:: python @@ -72,15 +72,15 @@ Graph node is usually defined at `aitemplate/compiler/ops`. .. note:: - `_attrs` in Operator is the most important data structure for codegen. - - `_attrs["op"]` is the identity of operator category, which is used to find the corresponding codegen function in backend, must be **unique**. + - `_attrs["op"]` is the identity of operator category, which is used to find the corresponding codegen function in the backend; must be **unique**. 2. Define the necessary templates for Codegen ----------------------------------------------- +--------------------------------------------- In AIT, there are 4 important templates for codegen: - `FUNC_TEMPLATE`: the template for generating the function body of the operator, and invoke GPU kernel in the body. -- `FUNC_SIGNATURE_TEMPLATE`: the template for generating the function signature of the operator. The signature defined name, and arguments of the function. +- `FUNC_SIGNATURE_TEMPLATE`: the template for generating the function signature of the operator. The signature defines the name and arguments of the function. - `FUNC_CALL_TEMPLATE`: the template for generating the function call of the operator. The call will be used during inference to invoke the GPU kernel with given arguments. - `FUNC_DECL`: the template for forward declaration of the operator function. This is usually an alias of `FUNC_SIGNATURE_TEMPLATE`. @@ -128,7 +128,7 @@ In AIT, there are 4 important templates for codegen: ) 3. Create the GPU kernels --------------------------- +------------------------- In this example we use a simplest add one kernel. The kernel can be written by hand (as what programmer is expected to do), or generated by other tools. @@ -166,10 +166,10 @@ In this example we use a simplest add one kernel. The kernel can be written by h ) 4. Define the codegen function -------------------------------- +------------------------------ -The codegen function is the function that render the templates we defined into valid C++ code string. -The codegen function will take `func_attrs` from graph node, and fill into the jinja2 template. +The codegen function is the function that renders the templates we defined into valid C++ code string. +The codegen function will take `func_attrs` from the graph node, and fill in the jinja2 template. .. code-block:: python @@ -213,10 +213,10 @@ The codegen function will take `func_attrs` from graph node, and fill into the j ).strip() ) -5.1 Register the codegen function to CUDA backend ---------------------------------------------------- +5.1 Register the codegen function in CUDA backend +------------------------------------------------- -CUDA backend functions is usually defined at `aitemplate/backend/cuda/`. +CUDA backend functions are usually defined at `aitemplate/backend/cuda/`. .. code-block:: python @@ -240,10 +240,9 @@ CUDA backend functions is usually defined at `aitemplate/backend/cuda/`. return gen_function_call(func_attrs, indent, is_cuda=True) 5.2 (Optional) Register the codegen function to ROCm backend --------------------------------------------------------------- - -ROCm backend functions is usually defined at `aitemplate/backend/rocm/`. +------------------------------------------------------------ +ROCm backend functions are usually defined at `aitemplate/backend/rocm/`. .. code-block:: python @@ -269,7 +268,7 @@ ROCm backend functions is usually defined at `aitemplate/backend/rocm/`. 6. Compile and verify the results with PyTorch ------------------------------------------------- +---------------------------------------------- .. code-block:: python @@ -299,4 +298,3 @@ ROCm backend functions is usually defined at `aitemplate/backend/rocm/`. outputs = {"Y": y} module.run_with_tensors(inputs, outputs) print(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2)) - diff --git a/docs/source/tutorial/how_to_infer_pt.rst b/docs/source/tutorial/how_to_infer_pt.rst index 67891c46a..8b0535ce0 100644 --- a/docs/source/tutorial/how_to_infer_pt.rst +++ b/docs/source/tutorial/how_to_infer_pt.rst @@ -1,17 +1,15 @@ How to inference a PyTorch model with AIT -========================================== +========================================= This tutorial will demonstrate how to inference a PyTorch model with AIT. -Full source code can be founded at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py` +Full source code can be found at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py`. 0. Prerequisites ------------------ +---------------- -We need to import necessary Python modules +We need to import necessary Python modules: .. code-block:: python - - from collections import OrderedDict import torch @@ -23,9 +21,9 @@ We need to import necessary Python modules 1. Define a PyTorch module ---------------------------- +-------------------------- -Here we define a PyTorch model which is commonly seen in Transformers. +Here we define a PyTorch model which is commonly seen in Transformers: .. code-block:: python @@ -46,7 +44,7 @@ Here we define a PyTorch model which is commonly seen in Transformers. return hidden_states 2. Define an AIT module ------------------------- +----------------------- We can define a similar AIT module as follows: @@ -69,22 +67,23 @@ We can define a similar AIT module as follows: .. warning:: The `nn.Module` API in AIT looks similar to PyTorch, but it is not the same. - The fundamental difference is that AIT module is a container to build graph, while PyTorch module is a container to store parameters for eager. - Which means, each AIT module's `forward` method can be only called once, and the graph is built during the first call. If you want to share parameters, needs to call `compiler.ops` instead. The `compiler.ops` is similar to `functional` in PyTorch. + The fundamental difference is that AIT module is a container to build a graph, while PyTorch module is a container to store parameters for eager. + Which means, each AIT module's `forward` method can be only called once, and the graph is built during the first call. + If you want to share parameters, you need to use the `compiler.ops` instead. The `compiler.ops` is similar to `functional` in PyTorch. + + AITemplate supports automatic fusion of linear followed by other operators. However in many cases, especially for quick iterations, we use manual `specialization` to specify the fused operator. For example, `specialization="fast_gelu"` will fuse linear with the `fast_gelu` operator. - AITemplate supports automatically fusion on linear followed by other operators. However in many case especially for quick iterations, we use manual `specialization` to specify the fused operator. For example, `specialization="fast_gelu"` will fuse linear with `fast_gelu` operator. - 3. Define a helper function to map PyTorch parameters to AIT parameters -------------------------------------------------------------------------- +----------------------------------------------------------------------- -In AIT, all names must follow C variable naming standard because the name will be used in codegen process. +In AIT, all names must follow the C variable naming standard, because the names will be used in the codegen process. .. code-block:: python def map_pt_params(ait_model, pt_model): ait_model.name_parameter_tensor() pt_params = dict(pt_model.named_parameters()) - mapped_pt_params = OrderedDict() + mapped_pt_params = {} for name, _ in ait_model.named_parameters(): ait_name = name.replace(".", "_") assert name in pt_params @@ -93,12 +92,12 @@ In AIT, all names must follow C variable naming standard because the name will b .. warning:: - - Different to PyTorch, it is required to call ait_model **.name_parameter_tensor()** method to provide each parameter a name with direct map to PyTorch. - - Because all names in AIT must follow C variable naming standard, you can easier replace `.` to `_` or use a regular expression to make sure the name in valid. - - For network with conv + bn subgraph, we currently haven't provide automatic pass to fold it. Refer our ResNet and Detectron2 examples to see how we handle CNN layout transform and BatchNorm folding. + - Different to PyTorch, it is required to call ait_model **.name_parameter_tensor()** method to provide each parameter with a name with a direct map to PyTorch. + - Because all names in AIT must follow the C variable naming standard, you can easily replace `.` by `_` or use a regular expression to make sure the name in valid. + - For networks with conv + bn subgraph, we currently don't provide an automatic pass to fold it. Please refer to our ResNet and Detectron2 examples to see how we handle CNN layout transform and BatchNorm folding. 4. Create PyTorch module, inputs/outputs ------------------------------------------ +---------------------------------------- .. code-block:: python @@ -115,7 +114,7 @@ In AIT, all names must follow C variable naming standard because the name will b y_pt = pt_model(x) 5. Create AIT module, inputs/outputs -------------------------------------- +------------------------------------ .. code-block:: python @@ -139,12 +138,12 @@ In AIT, all names must follow C variable naming standard because the name will b .. warning:: - Similar to MetaTensor, LazyTensor and a lot of other lazy evaluation frameworks, AIT's Tensor records the computation graph, and the graph is built when the Tensor is compiled. - - For input tensor, it is required to set the attribute **is_input=True** - - For output tensor, it is required to set the attribute **Y._attrs["is_output"] = True** - - For input and output tensors, it is better to provide **name** attributes to use in runtime + - For input tensor, it is required to set the attribute **is_input=True**. + - For output tensor, it is required to set the attribute **Y._attrs["is_output"] = True**. + - For input and output tensors, it is better to provide the **name** attributes to use in runtime. -6. Compile AIT module in to runtime, and do verification --------------------------------------------------------- +6. Compile AIT module into runtime and do verification +------------------------------------------------------ .. code-block:: python @@ -180,9 +179,9 @@ In AIT, all names must follow C variable naming standard because the name will b print(f"PyTorch eager time: {pt_t} ms/iter") -In this example, AIT will automatically fuse GELU and elementwise add into TensorCore/MatrixCore gemm operation. On RTX-3080 for this example, AIT is about 1.15X fast than PyTorch Eager in this example. +In this example, AIT will automatically fuse GELU and elementwise addition into the TensorCore/MatrixCore gemm operation. On RTX-3080, in the example AIT is about 1.15X faster than PyTorch Eager. .. note:: - - In this example, we fold parameters (weights) into AIT runtime, which the final dynamic library will contains parameters. - - If during compile we don't provide parameters, for example the total parameters size is greater than 2GB, we can always call `set_constant` function in runtime. Check runtime API for details. \ No newline at end of file + - In this example, we fold the parameters (`weights`) into AIT runtime. The final dynamic library will contain them as parameters. + - If during the compile time we don't provide the parameters (for example, because the total parameters size is greater than 2GB), we can always call `set_constant` function in the runtime. Please check the runtime API for the details. diff --git a/docs/source/tutorial/how_to_visualize.rst b/docs/source/tutorial/how_to_visualize.rst index 5af7c89a5..1b6856699 100644 --- a/docs/source/tutorial/how_to_visualize.rst +++ b/docs/source/tutorial/how_to_visualize.rst @@ -1,5 +1,5 @@ How to visualize an AIT model -============================== +============================= Visualization is important for understanding the behavior of a model optimization. In AIT, we modify the codegen a little bit, from generating CUDA/HIP C++ code to HTML/Javascript code, @@ -9,7 +9,7 @@ then we can generate a visualization of the model. The following code will generate a visualization of our first example. 1. Define the AIT Model ------------------------- +----------------------- .. code-block:: python @@ -71,15 +71,15 @@ The following code will generate a visualization of our first example. graph = apply_optimizations(output_tensor) 3. Generate visualization --------------------------- +------------------------- .. code-block:: python # Plot the graph - plot_graph(graph, file_path="ait_model.html", network_name="ait_sample_net") + plot_graph(graph, file_path="ait_model.html") The visualization will be generated in the "ait_model.html" file. This file can be opened in Chrome without any web server. .. raw:: html - \ No newline at end of file + diff --git a/docs/static/ait_model.html b/docs/static/ait_model.html index 18c56089d..3f414b67b 100644 --- a/docs/static/ait_model.html +++ b/docs/static/ait_model.html @@ -4,7 +4,7 @@ - ait_sample_net + ait_model