From 9d59e8d26879a17c2bd0c3abb86b2f3d80e8ca92 Mon Sep 17 00:00:00 2001 From: Nicolas Vasilache Date: Thu, 14 Jan 2021 11:05:13 +0000 Subject: [PATCH] [DoNotCommit] Add support for building Codegen example with an existing MLIR Prerequisites: ============== First, `export MLIR_SOURCE_DIR=...` ``` (mkdir -p ${MLIR_SOURCE_DIR}/../build && \ cd ${MLIR_SOURCE_DIR}/../build && \ cmake -G Ninja ../llvm -DLLVM_ENABLE_PROJECTS="mlir" -DBUILD_SHARED_LIBS=ON -DLLVM_BUILD_LLVM_DYLIB=1 -DMLIR_LINK_MLIR_DYLIB=1 -DLLVM_BUILD_EXAMPLES=OFF -DLLVM_TARGETS_TO_BUILD="X86" \ -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ && \ cmake --build . --target MLIR check-mlir) ``` Codegen: ======== ``` MLIR_DIR=${MLIR_SOURCE_DIR}/../build cmake -GNinja -DCMAKE_CXX_COMPILER=clang++-11 -DCMAKE_C_COMPILER=clang-11 \ -DMLIR_SOURCE=${MLIR_SOURCE_DIR} -DUSE_MKL=OFF -DMLIR_BUILD=${MLIR_SOURCE_DIR}/../build/lib -B build ./Codegen/matmul && \ cmake --build build ``` Benchmark: ========== ``` rm -f build/matmul_* && cmake --build build --target matmul-compile; \ for f in $(find build/ -maxdepth 1 -executable -type f | sort --version-sort); do $f; done; \ ls *out | sort --version-sort | xargs tail -n 1 ``` Results (on my machine, peak ~96GFlops/s DP): ============================================= ==> matmul_18x32x96_mlir_perf.out <== 32.44 GFLOPS ==> matmul_24x64x96_mlir_perf.out <== 33.86 GFLOPS ==> matmul_24x64x512_mlir_perf.out <== 40.66 GFLOPS ==> matmul_48x64x128_mlir_perf.out <== 42.69 GFLOPS ==> matmul_192x64x128_mlir_perf.out <== 41.60 GFLOPS ==> matmul_192x128x128_mlir_perf.out <== 36.87 GFLOPS ==> matmul_192x256x256_mlir_perf.out <== 34.32 GFLOPS ==> matmul_384x256x256_mlir_perf.out <== 35.13 GFLOPS ==> matmul_480x512x256_mlir_perf.out <== 30.80 GFLOPS ==> matmul_1020x1152x1152_mlir_perf.out <== 12.49 GFLOPS ==> matmul_1024x1024x1024_mlir_perf.out <== 35.26 GFLOPS ==> matmul_2304x2304x2560_mlir_perf.out <== 24.42 GFLOPS Notes: ====== 1. ODM numbers were using F32, good register/tile sizes need to be explored for F64. 2. Fixed some issues preventing AVX512, may be a few more things needed re compiler flags. 3. There seems to be some core MLIR regressions: manually trying different tiles sizes can create code that segfaults. 4. MLIR OSS lacks hoistings that were used internally, linalg on tensors is a better abstraction for this but still WIP. 5. MLIR OSS lacks full/partial splitting + outlining strategies that were used internally. --- .gitignore | 4 +- Codegen/CMakeLists.txt | 8 ++- Codegen/matmul/CMakeLists.txt | 6 ++- Codegen/matmul/matmul-compile/CMakeLists.txt | 3 ++ .../matmul/matmul-compile/matmul-compile.cpp | 53 ++++++++++++++----- FILE_NAME | 1 + 6 files changed, 58 insertions(+), 17 deletions(-) create mode 100644 FILE_NAME diff --git a/.gitignore b/.gitignore index ad6c7ac..c17287f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,6 @@ build __pycache__ - +*.ll +*.out +*/*.out diff --git a/Codegen/CMakeLists.txt b/Codegen/CMakeLists.txt index dc34150..243154b 100644 --- a/Codegen/CMakeLists.txt +++ b/Codegen/CMakeLists.txt @@ -3,8 +3,12 @@ project(codegen C CXX) include(ExternalProject) -set(MLIR_SOURCE ../../external/llvm-project/llvm) -set(MLIR_BUILD ${CMAKE_BINARY_DIR}/mlir) +if (NOT DEFINED MLIR_SOURCE) + set(MLIR_SOURCE ../../external/llvm-project/llvm) +endif() +if (NOT DEFINED MLIR_BUILD) + set(MLIR_BUILD ${CMAKE_BINARY_DIR}/mlir) +endif() set(MLIR_INSTALL ${CMAKE_BINARY_DIR}/mlir-install) ExternalProject_Add(mlir PREFIX ${CMAKE_BINARY_DIR}/mlir diff --git a/Codegen/matmul/CMakeLists.txt b/Codegen/matmul/CMakeLists.txt index 82dda67..45a038a 100644 --- a/Codegen/matmul/CMakeLists.txt +++ b/Codegen/matmul/CMakeLists.txt @@ -8,7 +8,9 @@ function(compile_mlir mlir_prefix) set(OBJ ${CMAKE_BINARY_DIR}/${mlir_prefix}.o) add_custom_command(OUTPUT ${OBJ} COMMAND ${CMAKE_BINARY_DIR}/matmul-compile/matmul-compile ${CMAKE_CURRENT_LIST_DIR}/mlir/${mlir_prefix}.mlir - COMMAND ${CMAKE_CXX_COMPILER} -O3 ${CMAKE_CURRENT_LIST_DIR}/${mlir_prefix}.ll -c -o ${OBJ} + COMMAND ${CMAKE_CXX_COMPILER} -O3 ${CMAKE_CURRENT_LIST_DIR}/${mlir_prefix}.ll + -mllvm -enable-matrix -mllvm -matrix-allow-contract -mllvm -matrix-default-layout=row-major + -c -o ${OBJ} WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} DEPENDS matmul-compile ) @@ -50,7 +52,7 @@ foreach(MATRIX_SIZE ${MATRIX_SIZES}) target_link_directories(${MATMUL} PRIVATE ${MKL_DIR}/lib/intel64) target_link_libraries(${MATMUL} PRIVATE mkl_intel_ilp64 mkl_gnu_thread mkl_core gomp) else() - target_compile_definitions(${MATMUL} PRIVATE FILE_NAME=${MATMUL}_mlir_perf.out) + # target_compile_definitions(${MATMUL} PRIVATE FILE_NAME=${MATMUL}_mlir_perf.out) endif() target_link_libraries(${MATMUL} PRIVATE m) list(APPEND ALL_TARGETS ${MATMUL}) diff --git a/Codegen/matmul/matmul-compile/CMakeLists.txt b/Codegen/matmul/matmul-compile/CMakeLists.txt index 807eb6c..9b793b7 100644 --- a/Codegen/matmul/matmul-compile/CMakeLists.txt +++ b/Codegen/matmul/matmul-compile/CMakeLists.txt @@ -18,6 +18,9 @@ include_directories(${MLIR_INCLUDE_DIRS}) get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) message(STATUS "${dialect_libs}") +if (DEFINED MLIR_BUILD) + link_directories(${MLIR_BUILD}) +endif() set(LIBS ${dialect_libs} ${conversion_libs} diff --git a/Codegen/matmul/matmul-compile/matmul-compile.cpp b/Codegen/matmul/matmul-compile/matmul-compile.cpp index 729ea3c..ccd77dc 100644 --- a/Codegen/matmul/matmul-compile/matmul-compile.cpp +++ b/Codegen/matmul/matmul-compile/matmul-compile.cpp @@ -66,6 +66,16 @@ struct LinalgCodegenPass : public PassWrapper { } // namespace void LinalgCodegenPass::runOnFunction() { + MLIRContext *ctx = getFunction().getContext(); + SmallVector attrs; + attrs.push_back(ArrayAttr::get({StringAttr::get("prefer-vector-width", ctx), + StringAttr::get("512", ctx)}, + ctx)); + attrs.push_back(ArrayAttr::get({StringAttr::get("target-cpu", ctx), + StringAttr::get("skylake-avx512", ctx)}, + ctx)); + getFunction().setAttr("passthrough", ArrayAttr::get(attrs, ctx)); + std::string vectorizeContractionTo("outerproduct"); std::string splitVectorTransfersTo("vector-transfers"); bool registerPromoteFullTile{true}; @@ -88,24 +98,24 @@ void LinalgCodegenPass::runOnFunction() { // Small and medium codegen if (M < 1000) { LinalgTilingOptions tilingOptions; - llvm::SmallVector tileSizes{6, 32, 16}; + llvm::SmallVector tileSizes{6, 16, 16}; if (!tileSizes.empty()) tilingOptions = tilingOptions.setTileSizes(tileSizes); - LinalgTilingOptions registerTilingOptions; - llvm::SmallVector registerTileSizes{2, 4, 8}; - if (!registerTileSizes.empty()) - registerTilingOptions = registerTilingOptions.setTileSizes(registerTileSizes); + // LinalgTilingOptions registerTilingOptions; + // llvm::SmallVector registerTileSizes{2, 4, 8}; + // if (!registerTileSizes.empty()) + // registerTilingOptions = registerTilingOptions.setTileSizes(registerTileSizes); CodegenStrategy strategy; strategy.tile(tilingOptions) .promote(LinalgPromotionOptions() .setAlignment(16) .setUseFullTileBuffersByDefault(true)) - .tile(registerTilingOptions) - .promote(LinalgPromotionOptions() - .setAlignment(16) - .setUseFullTileBuffersByDefault(registerPromoteFullTile)) + // .tile(registerTilingOptions) + // .promote(LinalgPromotionOptions() + // .setAlignment(16) + // .setUseFullTileBuffersByDefault(registerPromoteFullTile)) .vectorize() .setVectorTransformsOptions( vector::VectorTransformsOptions() @@ -125,7 +135,7 @@ void LinalgCodegenPass::runOnFunction() { CodegenStrategy strategyCaches; strategyCaches .tile(LinalgTilingOptions() - .setTileSizes({192, 256, 256}) + .setTileSizes({128, 128, 256}) .setInterchange({0, 2, 1})) .promote(LinalgPromotionOptions() .setOperandsToPromote({0, 1}) @@ -138,7 +148,18 @@ void LinalgCodegenPass::runOnFunction() { { CodegenStrategy strategyRegisters; strategyRegisters - .tile(LinalgTilingOptions().setTileSizes({4, 32})) + .tile(LinalgTilingOptions().setTileSizes({4, 16})) + .vectorize() + .setVectorTransferToSCFOptions( + VectorTransferToSCFOptions().setUnroll(unrollVectorTransfers)); + + strategyRegisters.transform(getFunction()); + } + + { + CodegenStrategy strategyRegisters; + strategyRegisters + .tile(LinalgTilingOptions().setTileSizes({4, 16})) .vectorize() .setVectorTransferToSCFOptions( VectorTransferToSCFOptions().setUnroll(unrollVectorTransfers)); @@ -150,7 +171,7 @@ void LinalgCodegenPass::runOnFunction() { { CodegenStrategy strategyRegisters; strategyRegisters - .tile(LinalgTilingOptions().setTileSizes({6, 32, 16})) + .tile(LinalgTilingOptions().setTileSizes({8, 16, 8})) .promote(LinalgPromotionOptions() .setUseFullTileBuffersByDefault(registerPromoteFullTile) .setAlignment(128)) @@ -165,6 +186,8 @@ void LinalgCodegenPass::runOnFunction() { strategyRegisters.transform(getFunction()); } } + + getFunction().dump(); } std::unique_ptr> createLinalgCodegenPass(int M, int N, int K) { @@ -213,16 +236,22 @@ static void get_dimensions(const std::string filename, int &M, int &N, int &K) { Error compile(Options &options, mlir::DialectRegistry ®istry) { MLIRContext context; registry.loadAll(&context); + llvm::errs() << "Read file: " << options.inputFile; OwningModuleRef moduleRef = parseSourceFile(options.inputFile, &context); if (!moduleRef) return make_string_error(Twine("could not open ") + options.inputFile); ModuleOp module = *moduleRef; PassManager pm(module.getContext(), OpPassManager::Nesting::Implicit); + // context.disableMultithreading(); + // pm.enableIRPrinting([](Pass *, Operation*){return true; }, [](Pass *, Operation*){return true; }); + int M, N, K; get_dimensions(options.inputFile, M, N, K); + pm.addPass(createCanonicalizerPass()); pm.addPass(createLinalgCodegenPass(M, N, K)); + // Lower to LLVM pm.addPass(createConvertVectorToSCFPass()); pm.addPass(createLowerAffinePass()); diff --git a/FILE_NAME b/FILE_NAME new file mode 100644 index 0000000..7c6869d --- /dev/null +++ b/FILE_NAME @@ -0,0 +1 @@ +32.65 GFLOPS