From 9d59e8d26879a17c2bd0c3abb86b2f3d80e8ca92 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolas.vasilache@gmail.com>
Date: Thu, 14 Jan 2021 11:05:13 +0000
Subject: [PATCH] [DoNotCommit] Add support for building Codegen example with
 an existing MLIR

Prerequisites:
==============

First, `export MLIR_SOURCE_DIR=...`

```
(mkdir -p ${MLIR_SOURCE_DIR}/../build && \
cd ${MLIR_SOURCE_DIR}/../build && \
cmake -G Ninja ../llvm -DLLVM_ENABLE_PROJECTS="mlir"  -DBUILD_SHARED_LIBS=ON -DLLVM_BUILD_LLVM_DYLIB=1 -DMLIR_LINK_MLIR_DYLIB=1  -DLLVM_BUILD_EXAMPLES=OFF  -DLLVM_TARGETS_TO_BUILD="X86" \
 -DCMAKE_BUILD_TYPE=Release    -DLLVM_ENABLE_ASSERTIONS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ && \
cmake --build . --target MLIR check-mlir)
```

Codegen:
========

```
MLIR_DIR=${MLIR_SOURCE_DIR}/../build    cmake -GNinja -DCMAKE_CXX_COMPILER=clang++-11 -DCMAKE_C_COMPILER=clang-11 \
-DMLIR_SOURCE=${MLIR_SOURCE_DIR} -DUSE_MKL=OFF -DMLIR_BUILD=${MLIR_SOURCE_DIR}/../build/lib -B build ./Codegen/matmul && \
cmake --build build
```

Benchmark:
==========

```
rm -f build/matmul_* && cmake --build build --target matmul-compile; \
for f in $(find build/ -maxdepth 1 -executable -type f | sort --version-sort); do $f; done; \
ls *out | sort --version-sort | xargs tail -n 1
```

Results (on my machine, peak ~96GFlops/s DP):
=============================================

==> matmul_18x32x96_mlir_perf.out <==
32.44 GFLOPS

==> matmul_24x64x96_mlir_perf.out <==
33.86 GFLOPS

==> matmul_24x64x512_mlir_perf.out <==
40.66 GFLOPS

==> matmul_48x64x128_mlir_perf.out <==
42.69 GFLOPS

==> matmul_192x64x128_mlir_perf.out <==
41.60 GFLOPS

==> matmul_192x128x128_mlir_perf.out <==
36.87 GFLOPS

==> matmul_192x256x256_mlir_perf.out <==
34.32 GFLOPS

==> matmul_384x256x256_mlir_perf.out <==
35.13 GFLOPS

==> matmul_480x512x256_mlir_perf.out <==
30.80 GFLOPS

==> matmul_1020x1152x1152_mlir_perf.out <==
12.49 GFLOPS

==> matmul_1024x1024x1024_mlir_perf.out <==
35.26 GFLOPS

==> matmul_2304x2304x2560_mlir_perf.out <==
24.42 GFLOPS

Notes:
======

1. ODM numbers were using F32, good register/tile sizes need to be explored for F64.
2. Fixed some issues preventing AVX512, may be a few more things needed re compiler flags.
3. There seems to be some core MLIR regressions: manually trying different tiles sizes can create code that segfaults.
4. MLIR OSS lacks hoistings that were used internally, linalg on tensors is a better abstraction for this but still WIP.
5. MLIR OSS lacks full/partial splitting + outlining strategies that were used internally.
---
 .gitignore                                    |  4 +-
 Codegen/CMakeLists.txt                        |  8 ++-
 Codegen/matmul/CMakeLists.txt                 |  6 ++-
 Codegen/matmul/matmul-compile/CMakeLists.txt  |  3 ++
 .../matmul/matmul-compile/matmul-compile.cpp  | 53 ++++++++++++++-----
 FILE_NAME                                     |  1 +
 6 files changed, 58 insertions(+), 17 deletions(-)
 create mode 100644 FILE_NAME

diff --git a/.gitignore b/.gitignore
index ad6c7ac..c17287f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,6 @@
 
 build
 __pycache__
-
+*.ll
+*.out
+*/*.out
diff --git a/Codegen/CMakeLists.txt b/Codegen/CMakeLists.txt
index dc34150..243154b 100644
--- a/Codegen/CMakeLists.txt
+++ b/Codegen/CMakeLists.txt
@@ -3,8 +3,12 @@ project(codegen C CXX)
 
 include(ExternalProject)
 
-set(MLIR_SOURCE ../../external/llvm-project/llvm)
-set(MLIR_BUILD ${CMAKE_BINARY_DIR}/mlir)
+if (NOT DEFINED MLIR_SOURCE)
+  set(MLIR_SOURCE ../../external/llvm-project/llvm)
+endif()
+if (NOT DEFINED MLIR_BUILD)
+  set(MLIR_BUILD ${CMAKE_BINARY_DIR}/mlir)
+endif()
 set(MLIR_INSTALL ${CMAKE_BINARY_DIR}/mlir-install)
 ExternalProject_Add(mlir
   PREFIX ${CMAKE_BINARY_DIR}/mlir
diff --git a/Codegen/matmul/CMakeLists.txt b/Codegen/matmul/CMakeLists.txt
index 82dda67..45a038a 100644
--- a/Codegen/matmul/CMakeLists.txt
+++ b/Codegen/matmul/CMakeLists.txt
@@ -8,7 +8,9 @@ function(compile_mlir mlir_prefix)
   set(OBJ ${CMAKE_BINARY_DIR}/${mlir_prefix}.o)
   add_custom_command(OUTPUT ${OBJ}
     COMMAND ${CMAKE_BINARY_DIR}/matmul-compile/matmul-compile ${CMAKE_CURRENT_LIST_DIR}/mlir/${mlir_prefix}.mlir
-    COMMAND ${CMAKE_CXX_COMPILER} -O3 ${CMAKE_CURRENT_LIST_DIR}/${mlir_prefix}.ll -c -o ${OBJ}
+    COMMAND ${CMAKE_CXX_COMPILER} -O3 ${CMAKE_CURRENT_LIST_DIR}/${mlir_prefix}.ll
+            -mllvm -enable-matrix -mllvm -matrix-allow-contract -mllvm -matrix-default-layout=row-major 
+            -c -o ${OBJ}
     WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
     DEPENDS matmul-compile
   )
@@ -50,7 +52,7 @@ foreach(MATRIX_SIZE ${MATRIX_SIZES})
     target_link_directories(${MATMUL} PRIVATE ${MKL_DIR}/lib/intel64)
     target_link_libraries(${MATMUL} PRIVATE mkl_intel_ilp64 mkl_gnu_thread mkl_core gomp)
   else()
-    target_compile_definitions(${MATMUL} PRIVATE FILE_NAME=${MATMUL}_mlir_perf.out)
+    # target_compile_definitions(${MATMUL} PRIVATE FILE_NAME=${MATMUL}_mlir_perf.out)
   endif()
   target_link_libraries(${MATMUL} PRIVATE m)
   list(APPEND ALL_TARGETS ${MATMUL})
diff --git a/Codegen/matmul/matmul-compile/CMakeLists.txt b/Codegen/matmul/matmul-compile/CMakeLists.txt
index 807eb6c..9b793b7 100644
--- a/Codegen/matmul/matmul-compile/CMakeLists.txt
+++ b/Codegen/matmul/matmul-compile/CMakeLists.txt
@@ -18,6 +18,9 @@ include_directories(${MLIR_INCLUDE_DIRS})
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
 message(STATUS "${dialect_libs}")
+if (DEFINED MLIR_BUILD)
+  link_directories(${MLIR_BUILD})
+endif()
 set(LIBS
   ${dialect_libs}
   ${conversion_libs}
diff --git a/Codegen/matmul/matmul-compile/matmul-compile.cpp b/Codegen/matmul/matmul-compile/matmul-compile.cpp
index 729ea3c..ccd77dc 100644
--- a/Codegen/matmul/matmul-compile/matmul-compile.cpp
+++ b/Codegen/matmul/matmul-compile/matmul-compile.cpp
@@ -66,6 +66,16 @@ struct LinalgCodegenPass : public PassWrapper<LinalgCodegenPass, FunctionPass> {
 }  // namespace
 
 void LinalgCodegenPass::runOnFunction() {
+  MLIRContext *ctx = getFunction().getContext();
+  SmallVector<Attribute, 4> attrs;
+  attrs.push_back(ArrayAttr::get({StringAttr::get("prefer-vector-width", ctx),
+                                  StringAttr::get("512", ctx)}, 
+                                  ctx));
+  attrs.push_back(ArrayAttr::get({StringAttr::get("target-cpu", ctx),
+                                  StringAttr::get("skylake-avx512", ctx)}, 
+                                  ctx));
+  getFunction().setAttr("passthrough", ArrayAttr::get(attrs, ctx));
+
   std::string vectorizeContractionTo("outerproduct");
   std::string splitVectorTransfersTo("vector-transfers");
   bool registerPromoteFullTile{true};
@@ -88,24 +98,24 @@ void LinalgCodegenPass::runOnFunction() {
   // Small and medium codegen
   if (M < 1000) {
     LinalgTilingOptions tilingOptions;
-    llvm::SmallVector<int64_t, 4> tileSizes{6, 32, 16};
+    llvm::SmallVector<int64_t, 4> tileSizes{6, 16, 16};
     if (!tileSizes.empty())
       tilingOptions = tilingOptions.setTileSizes(tileSizes);
 
-    LinalgTilingOptions registerTilingOptions;
-    llvm::SmallVector<int64_t, 4> registerTileSizes{2, 4, 8};
-    if (!registerTileSizes.empty())
-      registerTilingOptions = registerTilingOptions.setTileSizes(registerTileSizes);
+    // LinalgTilingOptions registerTilingOptions;
+    // llvm::SmallVector<int64_t, 4> registerTileSizes{2, 4, 8};
+    // if (!registerTileSizes.empty())
+    //   registerTilingOptions = registerTilingOptions.setTileSizes(registerTileSizes);
 
     CodegenStrategy strategy;
     strategy.tile<MatmulOp>(tilingOptions)
         .promote<MatmulOp>(LinalgPromotionOptions()
 			      .setAlignment(16)
 			      .setUseFullTileBuffersByDefault(true))
-        .tile<MatmulOp>(registerTilingOptions)
-        .promote<MatmulOp>(LinalgPromotionOptions()
-                              .setAlignment(16)
-                              .setUseFullTileBuffersByDefault(registerPromoteFullTile))
+        // .tile<MatmulOp>(registerTilingOptions)
+        // .promote<MatmulOp>(LinalgPromotionOptions()
+        //                       .setAlignment(16)
+        //                       .setUseFullTileBuffersByDefault(registerPromoteFullTile))
         .vectorize<MatmulOp>()
         .setVectorTransformsOptions(
             vector::VectorTransformsOptions()
@@ -125,7 +135,7 @@ void LinalgCodegenPass::runOnFunction() {
       CodegenStrategy strategyCaches;
       strategyCaches
         .tile<MatmulOp>(LinalgTilingOptions()
-			.setTileSizes({192, 256, 256})
+			.setTileSizes({128, 128, 256})
 			.setInterchange({0, 2, 1}))
         .promote<MatmulOp>(LinalgPromotionOptions()
 			   .setOperandsToPromote({0, 1})
@@ -138,7 +148,18 @@ void LinalgCodegenPass::runOnFunction() {
     {
       CodegenStrategy strategyRegisters;
       strategyRegisters
-        .tile<CopyOp>(LinalgTilingOptions().setTileSizes({4, 32}))
+        .tile<FillOp>(LinalgTilingOptions().setTileSizes({4, 16}))
+        .vectorize<FillOp>()
+        .setVectorTransferToSCFOptions(
+            VectorTransferToSCFOptions().setUnroll(unrollVectorTransfers));
+
+      strategyRegisters.transform(getFunction());
+    }
+
+    {
+      CodegenStrategy strategyRegisters;
+      strategyRegisters
+        .tile<CopyOp>(LinalgTilingOptions().setTileSizes({4, 16}))
         .vectorize<CopyOp>()
         .setVectorTransferToSCFOptions(
             VectorTransferToSCFOptions().setUnroll(unrollVectorTransfers));
@@ -150,7 +171,7 @@ void LinalgCodegenPass::runOnFunction() {
     {
       CodegenStrategy strategyRegisters;
       strategyRegisters
-        .tile<CopyOp>(LinalgTilingOptions().setTileSizes({6, 32, 16}))
+        .tile<MatmulOp>(LinalgTilingOptions().setTileSizes({8, 16, 8}))
         .promote<MatmulOp>(LinalgPromotionOptions()
                              .setUseFullTileBuffersByDefault(registerPromoteFullTile)
 			   .setAlignment(128))
@@ -165,6 +186,8 @@ void LinalgCodegenPass::runOnFunction() {
       strategyRegisters.transform(getFunction());
     }
   }
+
+  getFunction().dump();
 }
 
 std::unique_ptr<OperationPass<FuncOp>> createLinalgCodegenPass(int M, int N, int K) {
@@ -213,16 +236,22 @@ static void get_dimensions(const std::string filename, int &M, int &N, int &K) {
 Error compile(Options &options, mlir::DialectRegistry &registry) {
   MLIRContext context;
   registry.loadAll(&context);
+  llvm::errs() << "Read file: " << options.inputFile;
   OwningModuleRef moduleRef = parseSourceFile(options.inputFile, &context);
   if (!moduleRef)
     return make_string_error(Twine("could not open ") + options.inputFile);
 
   ModuleOp module = *moduleRef;
   PassManager pm(module.getContext(), OpPassManager::Nesting::Implicit);
+  // context.disableMultithreading();
+  // pm.enableIRPrinting([](Pass *, Operation*){return true; }, [](Pass *, Operation*){return true; });
+
   int M, N, K;
   get_dimensions(options.inputFile, M, N, K);
+  pm.addPass(createCanonicalizerPass());
   pm.addPass(createLinalgCodegenPass(M, N, K));
 
+
   // Lower to LLVM
   pm.addPass(createConvertVectorToSCFPass());
   pm.addPass(createLowerAffinePass());
diff --git a/FILE_NAME b/FILE_NAME
new file mode 100644
index 0000000..7c6869d
--- /dev/null
+++ b/FILE_NAME
@@ -0,0 +1 @@
+32.65 GFLOPS