Matrix-Multiplication/Main.cpp at main · Kalith-Ismaik/Matrix-Multiplication · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
// C++ CUDA KERNEL IMPLEMENTATION OF 2D MATRIX-MATRIX MULTIPLICATION

#include <iostream>
#include <chrono>
#include <cmath>

#include "MatOper.h"
#include "GEMM.h"

using namespace std;
using namespace chrono;

int main() {

    int WRuns = 3;        // WARMUP RUNS TO STABILIZE GPU BEHAVIOUR AND CACHE HANDLING
    int NRuns = 10;       // BENCHMARK RUNS

    int SizeAX = 4096;    // ROWS IN MATRIX A
    int SizeAY = 4096;    // COLUMNS IN MATRIX A
    int SizeBX = 4096;    // ROWS IN MATRIX B
    int SizeBY = 16384;   // COLUMNS IN MATRIXB

    // ALLOCATE 2D ARRAYS FOR BOTH INPUT AND OUTPUT ARRAYS
    float** matrixA = ArrayAllocator(SizeAX, SizeAY);
    float** matrixB = ArrayAllocator(SizeBX, SizeBY);

    float** matrixR1 = ArrayAllocator(SizeAX, SizeBY);
    float** matrixR2 = ArrayAllocator(SizeAX, SizeBY);

    // INITIALIZE THE 2D ARRAY WITH DUMMY DATA
    InitializeMatrixA(matrixA, SizeAX, SizeAY);
    InitializeMatrixB(matrixB, SizeBX, SizeBY);

    for (int i = 0; i < WRuns; ++i) {

        InitializeMatrixR(matrixR1, SizeAX, SizeBY);
        InitializeMatrixR(matrixR2, SizeAX, SizeBY);

        // PERFORM MATRIX-MATRIX MULTIPLICATION IN NAIVE GPU KERNEL WITH BFP QUANTIZATION
        MatrixMultiplierLauncher(matrixA, matrixB, matrixR1, SizeAX, SizeAY, SizeBX, SizeBY);
        // PERFORM MATRIX MULTIPLICATION WITH CUBLAS LIBRARY
        MatrixMultiplierCuBLAS(matrixA, matrixB, matrixR2, SizeAX, SizeAY, SizeBX, SizeBY);

        cout << "Warmup run " << i + 1 << " completed" << endl;

    }

    float* CUBFP_times  = new float[NRuns];
    float* CUBLAS_times = new float[NRuns];

    for (int i = 0; i < NRuns; ++i) {

        InitializeMatrixR(matrixR1, SizeAX, SizeBY);
        InitializeMatrixR(matrixR2, SizeAX, SizeBY);

        // TIME THE START OF MATRIX MULTIPLICATION
        auto t1 = high_resolution_clock::now();

        // PERFORM MATRIX-MATRIX MULTIPLICATION IN NAIVE GPU KERNEL WITH BFP QUANTIZATION
        MatrixMultiplierLauncher(matrixA, matrixB, matrixR1, SizeAX, SizeAY, SizeBX, SizeBY);

        // TIME THE END OF MATRIX MULTIPLICATION
        auto t2 = high_resolution_clock::now();

        // CALCULATE THE TIME TAKEN FOR OPERATION
        CUBFP_times[i] = duration_cast<milliseconds>(t2 - t1).count();

        // TIME THE START OF MATRIX MULTIPLICATION
        auto t3 = high_resolution_clock::now();

        // PERFORM MATRIX MULTIPLICATION WITH CUBLAS LIBRARY
        MatrixMultiplierCuBLAS(matrixA, matrixB, matrixR2, SizeAX, SizeAY, SizeBX, SizeBY);

        // TIME THE END OF MATRIX MULTIPLICATION
        auto t4 = high_resolution_clock::now();

        // CALCULATE THE TIME TAKEN FOR OPERATION AND PRINT IT
        CUBLAS_times[i] = duration_cast<milliseconds>(t4 - t3).count();

        cout << "Benchmark run " << i + 1 << " completed" << endl;

    }

    // Print final statistics
    cout << "\nFinal Statistics over " << NRuns << " successful benchmark runs:" << endl;

    float cubfp_avg = 0.0f;
    float cubfp_std = 0.0f;
    StatCalculator(CUBFP_times, cubfp_avg, cubfp_std, NRuns);
    cout << "\nNaive BFP Matrix Multiplication Implementation with BF16 data handling:" << endl;
    cout << "  Average: " << cubfp_avg << " ms" << endl;
    cout << "  Std Dev: " << cubfp_std << " ms" << endl;

    float cublas_avg = 0.0f;
    float cublas_std = 0.0f;
    StatCalculator(CUBLAS_times, cublas_avg, cublas_std, NRuns);
    cout << "\nCublas Matrix Multiplication Implementation with Float32 data handling:" << endl;
    cout << "  Average: " << cublas_avg << " ms" << endl;
    cout << "  Std Dev: " << cublas_std << " ms" << endl;

    // CALCULATE THE PRECISION OF NAIVE VS CUBLAS MATRIX-MATRIX IMPLEMENTATION
    cout << "Error between Optimized Floating Point (cuBLAS GPU) matrix multiplication vs Naive Block Floating Point (GPU) matrix multiplication with 16 bit mantissa: " << endl;
    ErrCalculator(matrixR1, matrixR2, SizeAX, SizeBY);

    // DEALLOCATE THE 2D DYNAMIC ARRAY
    ArrayDeAllocator(matrixA, SizeAX);
    ArrayDeAllocator(matrixB, SizeBX);

    ArrayDeAllocator(matrixR1, SizeAX);
    ArrayDeAllocator(matrixR2, SizeAX);

    // DEALLOCATE THE 1D DYNAMIC ARRAYS
    delete[] CUBFP_times;
    delete[] CUBLAS_times;

    return 0;

}