From fdb32c352eee0c499177b620cef2565775b04dec Mon Sep 17 00:00:00 2001 From: Elman Jahangiri Date: Mon, 19 Jan 2026 13:36:26 +0100 Subject: [PATCH 1/2] Changes - implemented the new method matrixMultDenseDenseMM, where using different iterations, when dealing with transposes, trying to achieve a speedup - added one test class for the correctness of the implementation - added on test class to check, wether the running time has improved --- .../runtime/matrix/data/LibMatrixMult.java | 140 ++++++++++++++++++ .../MatrixMultTransposedPerformanceTest.java | 91 ++++++++++++ .../matrixmult/MatrixMultTransposedTest.java | 72 +++++++++ 3 files changed, 303 insertions(+) create mode 100644 src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedPerformanceTest.java create mode 100644 src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedTest.java diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index 5753fbbadbe..4438899f375 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -1026,6 +1026,146 @@ public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV // optimized matrix mult implementation // ////////////////////////////////////////// + public static void matrixMultDenseDenseMM(DenseBlock a, DenseBlock b, DenseBlock c, boolean transA, boolean transB, int n, int cd, int rl, int ru, int cl, int cu) { + // C = A %*% B + if (!transA && !transB) { + matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu); + return; + } + // C = t(A) %*% B + if (transA && !transB) { + multDenseDenseTransA(a, b, c, n, cd, rl, ru, cl, cu); + return; + } + // C = A %*% t(B) + if (!transA && transB) { + multDenseDenseTransB(a, b, c, n, cd, rl, ru, cl, cu); + return; + } + // C = t(A) %*% t(B) + if (transA && transB) { + multDenseDenseTransATransB(a, b, c, n, cd, rl, ru, cl, cu); + return; + } + } + + private static void multDenseDenseTransA(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) { + final int blocksizeI = 32; + final int blocksizeK = 24; + + for (int bi = rl; bi < ru; bi += blocksizeI) { + int bimin = Math.min(ru, bi + blocksizeI); + for (int bk = 0; bk < cd; bk += blocksizeK) { + int bkmin = Math.min(cd, bk + blocksizeK); + + int k = bk; + for (; k < bkmin - 3; k += 4) { + if (b.isContiguous()) { + double[] bvals = b.values(0); + int bix0 = b.pos(k, cl), bix1 = b.pos(k+1, cl); + int bix2 = b.pos(k+2, cl), bix3 = b.pos(k+3, cl); + + for (int i = bi; i < bimin; i++) { + double[] cvals = c.values(i); + int cix = c.pos(i, cl); + double val0 = a.values(k)[a.pos(k) + i]; + double val1 = a.values(k+1)[a.pos(k+1) + i]; + double val2 = a.values(k+2)[a.pos(k+2) + i]; + double val3 = a.values(k+3)[a.pos(k+3) + i]; + + vectMultiplyAdd4(val0, val1, val2, val3, + bvals, cvals, + bix0, bix1, bix2, bix3, cix, cu - cl); + } + } else { + for (int i = bi; i < bimin; i++) { + double[] cvals = c.values(i); + int cix = c.pos(i, cl); + double val0 = a.values(k)[a.pos(k) + i]; + if(val0!=0) vectMultiplyAdd(val0, b.values(k), cvals, b.pos(k, cl), cix, cu - cl); + double val1 = a.values(k+1)[a.pos(k+1) + i]; + if(val1!=0) vectMultiplyAdd(val1, b.values(k+1), cvals, b.pos(k+1, cl), cix, cu - cl); + double val2 = a.values(k+2)[a.pos(k+2) + i]; + if(val2!=0) vectMultiplyAdd(val2, b.values(k+2), cvals, b.pos(k+2, cl), cix, cu - cl); + double val3 = a.values(k+3)[a.pos(k+3) + i]; + if(val3!=0) vectMultiplyAdd(val3, b.values(k+3), cvals, b.pos(k+3, cl), cix, cu - cl); + } + } + } + for (; k < bkmin; k++) { + double[] bvals = b.values(k); + int bix = b.pos(k, cl); + double[] avals = a.values(k); + int apos = a.pos(k); + for (int i = bi; i < bimin; i++) { + double val = avals[apos + i]; + if (val != 0) { + vectMultiplyAdd(val, bvals, c.values(i), bix, c.pos(i, cl), cu - cl); + } + } + } + } + } + } + + private static void multDenseDenseTransB(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) { + final int blocksizeK = 24; + double[] bufB = new double[blocksizeK * (cu - cl)]; + + for (int bk = 0; bk < cd; bk += blocksizeK) { + int bkmin = Math.min(cd, bk + blocksizeK); + int bklen = bkmin - bk; + + + for (int j = cl; j < cu; j++) { + double[] bvals = b.values(j); + int bpos = b.pos(j); + + for (int k = 0; k < bklen; k++) { + bufB[k * (cu-cl) + (j-cl)] = bvals[bpos + bk + k]; + } + } + + for (int i = rl; i < ru; i++) { + double[] avals = a.values(i); + int apos = a.pos(i); + double[] cvals = c.values(i); + int cix = c.pos(i, cl); + + for (int k = 0; k < bklen; k++) { + double val = avals[apos + bk + k]; + if (val != 0) { + int bufIx = k * (cu-cl); + vectMultiplyAdd(val, bufB, cvals, bufIx, cix, cu - cl); + } + } + } + } + } + + private static void multDenseDenseTransATransB(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) { + double[] d_row = new double[ru]; + + for (int j = cl; j < cu; j++) { + java.util.Arrays.fill(d_row, rl, ru, 0); + + for (int k = 0; k < cd; k++) { + double valB = b.get(j, k); + if (valB != 0) { + double[] avals = a.values(k); + int apos = a.pos(k); + vectMultiplyAdd(valB, avals, d_row, apos + rl, rl, ru - rl); + } + } + + for (int i = rl; i < ru; i++) { + double[] cvals = c.values(i); + int cix = c.pos(i); + cvals[cix + j] += d_row[i]; + } + } + } + private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) { DenseBlock a = m1.getDenseBlock(); DenseBlock b = m2.getDenseBlock(); diff --git a/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedPerformanceTest.java b/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedPerformanceTest.java new file mode 100644 index 00000000000..c9899f907aa --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedPerformanceTest.java @@ -0,0 +1,91 @@ +package org.apache.sysds.test.component.matrixmult; + +import org.apache.sysds.runtime.matrix.data.LibMatrixMult; +import org.apache.sysds.runtime.matrix.data.LibMatrixReorg; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.junit.Assert; +import org.junit.Test; + +public class MatrixMultTransposedPerformanceTest { + private final int m = 500; + private final int n = 500; + private final int k = 500; + + + @Test + public void testPerf_1_NoTransA_TransB() { + System.out.println("Case: C = A %*% t(B)"); + runTest(false, true); + System.out.println(); + } + + @Test + public void testPerf_2_TransA_NoTransB() { + System.out.println("Case: C = t(A) %*% B"); + runTest(true, false); + System.out.println(); + } + + @Test + public void testPerf_3_TransA_TransB() { + System.out.println("Case: C = t(A) %*% t(B)"); + runTest(true, true); + } + + private void runTest(boolean tA, boolean tB) { + int REP = 50; + + // setup Dimensions + int rowsA = tA ? k : m; + int colsA = tA ? m : k; + int rowsB = tB ? n : k; + int colsB = tB ? k : n; + + MatrixBlock A = MatrixBlock.randOperations(rowsA, colsA, 1.0, -1, 1, "uniform", 7); + MatrixBlock B = MatrixBlock.randOperations(rowsB, colsB, 1.0, -1, 1, "uniform", 3); + MatrixBlock C = new MatrixBlock(m, n, false); + C.allocateDenseBlock(); + + + for(int i=0; i<50; i++) { + runOldMethod(A, B, tA, tB); + runNewKernel(A, B, C, tA, tB); + } + + + long startTimeOld = System.nanoTime(); + for(int i = 0; i < REP; i++) { + runOldMethod(A, B, tA, tB); + } + double avgTimeOld = (System.nanoTime() - startTimeOld) / 1e6 / REP; + + + double startTimeNew = System.nanoTime(); + for(int i = 0; i < REP; i++) { + runNewKernel(A, B, C, tA, tB); + } + double avgTimeNew = (System.nanoTime() - startTimeNew) / 1e6 / REP; + + System.out.printf("Old Method: %.3f ms | New Kernel: %.3f ms", avgTimeOld, avgTimeNew); + + Assert.assertTrue(avgTimeNew < avgTimeOld); + } + + private void runNewKernel(MatrixBlock A, MatrixBlock B, MatrixBlock C, boolean tA, boolean tB) { + C.reset(); + + LibMatrixMult.matrixMultDenseDenseMM(A.getDenseBlock(), B.getDenseBlock(), C.getDenseBlock(), tA, tB, m, k, 0, m, 0, n); + } + + private void runOldMethod(MatrixBlock A, MatrixBlock B, boolean tA, boolean tB) { + // do transpose if needed + MatrixBlock A_in = tA ? LibMatrixReorg.transpose(A) : A; + MatrixBlock B_in = tB ? LibMatrixReorg.transpose(B) : B; + + MatrixBlock C = new MatrixBlock(m, n, false); + C.allocateDenseBlock(); + + LibMatrixMult.matrixMultDenseDenseMM(A_in.getDenseBlock(), B_in.getDenseBlock(), C.getDenseBlock(), false, + false, m, k, 0, m, 0, n); + } +} diff --git a/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedTest.java b/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedTest.java new file mode 100644 index 00000000000..9607b21a301 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedTest.java @@ -0,0 +1,72 @@ +package org.apache.sysds.test.component.matrixmult; + +import org.apache.sysds.runtime.data.DenseBlock; +import org.apache.sysds.runtime.matrix.data.LibMatrixMult; +import org.apache.sysds.runtime.matrix.data.LibMatrixReorg; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; + +import java.util.Random; + +public class MatrixMultTransposedTest { + + // run multiple random scenarios + @Test + public void testCase_noTransA_TransB() { + for(int i=0; i<10; i++) { + runTest(false, true); + } + } + + @Test + public void testCase_TransA_NoTransB() { + for(int i=0; i<10; i++) { + runTest(true, false); + } + } + + @Test + public void testCase_TransA_TransB() { + for(int i=0; i<10; i++) { + runTest(true, true); + } + } + + private void runTest(boolean tA, boolean tB) { + Random rand = new Random(); + + // generate random dimensions between 1 and 300 + int m = rand.nextInt(300) + 1; + int n = rand.nextInt(300) + 1; + int k = rand.nextInt(300) + 1; + + + int rowsA = tA ? k : m; + int colsA = tA ? m : k; + int rowsB = tB ? n : k; + int colsB = tB ? k : n; + + MatrixBlock ma = MatrixBlock.randOperations(rowsA, colsA, 1.0, -1, 1, "uniform", 7); + MatrixBlock mb = MatrixBlock.randOperations(rowsB, colsB, 1.0, -1, 1, "uniform", 3); + + MatrixBlock mc = new MatrixBlock(m, n, false); + mc.allocateDenseBlock(); + + DenseBlock a = ma.getDenseBlock(); + DenseBlock b = mb.getDenseBlock(); + DenseBlock c = mc.getDenseBlock(); + + LibMatrixMult.matrixMultDenseDenseMM(a, b, c, tA, tB, n, k, 0, m, 0, n); + + mc.recomputeNonZeros(); + + // calc true result with existing methods + MatrixBlock ma_in = tA ? LibMatrixReorg.transpose(ma) : ma; + MatrixBlock mb_in = tB ? LibMatrixReorg.transpose(mb) : mb; + MatrixBlock expected = LibMatrixMult.matrixMult(ma_in, mb_in); + + // compare results + TestUtils.compareMatrices(expected, mc, 1e-8); + } +} From 3288a7fa4a06382b1fe0741310dc33f67086bca6 Mon Sep 17 00:00:00 2001 From: Elman Jahangiri Date: Thu, 22 Jan 2026 00:03:37 +0100 Subject: [PATCH 2/2] [SYSTEMDS-3168] Optimize matrix multiplication for transposed inputs This is a new method for dense-dense matrix multiplication kernels to improve performance for operations involving transposed inputs. Previously, this was done, by doing a transpose and then calling a densedenseMM method. The optimizations target three specific transpose scenarios using 100x100 dense matrices as the benchmark, but also worked for other matrix dimensions. --- .../runtime/matrix/data/LibMatrixMult.java | 148 ++++++++++-------- .../MatrixMultTransposedPerformanceTest.java | 8 +- 2 files changed, 87 insertions(+), 69 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index 4438899f375..87c9827470a 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -44,17 +44,8 @@ import org.apache.sysds.lops.WeightedSquaredLoss.WeightsType; import org.apache.sysds.lops.WeightedUnaryMM.WUMMType; import org.apache.sysds.runtime.DMLRuntimeException; -import org.apache.sysds.runtime.data.DenseBlock; -import org.apache.sysds.runtime.data.DenseBlockFP64DEDUP; -import org.apache.sysds.runtime.data.DenseBlockFactory; -import org.apache.sysds.runtime.data.SparseBlock; +import org.apache.sysds.runtime.data.*; import org.apache.sysds.runtime.data.SparseBlock.Type; -import org.apache.sysds.runtime.data.SparseBlockCSR; -import org.apache.sysds.runtime.data.SparseBlockFactory; -import org.apache.sysds.runtime.data.SparseBlockMCSR; -import org.apache.sysds.runtime.data.SparseRow; -import org.apache.sysds.runtime.data.SparseRowScalar; -import org.apache.sysds.runtime.data.SparseRowVector; import org.apache.sysds.runtime.functionobjects.SwapIndex; import org.apache.sysds.runtime.functionobjects.ValueFunction; import org.apache.sysds.runtime.matrix.operators.ReorgOperator; @@ -1050,57 +1041,70 @@ public static void matrixMultDenseDenseMM(DenseBlock a, DenseBlock b, DenseBlock } private static void multDenseDenseTransA(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) { + // process matrices in small blocks for caching final int blocksizeI = 32; final int blocksizeK = 24; + final int blocksizeJ = 1024; + // iterate over block of C rows for (int bi = rl; bi < ru; bi += blocksizeI) { int bimin = Math.min(ru, bi + blocksizeI); + + // iterate over blocks of common dimension k for (int bk = 0; bk < cd; bk += blocksizeK) { int bkmin = Math.min(cd, bk + blocksizeK); - int k = bk; - for (; k < bkmin - 3; k += 4) { + // iterate over blocks of C columns + for (int bj = cl; bj < cu; bj += blocksizeJ) { + int bjmin = Math.min(cu, bj + blocksizeJ); + int lenJ = bjmin - bj; + + // if B is a single contiguous array, we skip checks inside the loop if (b.isContiguous()) { double[] bvals = b.values(0); - int bix0 = b.pos(k, cl), bix1 = b.pos(k+1, cl); - int bix2 = b.pos(k+2, cl), bix3 = b.pos(k+3, cl); - - for (int i = bi; i < bimin; i++) { - double[] cvals = c.values(i); - int cix = c.pos(i, cl); - double val0 = a.values(k)[a.pos(k) + i]; - double val1 = a.values(k+1)[a.pos(k+1) + i]; - double val2 = a.values(k+2)[a.pos(k+2) + i]; - double val3 = a.values(k+3)[a.pos(k+3) + i]; - - vectMultiplyAdd4(val0, val1, val2, val3, - bvals, cvals, - bix0, bix1, bix2, bix3, cix, cu - cl); + + int k = bk; + // process 4 rows of A at the same time + for (; k < bkmin - 3; k += 4) { + int bix0 = b.pos(k, bj); + int bix1 = b.pos(k+1, bj); + int bix2 = b.pos(k+2, bj); + int bix3 = b.pos(k+3, bj); + + for (int i = bi; i < bimin; i++) { + // grab 4 values from A + double val0 = a.values(k)[a.pos(k) + i]; + double val1 = a.values(k+1)[a.pos(k+1) + i]; + double val2 = a.values(k+2)[a.pos(k+2) + i]; + double val3 = a.values(k+3)[a.pos(k+3) + i]; + + double[] cvals = c.values(i); + int cix = c.pos(i, bj); + + vectMultiplyAdd4(val0, val1, val2, val3, + bvals, cvals, + bix0, bix1, bix2, bix3, cix, lenJ); + } } - } else { - for (int i = bi; i < bimin; i++) { - double[] cvals = c.values(i); - int cix = c.pos(i, cl); - double val0 = a.values(k)[a.pos(k) + i]; - if(val0!=0) vectMultiplyAdd(val0, b.values(k), cvals, b.pos(k, cl), cix, cu - cl); - double val1 = a.values(k+1)[a.pos(k+1) + i]; - if(val1!=0) vectMultiplyAdd(val1, b.values(k+1), cvals, b.pos(k+1, cl), cix, cu - cl); - double val2 = a.values(k+2)[a.pos(k+2) + i]; - if(val2!=0) vectMultiplyAdd(val2, b.values(k+2), cvals, b.pos(k+2, cl), cix, cu - cl); - double val3 = a.values(k+3)[a.pos(k+3) + i]; - if(val3!=0) vectMultiplyAdd(val3, b.values(k+3), cvals, b.pos(k+3, cl), cix, cu - cl); + // for the remaining rows + for (; k < bkmin; k++) { + int bix = b.pos(k, bj); + for (int i = bi; i < bimin; i++) { + double val = a.values(k)[a.pos(k) + i]; + if (val != 0) { + vectMultiplyAdd(val, bvals, c.values(i), bix, c.pos(i, bj), lenJ); + } + } } - } - } - for (; k < bkmin; k++) { - double[] bvals = b.values(k); - int bix = b.pos(k, cl); - double[] avals = a.values(k); - int apos = a.pos(k); - for (int i = bi; i < bimin; i++) { - double val = avals[apos + i]; - if (val != 0) { - vectMultiplyAdd(val, bvals, c.values(i), bix, c.pos(i, cl), cu - cl); + } else { + for (int k = bk; k < bkmin; k++) { + for (int i = bi; i < bimin; i++) { + double val = a.values(k)[a.pos(k) + i]; + if (val != 0) { + vectMultiplyAdd(val, b.values(k), c.values(i), + b.pos(k, bj), c.pos(i, bj), lenJ); + } + } } } } @@ -1109,6 +1113,7 @@ private static void multDenseDenseTransA(DenseBlock a, DenseBlock b, DenseBlock } private static void multDenseDenseTransB(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) { + // copy small blocks of B into buffer bufB final int blocksizeK = 24; double[] bufB = new double[blocksizeK * (cu - cl)]; @@ -1116,7 +1121,7 @@ private static void multDenseDenseTransB(DenseBlock a, DenseBlock b, DenseBlock int bkmin = Math.min(cd, bk + blocksizeK); int bklen = bkmin - bk; - + // put B into buffer while transposing for (int j = cl; j < cu; j++) { double[] bvals = b.values(j); int bpos = b.pos(j); @@ -1126,6 +1131,7 @@ private static void multDenseDenseTransB(DenseBlock a, DenseBlock b, DenseBlock } } + // perform matrix multiplication with buffer for (int i = rl; i < ru; i++) { double[] avals = a.values(i); int apos = a.pos(i); @@ -1144,28 +1150,40 @@ private static void multDenseDenseTransB(DenseBlock a, DenseBlock b, DenseBlock } private static void multDenseDenseTransATransB(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) { - double[] d_row = new double[ru]; + // transpose B into temp Block B + // use C = t(A) * B from above as helper method - for (int j = cl; j < cu; j++) { - java.util.Arrays.fill(d_row, rl, ru, 0); + // allocate Block for transposing B + int tB_rows = cd; + int tB_cols = cu - cl; - for (int k = 0; k < cd; k++) { - double valB = b.get(j, k); - if (valB != 0) { - double[] avals = a.values(k); - int apos = a.pos(k); - vectMultiplyAdd(valB, avals, d_row, apos + rl, rl, ru - rl); - } - } + DenseBlock tB_block = new DenseBlockFP64(new int[] {tB_rows, tB_cols}); + double[] tB = tB_block.values(0); - for (int i = rl; i < ru; i++) { - double[] cvals = c.values(i); - int cix = c.pos(i); - cvals[cix + j] += d_row[i]; + // perform transpose from B to tB_block + final int BLOCK = 128; + for (int bi = cl; bi < cu; bi += BLOCK) { + int bimin = Math.min(cu, bi + BLOCK); + for (int bk = 0; bk < cd; bk += BLOCK) { + int bkmin = Math.min(cd, bk + BLOCK); + + for (int j = bi; j < bimin; j++) { + double[] b_vals = b.values(j); + int b_pos = b.pos(j); + + int tB_col_idx = (j - cl); + + for (int k = bk; k < bkmin; k++) { + tB[k * tB_cols + tB_col_idx] = b_vals[b_pos + k]; + } + } } } + // reuse our existing method + multDenseDenseTransA(a, tB_block, c, n, cd, rl, ru, 0, tB_cols); } + private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) { DenseBlock a = m1.getDenseBlock(); DenseBlock b = m2.getDenseBlock(); diff --git a/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedPerformanceTest.java b/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedPerformanceTest.java index c9899f907aa..787934f56bb 100644 --- a/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedPerformanceTest.java +++ b/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedPerformanceTest.java @@ -7,9 +7,9 @@ import org.junit.Test; public class MatrixMultTransposedPerformanceTest { - private final int m = 500; - private final int n = 500; - private final int k = 500; + private final int m = 100; + private final int n = 100; + private final int k = 100; @Test @@ -33,7 +33,7 @@ public void testPerf_3_TransA_TransB() { } private void runTest(boolean tA, boolean tB) { - int REP = 50; + int REP = 5000; // setup Dimensions int rowsA = tA ? k : m;