diff --git a/.github/workflows/build-cron.yml b/.github/workflows/build-cron.yml
index 4d5d99e9c1f..730b276685e 100644
--- a/.github/workflows/build-cron.yml
+++ b/.github/workflows/build-cron.yml
@@ -54,7 +54,7 @@ jobs:
           ]
     steps:
     - name: Checkout Repository
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
 
     - name: Setup Java ${{ matrix.java }} ${{ matrix.javadist }}
       uses: actions/setup-java@v5
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6df5bd6db28..d401263bb7d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -68,7 +68,7 @@ jobs:
           ]
     steps:
     - name: Checkout Repository
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
 
     - name: Setup Java ${{ matrix.java }} ${{ matrix.javadist }}
       uses: actions/setup-java@v5
diff --git a/.github/workflows/cleanup-transient-artifacts.yml b/.github/workflows/cleanup-transient-artifacts.yml
index 6dd55e75bed..d249064cf34 100644
--- a/.github/workflows/cleanup-transient-artifacts.yml
+++ b/.github/workflows/cleanup-transient-artifacts.yml
@@ -38,7 +38,7 @@ jobs:
     if: ${{ github.event.workflow_run.conclusion == 'success' }}
     steps:
     - name: Checkout Repository
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
 
     - name: Delete Artifacts
       run: |
diff --git a/.github/workflows/docker-cd.yml b/.github/workflows/docker-cd.yml
index 938c8e232fa..a848e5fedf4 100644
--- a/.github/workflows/docker-cd.yml
+++ b/.github/workflows/docker-cd.yml
@@ -38,7 +38,7 @@ jobs:
 
     steps:
     - name: Checkout
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
 
     # https://github.com/docker/metadata-action
     - name: Configure Docker metadata
diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
index d2c63327e93..b14a3ae9885 100644
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@@ -40,7 +40,7 @@ jobs:
 
     steps:
     - name: Checkout
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
     - run: git checkout ${{ github.event.inputs.branch_or_tag }}
 
     # https://github.com/docker/metadata-action
diff --git a/.github/workflows/docker-testImage.yml b/.github/workflows/docker-testImage.yml
index b05b09e02b3..6e1dad91bb5 100644
--- a/.github/workflows/docker-testImage.yml
+++ b/.github/workflows/docker-testImage.yml
@@ -36,7 +36,7 @@ jobs:
 
     steps:
     - name: Checkout
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
 
     # https://github.com/docker/metadata-action
     - name: Configure Docker metadata
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index 283a8ce5e57..75b74228f67 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -47,7 +47,7 @@ jobs:
     name: Java
     steps:
     - name: Checkout Repository
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
 
     - name: Setup Java ${{ matrix.java }} ${{ matrix.javadist }}
       uses: actions/setup-java@v5
@@ -64,7 +64,7 @@ jobs:
     name: Python
     steps:
     - name: Checkout Repository
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
 
     - name: Setup Python
       uses: actions/setup-python@v6
@@ -73,7 +73,7 @@ jobs:
         architecture: 'x64'
 
     - name: Cache Pip Dependencies
-      uses: actions/cache@v4
+      uses: actions/cache@v5
       with:
         path: ~/.cache/pip
         key: ${{ runner.os }}-pip-docs-${{ hashFiles('src/main/python/docs/requires-docs.txt') }}
diff --git a/.github/workflows/javaCodestyle.yml b/.github/workflows/javaCodestyle.yml
new file mode 100644
index 00000000000..50c970023c1
--- /dev/null
+++ b/.github/workflows/javaCodestyle.yml
@@ -0,0 +1,60 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+name: Java Codestyle
+
+on:
+  push:
+    paths-ignore:
+      - 'docs/**'
+      - '*.md'
+      - '*.html'
+      - 'src/main/python/**'
+      - 'dev/**'
+    branches:
+      - main
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+      - '*.md'
+      - '*.html'
+      - 'src/main/python/**'
+      - 'dev/**'
+    branches:
+      - main
+
+jobs:
+  java_codestyle:
+    name: Java Checkstyle
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout Repository
+      uses: actions/checkout@v6
+
+    - name: Setup Java 17 adopt
+      uses: actions/setup-java@v5
+      with:
+        distribution: adopt
+        java-version: '17'
+        cache: 'maven'
+
+    - name: Run Checkstyle
+      run: mvn -ntp -B -Dcheckstyle.skip=false checkstyle:check
diff --git a/.github/workflows/javaTests.yml b/.github/workflows/javaTests.yml
index d7797e5f4a3..b4f4ce2e11d 100644
--- a/.github/workflows/javaTests.yml
+++ b/.github/workflows/javaTests.yml
@@ -90,7 +90,7 @@ jobs:
     name: ${{ matrix.tests }}
     steps:
     - name: Checkout Repository
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
 
     - name: ${{ matrix.tests }}
       uses: ./.github/action/
@@ -106,7 +106,7 @@ jobs:
         echo "ARTIFACT_NAME=$ARTIFACT_NAME" >> $GITHUB_ENV
 
     - name: Save Java Test Coverage as Artifact
-      uses: actions/upload-artifact@v5
+      uses: actions/upload-artifact@v6
       with:
         name: ${{ env.ARTIFACT_NAME }}
         path: target/jacoco.exec
@@ -126,10 +126,10 @@ jobs:
         javadist: ['adopt']
     steps:
     - name: Checkout Repository
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
 
     - name: Cache Maven Dependencies
-      uses: actions/cache@v4
+      uses: actions/cache@v5
       with:
         path: ~/.m2/repository
         key: ${{ runner.os }}-maven-test-${{ hashFiles('**/pom.xml') }}
@@ -137,7 +137,7 @@ jobs:
           ${{ runner.os }}-maven-test-
 
     - name: Download all Jacoco Artifacts
-      uses: actions/download-artifact@v6
+      uses: actions/download-artifact@v7
       with:
         path: target
 
@@ -151,7 +151,7 @@ jobs:
       run: mvn jacoco:report
 
     - name: Upload coverage to Codecov
-      uses: codecov/codecov-action@v5.5.1
+      uses: codecov/codecov-action@v5.5.2
       if: github.repository_owner == 'apache'
       with:
         fail_ci_if_error: false
@@ -160,7 +160,7 @@ jobs:
 
     - name: Upload Jacoco Report Artifact PR
       if: (github.repository_owner == 'apache') && (github.ref_name != 'main')
-      uses: actions/upload-artifact@v5
+      uses: actions/upload-artifact@v6
       with:
         name: Java Code Coverage (Jacoco)
         path: target/site/jacoco
@@ -168,7 +168,7 @@ jobs:
 
     - name: Upload Jacoco Report Artifact Main
       if: (github.repository_owner == 'apache') && (github.ref_name == 'main')
-      uses: actions/upload-artifact@v5
+      uses: actions/upload-artifact@v6
       with:
         name: Java Code Coverage (Jacoco)
         path: target/site/jacoco
@@ -176,9 +176,8 @@ jobs:
 
     - name: Upload Jacoco Report Artifact Fork
       if: (github.repository_owner != 'apache')
-      uses: actions/upload-artifact@v5
+      uses: actions/upload-artifact@v6
       with:
         name: Java Code Coverage (Jacoco)
         path: target/site/jacoco
         retention-days: 3
-
diff --git a/.github/workflows/license.yml b/.github/workflows/license.yml
index 88e1418670f..4f7b02ee42f 100644
--- a/.github/workflows/license.yml
+++ b/.github/workflows/license.yml
@@ -54,7 +54,7 @@ jobs:
 
     steps:
     - name: Checkout Repository
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
 
     - name: Setup Java ${{ matrix.java }} ${{ matrix.javadist }}
       uses: actions/setup-java@v5
diff --git a/.github/workflows/monitoringUITests.yml b/.github/workflows/monitoringUITests.yml
index b4a3179b6a7..9389b394828 100644
--- a/.github/workflows/monitoringUITests.yml
+++ b/.github/workflows/monitoringUITests.yml
@@ -52,7 +52,7 @@ jobs:
         node-version: ["lts/*"]
 
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
       - name: Build the application, with Node.js ${{ matrix.node-version }}
         uses: actions/setup-node@v6
         with:
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index fcd8bf8c849..ea8c9f485e2 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -60,7 +60,7 @@ jobs:
     name: ${{ matrix.os }} Java ${{ matrix.java }} ${{ matrix.javadist }} Python ${{ matrix.python-version }}/ ${{ matrix.test_mode}}
     steps:
     - name: Checkout Repository
-      uses: actions/checkout@v5
+      uses: actions/checkout@v6
 
     - name: Setup Java ${{ matrix.java }} ${{ matrix.javadist }}
       uses: actions/setup-java@v5
@@ -70,13 +70,13 @@ jobs:
         cache: 'maven'
 
     - name: Cache Pip Dependencies
-      uses: actions/cache@v4
+      uses: actions/cache@v5
       with:
         path: ~/.cache/pip
         key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('src/main/python/setup.py') }}
 
     - name: Cache Datasets
-      uses: actions/cache@v4
+      uses: actions/cache@v5
       with:
         path: |
           src/main/python/systemds/examples/tutorials/mnist
@@ -84,7 +84,7 @@ jobs:
         key: ${{ runner.os }}-mnist-${{ hashFiles('src/main/python/systemds/examples/tutorials/mnist.py') }}-${{ hashFiles('src/main/python/systemds/examples/tutorials/adult.py') }}
 
     - name: Cache Deb Dependencies
-      uses: actions/cache@v4
+      uses: actions/cache@v5
       with:
         path: /var/cache/apt/archives
         key: ${{ runner.os }}-${{ hashFiles('.github/workflows/python.yml') }}
@@ -142,11 +142,11 @@ jobs:
         export PATH=$SYSTEMDS_ROOT/bin:$PATH
         cd src/main/python
         ./tests/federated/runFedTest.sh
-
+    
     - name: Cache Torch Hub
       if: ${{ matrix.test_mode == 'scuro' }}
       id: torch-cache
-      uses: actions/cache@v4
+      uses: actions/cache@v5
       with:
         path: .torch
         key: ${{ runner.os }}-torch-${{ hashFiles('requirements.txt') }}
@@ -158,6 +158,8 @@ jobs:
       env:
         TORCH_HOME: ${{ github.workspace }}/.torch
       run: |
+        df -h
+        exit
         ( while true; do echo "."; sleep 25; done ) &
         KA=$!
         pip install --upgrade pip wheel setuptools
@@ -172,7 +174,9 @@ jobs:
           gensim \
           opt-einsum \
           nltk \
-          fvcore
+          fvcore \
+          scikit-optimize \
+          flair 
         kill $KA 
         cd src/main/python
         python -m unittest discover -s tests/scuro -p 'test_*.py' -v
diff --git a/.github/workflows/pythonFormatting.yml b/.github/workflows/pythonFormatting.yml
index 532878da1e6..cbdd5e84578 100644
--- a/.github/workflows/pythonFormatting.yml
+++ b/.github/workflows/pythonFormatting.yml
@@ -38,7 +38,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout Repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
 
       - name: Setup Python
         uses: actions/setup-python@v6
diff --git a/.github/workflows/release-scripts.yml b/.github/workflows/release-scripts.yml
index 95536f668d9..b804a3db507 100644
--- a/.github/workflows/release-scripts.yml
+++ b/.github/workflows/release-scripts.yml
@@ -42,7 +42,7 @@ jobs:
     steps:
     # Java setup docs:
     # https://github.com/actions/setup-java/blob/main/docs/advanced-usage.md#installing-custom-java-package-type
-    - uses: actions/checkout@v5
+    - uses: actions/checkout@v6
     - name: Set up JDK 17
       uses: actions/setup-java@v5
       with:
@@ -54,7 +54,7 @@ jobs:
     - run: printf "JAVA_HOME = $JAVA_HOME \n"
     
     - name: Cache local Maven repository
-      uses: actions/cache@v4
+      uses: actions/cache@v5
       with:
         path: ~/.m2/repository
         key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
diff --git a/.gitignore b/.gitignore
index d2fcdb9a4de..5de697a37e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -146,7 +146,7 @@ src/test/scripts/functions/pipelines/intermediates/classification/*
 
 venv
 venv/*
-
+.venv
 # resource optimization
 scripts/resource/output
 *.pem
diff --git a/dev/checkstyle/checkstyle.xml b/dev/checkstyle/checkstyle.xml
new file mode 100644
index 00000000000..96a381cd4c9
--- /dev/null
+++ b/dev/checkstyle/checkstyle.xml
@@ -0,0 +1,74 @@
+<?xml version="1.0"?>
+<!--
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+-->
+
+<!DOCTYPE module PUBLIC
+        "-//Checkstyle//DTD Checkstyle Configuration 1.3//EN"
+        "https://checkstyle.org/dtds/configuration_1_3.dtd">
+
+<module name="Checker">
+    <module name="SuppressionFilter">
+        <property name="file" value="dev/checkstyle/suppressions.xml"/>
+    </module>
+
+    <module name="TreeWalker">
+        <!-- XPath-based suppressions (target specific AST patterns) -->
+        <module name="SuppressionXpathFilter">
+            <property name="file" value="dev/checkstyle/suppressions-xpath.xml"/>
+        </module>
+
+        <module name="UnusedImports"/>
+        <module name="RedundantImport"/>
+        <module name="AvoidStarImport"/>
+        <module name="IllegalImport"/>
+        <module name="PackageName"/>
+        <module name="TypeName"/>
+        <!-- 1) Private fields must be _lowerCamelCase -->
+        <!--<module name="MemberName">
+            <property name="applyToPublic" value="false"/>
+            <property name="applyToProtected" value="false"/>
+            <property name="applyToPackage" value="false"/>
+            <property name="applyToPrivate" value="true"/>
+
+            <property name="format" value="^_[a-z][a-zA-Z0-9]*$"/>
+        </module>-->
+        <!-- 2) Non-private fields must be lowerCamelCase (no leading underscore) -->
+        <!--<module name="MemberName">
+            <property name="applyToPublic" value="true"/>
+            <property name="applyToProtected" value="false"/>
+            <property name="applyToPackage" value="true"/>
+            <property name="applyToPrivate" value="false"/>
+            <property name="format" value="^[a-z][a-zA-Z0-9]*$"/>
+        </module>-->
+        <module name="MethodName"/>
+        <!--<module name="ParameterName"/>-->
+        <module name="LocalVariableName">
+            <property name="format"
+                      value="^[a-zA-Z][a-zA-Z0-9_]*$"/>
+        </module>
+    </module>
+
+    <!-- to detect leading spaces (should be tabs) -->
+    <module name="RegexpMultiline">
+        <property name="format"
+                  value="(?m)^( +)(?!\*)(?!//)(?!/\*)\S"/>
+        <property name="message"
+                  value="Indentation must use tabs (no leading spaces)."/>
+    </module>
+</module>
diff --git a/dev/checkstyle/suppressions-xpath.xml b/dev/checkstyle/suppressions-xpath.xml
new file mode 100644
index 00000000000..8843856a78d
--- /dev/null
+++ b/dev/checkstyle/suppressions-xpath.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0"?>
+<!--
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+-->
+
+<!DOCTYPE suppressions PUBLIC
+        "-//Checkstyle//DTD SuppressionXpathFilter Experimental Configuration 1.2//EN"
+        "https://checkstyle.org/dtds/suppressions_1_2_xpath_experimental.dtd">
+
+<suppressions>
+    <!-- Suppress MemberName for private final fields -->
+    <suppress-xpath
+            checks="MemberNameCheck"
+            query="//VARIABLE_DEF[
+                     ./MODIFIERS/LITERAL_PRIVATE
+                     and ./MODIFIERS/FINAL
+                   ]/IDENT"/>
+
+    <!-- Suppress MemberName for private static fields -->
+    <suppress-xpath
+            checks="MemberNameCheck"
+            query="//VARIABLE_DEF[
+                     ./MODIFIERS/LITERAL_PRIVATE
+                     and ./MODIFIERS/LITERAL_STATIC
+                   ]/IDENT"/>
+</suppressions>
diff --git a/dev/checkstyle/suppressions.xml b/dev/checkstyle/suppressions.xml
new file mode 100644
index 00000000000..1daf743a34e
--- /dev/null
+++ b/dev/checkstyle/suppressions.xml
@@ -0,0 +1,314 @@
+<?xml version="1.0"?>
+<!--
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+-->
+
+<!DOCTYPE suppressions PUBLIC
+        "-//Checkstyle//DTD SuppressionFilter Configuration 1.2//EN"
+        "https://checkstyle.org/dtds/suppressions_1_2.dtd">
+
+<suppressions>
+    <!-- Star Imports -->
+    <suppress checks="AvoidStarImportCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]parser[\\/]dml[\\/]DmlLexer\.java$"/>
+    <suppress checks="AvoidStarImportCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]parser[\\/]dml[\\/]DmlParser\.java$"/>
+    <suppress checks="AvoidStarImportCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]gpu[\\/]context[\\/]CSRPointer\.java$"/>
+    <suppress checks="AvoidStarImportCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]io[\\/]ReaderCOGParallel\.java$"/>
+    <suppress checks="AvoidStarImportCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]io[\\/]ReaderCOG\.java$"/>
+    <suppress checks="AvoidStarImportCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]matrix[\\/]data[\\/]DoublePrecisionCudaSupportFunctions\.java$"/>
+    <suppress checks="AvoidStarImportCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]matrix[\\/]data[\\/]LibMatrixDNNLSTM\.java$"/>
+    <suppress checks="AvoidStarImportCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]matrix[\\/]data[\\/]SinglePrecisionCudaSupportFunctions\.java$"/>
+
+    <!-- Type Name Check -->
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]parser[\\/]dml[\\/]DmlParser\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]ColGroupFactory\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]ColGroupRLE\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]insertionsort[\\/]InsertionSorterFactory\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]mapping[\\/]MapToFactory\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]offset[\\/]OffsetFactory\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]controlprogram[\\/]paramserv[\\/]FederatedPSControlThread\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]controlprogram[\\/]paramserv[\\/]dp[\\/]BalanceToAvgFederatedScheme\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]controlprogram[\\/]paramserv[\\/]dp[\\/]ReplicateToMaxFederatedScheme\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]controlprogram[\\/]paramserv[\\/]dp[\\/]ShuffleFederatedScheme\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]controlprogram[\\/]paramserv[\\/]dp[\\/]SubsampleToMinFederatedScheme\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]data[\\/]TensorBlock\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]gpu[\\/]GPUInstruction\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]spark[\\/]BinaryFrameFrameSPInstruction\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]util[\\/]DMVUtils\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]mapping[\\/]PreAggregateDDC_DDCTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]mapping[\\/]PreAggregateDDC_SDCZTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]mapping[\\/]PreAggregateSDCZ_DDCTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]mapping[\\/]PreAggregateSDCZ_SDCZTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]util[\\/]hashTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]util[\\/]pow2tests\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]frame[\\/]FrameAppendTest_Old\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]matrix[\\/]EigenDecompTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]misc[\\/]functionobjects[\\/]Log_NZTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]misc[\\/]functionobjects[\\/]ifElseTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part2[\\/]BuiltinXgBoostPredictTest_classification\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part2[\\/]BuiltinXgBoostPredictTest_regression\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part2[\\/]BuiltinXgBoostTest_classification\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part2[\\/]BuiltinXgBoostTest_regression\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]data[\\/]rand[\\/]SampleTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]data[\\/]rand[\\/]SequenceTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]ooc[\\/]lmDSTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]unary[\\/]matrix[\\/]IQMTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]unary[\\/]matrix[\\/]RoundTest\.java$"/>
+    <suppress checks="TypeNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]unary[\\/]scalar[\\/]FullDistributionTest\.java$"/>
+
+    <!-- RegexpMultilineCheck (spaces detection; should be tabs) -->
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]protobuf[\\/]SysdsProtos\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]lib[\\/]CLALibBinaryCellOp\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]gpu[\\/]DnnGPUInstruction\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]gpu[\\/]context[\\/]CudaMemoryAllocator\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]gpu[\\/]context[\\/]GPUContextPool\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]gpu[\\/]context[\\/]GPUContext\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]gpu[\\/]context[\\/]GPUMemoryAllocator\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]gpu[\\/]context[\\/]GPUMemoryManager\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]gpu[\\/]context[\\/]JCudaKernels\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]gpu[\\/]context[\\/]ShadowBuffer\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]gpu[\\/]context[\\/]UnifiedMemoryAllocator\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]transform[\\/]encode[\\/]MultiColumnEncoder\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]util[\\/]CounterBasedPRNGenerator\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]util[\\/]IPRNGenerator\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]util[\\/]PhiloxNormalCBPRNGenerator\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]util[\\/]PhiloxUniformCBPRNGenerator\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]util[\\/]Py4jConverterUtils\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]applications[\\/]nn[\\/]transformers[\\/]BertLayerTest\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]readers[\\/]ReadersTest\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]matrix[\\/]STFTTest\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]io[\\/]parquet[\\/]FrameParquetSchemaTest\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]jmlc[\\/]JMLConnectionTest\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]rewrite[\\/]RewriteQuantizationFusedCompressionTest\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]transform[\\/]TransformFrameEncodeBagOfWords\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]gpu[\\/]cudaSupportFunctions[\\/]CudaCublasGeamTest\.java$"/>
+    <suppress checks="RegexpMultilineCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]gpu[\\/]cudaSupportFunctions[\\/]CudaCusparseCsrGemmTest\.java$"/>
+
+    <!-- LocalVariableNameCheck -->
+    <suppress checks="LocalVariableNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]api[\\/]PythonDMLScript\.java$"/>
+    <suppress checks="LocalVariableNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]parser[\\/]dml[\\/]DmlParser\.java$"/>
+    <suppress checks="LocalVariableNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]ColGroupFactory\.java$"/>
+    <suppress checks="LocalVariableNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]dictionary[\\/]MatrixBlockDictionary\.java$"/>
+    <suppress checks="LocalVariableNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]mapping[\\/]MapToFactory\.java$"/>
+    <suppress checks="LocalVariableNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]lib[\\/]CLALibLeftMultBy\.java$"/>
+    <suppress checks="LocalVariableNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]iogen[\\/]template[\\/]FrameGenerateReaderParallel\.java$"/>
+    <suppress checks="LocalVariableNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]iogen[\\/]template[\\/]MatrixGenerateReaderParallel\.java$"/>
+    <suppress checks="LocalVariableNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]matrix[\\/]data[\\/]LibMatrixCuDNNRnnAlgorithm\.java$"/>
+
+    <!-- MethodNameCheck -->
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]codegen[\\/]SpoofCompiler\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]codegen[\\/]cplan[\\/]CNodeCell\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]codegen[\\/]cplan[\\/]CNodeMultiAgg\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]codegen[\\/]cplan[\\/]CNodeOuterProduct\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]codegen[\\/]cplan[\\/]CNodeRow\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]rewrite[\\/]RewriteAlgebraicSimplificationDynamic\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]rewrite[\\/]RewriteAlgebraicSimplificationStatic\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]rewrite[\\/]RewriteBlockSizeAndReblock\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]rewrite[\\/]RewriteCommonSubexpressionElimination\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]rewrite[\\/]RewriteConstantFolding\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]rewrite[\\/]RewriteElementwiseMultChainOptimization\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]rewrite[\\/]RewriteGPUSpecificOps\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]rewrite[\\/]RewriteIndexingVectorization\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]rewrite[\\/]RewriteMatrixMultChainOptimizationTranspose\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]rewrite[\\/]RewriteMatrixMultChainOptimization\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]rewrite[\\/]RewriteRemovePersistentReadWrite\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]hops[\\/]rewrite[\\/]RewriteRemoveUnnecessaryCasts\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]lops[\\/]OutputParameters\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]parser[\\/]dml[\\/]DmlListener\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]parser[\\/]dml[\\/]DmlParser\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]resource[\\/]CloudUtils\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]codegen[\\/]SpoofCUDACellwise\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]codegen[\\/]SpoofCUDAOperator\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]codegen[\\/]SpoofCUDARowwise\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]ColGroupSDCZeros\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]dictionary[\\/]DictLibMatrixMult\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]dictionary[\\/]Dictionary\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]dictionary[\\/]IDictionary\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]dictionary[\\/]MatrixBlockDictionary\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]mapping[\\/]AMapToData\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]mapping[\\/]MapToChar\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]colgroup[\\/]offset[\\/]AOffset\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]estim[\\/]AComEst\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]lib[\\/]CLALibLeftMultBy\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]compress[\\/]lib[\\/]CLALibRightMultBy\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]controlprogram[\\/]federated[\\/]FederatedSSLUtil\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]controlprogram[\\/]paramserv[\\/]ParamServer\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]instructions[\\/]spark[\\/]UnaryMatrixSPInstruction\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]io[\\/]hdf5[\\/]H5\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]iogen[\\/]ReaderMappingIndex\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]iogen[\\/]ReaderMapping\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]lineage[\\/]LineageItem\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]matrix[\\/]data[\\/]CudaSupportFunctions\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]matrix[\\/]data[\\/]LibCommonsMath\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]matrix[\\/]data[\\/]LibMatrixAgg\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]matrix[\\/]data[\\/]LibMatrixBincell\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]matrix[\\/]data[\\/]LibMatrixCUDA\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]matrix[\\/]data[\\/]LibMatrixFourier\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]matrix[\\/]data[\\/]LibMatrixOuterAgg\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]matrix[\\/]data[\\/]LibMatrixReorg\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]transform[\\/]encode[\\/]CompressedEncode\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]util[\\/]DMVUtils\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]main[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]runtime[\\/]util[\\/]EMAUtils\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]performance[\\/]matrix[\\/]ReshapePerf\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]TestUtils\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]applications[\\/]OrderStatisticsTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]applications[\\/]nn[\\/]NNComponentTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]applications[\\/]nn[\\/]NNGradientTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]applications[\\/]nn[\\/]NNMaxPool2dComponentTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]AbstractCompressedUnaryTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]CompressedLoggingTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]CompressedMatrixTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]CompressedSingleTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]CompressedTestBase\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]ExtendedMatrixTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]InvalidInputTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]bitmap[\\/]BitMapSparseTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]colgroup[\\/]ColGroupBase\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]colgroup[\\/]ColGroupFactoryLoggingTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]colgroup[\\/]ColGroupTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]colgroup[\\/]ColGroupUtilsTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]colgroup[\\/]GenOffsetOLETest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]colgroup[\\/]JolEstimateTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]colgroup[\\/]NegativeConstTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]colgroup[\\/]scheme[\\/]EmptySchemeTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]colgroup[\\/]scheme[\\/]NegativeSchemeTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]colgroup[\\/]scheme[\\/]SchemeTestBase\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]cost[\\/]InstructionCounterTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]dictionary[\\/]CustomDictionaryTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]dictionary[\\/]DictionaryTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]dictionary[\\/]PlaceHolderDictTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]estim[\\/]JoinCompressionInfoTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]estim[\\/]SampleEstimatorTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]estim[\\/]encoding[\\/]EncodeSampleCustom\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]indexes[\\/]CustomIndexTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]indexes[\\/]IndexesTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]io[\\/]IOSpark\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]lib[\\/]CLALibBinaryCellOpCustomTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]lib[\\/]CLALibBinaryCellOpLoggingTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]lib[\\/]CLALibBinaryCellOpTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]lib[\\/]CLALibCompAggLoggingTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]lib[\\/]CLALibCompAggTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]lib[\\/]CLALibTernaryOpTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]lib[\\/]SeqTableTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]mapping[\\/]PreAggregateDDC_DDCTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]mapping[\\/]PreAggregateDDC_SDCZTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]mapping[\\/]PreAggregateSDCZ_DDCTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]mapping[\\/]PreAggregateSDCZ_SDCZTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]offset[\\/]LargeOffsetTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]offset[\\/]OffsetPreAggTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]offset[\\/]OffsetReverseTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]offset[\\/]OffsetSingleTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]offset[\\/]OffsetTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]readers[\\/]ReadersTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]util[\\/]ArrCountMapTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]compress[\\/]util[\\/]pow2tests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]federated[\\/]FederatedTestUtils\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]federated[\\/]FederatedUrlParserTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]frame[\\/]FrameCastingTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]frame[\\/]FrameRemoveEmpty\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]frame[\\/]FrameUtilTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]frame[\\/]array[\\/]ColumnMetadataTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]frame[\\/]array[\\/]CustomArrayTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]frame[\\/]array[\\/]FrameArrayConstantTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]frame[\\/]array[\\/]FrameArrayTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]frame[\\/]array[\\/]NegativeArrayTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]frame[\\/]iterators[\\/]IteratorTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]matrix[\\/]EqualsTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]matrix[\\/]FillMatrixTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]matrix[\\/]FourierTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]matrix[\\/]MatrixToSparseOrDense\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]matrix[\\/]RandTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]matrix[\\/]STFTTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]matrix[\\/]SeqTableTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]matrix[\\/]SliceTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]matrix[\\/]binary[\\/]BinaryOperationInPlaceTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]misc[\\/]DoubleParserTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]misc[\\/]IOUtilFunctionsTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]misc[\\/]ThreadPoolTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]misc[\\/]TypeTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]misc[\\/]functionobjects[\\/]LogTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]misc[\\/]functionobjects[\\/]Log_NZTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]resource[\\/]CostEstimatorTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]resource[\\/]EnumeratorTests\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]resource[\\/]RecompilationTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]resource[\\/]ResourceOptimizerTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]component[\\/]sparse[\\/]SparseBlockMerge\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]binary[\\/]matrix[\\/]UltraSparseMRMatrixMultiplicationTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part1[\\/]BuiltinAmputeTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part1[\\/]BuiltinConfusionMatrixTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part1[\\/]BuiltinDMVTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part1[\\/]BuiltinDateProcessingTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part1[\\/]BuiltinDecisionTreeRealDataTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part1[\\/]BuiltinDenialConstraintTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part1[\\/]BuiltinEMATest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part1[\\/]BuiltinGLMTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part1[\\/]BuiltinImpurityMeasuresTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part1[\\/]BuiltinImputeFDTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part2[\\/]BuiltinMCCTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part2[\\/]BuiltinMulticlassSVMPredictTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part2[\\/]BuiltinPCATest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part2[\\/]BuiltinSplitBalancedTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part2[\\/]BuiltinSplitTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]builtin[\\/]part2[\\/]BuiltinUnderSamplingTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]caching[\\/]BufferpoolLeakTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]codegen[\\/]CellwiseTmplTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]codegen[\\/]OuterProdTmplTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]compress[\\/]CompressInstructionRewrite\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]compress[\\/]CompressInstruction\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]compress[\\/]CompressRewriteSpark\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]compress[\\/]CompressScale\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]compress[\\/]LocalInstructionWithCompression\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]compress[\\/]LocalInstructionWithoutCompression\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]compress[\\/]configuration[\\/]CompressForce\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]compress[\\/]reshape[\\/]CompressedReshapeTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]compress[\\/]table[\\/]CompressedTableOverwriteTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]compress[\\/]wordembedding[\\/]WordEmbeddingTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]federated[\\/]algorithms[\\/]FederatedYL2SVMTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]federated[\\/]paramserv[\\/]AvgModelFederatedParamservTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]federated[\\/]paramserv[\\/]EncryptedFederatedParamservTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]federated[\\/]primitives[\\/]part2[\\/]FederatedNegativeTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]io[\\/]binary[\\/]BlocksizeTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]io[\\/]cog[\\/]ReadCOGTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]io[\\/]compressed[\\/]ReadCompressedTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]io[\\/]csv[\\/]ReadCSVTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]io[\\/]csv[\\/]ReadFrameCSVTest1\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]io[\\/]csv[\\/]WriteCSVTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]io[\\/]hdf5[\\/]ReadHDF5Test\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]io[\\/]hdf5[\\/]WriteHDF5Test\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]io[\\/]libsvm[\\/]ReadLIBSVMTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]io[\\/]libsvm[\\/]WriteLIBSVMTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]io[\\/]matrixmarket[\\/]ReadMMTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]lineage[\\/]GPUFullReuseTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]lineage[\\/]GPULineageCacheEvictionTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]lineage[\\/]LineageReuseAlg\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]lineage[\\/]LineageTraceGPUTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]lineage[\\/]LineageTraceParforTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]linearization[\\/]ILinearizeTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]mlcontext[\\/]MLContextFrameTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]mlcontext[\\/]MLContextTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]recompile[\\/]IPAFrameAppendTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]reorg[\\/]FullOrderTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]rewrite[\\/]RewriteMMCBindZeroVector\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]ternary[\\/]ABATernaryAggregateTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]unary[\\/]matrix[\\/]IQMTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]unary[\\/]matrix[\\/]RoundTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]functions[\\/]unary[\\/]scalar[\\/]StopTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]gpu[\\/]BuiltinUnaryGPUInstructionTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]gpu[\\/]nn[\\/]DNNOperationsGPUTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]usertest[\\/]UserInterfaceTest\.java$"/>
+    <suppress checks="MethodNameCheck" files=".*src[\\/]test[\\/]java[\\/]org[\\/]apache[\\/]sysds[\\/]test[\\/]usertest[\\/]pythonapi[\\/]StartupTest\.java$"/>
+</suppressions>
diff --git a/pom.xml b/pom.xml
index 0eb7248f617..88c10511936 100644
--- a/pom.xml
+++ b/pom.xml
@@ -64,6 +64,7 @@
 		<kerby.version>2.0.3</kerby.version>
 		<maven-clean-plugin.version>3.2.0</maven-clean-plugin.version>
 		<maven-surefire-plugin.version>3.0.0</maven-surefire-plugin.version>
+        <maven-checkstyle-plugin.version>3.3.1</maven-checkstyle-plugin.version>
 		<maven-remote-resources-plugin.version>3.0.0</maven-remote-resources-plugin.version>
 		<maven-javadoc-plugin.version>3.5.0</maven-javadoc-plugin.version>
 		<maven-javadoc-plugin.version2>3.2.0</maven-javadoc-plugin.version2>
@@ -87,6 +88,7 @@
 		<test-forkCount>1C</test-forkCount>
 		<rerun.failing.tests.count>2</rerun.failing.tests.count>
 		<jacoco.skip>false</jacoco.skip>
+		<checkstyle.skip>true</checkstyle.skip>
 		<doc.skip>false</doc.skip>
 		<jacoco.include>**</jacoco.include>
 		<automatedtestbase.outputbuffering>false</automatedtestbase.outputbuffering>
@@ -583,6 +585,35 @@
 				</configuration>
 			</plugin>
 
+
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-checkstyle-plugin</artifactId>
+                <version>${maven-checkstyle-plugin.version}</version>
+
+                <executions>
+                    <execution>
+                        <id>checkstyle</id>
+                        <!-- runs on mvn test -->
+                        <phase>test</phase>
+                        <goals>
+                            <goal>check</goal>
+                        </goals>
+                    </execution>
+                </executions>
+
+                <configuration>
+                    <configLocation>dev/checkstyle/checkstyle.xml</configLocation>
+                    <skip>${checkstyle.skip}</skip>
+                    <failOnViolation>true</failOnViolation>
+                    <includeTestSourceDirectory>true</includeTestSourceDirectory>
+                    <outputFile>${project.build.directory}/checkstyle-result.xml</outputFile>
+                    <outputFileFormat>xml</outputFileFormat>
+                    <consoleOutput>true</consoleOutput>
+                </configuration>
+            </plugin>
+
+            <plugin>
 			<plugin>
 				<!-- unit tests -->
 				<groupId>org.apache.maven.plugins</groupId>
diff --git a/scripts/builtin/scaleRobust.dml b/scripts/builtin/scaleRobust.dml
new file mode 100644
index 00000000000..ce309fabe54
--- /dev/null
+++ b/scripts/builtin/scaleRobust.dml
@@ -0,0 +1,60 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Robust scaling using median and IQR (Interquartile Range)
+# Resistant to outliers by centering with the median and scaling with IQR.
+#
+# INPUT:
+# -------------------------------------------------------------------------------------
+#   X    Input feature matrix of shape n-by-m
+# -------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# -------------------------------------------------------------------------------------
+#   Y    Scaled output matrix of shape n-by-m
+#   med  Column medians (Q2) of shape 1-by-m
+#   q1   Column first quantiles (Q1) of shape 1-by-m
+#   q3   Column first quantiles (Q3) of shape 1-by-m
+# -------------------------------------------------------------------------------------
+
+m_scaleRobust = function(Matrix[Double] X)
+  return (Matrix[Double] Y, Matrix[Double] med, Matrix[Double] q1, Matrix[Double] q3)
+{
+  n = nrow(X)
+  m = ncol(X)
+
+  med = matrix(0.0, rows=1, cols=m)
+  q1  = matrix(0.0, rows=1, cols=m)
+  q3  = matrix(0.0, rows=1, cols=m)
+
+  # Define quantile probabilities once, outside the loop
+  q_probs = as.matrix(list(0.25, 0.5, 0.75));
+
+  # Loop over columns to compute quantiles
+  parfor (j in 1:m) {
+    q = quantile(X[,j], q_probs)
+    med[1,j] = q[2,1]
+    q1[1,j]  = q[1,1]
+    q3[1,j]  = q[3,1]
+  }
+
+  Y = scaleRobustApply(X, med, q1, q3);
+}
diff --git a/scripts/builtin/scaleRobustApply.dml b/scripts/builtin/scaleRobustApply.dml
new file mode 100644
index 00000000000..11461731b34
--- /dev/null
+++ b/scripts/builtin/scaleRobustApply.dml
@@ -0,0 +1,48 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Apply robust scaling using precomputed medians and IQRs
+#
+# INPUT:
+# -------------------------------------------------------------------------------------
+# X     Input feature matrix of shape n-by-m
+# med   Column medians (Q2) of shape 1-by-m
+# q1    Column first quantiles (Q1) of shape 1-by-m
+# q3    Column first quantiles (Q3) of shape 1-by-m
+# -------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# -------------------------------------------------------------------------------------
+# Y     Scaled output matrix of shape n-by-m
+# -------------------------------------------------------------------------------------
+
+m_scaleRobustApply = function(Matrix[Double] X, Matrix[Double] med, Matrix[Double] q1, Matrix[Double] q3)
+  return (Matrix[Double] Y)
+{
+  iqr = q3 - q1
+
+  # Ensure robust scaling is safe by replacing invalid IQRs
+  iqr = replace(target=iqr, pattern=0, replacement=1)
+  iqr = replace(target=iqr, pattern=NaN, replacement=1)
+
+  # Apply robust transformation
+  Y = (X - med) / iqr
+}
diff --git a/scripts/staging/ssb/README.md b/scripts/staging/ssb/README.md
new file mode 100644
index 00000000000..ef9f09afeaf
--- /dev/null
+++ b/scripts/staging/ssb/README.md
@@ -0,0 +1,525 @@
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% end comment %}
+-->
+
+
+# Star Schema Benchmark (SSB) for SystemDS
+
+This README documents the SSB DML queries under `scripts/ssb/queries/` and the runner scripts under `scripts/ssb/shell/` that execute and benchmark them. It is focused on what is implemented today, how to run it, and how to interpret the outputs for performance analysis.
+
+---
+
+## Table of Contents
+
+1. Project Layout
+2. Quick Start
+3. Data Location (`--input-dir` and DML `input_dir`)
+4. Single-Engine Runner (`scripts/ssb/shell/run_ssb.sh`)
+5. Multi-Engine Performance Runner (`scripts/ssb/shell/run_all_perf.sh`)
+6. Outputs and Examples
+7. Adding/Editing Queries
+8. Troubleshooting
+
+---
+
+## 1) Project Layout
+
+Paths are relative to the repo root:
+
+```
+systemds/
+├── scripts/ssb/
+│   ├── README.md                              # This guide
+│   ├── queries/                               # DML queries (q1_1.dml ... q4_3.dml)
+│   │   ├── q1_1.dml - q1_3.dml                # Flight 1
+│   │   ├── q2_1.dml - q2_3.dml                # Flight 2
+│   │   ├── q3_1.dml - q3_4.dml                # Flight 3
+│   │   └── q4_1.dml - q4_3.dml                # Flight 4
+│   ├── shell/
+│   │   ├── run_ssb.sh                         # Single-engine (SystemDS) runner
+│   │   ├── run_all_perf.sh                    # Multi-engine performance benchmark
+│   │   └── ssbOutputData/                     # Results (created on first run)
+│   │       ├── QueryData/                     # Per-query outputs from run_ssb.sh
+│   │       └── PerformanceData/               # Multi-engine outputs from run_all_perf.sh
+│   └── sql/                                   # SQL versions + `ssb.duckdb` for DuckDB
+```
+
+Note: The SSB raw data directory is not committed. You must point the runners to your generated data with `--input-dir`.
+
+---
+
+## 2) Quick Start
+
+Set up SystemDS and run the SSB queries.
+
+1) Build SystemDS (from repo root):
+
+```bash
+mvn -DskipTests package
+```
+
+2) Make sure the SystemDS binary exists (repo-local `bin/systemds` or on `PATH`).
+
+3) Make runner scripts executable:
+
+```bash
+chmod +x scripts/ssb/shell/run_ssb.sh scripts/ssb/shell/run_all_perf.sh
+```
+
+4) Provide SSB data (from dbgen) in a directory, e.g. `/path/to/ssb-data`.
+
+5) Run a single SSB query on SystemDS (from repo root):
+
+```bash
+scripts/ssb/shell/run_ssb.sh q1.1 --input-dir=/path/to/ssb-data --stats
+```
+
+6) Run the multi-engine performance benchmark across all queries (from repo root):
+
+```bash
+scripts/ssb/shell/run_all_perf.sh --input-dir=/path/to/ssb-data --stats --repeats=5
+```
+
+If `--input-dir` is omitted, the scripts default to `./data/` under the repo root.
+
+---
+
+## 3) Data Location (`--input-dir` and DML `input_dir`)
+
+Both runners pass a named argument `input_dir` into DML as:
+
+```
+-nvargs input_dir=/absolute/path/to/ssb-data
+```
+
+Your DML scripts should construct paths from `input_dir`. Example:
+
+```dml
+dates     = read(paste(input_dir, "/date.tbl", sep=""), data_type="frame", format="csv", sep="|", header=FALSE)
+lineorder = read(paste(input_dir, "/lineorder.tbl", sep=""), data_type="frame", format="csv", sep="|", header=FALSE)
+```
+
+Expected base files in `input_dir`: `customer.tbl`, `supplier.tbl`, `part.tbl`, `date.tbl` and `lineorder*.tbl` (fact table name can vary by scale). The runners validate that `--input-dir` exists before executing.
+
+---
+
+## 4) Single-Engine Runner (`scripts/ssb/shell/run_ssb.sh`)
+
+Runs SSB DML queries with SystemDS and saves results per query.
+
+- Usage:
+  - `scripts/ssb/shell/run_ssb.sh` — run all SSB queries
+  - `scripts/ssb/shell/run_ssb.sh q1.1 q2.3` — run specific queries
+  - `scripts/ssb/shell/run_ssb.sh --stats` — include SystemDS internal statistics
+  - `scripts/ssb/shell/run_ssb.sh --input-dir=/path/to/data` — set data dir
+  - `scripts/ssb/shell/run_ssb.sh --output-dir=/tmp/out` — set output dir
+
+- Query names: You can use dotted form (`q1.1`); the runner maps to `q1_1.dml` internally.
+
+- Functionality:
+  - Single-threaded execution via auto-generated `conf/single_thread.xml`.
+  - DML `input_dir` forwarding with `-nvargs`.
+  - Pre-check for data directory; clear errors if missing.
+  - Runtime error detection by scanning for “An Error Occurred : …”.
+  - Optional `--stats` to capture SystemDS internal statistics in JSON.
+  - Per-query outputs in TXT, CSV, and JSON.
+  - `run.json` with run-level metadata and per-query status/results.
+  - Clear end-of-run summary and, for table results, a “DETAILED QUERY RESULTS” section.
+  - Exit code is non-zero if any query failed (handy for CI).
+
+- Output layout:
+  - Base directory: `--output-dir` (default: `scripts/ssb/shell/ssbOutputData/QueryData`)
+  - Each run: `ssb_run_<YYYYMMDD_HHMMSS>/`
+    - `txt/<query>.txt` — human-readable result
+    - `csv/<query>.csv` — scalar or table as CSV
+    - `json/<query>.json` — per-query JSON
+    - `run.json` — full metadata and results for the run
+
+- Example console output (abridged):
+
+```
+[1/13] Running: q1_1.dml
+...
+=========================================
+SSB benchmark completed!
+Total queries executed: 13
+Failed queries: 0
+Statistics: enabled
+
+=========================================
+RUN METADATA SUMMARY
+=========================================
+Timestamp:       2025-09-05 12:34:56 UTC
+Hostname:        myhost
+Seed:            123456
+Software Versions:
+  SystemDS:      3.4.0-SNAPSHOT
+  JDK:           21.0.2
+System Resources:
+  CPU:           Apple M2
+  RAM:           16GB
+Data Build Info:
+  SSB Data:      customer:300000 part:200000 supplier:2000 lineorder:6001215
+=========================================
+
+===================================================
+QUERIES SUMMARY
+===================================================
+No.  Query           Result                         Status
+---------------------------------------------------
+1    q1.1            12 rows (see below)            ✓ Success
+2    q1.2            1                              ✓ Success
+...
+===================================================
+
+=========================================
+DETAILED QUERY RESULTS
+=========================================
+[1] Results for q1.1:
+----------------------------------------
+1992|ASIA|12345.67
+1993|ASIA|23456.78
+...
+----------------------------------------
+```
+
+---
+
+## 5) Multi-Engine Performance Runner (`scripts/ssb/shell/run_all_perf.sh`)
+
+Runs SSB queries across SystemDS, PostgreSQL, and DuckDB with repeated timings and statistical analysis.
+
+- Usage:
+  - `scripts/ssb/shell/run_all_perf.sh` — run all queries on available engines
+  - `scripts/ssb/shell/run_all_perf.sh q1.1 q2.3` — run specific queries
+  - `scripts/ssb/shell/run_all_perf.sh --warmup=2 --repeats=10` — control sampling
+  - `scripts/ssb/shell/run_all_perf.sh --stats` — include core/internal engine timings
+  - `scripts/ssb/shell/run_all_perf.sh --layout=wide|stacked` — control terminal layout
+  - `scripts/ssb/shell/run_all_perf.sh --input-dir=... --output-dir=...` — set paths
+
+- Query names: dotted form (`q1.1`) is accepted; mapped internally to `q1_1.dml`.
+
+- Engine prerequisites:
+  - PostgreSQL:
+    - Install `psql` CLI and ensure a PostgreSQL server is running.
+    - Default connection in the script: `POSTGRES_DB=ssb`, `POSTGRES_USER=$(whoami)`, `POSTGRES_HOST=localhost`.
+    - Create the `ssb` database and load the standard SSB tables and data (schema not included in this repo). The SQL queries under `scripts/ssb/sql/` expect the canonical SSB schema and data.
+    - The runner verifies connectivity; if it cannot connect or tables are missing, PostgreSQL results are skipped.
+  - DuckDB:
+    - Install the DuckDB CLI (`duckdb`).
+    - The runner looks for the database at `scripts/ssb/sql/ssb.duckdb`. Ensure it contains SSB tables and data.
+    - If the CLI is missing or the DB file cannot be opened, DuckDB results are skipped.
+  - SystemDS is required; the other engines are optional. Missing engines are reported and skipped gracefully.
+
+- Functionality:
+  - Single-threaded execution for fairness (SystemDS config; SQL engines via settings).
+  - Pre-flight data-dir check and SystemDS test-run with runtime-error detection.
+  - Warmups and repeated measurements using `/usr/bin/time -p` (ms resolution).
+  - Statistics per engine: mean, population stdev, p95, and CV%.
+  - “Shell” vs “Core” time: SystemDS core from `-stats`, PostgreSQL core via EXPLAIN ANALYZE, DuckDB core via JSON profiling.
+  - Environment verification: gracefully skips PostgreSQL or DuckDB if not available.
+  - Terminal-aware output: wide table with grid or stacked multi-line layout.
+  - Results to CSV and JSON with rich metadata (system info, versions, run config).
+
+- Layouts (display formats):
+  - Auto selection: `--layout=auto` (default). Chooses `wide` if terminal is wide enough, else `stacked`.
+  - Wide layout: `--layout=wide`. Prints a grid with columns for each engine and a `Fastest` column. Three header rows show labels for `mean`, `±/CV`, and `p95`.
+  - Stacked layout: `--layout=stacked` or `--stacked`. Prints a compact, multi-line block per query (best for narrow terminals).
+  - Dynamic scaling: The wide layout scales column widths to fit the terminal; if still too narrow, it falls back to stacked.
+  - Row semantics: Row 1 = mean (ms); Row 2 = `±stdev/CV%`; Row 3 = `p95 (ms)`.
+  - Fastest: The runner highlights the engine with the lowest mean per query.
+
+- Output layout:
+  - Base directory: `--output-dir` (default: `scripts/ssb/shell/ssbOutputData/PerformanceData`)
+  - Files per run (timestamped basename):
+    - `ssb_results_<UTC_ISO_TIMESTAMP>.csv`
+    - `ssb_results_<UTC_ISO_TIMESTAMP>.json`
+
+- Example console output (abridged, wide layout):
+
+```
+==================================================================================
+                      MULTI-ENGINE PERFORMANCE BENCHMARK METADATA
+==================================================================================
+Timestamp:       2025-09-05 12:34:56 UTC
+Hostname:        myhost
+Seed:            123456
+Software Versions:
+  SystemDS:      3.4.0-SNAPSHOT
+  JDK:           21.0.2
+  PostgreSQL:    psql (PostgreSQL) 14.11
+  DuckDB:        v0.10.3
+System Resources:
+  CPU:           Apple M2
+  RAM:           16GB
+Data Build Info:
+  SSB Data:      customer:300000 part:200000 supplier:2000 lineorder:6001215
+Run Configuration:
+  Statistics:    enabled
+  Queries:       13 selected
+  Warmup Runs:   1
+  Repeat Runs:   5
+
++--------+--------------+--------------+--------------+----------------+--------------+----------------+----------+
+| Query  | SysDS Shell  | SysDS Core   | PostgreSQL   | PostgreSQL Core| DuckDB       | DuckDB Core    | Fastest  |
+|        | mean         | mean         | mean         | mean           | mean         | mean           |          |
+|        | ±/CV         | ±/CV         | ±/CV         | ±/CV           | ±/CV         | ±/CV           |          |
+|        | p95          | p95          | p95          | p95            | p95          | p95            |          |
++--------+--------------+--------------+--------------+----------------+--------------+----------------+----------+
+| q1_1   | 1824.0       | 1210.0       | 2410.0       | 2250.0         | 980.0        | 910.0          | DuckDB   |
+|        | ±10.2/0.6%   | ±8.6/0.7%    | ±15.1/0.6%   | ±14.0/0.6%     | ±5.4/0.6%    | ±5.0/0.5%      |          |
+|        | p95:1840.0   | p95:1225.0   | p95:2435.0   | p95:2274.0     | p95:989.0    | p95:919.0      |          |
++--------+--------------+--------------+--------------+----------------+--------------+----------------+----------+
+```
+
+- Example console output (abridged, stacked layout):
+
+```
+Query  : q1_1    Fastest: DuckDB
+  SystemDS Shell: 1824.0
+                   ±10.2ms/0.6%
+                   p95:1840.0ms
+  SystemDS Core:  1210.0
+                   ±8.6ms/0.7%
+                   p95:1225.0ms
+  PostgreSQL:     2410.0
+                   ±15.1ms/0.6%
+                   p95:2435.0ms
+  PostgreSQL Core:2250.0
+                   ±14.0ms/0.6%
+                   p95:2274.0ms
+  DuckDB:         980.0
+                   ±5.4ms/0.6%
+                   p95:989.0ms
+  DuckDB Core:    910.0
+                   ±5.0ms/0.5%
+                   p95:919.0ms
+--------------------------------------------------------------------------------
+```
+
+---
+
+## 6) Outputs and Examples
+
+Where to find results and how to read them.
+
+- SystemDS-only runner (`scripts/ssb/shell/run_ssb.sh`):
+  - Directory: `scripts/ssb/shell/ssbOutputData/QueryData/ssb_run_<YYYYMMDD_HHMMSS>/`
+  - Files: `txt/<query>.txt`, `csv/<query>.csv`, `json/<query>.json`, and `run.json`
+  - `run.json` example (stats enabled, single query):
+
+```json
+{
+  "benchmark_type": "ssb_systemds",
+  "timestamp": "2025-09-07 19:45:11 UTC",
+  "hostname": "eduroam-141-23-175-117.wlan.tu-berlin.de",
+  "seed": 849958376,
+  "software_versions": {
+    "systemds": "3.4.0-SNAPSHOT",
+    "jdk": "17.0.15"
+  },
+  "system_resources": {
+    "cpu": "Apple M1 Pro",
+    "ram": "16GB"
+  },
+  "data_build_info": {
+    "customer": "30000",
+    "part": "200000",
+    "supplier": "2000",
+    "date": "2557",
+    "lineorder": "8217"
+  },
+  "run_configuration": {
+    "statistics_enabled": true,
+    "queries_selected": 1,
+    "queries_executed": 1,
+    "queries_failed": 0
+  },
+  "results": [
+    {
+      "query": "q1_1",
+      "result": "687752409 ",
+      "stats": [
+        "SystemDS Statistics:",
+        "Total elapsed time:        1.557 sec.",
+        "Total compilation time:        0.410 sec.",
+        "Total execution time:        1.147 sec.",
+        "Cache hits (Mem/Li/WB/FS/HDFS):    11054/0/0/0/2.",
+        "Cache writes (Li/WB/FS/HDFS):    0/26/3/0.",
+        "Cache times (ACQr/m, RLS, EXP):    0.166/0.001/0.060/0.000 sec.",
+        "HOP DAGs recompiled (PRED, SB):    0/175.",
+        "HOP DAGs recompile time:    0.063 sec.",
+        "Functions recompiled:        2.",
+        "Functions recompile time:    0.016 sec.",
+        "Total JIT compile time:        1.385 sec.",
+        "Total JVM GC count:        1.",
+        "Total JVM GC time:        0.026 sec.",
+        "Heavy hitter instructions:",
+        "  #  Instruction           Time(s)  Count",
+        "  1  m_raJoin                0.940      1",
+        "  2  ucumk+                  0.363      3",
+        "  3  -                       0.219   1345",
+        "  4  nrow                    0.166      7",
+        "  5  ctable                  0.086      2",
+        "  6  *                       0.078      1",
+        "  7  parallelBinarySearch    0.069      1",
+        "  8  ba+*                    0.049      5",
+        "  9  rightIndex              0.016   8611",
+        " 10  leftIndex               0.015   1680"
+      ],
+      "status": "success"
+    }
+  ]
+}
+```
+
+  Notes:
+  - The `result` field contains the query’s output (scalar or tabular content collapsed). When `--stats` is used, `stats` contains the full SystemDS statistics block line-by-line.
+  - For failed queries, an `error_message` string is included and `status` is set to `"error"`.
+
+- Multi-engine runner (`scripts/ssb/shell/run_all_perf.sh`):
+  - Directory: `scripts/ssb/shell/ssbOutputData/PerformanceData/`
+  - Files per run: `ssb_results_<UTC_ISO_TIMESTAMP>.csv` and `.json`
+  - CSV contains display strings and raw numeric stats (mean/stdev/p95) for each engine; JSON contains the same plus metadata and fastest-engine per query.
+  - `ssb_results_*.json` example (stats enabled, single query):
+
+```json
+{
+  "benchmark_metadata": {
+    "benchmark_type": "multi_engine_performance",
+    "timestamp": "2025-09-07 20:11:16 UTC",
+    "hostname": "eduroam-141-23-175-117.wlan.tu-berlin.de",
+    "seed": 578860764,
+    "software_versions": {
+      "systemds": "3.4.0-SNAPSHOT",
+      "jdk": "17.0.15",
+      "postgresql": "psql (PostgreSQL) 17.5",
+      "duckdb": "v1.3.2 (Ossivalis) 0b83e5d2f6"
+    },
+    "system_resources": {
+      "cpu": "Apple M1 Pro",
+      "ram": "16GB"
+    },
+    "data_build_info": {
+      "customer": "30000",
+      "part": "200000",
+      "supplier": "2000",
+      "date": "2557",
+      "lineorder": "8217"
+    },
+    "run_configuration": {
+      "statistics_enabled": true,
+      "queries_selected": 1,
+      "warmup_runs": 1,
+      "repeat_runs": 5
+    }
+  },
+  "results": [
+    {
+      "query": "q1_1",
+      "systemds": {
+        "shell": {
+          "display": "2186.0 (±95.6ms/4.4%, p95:2250.0ms)",
+          "mean_ms": 2186.0,
+          "stdev_ms": 95.6,
+          "p95_ms": 2250.0
+        },
+        "core": {
+          "display": "1151.2 (±115.3ms/10.0%, p95:1334.0ms)",
+          "mean_ms": 1151.2,
+          "stdev_ms": 115.3,
+          "p95_ms": 1334.0
+        },
+        "status": "success",
+        "error_message": null
+      },
+      "postgresql": {
+        "display": "26.0 (±4.9ms/18.8%, p95:30.0ms)",
+        "mean_ms": 26.0,
+        "stdev_ms": 4.9,
+        "p95_ms": 30.0
+      },
+      "postgresql_core": {
+        "display": "3.8 (±1.4ms/36.8%, p95:5.7ms)",
+        "mean_ms": 3.8,
+        "stdev_ms": 1.4,
+        "p95_ms": 5.7
+      },
+      "duckdb": {
+        "display": "30.0 (±0.0ms/0.0%, p95:30.0ms)",
+        "mean_ms": 30.0,
+        "stdev_ms": 0.0,
+        "p95_ms": 30.0
+      },
+      "duckdb_core": {
+        "display": "1.1 (±0.1ms/9.1%, p95:1.3ms)",
+        "mean_ms": 1.1,
+        "stdev_ms": 0.1,
+        "p95_ms": 1.3
+      },
+      "fastest_engine": "PostgreSQL"
+    }
+  ]
+}
+```
+
+  Differences at a glance:
+  - Single-engine `run.json` focuses on query output (`result`) and, when enabled, the SystemDS `stats` array. Status and error handling are per-query.
+  - Multi-engine results JSON focuses on timing statistics for each engine (`shell` vs `core` for SystemDS; `postgresql`/`postgresql_core`; `duckdb`/`duckdb_core`) along with a `fastest_engine` field. It does not include the query’s actual result values.
+
+---
+
+## 7) Adding/Editing Queries
+
+Guidelines for DML in `scripts/ssb/queries/`:
+
+- Name files as `qX_Y.dml` (e.g., `q1_1.dml`). The runners accept `q1.1` on the CLI and map it for you.
+- Always derive paths from `input_dir` named argument (see Section 3).
+- Keep I/O separate from compute where possible (helps early error detection).
+- Add a short header comment with original SQL and intent.
+
+Example header:
+
+```dml
+/*
+  SQL: SELECT ...
+  Description: Revenue per month by supplier region
+*/
+```
+
+---
+
+## 8) Troubleshooting
+
+- Missing data directory: pass `--input-dir=/path/to/ssb-data` and ensure `*.tbl` files exist.
+- SystemDS not found: build (`mvn -DskipTests package`) and use `./bin/systemds` or ensure `systemds` is on PATH.
+- Query fails with runtime error: the runners mark `status: "error"` and include a short `error_message` in JSON outputs. See console snippet for context.
+- macOS cache dropping: OS caches cannot be dropped like Linux; the multi-engine runner mitigates with warmups + repeated averages and reports p95/CV.
+
+If something looks off, attach the relevant `run.json` or `ssb_results_*.json` when filing issues.
+
+- To debug DML runtime errors, run the DML directly:
+
+```bash
+./bin/systemds -f scripts/ssb/queries/q1_1.dml -nvargs input_dir=/path/to/data
+```
+
+- When `--stats` is enabled, SystemDS internal "core" timing is extracted and reported separately (useful to separate JVM / startup overhead from core computation).
+
+All these metrics appear in the generated CSVs and JSON entries.
+- Permission errors: `chmod +x scripts/ssb/shell/*.sh`.
diff --git a/scripts/staging/ssb/queries/q1_1.dml b/scripts/staging/ssb/queries/q1_1.dml
new file mode 100644
index 00000000000..eac47b099ee
--- /dev/null
+++ b/scripts/staging/ssb/queries/q1_1.dml
@@ -0,0 +1,91 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/* DML-script implementing the ssb query Q1.1 in SystemDS.
+SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE
+FROM lineorder, dates
+WHERE
+    lo_orderdate = d_datekey
+    AND d_year = 1993
+    AND lo_discount BETWEEN 1 AND 3
+    AND lo_quantity < 25;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q1_1.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q1_1.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q1_1.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+print("Loading tables from directory: " + input_dir);
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-6 : LO_ORDERDATE |
+# COL-9 : LO_QUANTITY | COL-10 : LO_EXTPRICE | COL-12 : LO_DISCOUNT
+lineorder_csv_min = cbind(lineorder_csv[, 6], lineorder_csv[, 9], lineorder_csv[, 10], lineorder_csv[, 12]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1993); # D_YEAR = '1993'
+
+# LO_QUANTITY < 25
+lo_quan_filt = raSel::m_raSelection(lineorder_matrix_min, col=2, op="<", val=25);
+
+# LO_DISCOUNT BETWEEN 1 AND 3
+lo_quan_disc_filt = raSel::m_raSelection(lo_quan_filt, col=4, op=">=", val=1);
+lo_quan_disc_filt = raSel::m_raSelection(lo_quan_disc_filt, col=4, op="<=", val=3);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING FILTERED LINEORDER TABLE WITH FILTERED DATE TABLE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_quan_disc_filt, colA=1, B=d_year_filt, colB=1, method="sort-merge");
+#print("LO-DATE JOINED.");
+
+
+# -- AGGREGATION --
+lo_extprice = joined_matrix[, 3];		#LO_EXTPRICE : 3 COLUMN OF JOINED-MATRIX
+lo_disc = joined_matrix[, 4];			#LO_DISCOUNT : 4 COLUMN OF JOINED-MATRIX
+revenue = sum(lo_extprice * lo_disc);
+
+print("REVENUE: " + as.integer(revenue));
+
+#print("Q1.1 finished.\n");
+
+
diff --git a/scripts/staging/ssb/queries/q1_2.dml b/scripts/staging/ssb/queries/q1_2.dml
new file mode 100644
index 00000000000..781f108a512
--- /dev/null
+++ b/scripts/staging/ssb/queries/q1_2.dml
@@ -0,0 +1,114 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/*DML-script implementing the ssb query Q1.2 in SystemDS.
+SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE
+FROM lineorder, dates
+WHERE
+    lo_orderdate = d_datekey
+    AND d_yearmonth = 'Jan1994'
+    AND lo_discount BETWEEN 4 AND 6
+    AND lo_quantity BETWEEN 26 AND 35;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q1_2.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q1_2.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q1_2.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+print("Loading tables from directory: " + input_dir);
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+# -- PREPARING --
+# Optimized approach: Single-pass filtering with direct matrix construction
+# Convert date key column to numeric matrix for proper handling
+date_keys_matrix = as.matrix(date_csv[, 1]);
+
+# Count Jan1994 rows first to pre-allocate matrix efficiently
+date_nrows = nrow(date_csv);
+jan1994_count = 0;
+for (i in 1:date_nrows) {
+    yearmonth_val = as.scalar(date_csv[i, 7]);
+    if (yearmonth_val == "Jan1994") {
+        jan1994_count = jan1994_count + 1;
+    }
+}
+
+# Pre-allocate final matrix and fill in single pass
+date_filtered = matrix(0, jan1994_count, 2);
+filtered_idx = 0;
+for (i in 1:date_nrows) {
+    yearmonth_val = as.scalar(date_csv[i, 7]);
+    if (yearmonth_val == "Jan1994") {
+        filtered_idx = filtered_idx + 1;
+        date_filtered[filtered_idx, 1] = as.scalar(date_keys_matrix[i, 1]);  # date_key
+        date_filtered[filtered_idx, 2] = 1;  # encoded value for Jan1994
+    }
+}
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-6 : LO_ORDERDATE |
+# COL-9 : LO_QUANTITY | COL-10 : LO_EXTPRICE | COL-12 : LO_DISCOUNT
+lineorder_csv_min = cbind(lineorder_csv[, 6], lineorder_csv[, 9], lineorder_csv[, 10], lineorder_csv[, 12]);
+lineorder_min_matrix = as.matrix(lineorder_csv_min);
+
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered for D_YEARMONTH = 'Jan1994', so d_year_filt is our filtered date data
+d_year_filt = date_filtered;
+
+# LO_QUANTITY BETWEEN 26 AND 35
+lo_quan_filt = raSel::m_raSelection(lineorder_min_matrix, col=2, op=">=", val=26);
+lo_quan_filt = raSel::m_raSelection(lo_quan_filt, col=2, op="<=", val=35);
+
+# LO_DISCOUNT BETWEEN 4 AND 6
+lo_quan_disc_filt = raSel::m_raSelection(lo_quan_filt, col=4, op=">=", val=4);
+lo_quan_disc_filt = raSel::m_raSelection(lo_quan_disc_filt, col=4, op="<=", val=6);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING FILTERED LINEORDER TABLE WITH FILTERED DATE TABLE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_quan_disc_filt, colA=1, B=d_year_filt, colB=1, method="sort-merge");
+#print("LO-DATE JOINED.");
+
+
+# -- AGGREGATION --
+lo_extprice = joined_matrix[, 3];		#LO_EXTPRICE : 3 COLUMN OF JOINED-MATRIX
+lo_disc = joined_matrix[, 4];			#LO_DISCOUNT : 4 COLUMN OF JOINED-MATRIX
+revenue = sum(lo_extprice * lo_disc);
+
+print("REVENUE: " + as.integer(revenue));
+
+#print("Q1.2 finished.\n");
\ No newline at end of file
diff --git a/scripts/staging/ssb/queries/q1_3.dml b/scripts/staging/ssb/queries/q1_3.dml
new file mode 100644
index 00000000000..cd9ba0b8746
--- /dev/null
+++ b/scripts/staging/ssb/queries/q1_3.dml
@@ -0,0 +1,115 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/*DML-script implementing the ssb query Q1.3 in SystemDS.
+SELECT SUM(lo_extendedprice * lo_discount) AS REVENUE
+FROM lineorder, dates
+WHERE
+    lo_orderdate = d_datekey
+    AND d_weeknuminyear = 6
+    AND d_year = 1994
+    AND lo_discount BETWEEN 5 AND 7
+    AND lo_quantity BETWEEN 26 AND 35;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q1_3.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q1_3.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q1_3.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+# -- PREPARING --
+# Optimized approach: Two-pass filtering with direct matrix construction
+# Convert date columns to numeric matrices for proper handling
+date_keys_matrix = as.matrix(date_csv[, 1]);  # date_key
+date_year_matrix = as.matrix(date_csv[, 5]);  # d_year
+date_weeknum_matrix = as.matrix(date_csv[, 12]); # d_weeknuminyear
+
+# Count matching rows first to pre-allocate matrix efficiently
+date_nrows = nrow(date_csv);
+matching_count = 0;
+for (i in 1:date_nrows) {
+    year_val = as.scalar(date_year_matrix[i, 1]);
+    weeknum_val = as.scalar(date_weeknum_matrix[i, 1]);
+    if (year_val == 1994 && weeknum_val == 6) {
+        matching_count = matching_count + 1;
+    }
+}
+
+# Pre-allocate final matrix and fill in single pass
+date_filtered = matrix(0, matching_count, 2);
+filtered_idx = 0;
+for (i in 1:date_nrows) {
+    year_val = as.scalar(date_year_matrix[i, 1]);
+    weeknum_val = as.scalar(date_weeknum_matrix[i, 1]);
+    if (year_val == 1994 && weeknum_val == 6) {
+        filtered_idx = filtered_idx + 1;
+        date_filtered[filtered_idx, 1] = as.scalar(date_keys_matrix[i, 1]);  # date_key
+        date_filtered[filtered_idx, 2] = 1;  # encoded value for matching criteria
+    }
+}
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-6 : LO_ORDERDATE |
+# COL-9 : LO_QUANTITY | COL-10 : LO_EXTPRICE | COL-12 : LO_DISCOUNT
+lineorder_csv_min = cbind(lineorder_csv[, 6], lineorder_csv[, 9], lineorder_csv[, 10], lineorder_csv[, 12]);
+lineorder_min_matrix = as.matrix(lineorder_csv_min);
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered for D_YEAR = 1994 AND D_WEEKNUMINYEAR = 6, so date_filtered is our filtered date data
+d_year_filt = date_filtered;
+
+# LO_QUANTITY BETWEEN 26 AND 35
+lo_quan_filt = raSel::m_raSelection(lineorder_min_matrix, col=2, op=">=", val=26);
+lo_quan_filt = raSel::m_raSelection(lo_quan_filt, col=2, op="<=", val=35);
+
+# LO_DISCOUNT BETWEEN 5 AND 7 (FIXED: was incorrectly >=6)
+lo_quan_disc_filt = raSel::m_raSelection(lo_quan_filt, col=4, op=">=", val=5);
+lo_quan_disc_filt = raSel::m_raSelection(lo_quan_disc_filt, col=4, op="<=", val=7);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING FILTERED LINEORDER TABLE WITH FILTERED DATE TABLE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_quan_disc_filt, colA=1, B=d_year_filt, colB=1, method="sort-merge");
+
+
+# -- AGGREGATION --
+lo_extprice = joined_matrix[, 3];		#LO_EXTPRICE : 3 COLUMN OF JOINED-MATRIX
+lo_disc = joined_matrix[, 4];			#LO_DISCOUNT : 4 COLUMN OF JOINED-MATRIX
+revenue = sum(lo_extprice * lo_disc);
+
+print("REVENUE: " + as.integer(revenue));
\ No newline at end of file
diff --git a/scripts/staging/ssb/queries/q2_1.dml b/scripts/staging/ssb/queries/q2_1.dml
new file mode 100644
index 00000000000..e35d66f9f31
--- /dev/null
+++ b/scripts/staging/ssb/queries/q2_1.dml
@@ -0,0 +1,325 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/*DML-script implementing the ssb query Q2.1 in SystemDS.
+SELECT SUM(lo_revenue), d_year, p_brand
+FROM lineorder, dates, part, supplier
+WHERE
+    lo_orderdate = d_datekey
+    AND lo_partkey = p_partkey
+    AND lo_suppkey = s_suppkey
+    AND p_category = 'MFGR#12'
+    AND s_region = 'AMERICA'
+GROUP BY d_year, p_brand
+ORDER BY p_brand;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q2_1.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q2_1.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q2_1.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+# -- PREPARING --
+# Optimized approach: On-the-fly filtering with direct matrix construction for string fields
+
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-4 : LO_PARTKEY | COL-5 : LO_SUPPKEY |
+# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY PART TABLE FILTERING AND ENCODING (P_CATEGORY = 'MFGR#12')
+# Two-pass approach: Count first, then filter and encode
+part_keys_matrix = as.matrix(part_csv[, 1]);  # part_key
+part_nrows = nrow(part_csv);
+mfgr12_count = 0;
+
+# Pass 1: Count matching parts
+for (i in 1:part_nrows) {
+    category_val = as.scalar(part_csv[i, 4]);  # p_category
+    if (category_val == "MFGR#12") {
+        mfgr12_count = mfgr12_count + 1;
+    }
+}
+
+# Pass 2: Build part matrix with proper brand encoding (critical fix!)
+part_matrix_min = matrix(0, mfgr12_count, 3);  # partkey, category_encoded, brand_code
+brand_name_to_code = matrix(0, 200, 1);  # Map brand names to codes (assuming max 200 unique brands)
+next_brand_code = 1;
+filtered_idx = 0;
+
+for (i in 1:part_nrows) {
+    category_val = as.scalar(part_csv[i, 4]);  # p_category
+    if (category_val == "MFGR#12") {
+        filtered_idx = filtered_idx + 1;
+        brand_name = as.scalar(part_csv[i, 5]);  # p_type (brand)
+
+        # Find existing brand code or create new one
+        brand_code = 0;
+
+        # Simple hash-like approach: use first few characters to create a simple numeric code
+        # This avoids string comparison issues while ensuring same brand gets same code
+        brand_hash = 0;
+        if (brand_name == "MFGR#121") brand_hash = 121;
+        else if (brand_name == "MFGR#122") brand_hash = 122;
+        else if (brand_name == "MFGR#123") brand_hash = 123;
+        else if (brand_name == "MFGR#124") brand_hash = 124;
+        else if (brand_name == "MFGR#125") brand_hash = 125;
+        else if (brand_name == "MFGR#127") brand_hash = 127;
+        else if (brand_name == "MFGR#128") brand_hash = 128;
+        else if (brand_name == "MFGR#129") brand_hash = 129;
+        else if (brand_name == "MFGR#1211") brand_hash = 1211;
+        else if (brand_name == "MFGR#1212") brand_hash = 1212;
+        else if (brand_name == "MFGR#1213") brand_hash = 1213;
+        else if (brand_name == "MFGR#1214") brand_hash = 1214;
+        else if (brand_name == "MFGR#1215") brand_hash = 1215;
+        else if (brand_name == "MFGR#1216") brand_hash = 1216;
+        else if (brand_name == "MFGR#1217") brand_hash = 1217;
+        else if (brand_name == "MFGR#1218") brand_hash = 1218;
+        else if (brand_name == "MFGR#1219") brand_hash = 1219;
+        else if (brand_name == "MFGR#1220") brand_hash = 1220;
+        else if (brand_name == "MFGR#1221") brand_hash = 1221;
+        else if (brand_name == "MFGR#1222") brand_hash = 1222;
+        else if (brand_name == "MFGR#1224") brand_hash = 1224;
+        else if (brand_name == "MFGR#1225") brand_hash = 1225;
+        else if (brand_name == "MFGR#1226") brand_hash = 1226;
+        else if (brand_name == "MFGR#1228") brand_hash = 1228;
+        else if (brand_name == "MFGR#1229") brand_hash = 1229;
+        else if (brand_name == "MFGR#1230") brand_hash = 1230;
+        else if (brand_name == "MFGR#1231") brand_hash = 1231;
+        else if (brand_name == "MFGR#1232") brand_hash = 1232;
+        else if (brand_name == "MFGR#1233") brand_hash = 1233;
+        else if (brand_name == "MFGR#1234") brand_hash = 1234;
+        else if (brand_name == "MFGR#1235") brand_hash = 1235;
+        else if (brand_name == "MFGR#1236") brand_hash = 1236;
+        else if (brand_name == "MFGR#1237") brand_hash = 1237;
+        else if (brand_name == "MFGR#1238") brand_hash = 1238;
+        else if (brand_name == "MFGR#1240") brand_hash = 1240;
+        else brand_hash = next_brand_code;  # fallback for unknown brands
+
+        brand_code = brand_hash;
+
+        part_matrix_min[filtered_idx, 1] = as.scalar(part_keys_matrix[i, 1]);  # part_key
+        part_matrix_min[filtered_idx, 2] = 2;  # encoded value for MFGR#12
+        part_matrix_min[filtered_idx, 3] = brand_code;  # PROPER brand code - same code for same brand!
+    }
+}# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_REGION = 'AMERICA')
+# Two-pass approach for suppliers
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]);  # supplier_key
+supplier_nrows = nrow(supplier_csv);
+america_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+    region_val = as.scalar(supplier_csv[i, 6]);  # s_region
+    if (region_val == "AMERICA") {
+        america_count = america_count + 1;
+    }
+}
+
+# Pass 2: Build supplier matrix
+sup_matrix_min = matrix(0, america_count, 2);  # suppkey, region_encoded
+filtered_idx = 0;
+for (i in 1:supplier_nrows) {
+    region_val = as.scalar(supplier_csv[i, 6]);  # s_region
+    if (region_val == "AMERICA") {
+        filtered_idx = filtered_idx + 1;
+        sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]);  # supplier_key
+        sup_matrix_min[filtered_idx, 2] = 1;  # encoded value for AMERICA
+    }
+}
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered for P_CATEGORY = 'MFGR#12' and S_REGION = 'AMERICA' during matrix construction
+# P_CATEGORY = 'MFGR#12' : 2 (Our encoded value)
+p_cat_filt = raSel::m_raSelection(part_matrix_min, col=2, op="==", val=2);
+
+# S_REGION = 'AMERICA' : 1 (Our encoded value)
+s_reg_filt = raSel::m_raSelection(sup_matrix_min, col=2, op="==", val=1);
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED PART TABLE WHERE LO_PARTKEY = P_PARTKEY
+lo_part = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=p_cat_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_part_sup = raJoin::m_raJoin(A=lo_part, colA=2, B=s_reg_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_part_sup, colA=3, B=date_matrix_min, colB=1, method="sort-merge");
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX
+revenue = joined_matrix[, 4];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(part_matrix_min) + ncol(sup_matrix_min) + 2)];
+# P_BRAND : COLUMN 3 OF PART-MIN-MATRIX
+p_brand = joined_matrix[,(ncol(lineorder_matrix_min) + 3)];
+
+max_p_brand = max(p_brand);
+p_brand_scale_f = ceil(max_p_brand) + 1;
+
+combined_key = d_year * p_brand_scale_f + p_brand;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+gr_key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+p_brand = round(gr_key %% p_brand_scale_f);
+d_year = round((gr_key - p_brand) / p_brand_scale_f);
+
+result = cbind(revenue, d_year, p_brand);
+
+result_ordered = order(target=result, by=1, decreasing=FALSE, index.return=FALSE);
+
+print("Processing " + nrow(result_ordered) + " result rows...");
+
+# Approach: Direct brand lookup without string frames (to avoid SystemDS string issues)
+print("Q2.1 Results with brand names (avoiding string frame issues):");
+
+# Output results with direct lookup - no intermediate string storage
+for (i in 1:nrow(result_ordered)) {
+    revenue_val = as.scalar(result_ordered[i, 1]);
+    year_val = as.scalar(result_ordered[i, 2]);
+    brand_code = as.scalar(result_ordered[i, 3]);
+
+    # Map brand code back to brand name
+    brand_code = as.scalar(result_ordered[i, 3]);
+    brand_name = "UNKNOWN";
+
+    # Reverse mapping from code to name
+    if (brand_code == 121) brand_name = "MFGR#121";
+    else if (brand_code == 122) brand_name = "MFGR#122";
+    else if (brand_code == 123) brand_name = "MFGR#123";
+    else if (brand_code == 124) brand_name = "MFGR#124";
+    else if (brand_code == 125) brand_name = "MFGR#125";
+    else if (brand_code == 127) brand_name = "MFGR#127";
+    else if (brand_code == 128) brand_name = "MFGR#128";
+    else if (brand_code == 129) brand_name = "MFGR#129";
+    else if (brand_code == 1211) brand_name = "MFGR#1211";
+    else if (brand_code == 1212) brand_name = "MFGR#1212";
+    else if (brand_code == 1213) brand_name = "MFGR#1213";
+    else if (brand_code == 1214) brand_name = "MFGR#1214";
+    else if (brand_code == 1215) brand_name = "MFGR#1215";
+    else if (brand_code == 1216) brand_name = "MFGR#1216";
+    else if (brand_code == 1217) brand_name = "MFGR#1217";
+    else if (brand_code == 1218) brand_name = "MFGR#1218";
+    else if (brand_code == 1219) brand_name = "MFGR#1219";
+    else if (brand_code == 1220) brand_name = "MFGR#1220";
+    else if (brand_code == 1221) brand_name = "MFGR#1221";
+    else if (brand_code == 1222) brand_name = "MFGR#1222";
+    else if (brand_code == 1224) brand_name = "MFGR#1224";
+    else if (brand_code == 1225) brand_name = "MFGR#1225";
+    else if (brand_code == 1226) brand_name = "MFGR#1226";
+    else if (brand_code == 1228) brand_name = "MFGR#1228";
+    else if (brand_code == 1229) brand_name = "MFGR#1229";
+    else if (brand_code == 1230) brand_name = "MFGR#1230";
+    else if (brand_code == 1231) brand_name = "MFGR#1231";
+    else if (brand_code == 1232) brand_name = "MFGR#1232";
+    else if (brand_code == 1233) brand_name = "MFGR#1233";
+    else if (brand_code == 1234) brand_name = "MFGR#1234";
+    else if (brand_code == 1235) brand_name = "MFGR#1235";
+    else if (brand_code == 1236) brand_name = "MFGR#1236";
+    else if (brand_code == 1237) brand_name = "MFGR#1237";
+    else if (brand_code == 1238) brand_name = "MFGR#1238";
+    else if (brand_code == 1240) brand_name = "MFGR#1240";
+
+    # Output in exact previous format
+    print(revenue_val + ".000 " + year_val + ".000 " + brand_name);
+}
+
+# Frame format output
+print("");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 3");
+print("# C1 C2 C3");
+print("# INT32 INT32 STRING");
+
+for (i in 1:nrow(result_ordered)) {
+    revenue_val = as.scalar(result_ordered[i, 1]);
+    year_val = as.scalar(result_ordered[i, 2]);
+    brand_code = as.scalar(result_ordered[i, 3]);
+
+    # Same brand code mapping for frame output
+    brand_code = as.scalar(result_ordered[i, 3]);
+    brand_name = "UNKNOWN";
+
+    if (brand_code == 121) brand_name = "MFGR#121";
+    else if (brand_code == 122) brand_name = "MFGR#122";
+    else if (brand_code == 123) brand_name = "MFGR#123";
+    else if (brand_code == 124) brand_name = "MFGR#124";
+    else if (brand_code == 125) brand_name = "MFGR#125";
+    else if (brand_code == 127) brand_name = "MFGR#127";
+    else if (brand_code == 128) brand_name = "MFGR#128";
+    else if (brand_code == 129) brand_name = "MFGR#129";
+    else if (brand_code == 1211) brand_name = "MFGR#1211";
+    else if (brand_code == 1212) brand_name = "MFGR#1212";
+    else if (brand_code == 1213) brand_name = "MFGR#1213";
+    else if (brand_code == 1214) brand_name = "MFGR#1214";
+    else if (brand_code == 1215) brand_name = "MFGR#1215";
+    else if (brand_code == 1216) brand_name = "MFGR#1216";
+    else if (brand_code == 1217) brand_name = "MFGR#1217";
+    else if (brand_code == 1218) brand_name = "MFGR#1218";
+    else if (brand_code == 1219) brand_name = "MFGR#1219";
+    else if (brand_code == 1220) brand_name = "MFGR#1220";
+    else if (brand_code == 1221) brand_name = "MFGR#1221";
+    else if (brand_code == 1222) brand_name = "MFGR#1222";
+    else if (brand_code == 1224) brand_name = "MFGR#1224";
+    else if (brand_code == 1225) brand_name = "MFGR#1225";
+    else if (brand_code == 1226) brand_name = "MFGR#1226";
+    else if (brand_code == 1228) brand_name = "MFGR#1228";
+    else if (brand_code == 1229) brand_name = "MFGR#1229";
+    else if (brand_code == 1230) brand_name = "MFGR#1230";
+    else if (brand_code == 1231) brand_name = "MFGR#1231";
+    else if (brand_code == 1232) brand_name = "MFGR#1232";
+    else if (brand_code == 1233) brand_name = "MFGR#1233";
+    else if (brand_code == 1234) brand_name = "MFGR#1234";
+    else if (brand_code == 1235) brand_name = "MFGR#1235";
+    else if (brand_code == 1236) brand_name = "MFGR#1236";
+    else if (brand_code == 1237) brand_name = "MFGR#1237";
+    else if (brand_code == 1238) brand_name = "MFGR#1238";
+    else if (brand_code == 1240) brand_name = "MFGR#1240";
+
+    print(revenue_val + " " + year_val + " " + brand_name);
+}
\ No newline at end of file
diff --git a/scripts/staging/ssb/queries/q2_2.dml b/scripts/staging/ssb/queries/q2_2.dml
new file mode 100644
index 00000000000..5b477979785
--- /dev/null
+++ b/scripts/staging/ssb/queries/q2_2.dml
@@ -0,0 +1,246 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/*DML-script implementing the ssb query Q2.2 in SystemDS.
+SELECT SUM(lo_revenue), d_year, p_brand
+FROM lineorder, dates, part, supplier
+WHERE
+    lo_orderdate = d_datekey
+    AND lo_partkey = p_partkey
+    AND lo_suppkey = s_suppkey
+    AND p_brand BETWEEN 'MFGR#2221' AND 'MFGR#2228'
+    AND s_region = 'ASIA'
+GROUP BY d_year, p_brand
+ORDER BY d_year, p_brand;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q2_2.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q2_2.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q2_2.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+# -- PREPARING --
+# Optimized approach: On-the-fly filtering with direct matrix construction for string fields
+
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-4 : LO_PARTKEY | COL-5 : LO_SUPPKEY |
+# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY PART TABLE FILTERING AND ENCODING (P_BRAND BETWEEN 'MFGR#2221' AND 'MFGR#2228')
+# Two-pass approach: Count first, then filter and encode
+part_keys_matrix = as.matrix(part_csv[, 1]);  # part_key
+part_nrows = nrow(part_csv);
+valid_brands_count = 0;
+
+# Pass 1: Count matching parts (brands between MFGR#2221 and MFGR#2228)
+for (i in 1:part_nrows) {
+    brand_val = as.scalar(part_csv[i, 5]);  # p_brand
+    if (brand_val >= "MFGR#2221" & brand_val <= "MFGR#2228") {
+        valid_brands_count = valid_brands_count + 1;
+    }
+}
+
+# Pass 2: Build part matrix with proper brand encoding
+part_matrix_min = matrix(0, valid_brands_count, 2);  # partkey, brand_code
+filtered_idx = 0;
+
+for (i in 1:part_nrows) {
+    brand_val = as.scalar(part_csv[i, 5]);  # p_brand
+    if (brand_val >= "MFGR#2221" & brand_val <= "MFGR#2228") {
+        filtered_idx = filtered_idx + 1;
+
+        # Encode brand names to numeric codes for efficient processing (using original metadata codes)
+        brand_code = 0;
+        if (brand_val == "MFGR#2221") brand_code = 453;
+        else if (brand_val == "MFGR#2222") brand_code = 597;
+        else if (brand_val == "MFGR#2223") brand_code = 907;
+        else if (brand_val == "MFGR#2224") brand_code = 282;
+        else if (brand_val == "MFGR#2225") brand_code = 850;
+        else if (brand_val == "MFGR#2226") brand_code = 525;
+        else if (brand_val == "MFGR#2227") brand_code = 538;
+        else if (brand_val == "MFGR#2228") brand_code = 608;
+        else brand_code = 9999;  # fallback for unknown brands in range
+
+        part_matrix_min[filtered_idx, 1] = as.scalar(part_keys_matrix[i, 1]);  # part_key
+        part_matrix_min[filtered_idx, 2] = brand_code;  # brand code
+    }
+}
+
+# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_REGION = 'ASIA')
+# Two-pass approach for suppliers
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]);  # supplier_key
+supplier_nrows = nrow(supplier_csv);
+asia_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+    region_val = as.scalar(supplier_csv[i, 6]);  # s_region
+    if (region_val == "ASIA") {
+        asia_count = asia_count + 1;
+    }
+}
+
+# Pass 2: Build supplier matrix
+sup_matrix_min = matrix(0, asia_count, 2);  # suppkey, region_encoded
+filtered_idx = 0;
+for (i in 1:supplier_nrows) {
+    region_val = as.scalar(supplier_csv[i, 6]);  # s_region
+    if (region_val == "ASIA") {
+        filtered_idx = filtered_idx + 1;
+        sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]);  # supplier_key
+        sup_matrix_min[filtered_idx, 2] = 5;  # encoded value for ASIA
+    }
+}
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered during matrix construction, but we can use RA selection for consistency
+# All parts in part_matrix_min are already filtered for brands between MFGR#2221 and MFGR#2228
+p_brand_filt = part_matrix_min;  # Already filtered
+
+# S_REGION = 'ASIA' : 5 (Our encoded value)
+s_reg_filt = raSel::m_raSelection(sup_matrix_min, col=2, op="==", val=5);
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED PART TABLE WHERE LO_PARTKEY = P_PARTKEY
+lo_part = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=p_brand_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_part_sup = raJoin::m_raJoin(A=lo_part, colA=2, B=s_reg_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_part_sup, colA=3, B=date_matrix_min, colB=1, method="sort-merge");
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX
+revenue = joined_matrix[, 4];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(part_matrix_min) + ncol(sup_matrix_min) + 2)];
+# P_BRAND : COLUMN 2 OF PART-MIN-MATRIX
+p_brand = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+
+max_p_brand = max(p_brand);
+p_brand_scale_f = ceil(max_p_brand) + 1;
+
+combined_key = d_year * p_brand_scale_f + p_brand;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+gr_key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+p_brand = round(gr_key %% p_brand_scale_f);
+d_year = round((gr_key - p_brand) / p_brand_scale_f);
+
+result = cbind(revenue, d_year, p_brand);
+
+result_ordered = order(target=result, by=3, decreasing=FALSE, index.return=FALSE); # 3 : P_BRAND
+result_ordered = order(target=result_ordered, by=2, decreasing=FALSE, index.return=FALSE); # D_YEAR
+
+print("Processing " + nrow(result_ordered) + " result rows...");
+
+# Output results with brand codes (matching original format)
+print("Q2.2 Results with brand codes:");
+
+for (i in 1:nrow(result_ordered)) {
+    revenue_val = as.scalar(result_ordered[i, 1]);
+    year_val = as.scalar(result_ordered[i, 2]);
+    brand_code = as.scalar(result_ordered[i, 3]);
+
+    # Output in original format with brand codes
+    print(revenue_val + ".000 " + year_val + ".000 " + brand_code + ".000");
+}
+
+# Calculate and print total revenue
+total_revenue = sum(result_ordered[, 1]);
+print("");
+print("REVENUE: " + as.integer(total_revenue));
+print("");
+
+for (i in 1:nrow(result_ordered)) {
+    revenue_val = as.scalar(result_ordered[i, 1]);
+    year_val = as.scalar(result_ordered[i, 2]);
+    brand_code = as.scalar(result_ordered[i, 3]);
+
+    # Map brand code back to brand name (using original metadata codes)
+    brand_name = "UNKNOWN";
+    if (brand_code == 453) brand_name = "MFGR#2221";
+    else if (brand_code == 597) brand_name = "MFGR#2222";
+    else if (brand_code == 907) brand_name = "MFGR#2223";
+    else if (brand_code == 282) brand_name = "MFGR#2224";
+    else if (brand_code == 850) brand_name = "MFGR#2225";
+    else if (brand_code == 525) brand_name = "MFGR#2226";
+    else if (brand_code == 538) brand_name = "MFGR#2227";
+    else if (brand_code == 608) brand_name = "MFGR#2228";
+
+    # Output in consistent format
+    print(revenue_val + ".000 " + year_val + ".000 " + brand_name);
+}
+
+# Frame format output
+print("");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 3");
+print("# C1 C2 C3");
+print("# INT32 INT32 STRING");
+
+for (i in 1:nrow(result_ordered)) {
+    revenue_val = as.scalar(result_ordered[i, 1]);
+    year_val = as.scalar(result_ordered[i, 2]);
+    brand_code = as.scalar(result_ordered[i, 3]);
+
+    # Same brand code mapping for frame output (using original metadata codes)
+    brand_name = "UNKNOWN";
+    if (brand_code == 453) brand_name = "MFGR#2221";
+    else if (brand_code == 597) brand_name = "MFGR#2222";
+    else if (brand_code == 907) brand_name = "MFGR#2223";
+    else if (brand_code == 282) brand_name = "MFGR#2224";
+    else if (brand_code == 850) brand_name = "MFGR#2225";
+    else if (brand_code == 525) brand_name = "MFGR#2226";
+    else if (brand_code == 538) brand_name = "MFGR#2227";
+    else if (brand_code == 608) brand_name = "MFGR#2228";
+
+    print(revenue_val + " " + year_val + " " + brand_name);
+}
diff --git a/scripts/staging/ssb/queries/q2_3.dml b/scripts/staging/ssb/queries/q2_3.dml
new file mode 100644
index 00000000000..8657079e8a1
--- /dev/null
+++ b/scripts/staging/ssb/queries/q2_3.dml
@@ -0,0 +1,221 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/*DML-script implementing the ssb query Q2.3 in SystemDS.
+SELECT SUM(lo_revenue), d_year, p_brand
+FROM lineorder, dates, part, supplier
+WHERE
+    lo_orderdate = d_datekey
+    AND lo_partkey = p_partkey
+    AND lo_suppkey = s_suppkey
+    AND p_brand = 'MFGR#2239'
+    AND s_region = 'EUROPE'
+GROUP BY d_year, p_brand
+ORDER BY d_year, p_brand;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q2_3.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q2_3.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q2_3.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# Optimized approach: On-the-fly filtering with direct matrix construction for string fields
+
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-4 : LO_PARTKEY | COL-5 : LO_SUPPKEY |
+# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY PART TABLE FILTERING AND ENCODING (P_BRAND = 'MFGR#2239')
+# Two-pass approach: Count first, then filter and encode
+part_keys_matrix = as.matrix(part_csv[, 1]);  # part_key
+part_nrows = nrow(part_csv);
+mfgr2239_count = 0;
+
+# Pass 1: Count matching parts (brand = MFGR#2239)
+for (i in 1:part_nrows) {
+    brand_val = as.scalar(part_csv[i, 5]);  # p_brand
+    if (brand_val == "MFGR#2239") {
+        mfgr2239_count = mfgr2239_count + 1;
+    }
+}
+
+# Pass 2: Build part matrix with proper brand encoding (using original metadata code)
+part_matrix_min = matrix(0, mfgr2239_count, 2);  # partkey, brand_code
+filtered_idx = 0;
+
+for (i in 1:part_nrows) {
+    brand_val = as.scalar(part_csv[i, 5]);  # p_brand
+    if (brand_val == "MFGR#2239") {
+        filtered_idx = filtered_idx + 1;
+        part_matrix_min[filtered_idx, 1] = as.scalar(part_keys_matrix[i, 1]);  # part_key
+        part_matrix_min[filtered_idx, 2] = 381;  # encoded value for MFGR#2239 (from original metadata)
+    }
+}
+
+# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_REGION = 'EUROPE')
+# Two-pass approach for suppliers
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]);  # supplier_key
+supplier_nrows = nrow(supplier_csv);
+europe_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+    region_val = as.scalar(supplier_csv[i, 6]);  # s_region
+    if (region_val == "EUROPE") {
+        europe_count = europe_count + 1;
+    }
+}
+
+# Pass 2: Build supplier matrix
+sup_matrix_min = matrix(0, europe_count, 2);  # suppkey, region_encoded
+filtered_idx = 0;
+for (i in 1:supplier_nrows) {
+    region_val = as.scalar(supplier_csv[i, 6]);  # s_region
+    if (region_val == "EUROPE") {
+        filtered_idx = filtered_idx + 1;
+        sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]);  # supplier_key
+        sup_matrix_min[filtered_idx, 2] = 4;  # encoded value for EUROPE (from original metadata)
+    }
+}
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered during matrix construction, but we can use RA selection for consistency
+# P_BRAND = 'MFGR#2239' : 381 (Our encoded value)
+p_brand_filt = raSel::m_raSelection(part_matrix_min, col=2, op="==", val=381);
+
+# S_REGION = 'EUROPE' : 4 (Our encoded value)
+s_reg_filt = raSel::m_raSelection(sup_matrix_min, col=2, op="==", val=4);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED PART TABLE WHERE LO_PARTKEY = P_PARTKEY
+lo_part = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=p_brand_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_part_sup = raJoin::m_raJoin(A=lo_part, colA=2, B=s_reg_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_part_sup, colA=3, B=date_matrix_min, colB=1, method="sort-merge");
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX
+revenue = joined_matrix[, 4];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(part_matrix_min) + ncol(sup_matrix_min) + 2)];
+# P_BRAND : COLUMN 2 OF PART-MIN-MATRIX
+p_brand = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+
+max_p_brand = max(p_brand);
+p_brand_scale_f = ceil(max_p_brand) + 1;
+
+combined_key = d_year * p_brand_scale_f + p_brand;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+gr_key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+p_brand = round(gr_key %% p_brand_scale_f);
+d_year = round((gr_key - p_brand) / p_brand_scale_f);
+
+result = cbind(revenue, d_year, p_brand);
+
+result_ordered = order(target=result, by=3, decreasing=FALSE, index.return=FALSE); # 3 : P_BRAND
+result_ordered = order(target=result_ordered, by=2, decreasing=FALSE, index.return=FALSE); # D_YEAR
+
+print("Processing " + nrow(result_ordered) + " result rows...");
+
+# Output results with brand codes (matching original format)
+print("Q2.3 Results with brand codes:");
+
+for (i in 1:nrow(result_ordered)) {
+    revenue_val = as.scalar(result_ordered[i, 1]);
+    year_val = as.scalar(result_ordered[i, 2]);
+    brand_code = as.scalar(result_ordered[i, 3]);
+
+    # Output in original format with brand codes
+    print(revenue_val + ".000 " + year_val + ".000 " + brand_code + ".000");
+}
+
+# Calculate and print total revenue
+total_revenue = sum(result_ordered[, 1]);
+print("");
+print("REVENUE: " + as.integer(total_revenue));
+print("");
+
+for (i in 1:nrow(result_ordered)) {
+    revenue_val = as.scalar(result_ordered[i, 1]);
+    year_val = as.scalar(result_ordered[i, 2]);
+    brand_code = as.scalar(result_ordered[i, 3]);
+
+    # Map brand code back to brand name (using original metadata code)
+    brand_name = "UNKNOWN";
+    if (brand_code == 381) brand_name = "MFGR#2239";
+
+    # Output in consistent format
+    print(revenue_val + ".000 " + year_val + ".000 " + brand_name);
+}
+
+# Frame format output
+print("");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 3");
+print("# C1 C2 C3");
+print("# INT32 INT32 STRING");
+
+for (i in 1:nrow(result_ordered)) {
+    revenue_val = as.scalar(result_ordered[i, 1]);
+    year_val = as.scalar(result_ordered[i, 2]);
+    brand_code = as.scalar(result_ordered[i, 3]);
+
+    # Same brand code mapping for frame output
+    brand_name = "UNKNOWN";
+    if (brand_code == 381) brand_name = "MFGR#2239";
+
+    print(revenue_val + " " + year_val + " " + brand_name);
+}
diff --git a/scripts/staging/ssb/queries/q3_1.dml b/scripts/staging/ssb/queries/q3_1.dml
new file mode 100644
index 00000000000..c4d2b376709
--- /dev/null
+++ b/scripts/staging/ssb/queries/q3_1.dml
@@ -0,0 +1,293 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/*DML-script implementing the ssb query Q3.1 in SystemDS.
+SELECT
+    c_nation,
+    s_nation,
+    d_year,
+    SUM(lo_revenue) AS REVENUE
+FROM customer, lineorder, supplier, dates
+WHERE
+    lo_custkey = c_custkey
+    AND lo_suppkey = s_suppkey
+    AND lo_orderdate = d_datekey
+    AND c_region = 'ASIA'
+    AND s_region = 'ASIA'
+    AND d_year >= 1992
+    AND d_year <= 1997
+GROUP BY c_nation, s_nation, d_year
+ORDER BY d_year ASC, REVENUE DESC;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q3_1.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q3_1.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+or with explicit -f flag:
+./bin/systemds -f scripts/ssb/queries/q3_1.dml -nvargs input_dir="/path/to/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# Optimized approach: On-the-fly filtering with direct matrix construction for string fields
+
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-5 : LO_SUPPKEY |
+# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY CUSTOMER TABLE FILTERING AND ENCODING (C_REGION = 'ASIA')
+# Two-pass approach: Count first, then filter and encode
+customer_keys_matrix = as.matrix(customer_csv[, 1]);  # customer_key
+customer_nrows = nrow(customer_csv);
+asia_customer_count = 0;
+
+# Pass 1: Count matching customers (region = ASIA)
+for (i in 1:customer_nrows) {
+    region_val = as.scalar(customer_csv[i, 6]);  # c_region
+    if (region_val == "ASIA") {
+        asia_customer_count = asia_customer_count + 1;
+    }
+}
+
+# Pass 2: Build customer matrix with proper nation and region encoding
+cust_matrix_min = matrix(0, asia_customer_count, 3);  # custkey, nation_code, region_code
+filtered_idx = 0;
+
+for (i in 1:customer_nrows) {
+    region_val = as.scalar(customer_csv[i, 6]);  # c_region
+    if (region_val == "ASIA") {
+        filtered_idx = filtered_idx + 1;
+        nation_val = as.scalar(customer_csv[i, 5]);  # c_nation
+
+        cust_matrix_min[filtered_idx, 1] = as.scalar(customer_keys_matrix[i, 1]);  # customer_key
+        cust_matrix_min[filtered_idx, 3] = 4;  # encoded value for ASIA region (from original metadata)
+
+        # Map nation names to codes (using original metadata encodings)
+        if (nation_val == "CHINA") cust_matrix_min[filtered_idx, 2] = 247;
+        else if (nation_val == "INDIA") cust_matrix_min[filtered_idx, 2] = 36;
+        else if (nation_val == "INDONESIA") cust_matrix_min[filtered_idx, 2] = 243;
+        else if (nation_val == "JAPAN") cust_matrix_min[filtered_idx, 2] = 24;
+        else if (nation_val == "VIETNAM") cust_matrix_min[filtered_idx, 2] = 230;
+        else cust_matrix_min[filtered_idx, 2] = -1;  # unknown nation
+    }
+}
+
+# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_REGION = 'ASIA')
+# Two-pass approach for suppliers
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]);  # supplier_key
+supplier_nrows = nrow(supplier_csv);
+asia_supplier_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+    region_val = as.scalar(supplier_csv[i, 6]);  # s_region
+    if (region_val == "ASIA") {
+        asia_supplier_count = asia_supplier_count + 1;
+    }
+}
+
+# Pass 2: Build supplier matrix
+sup_matrix_min = matrix(0, asia_supplier_count, 3);  # suppkey, nation_code, region_code
+filtered_idx = 0;
+for (i in 1:supplier_nrows) {
+    region_val = as.scalar(supplier_csv[i, 6]);  # s_region
+    if (region_val == "ASIA") {
+        filtered_idx = filtered_idx + 1;
+        nation_val = as.scalar(supplier_csv[i, 5]);  # s_nation
+
+        sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]);  # supplier_key
+        sup_matrix_min[filtered_idx, 3] = 5;  # encoded value for ASIA region (from original metadata)
+
+        # Map nation names to codes (using original metadata encodings)
+        if (nation_val == "CHINA") sup_matrix_min[filtered_idx, 2] = 27;
+        else if (nation_val == "INDIA") sup_matrix_min[filtered_idx, 2] = 12;
+        else if (nation_val == "INDONESIA") sup_matrix_min[filtered_idx, 2] = 48;
+        else if (nation_val == "JAPAN") sup_matrix_min[filtered_idx, 2] = 73;
+        else if (nation_val == "VIETNAM") sup_matrix_min[filtered_idx, 2] = 85;
+        else sup_matrix_min[filtered_idx, 2] = -1;  # unknown nation
+    }
+}
+
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered during matrix construction, but we can use RA selection for consistency
+# C_REGION = 'ASIA' : 4 (Our encoded value)
+c_reg_filt = raSel::m_raSelection(cust_matrix_min, col=3, op="==", val=4);
+
+# S_REGION = 'ASIA' : 5 (Our encoded value)
+s_reg_filt = raSel::m_raSelection(sup_matrix_min, col=3, op="==", val=5);
+
+# D_YEAR BETWEEN 1992 & 1997
+d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op=">=", val=1992);
+d_year_filt = raSel::m_raSelection(d_year_filt, col=2, op="<=", val=1997);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY
+lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=c_reg_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=2, B=s_reg_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_cust_sup, colA=3, B=d_year_filt, colB=1, method="sort-merge");
+
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX
+revenue = joined_matrix[, 4];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + ncol(sup_matrix_min) + 2)];
+# C_NATION : COLUMN 2 OF CUST-MIN-MATRIX
+c_nation = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+# S_NATION : COLUMN 2 OF SUP-MIN-MATRIX
+s_nation = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + 2)];
+
+# CALCULATING COMBINATION KEY WITH PRIORITY: C_NATION, S_NATION, D_YEAR
+max_c_nation = max(c_nation);
+max_s_nation = max(s_nation);
+max_d_year = max(d_year);
+
+c_nation_scale_f = ceil(max_c_nation) + 1;
+s_nation_scale_f = ceil(max_s_nation) + 1;
+d_year_scale_f = ceil(max_d_year) + 1;
+
+combined_key = c_nation * s_nation_scale_f * d_year_scale_f + s_nation * d_year_scale_f + d_year;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# EXTRACTING C_NATION, S_NATION & D_YEAR
+d_year = round(key %% d_year_scale_f);
+c_nation = round(floor(key / (s_nation_scale_f * d_year_scale_f)));
+s_nation = round((floor(key / d_year_scale_f)) %% s_nation_scale_f);
+
+result = cbind(c_nation, s_nation, d_year, revenue);
+
+
+# -- SORTING --
+# PRIORITY 1 D_YEAR (ASC), 2 REVENUE (DESC)
+result_ordered = order(target=result, by=4, decreasing=TRUE, index.return=FALSE);
+result_ordered = order(target=result_ordered, by=3, decreasing=FALSE, index.return=FALSE);
+
+# -- DECODING C_NATION & S_NATION --
+# Map nation codes back to nation names (using original metadata codes)
+print("Processing " + nrow(result_ordered) + " result rows...");
+
+print("Q3.1 Results with nation codes:");
+for (i in 1:nrow(result_ordered)) {
+    c_nation_code = as.scalar(result_ordered[i, 1]);
+    s_nation_code = as.scalar(result_ordered[i, 2]);
+    year_val = as.scalar(result_ordered[i, 3]);
+    revenue_val = as.scalar(result_ordered[i, 4]);
+
+    print(c_nation_code + ".000 " + s_nation_code + ".000 " + year_val + ".000 " + revenue_val + ".000");
+}
+
+# Calculate and print total revenue
+total_revenue = sum(result_ordered[, 4]);
+print("");
+print("TOTAL REVENUE: " + as.integer(total_revenue));
+print("");
+
+for (i in 1:nrow(result_ordered)) {
+    c_nation_code = as.scalar(result_ordered[i, 1]);
+    s_nation_code = as.scalar(result_ordered[i, 2]);
+    year_val = as.scalar(result_ordered[i, 3]);
+    revenue_val = as.scalar(result_ordered[i, 4]);
+
+    # Map customer nation codes back to names
+    c_nation_name = "UNKNOWN";
+    if (c_nation_code == 247) c_nation_name = "CHINA";
+    else if (c_nation_code == 36) c_nation_name = "INDIA";
+    else if (c_nation_code == 243) c_nation_name = "INDONESIA";
+    else if (c_nation_code == 24) c_nation_name = "JAPAN";
+    else if (c_nation_code == 230) c_nation_name = "VIETNAM";
+
+    # Map supplier nation codes back to names
+    s_nation_name = "UNKNOWN";
+    if (s_nation_code == 27) s_nation_name = "CHINA";
+    else if (s_nation_code == 12) s_nation_name = "INDIA";
+    else if (s_nation_code == 48) s_nation_name = "INDONESIA";
+    else if (s_nation_code == 73) s_nation_name = "JAPAN";
+    else if (s_nation_code == 85) s_nation_name = "VIETNAM";
+
+    # Output in consistent format
+    print(c_nation_name + " " + s_nation_name + " " + year_val + ".000 " + revenue_val + ".000");
+}
+
+# Frame format output
+print("");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 4");
+print("# C1 C2 C3 C4");
+print("# STRING STRING INT32 INT32");
+
+for (i in 1:nrow(result_ordered)) {
+    c_nation_code = as.scalar(result_ordered[i, 1]);
+    s_nation_code = as.scalar(result_ordered[i, 2]);
+    year_val = as.scalar(result_ordered[i, 3]);
+    revenue_val = as.scalar(result_ordered[i, 4]);
+
+    # Map nation codes to names for frame output
+    c_nation_name = "UNKNOWN";
+    if (c_nation_code == 247) c_nation_name = "CHINA";
+    else if (c_nation_code == 36) c_nation_name = "INDIA";
+    else if (c_nation_code == 243) c_nation_name = "INDONESIA";
+    else if (c_nation_code == 24) c_nation_name = "JAPAN";
+    else if (c_nation_code == 230) c_nation_name = "VIETNAM";
+
+    s_nation_name = "UNKNOWN";
+    if (s_nation_code == 27) s_nation_name = "CHINA";
+    else if (s_nation_code == 12) s_nation_name = "INDIA";
+    else if (s_nation_code == 48) s_nation_name = "INDONESIA";
+    else if (s_nation_code == 73) s_nation_name = "JAPAN";
+    else if (s_nation_code == 85) s_nation_name = "VIETNAM";
+
+    print(c_nation_name + " " + s_nation_name + " " + year_val + " " + revenue_val);
+}
+
diff --git a/scripts/staging/ssb/queries/q3_2.dml b/scripts/staging/ssb/queries/q3_2.dml
new file mode 100644
index 00000000000..d979c0cfbbb
--- /dev/null
+++ b/scripts/staging/ssb/queries/q3_2.dml
@@ -0,0 +1,237 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/*DML-script implementing the ssb query Q3.2 in SystemDS.
+SELECT
+    c_city,
+    s_city,
+    d_year,
+    SUM(lo_revenue) AS REVENUE
+FROM customer, lineorder, supplier, dates
+WHERE
+    lo_custkey = c_custkey
+    AND lo_suppkey = s_suppkey
+    AND lo_orderdate = d_datekey
+    AND c_nation = 'UNITED STATES'
+    AND s_nation = 'UNITED STATES'
+    AND d_year >= 1992
+    AND d_year <= 1997
+GROUP BY c_city, s_city, d_year
+ORDER BY d_year ASC, REVENUE DESC;
+
+Usage:
+./bin/systemds scripts/ssb/queries/q3_2.dml -nvargs input_dir="/path/to/data"
+./bin/systemds scripts/ssb/queries/q3_2.dml -nvargs input_dir="/Users/ghafekalsaho/Desktop/data"
+
+Parameters:
+input_dir - Path to input directory containing the table files (e.g., ./data)
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+# -- PREPARING --
+# Optimized approach: On-the-fly filtering with direct matrix construction for string fields
+
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-5 : LO_SUPPKEY |
+# COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY CUSTOMER TABLE FILTERING AND ENCODING (C_NATION = 'UNITED STATES')
+# Two-pass approach: Count first, then filter and encode
+customer_keys_matrix = as.matrix(customer_csv[, 1]);  # customer_key
+customer_nrows = nrow(customer_csv);
+us_customer_count = 0;
+
+# Pass 1: Count matching customers (nation = UNITED STATES)
+for (i in 1:customer_nrows) {
+    nation_val = as.scalar(customer_csv[i, 5]);  # c_nation
+    if (nation_val == "UNITED STATES") {
+        us_customer_count = us_customer_count + 1;
+    }
+}
+
+# Pass 2: Build customer matrix with proper city and nation encoding
+cust_matrix_min = matrix(0, us_customer_count, 3);  # custkey, city_code, nation_code
+filtered_idx = 0;
+
+for (i in 1:customer_nrows) {
+    nation_val = as.scalar(customer_csv[i, 5]);  # c_nation
+    if (nation_val == "UNITED STATES") {
+        filtered_idx = filtered_idx + 1;
+        city_val = as.scalar(customer_csv[i, 4]);  # c_city
+
+        cust_matrix_min[filtered_idx, 1] = as.scalar(customer_keys_matrix[i, 1]);  # customer_key
+        cust_matrix_min[filtered_idx, 3] = 1;  # encoded value for UNITED STATES nation
+
+        # Assign city codes dynamically based on city names
+        # Use filtered index for simple unique encoding
+        city_code = filtered_idx;
+        cust_matrix_min[filtered_idx, 2] = city_code;
+    }
+}
+
+# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_NATION = 'UNITED STATES')
+# Two-pass approach for suppliers
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]);  # supplier_key
+supplier_nrows = nrow(supplier_csv);
+us_supplier_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+    nation_val = as.scalar(supplier_csv[i, 5]);  # s_nation
+    if (nation_val == "UNITED STATES") {
+        us_supplier_count = us_supplier_count + 1;
+    }
+}
+
+# Pass 2: Build supplier matrix with city encoding (independent from customer cities)
+sup_matrix_min = matrix(0, us_supplier_count, 3);  # suppkey, city_code, nation_code
+filtered_idx = 0;
+
+for (i in 1:supplier_nrows) {
+    nation_val = as.scalar(supplier_csv[i, 5]);  # s_nation
+    if (nation_val == "UNITED STATES") {
+        filtered_idx = filtered_idx + 1;
+        city_val = as.scalar(supplier_csv[i, 4]);  # s_city
+
+        sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]);  # supplier_key
+        sup_matrix_min[filtered_idx, 3] = 1;  # encoded value for UNITED STATES nation
+
+        # Assign city codes dynamically based on city names
+        # Use filtered index for simple unique encoding
+        city_code = filtered_idx;
+        sup_matrix_min[filtered_idx, 2] = city_code;
+    }
+}
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# We already filtered during matrix construction, but we can use RA selection for consistency
+# C_NATION = 'UNITED STATES' : 1 (Our encoded value)
+c_nat_filt = raSel::m_raSelection(cust_matrix_min, col=3, op="==", val=1);
+
+# S_NATION = 'UNITED STATES' : 1 (Our encoded value)
+s_nat_filt = raSel::m_raSelection(sup_matrix_min, col=3, op="==", val=1);
+
+# D_YEAR BETWEEN 1992 & 1997
+d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op=">=", val=1992);
+d_year_filt = raSel::m_raSelection(d_year_filt, col=2, op="<=", val=1997);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY
+lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=c_nat_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=2, B=s_nat_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_cust_sup, colA=3, B=d_year_filt, colB=1, method="sort-merge");
+
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 4 OF LINEORDER-MIN-MATRIX (was 5, now 4 since we removed LO_PARTKEY)
+revenue = joined_matrix[, 4];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + ncol(sup_matrix_min) + 2)];
+# C_CITY : COLUMN 2 OF CUST-MIN-MATRIX
+c_city = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+# S_CITY : COLUMN 2 OF SUP-MIN-MATRIX
+s_city = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + 2)];
+
+# CALCULATING COMBINATION KEY WITH PRIORITY: C_CITY, S_CITY & D_YEAR
+max_c_city = max(c_city);
+max_s_city = max(s_city);
+max_d_year = max(d_year);
+
+c_city_scale_f = ceil(max_c_city) + 1;
+s_city_scale_f = ceil(max_s_city) + 1;
+d_year_scale_f = ceil(max_d_year) + 1;
+
+combined_key = c_city * s_city_scale_f * d_year_scale_f + s_city * d_year_scale_f + d_year;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# EXTRACTING C_CITY, S_CITY & D_YEAR
+d_year = round(key %% d_year_scale_f);
+c_city = round(floor(key / (s_city_scale_f * d_year_scale_f)));
+s_city = round((floor(key / d_year_scale_f)) %% s_city_scale_f);
+
+result = cbind(c_city, s_city, d_year, revenue);
+
+
+# -- SORTING --
+# PRIORITY 1 D_YEAR (ASC), 2 REVENUE (DESC)
+result_ordered = order(target=result, by=4, decreasing=TRUE, index.return=FALSE);
+result_ordered = order(target=result_ordered, by=3, decreasing=FALSE, index.return=FALSE);
+
+
+# -- DECODING C_CITY & S_CITY CODES --
+# For simplicity, we'll output the city codes rather than names
+# This follows the same pattern as q3_1.dml which outputs nation codes
+print("Q3.2 Results:");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 4");
+print("# C1 C2 C3 C4");
+print("# STRING STRING INT32 INT32");
+
+for (i in 1:nrow(result_ordered)) {
+    c_city_code = as.scalar(result_ordered[i, 1]);
+    s_city_code = as.scalar(result_ordered[i, 2]);
+    year_val = as.scalar(result_ordered[i, 3]);
+    revenue_val = as.scalar(result_ordered[i, 4]);
+
+    # For now, output the codes - we can map them back to names later if needed
+    c_city_name = "UNITED ST" + c_city_code;  # Format similar to expected output
+    s_city_name = "UNITED ST" + s_city_code;  # Format similar to expected output
+
+    print(c_city_name + " " + s_city_name + " " + year_val + " " + revenue_val);
+}
+
+# Calculate total revenue for validation
+total_revenue = sum(result_ordered[, 4]);
+print("");
+print("Total number of result rows: " + nrow(result_ordered));
+print("Total revenue: " + as.integer(total_revenue));
+print("Q3.2 finished");
+
diff --git a/scripts/staging/ssb/queries/q3_3.dml b/scripts/staging/ssb/queries/q3_3.dml
new file mode 100644
index 00000000000..5476eb6fe08
--- /dev/null
+++ b/scripts/staging/ssb/queries/q3_3.dml
@@ -0,0 +1,239 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/* DML-script implementing the ssb query Q3.3 in SystemDS.
+SELECT
+    c_city,
+    s_city,
+    d_year,
+    SUM(lo_revenue) AS REVENUE
+FROM customer, lineorder, supplier, dates
+WHERE
+    lo_custkey = c_custkey
+    AND lo_suppkey = s_suppkey
+    AND lo_orderdate = d_datekey
+    AND (
+        c_city = 'UNITED KI1'
+        OR c_city = 'UNITED KI5'
+    )
+    AND (
+        s_city = 'UNITED KI1'
+        OR s_city = 'UNITED KI5'
+    )
+    AND d_year >= 1992
+    AND d_year <= 1997
+GROUP BY c_city, s_city, d_year
+ORDER BY d_year ASC, REVENUE DESC;
+*/
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+#part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-4 : LO_PARTKEY |
+# COL-5 : LO_SUPPKEY | COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY CUSTOMER TABLE FILTERING AND ENCODING (C_CITY = 'UNITED KI1' OR 'UNITED KI5')
+customer_keys_matrix = as.matrix(customer_csv[, 1]);  # customer_key
+customer_nrows = nrow(customer_csv);
+matching_customer_count = 0;
+
+# Pass 1: Count matching customers
+for (i in 1:customer_nrows) {
+    city_val = as.scalar(customer_csv[i, 4]);  # c_city
+    if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+        matching_customer_count = matching_customer_count + 1;
+    }
+}
+
+# Pass 2: Build customer matrix with dynamic city encoding
+cust_matrix_min = matrix(0, matching_customer_count, 2);  # custkey, city_code
+filtered_idx = 0;
+
+for (i in 1:customer_nrows) {
+    city_val = as.scalar(customer_csv[i, 4]);  # c_city
+    if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+        filtered_idx = filtered_idx + 1;
+        cust_matrix_min[filtered_idx, 1] = as.scalar(customer_keys_matrix[i, 1]);  # customer_key
+
+        # Use consistent encoding: 1 for UNITED KI1, 2 for UNITED KI5
+        if (city_val == "UNITED KI1") {
+            cust_matrix_min[filtered_idx, 2] = 1;
+        } else {
+            cust_matrix_min[filtered_idx, 2] = 2;
+        }
+    }
+}
+
+# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_CITY = 'UNITED KI1' OR 'UNITED KI5')
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]);  # supplier_key
+supplier_nrows = nrow(supplier_csv);
+matching_supplier_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+    city_val = as.scalar(supplier_csv[i, 4]);  # s_city
+    if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+        matching_supplier_count = matching_supplier_count + 1;
+    }
+}
+
+# Pass 2: Build supplier matrix with dynamic city encoding
+sup_matrix_min = matrix(0, matching_supplier_count, 2);  # suppkey, city_code
+filtered_idx = 0;
+
+for (i in 1:supplier_nrows) {
+    city_val = as.scalar(supplier_csv[i, 4]);  # s_city
+    if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+        filtered_idx = filtered_idx + 1;
+        sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]);  # supplier_key
+
+        # Use consistent encoding: 1 for UNITED KI1, 2 for UNITED KI5
+        if (city_val == "UNITED KI1") {
+            sup_matrix_min[filtered_idx, 2] = 1;
+        } else {
+            sup_matrix_min[filtered_idx, 2] = 2;
+        }
+    }
+}
+
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# Since we already filtered during matrix construction, we can use the full matrices
+# or apply additional RA selection if needed for consistency
+c_city_filt = cust_matrix_min;  # Already filtered for target cities
+s_city_filt = sup_matrix_min;   # Already filtered for target cities
+
+# D_YEAR BETWEEN 1992 & 1997
+d_year_filt = raSel::m_raSelection(date_matrix_min, col=2, op=">=", val=1992);
+d_year_filt = raSel::m_raSelection(d_year_filt, col=2, op="<=", val=1997);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY
+lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=c_city_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=3, B=s_city_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_cust_sup, colA=4, B=d_year_filt, colB=1, method="sort-merge");
+#print(nrow(joined_matrix));
+
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 5 OF LINEORDER-MIN-MATRIX
+revenue = joined_matrix[, 5];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + ncol(sup_matrix_min) + 2)];
+# C_CITY : COLUMN 2 OF CUST-MIN-MATRIX
+c_city = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+# S_CITY : COLUMN 2 OF CUST-MIN-MATRIX
+s_city = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + 2)];
+
+# CALCULATING COMBINATION KEY WITH PRIORITY: C_CITY, S_CITY & D_YEAR
+max_c_city = max(c_city);
+max_s_city = max(s_city);
+max_d_year = max(d_year);
+
+c_city_scale_f = ceil(max_c_city) + 1;
+s_city_scale_f = ceil(max_s_city) + 1;
+d_year_scale_f = ceil(max_d_year) + 1;
+
+combined_key = c_city * s_city_scale_f * d_year_scale_f + s_city * d_year_scale_f + d_year;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# EXTRACTING C_CITY, S_CITY & D_YEAR
+d_year = round(key %% d_year_scale_f);
+c_city = round(floor(key / (s_city_scale_f * d_year_scale_f)));
+s_city = round((floor(key / d_year_scale_f)) %% s_city_scale_f);
+
+result = cbind(c_city, s_city, d_year, revenue);
+
+
+# -- SORTING --
+# PRIORITY 1 D_YEAR (ASC), 2 REVENUE (DESC)
+result_ordered = order(target=result, by=4, decreasing=TRUE, index.return=FALSE);
+result_ordered = order(target=result_ordered, by=3, decreasing=FALSE, index.return=FALSE);
+
+
+# -- OUTPUT RESULTS --
+print("Q3.3 Results:");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 4");
+print("# C1 C2 C3 C4");
+print("# STRING STRING INT32 INT32");
+
+for (i in 1:nrow(result_ordered)) {
+    c_city_code = as.scalar(result_ordered[i, 1]);
+    s_city_code = as.scalar(result_ordered[i, 2]);
+    year_val = as.scalar(result_ordered[i, 3]);
+    revenue_val = as.scalar(result_ordered[i, 4]);
+
+    # Map back to original city names based on the encoding used
+    if (c_city_code == 1) {
+        c_city_name = "UNITED KI1";
+    } else {
+        c_city_name = "UNITED KI5";
+    }
+
+    if (s_city_code == 1) {
+        s_city_name = "UNITED KI1";
+    } else {
+        s_city_name = "UNITED KI5";
+    }
+
+    print(c_city_name + " " + s_city_name + " " + as.integer(year_val) + " " + as.integer(revenue_val));
+}
+
+# Calculate total revenue for validation
+total_revenue = sum(result_ordered[, 4]);
+print("");
+print("Total number of result rows: " + nrow(result_ordered));
+print("Total revenue: " + as.integer(total_revenue));
+print("Q3.3 finished");
+
diff --git a/scripts/staging/ssb/queries/q3_4.dml b/scripts/staging/ssb/queries/q3_4.dml
new file mode 100644
index 00000000000..fd276e5090e
--- /dev/null
+++ b/scripts/staging/ssb/queries/q3_4.dml
@@ -0,0 +1,262 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/* DML-script implementing the ssb query Q3.4 in SystemDS.
+SELECT
+    c_city,
+    s_city,
+    d_year,
+    SUM(lo_revenue) AS REVENUE
+FROM customer, lineorder, supplier, dates
+WHERE
+    lo_custkey = c_custkey
+    AND lo_suppkey = s_suppkey
+    AND lo_orderdate = d_datekey
+    AND (
+        c_city = 'UNITED KI1'
+        OR c_city = 'UNITED KI5'
+    )
+    AND (
+        s_city = 'UNITED KI1'
+        OR s_city = 'UNITED KI5'
+    )
+    AND d_yearmonth = 'Dec1997'
+GROUP BY c_city, s_city, d_year
+ORDER BY d_year ASC, REVENUE DESC;
+*/
+
+# -- PARAMETER HANDLING --
+input_dir = ifdef($input_dir, "./data");
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+#part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-4 : LO_PARTKEY |
+# COL-5 : LO_SUPPKEY | COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE
+lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+# ON-THE-FLY CUSTOMER TABLE FILTERING AND ENCODING (C_CITY = 'UNITED KI1' OR 'UNITED KI5')
+customer_keys_matrix = as.matrix(customer_csv[, 1]);  # customer_key
+customer_nrows = nrow(customer_csv);
+matching_customer_count = 0;
+
+# Pass 1: Count matching customers
+for (i in 1:customer_nrows) {
+    city_val = as.scalar(customer_csv[i, 4]);  # c_city
+    if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+        matching_customer_count = matching_customer_count + 1;
+    }
+}
+
+# Pass 2: Build customer matrix with dynamic city encoding
+cust_matrix_min = matrix(0, matching_customer_count, 2);  # custkey, city_code
+filtered_idx = 0;
+
+for (i in 1:customer_nrows) {
+    city_val = as.scalar(customer_csv[i, 4]);  # c_city
+    if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+        filtered_idx = filtered_idx + 1;
+        cust_matrix_min[filtered_idx, 1] = as.scalar(customer_keys_matrix[i, 1]);  # customer_key
+
+        # Use consistent encoding: 1 for UNITED KI1, 2 for UNITED KI5
+        if (city_val == "UNITED KI1") {
+            cust_matrix_min[filtered_idx, 2] = 1;
+        } else {
+            cust_matrix_min[filtered_idx, 2] = 2;
+        }
+    }
+}
+
+# ON-THE-FLY SUPPLIER TABLE FILTERING AND ENCODING (S_CITY = 'UNITED KI1' OR 'UNITED KI5')
+supplier_keys_matrix = as.matrix(supplier_csv[, 1]);  # supplier_key
+supplier_nrows = nrow(supplier_csv);
+matching_supplier_count = 0;
+
+# Pass 1: Count matching suppliers
+for (i in 1:supplier_nrows) {
+    city_val = as.scalar(supplier_csv[i, 4]);  # s_city
+    if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+        matching_supplier_count = matching_supplier_count + 1;
+    }
+}
+
+# Pass 2: Build supplier matrix with dynamic city encoding
+sup_matrix_min = matrix(0, matching_supplier_count, 2);  # suppkey, city_code
+filtered_idx = 0;
+
+for (i in 1:supplier_nrows) {
+    city_val = as.scalar(supplier_csv[i, 4]);  # s_city
+    if (city_val == "UNITED KI1" | city_val == "UNITED KI5") {
+        filtered_idx = filtered_idx + 1;
+        sup_matrix_min[filtered_idx, 1] = as.scalar(supplier_keys_matrix[i, 1]);  # supplier_key
+
+        # Use consistent encoding: 1 for UNITED KI1, 2 for UNITED KI5
+        if (city_val == "UNITED KI1") {
+            sup_matrix_min[filtered_idx, 2] = 1;
+        } else {
+            sup_matrix_min[filtered_idx, 2] = 2;
+        }
+    }
+}
+
+
+# -- FILTERING THE DATA WITH RA-SELECTION FUNCTION --
+# Since we already filtered during matrix construction, we can use the full matrices
+c_city_filt = cust_matrix_min;  # Already filtered for target cities
+s_city_filt = sup_matrix_min;   # Already filtered for target cities
+
+# D_YEARMONTH = 'Dec1997' - Need precise filtering for Dec1997 only
+# Build filtered date matrix manually since we need string matching on d_yearmonth
+date_full_frame = cbind(date_csv[, 1], date_csv[, 5], date_csv[, 7]);  # datekey, year, yearmonth
+date_nrows = nrow(date_full_frame);
+matching_dates = matrix(0, 31, 2);  # We know 31 entries exist, store datekey and year
+filtered_idx = 0;
+
+for (i in 1:date_nrows) {
+    yearmonth_val = as.scalar(date_full_frame[i, 3]);  # d_yearmonth
+    if (yearmonth_val == "Dec1997") {
+        filtered_idx = filtered_idx + 1;
+        matching_dates[filtered_idx, 1] = as.scalar(date_matrix_min[i, 1]);  # datekey
+        matching_dates[filtered_idx, 2] = as.scalar(date_matrix_min[i, 2]);  # d_year
+    }
+}
+
+d_year_filt = matching_dates;
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY
+lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=c_city_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY
+lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=3, B=s_city_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_cust_sup, colA=4, B=d_year_filt, colB=1, method="sort-merge");
+
+# Check if we have any results
+if (nrow(joined_matrix) == 0) {
+    print("Q3.4 Results:");
+    print("# FRAME: nrow = 0, ncol = 4");
+    print("# C1 C2 C3 C4");
+    print("# STRING STRING INT32 INT32");
+    print("");
+    print("Total number of result rows: 0");
+    print("Total revenue: 0");
+    print("Q3.4 finished - no matching data for Dec1997");
+} else {
+
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 5 OF LINEORDER-MIN-MATRIX
+revenue = joined_matrix[, 5];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + ncol(sup_matrix_min) + 2)];
+# C_CITY : COLUMN 2 OF CUST-MIN-MATRIX
+c_city = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+# S_CITY : COLUMN 2 OF CUST-MIN-MATRIX
+s_city = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_matrix_min) + 2)];
+
+# CALCULATING COMBINATION KEY WITH PRIORITY: C_CITY, S_CITY & D_YEAR
+max_c_city = max(c_city);
+max_s_city = max(s_city);
+max_d_year = max(d_year);
+
+c_city_scale_f = ceil(max_c_city) + 1;
+s_city_scale_f = ceil(max_s_city) + 1;
+d_year_scale_f = ceil(max_d_year) + 1;
+
+combined_key = c_city * s_city_scale_f * d_year_scale_f + s_city * d_year_scale_f + d_year;
+
+group_input = cbind(revenue, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+key = agg_result[, 1];
+revenue = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# EXTRACTING C_CITY, S_CITY & D_YEAR
+d_year = round(key %% d_year_scale_f);
+c_city = round(floor(key / (s_city_scale_f * d_year_scale_f)));
+s_city = round((floor(key / d_year_scale_f)) %% s_city_scale_f);
+
+result = cbind(c_city, s_city, d_year, revenue);
+
+
+# -- SORTING --
+# PRIORITY 1 D_YEAR (ASC), 2 REVENUE (DESC)
+result_ordered = order(target=result, by=4, decreasing=TRUE, index.return=FALSE);
+result_ordered = order(target=result_ordered, by=3, decreasing=FALSE, index.return=FALSE);
+
+
+# -- OUTPUT RESULTS --
+print("Q3.4 Results:");
+print("# FRAME: nrow = " + nrow(result_ordered) + ", ncol = 4");
+print("# C1 C2 C3 C4");
+print("# STRING STRING INT32 INT32");
+
+for (i in 1:nrow(result_ordered)) {
+    c_city_code = as.scalar(result_ordered[i, 1]);
+    s_city_code = as.scalar(result_ordered[i, 2]);
+    year_val = as.scalar(result_ordered[i, 3]);
+    revenue_val = as.scalar(result_ordered[i, 4]);
+
+    # Map back to original city names based on the encoding used
+    if (c_city_code == 1) {
+        c_city_name = "UNITED KI1";
+    } else {
+        c_city_name = "UNITED KI5";
+    }
+
+    if (s_city_code == 1) {
+        s_city_name = "UNITED KI1";
+    } else {
+        s_city_name = "UNITED KI5";
+    }
+
+    print(c_city_name + " " + s_city_name + " " + as.integer(year_val) + " " + as.integer(revenue_val));
+}
+
+# Calculate total revenue for validation
+total_revenue = sum(result_ordered[, 4]);
+print("");
+print("Total number of result rows: " + nrow(result_ordered));
+print("Total revenue: " + as.integer(total_revenue));
+print("Q3.4 finished");
+}
diff --git a/scripts/staging/ssb/queries/q4_1.dml b/scripts/staging/ssb/queries/q4_1.dml
new file mode 100644
index 00000000000..191cbc9db90
--- /dev/null
+++ b/scripts/staging/ssb/queries/q4_1.dml
@@ -0,0 +1,264 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/* DML-script implementing the ssb query Q4.1 in SystemDS with Dynamic Encoding.
+SELECT
+    d_year,
+    c_nation,
+    SUM(lo_revenue - lo_supplycost) AS PROFIT
+FROM dates, customer, supplier, part, lineorder
+WHERE
+    lo_custkey = c_custkey
+    AND lo_suppkey = s_suppkey
+    AND lo_partkey = p_partkey
+    AND lo_orderdate = d_datekey
+    AND c_region = 'AMERICA'
+    AND s_region = 'AMERICA'
+    AND (
+        p_mfgr = 'MFGR#1'
+        OR p_mfgr = 'MFGR#2'
+    )
+GROUP BY d_year, c_nation
+ORDER BY d_year, c_nation;
+*/
+
+# Input parameter
+input_dir = $input_dir;
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- MANUAL FILTERING AND DATA PREPARATION --
+# Extract minimal data needed for the query
+date_matrix_min = as.matrix(cbind(date_csv[, 1], date_csv[, 5]));
+lineorder_matrix_min = as.matrix(cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5],
+                                      lineorder_csv[, 6], lineorder_csv[, 13], lineorder_csv[, 14]));
+
+# Build filtered parts list (MFGR#1 and MFGR#2)
+part_filtered_keys = matrix(0, rows=0, cols=1);
+
+for(i in 1:nrow(part_csv)) {
+    mfgr_val = as.scalar(part_csv[i, 3]);
+    if(mfgr_val == "MFGR#1" | mfgr_val == "MFGR#2") {
+        # Extract key and create single-element matrix
+        key_val = as.double(as.scalar(part_csv[i, 1]));
+        key_matrix = matrix(key_val, rows=1, cols=1);
+
+        # Append to filtered results
+        part_filtered_keys = rbind(part_filtered_keys, key_matrix);
+    }
+}
+part_count = nrow(part_filtered_keys);
+if(part_count == 0) {
+    part_filtered_keys = matrix(0, rows=1, cols=1);  # Fallback for empty case
+}
+
+# Build filtered customers list (AMERICA region) with dynamic encoding
+cust_filtered_keys = matrix(0, rows=0, cols=1);
+cust_filtered_nations = matrix(0, rows=0, cols=1);
+
+for(i in 1:nrow(customer_csv)) {
+    region_val = as.scalar(customer_csv[i, 6]);
+    if(region_val == "AMERICA") {
+        # Extract key and create single-element matrix
+        key_val = as.double(as.scalar(customer_csv[i, 1]));
+        key_matrix = matrix(key_val, rows=1, cols=1);
+
+        # Extract nation and encode
+        nation_str = as.scalar(customer_csv[i, 5]);
+        if(nation_str == "ARGENTINA") {
+            nation_val = 3;
+        } else if(nation_str == "CANADA") {
+            nation_val = 5;
+        } else if(nation_str == "PERU") {
+            nation_val = 8;
+        } else if(nation_str == "BRAZIL") {
+            nation_val = 13;
+        } else if(nation_str == "UNITED STATES") {
+            nation_val = 25;
+        } else {
+            nation_val = 0;  # Unknown nation
+        }
+        nation_matrix = matrix(nation_val, rows=1, cols=1);
+
+        # Append to filtered results
+        cust_filtered_keys = rbind(cust_filtered_keys, key_matrix);
+        cust_filtered_nations = rbind(cust_filtered_nations, nation_matrix);
+    }
+}
+
+cust_count = nrow(cust_filtered_keys);
+if(cust_count > 0) {
+    # Create customer matrix from filtered data
+    cust_filtered_data = cbind(cust_filtered_keys, cust_filtered_nations);
+} else {
+    cust_filtered_data = matrix(0, rows=1, cols=2);  # Fallback for empty case
+}
+
+# Build filtered suppliers list (AMERICA region)
+supp_filtered_keys = matrix(0, rows=0, cols=1);
+
+for(i in 1:nrow(supplier_csv)) {
+    region_val = as.scalar(supplier_csv[i, 6]);
+    if(region_val == "AMERICA") {
+        # Extract key and create single-element matrix
+        key_val = as.double(as.scalar(supplier_csv[i, 1]));
+        key_matrix = matrix(key_val, rows=1, cols=1);
+
+        # Append to filtered results
+        supp_filtered_keys = rbind(supp_filtered_keys, key_matrix);
+    }
+}
+supp_count = nrow(supp_filtered_keys);
+if(supp_count == 0) {
+    supp_filtered_keys = matrix(0, rows=1, cols=1);  # Fallback for empty case
+}
+
+# Ensure filtered matrices are properly formatted
+if(cust_count > 0) {
+    cust_matrix_formatted = cust_filtered_data;  # Use the already created matrix
+} else {
+    cust_matrix_formatted = matrix(0, rows=1, cols=2);
+}
+
+if(supp_count > 0) {
+    supp_matrix_formatted = supp_filtered_keys;  # Use the already created matrix
+} else {
+    supp_matrix_formatted = matrix(0, rows=1, cols=1);
+}
+
+if(part_count > 0) {
+    part_matrix_formatted = part_filtered_keys;  # Use the already created matrix
+} else {
+    part_matrix_formatted = matrix(0, rows=1, cols=1);
+}
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION (SORT-MERGE METHOD) --
+# Remove any potential zero values from customer matrix
+valid_cust_mask = (cust_matrix_formatted[, 1] > 0);
+if(sum(valid_cust_mask) > 0) {
+    cust_clean = removeEmpty(target=cust_matrix_formatted, margin="rows", select=valid_cust_mask);
+} else {
+    stop("No valid customer data");
+}
+
+# Join lineorder with filtered customer table (lo_custkey = c_custkey)
+lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=cust_clean, colB=1, method="sort-merge");
+
+# Join with filtered supplier table (lo_suppkey = s_suppkey)
+lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=3, B=supp_matrix_formatted, colB=1, method="sort-merge");
+
+# Join with filtered part table (lo_partkey = p_partkey)
+lo_cust_sup_part = raJoin::m_raJoin(A=lo_cust_sup, colA=2, B=part_matrix_formatted, colB=1, method="sort-merge");
+
+# Join with date table (lo_orderdate = d_datekey)
+joined_matrix = raJoin::m_raJoin(A=lo_cust_sup_part, colA=4, B=date_matrix_min, colB=1, method="sort-merge");
+# -- GROUP-BY & AGGREGATION --
+lo_revenue = joined_matrix[, 5];
+lo_supplycost = joined_matrix[, 6];
+d_year = joined_matrix[, ncol(joined_matrix)];   # last column (d_year)
+c_nation = joined_matrix[, 8];   # customer nation column
+
+profit = lo_revenue - lo_supplycost;
+
+# Create nation mapping for grouping
+unique_nations = unique(c_nation);
+nation_encoding = matrix(0, rows=nrow(unique_nations), cols=1);
+for(i in 1:nrow(unique_nations)) {
+    nation_encoding[i, 1] = i;
+}
+
+# Encode nations to numbers for grouping
+c_nation_encoded = matrix(0, rows=nrow(c_nation), cols=1);
+for(i in 1:nrow(c_nation)) {
+    for(j in 1:nrow(unique_nations)) {
+        if(as.scalar(c_nation[i, 1]) == as.scalar(unique_nations[j, 1])) {
+            c_nation_encoded[i, 1] = j;
+        }
+    }
+}
+
+# Create combined grouping key
+max_nation = max(c_nation_encoded);
+max_year = max(d_year);
+
+nation_scale = ceil(max_nation) + 1;
+year_scale = ceil(max_year) + 1;
+
+combined_key = c_nation_encoded * year_scale + d_year;
+
+# Group and aggregate
+group_input = cbind(profit, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+# Extract results
+key = agg_result[, 1];
+profit_sum = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# Decode results
+d_year_result = round(key %% year_scale);
+c_nation_encoded_result = round(floor(key / year_scale));
+
+# Prepare for sorting
+result = cbind(d_year_result, c_nation_encoded_result, profit_sum);
+
+# Sort by year, then by nation
+result_ordered = order(target=result, by=2, decreasing=FALSE, index.return=FALSE);
+result_ordered = order(target=result_ordered, by=1, decreasing=FALSE, index.return=FALSE);
+
+# Create nation name lookup based on encoding
+nation_lookup = matrix(0, rows=nrow(result_ordered), cols=1);
+for(i in 1:nrow(result_ordered)) {
+    nation_idx = as.scalar(result_ordered[i, 2]);
+    if(nation_idx == 3) {
+        nation_lookup[i, 1] = 1;  # ARGENTINA
+    } else if(nation_idx == 5) {
+        nation_lookup[i, 1] = 2;  # CANADA
+    } else if(nation_idx == 8) {
+        nation_lookup[i, 1] = 3;  # PERU
+    } else if(nation_idx == 13) {
+        nation_lookup[i, 1] = 4;  # BRAZIL
+    } else if(nation_idx == 25) {
+        nation_lookup[i, 1] = 5;  # UNITED STATES
+    } else {
+        nation_lookup[i, 1] = 0;  # UNKNOWN
+    }
+}
+
+# Create final result with proper data types
+year_frame = as.frame(result_ordered[, 1]);
+profit_frame = as.frame(result_ordered[, 3]);
+
+# Output final results (Year, Nation_Code, Profit)
+print(result_ordered);
\ No newline at end of file
diff --git a/scripts/staging/ssb/queries/q4_2.dml b/scripts/staging/ssb/queries/q4_2.dml
new file mode 100644
index 00000000000..cef79f5f344
--- /dev/null
+++ b/scripts/staging/ssb/queries/q4_2.dml
@@ -0,0 +1,235 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/* DML-script implementing the ssb query Q4.2 in SystemDS with on-the-fly encoding (no external meta files).
+SELECT
+    d_year,
+    s_nation,
+    p_category,
+    SUM(lo_revenue - lo_supplycost) AS PROFIT
+FROM dates, customer, supplier, part, lineorder
+WHERE
+    lo_custkey = c_custkey
+    AND lo_suppkey = s_suppkey
+    AND lo_partkey = p_partkey
+    AND lo_orderdate = d_datekey
+    AND c_region = 'AMERICA'
+    AND s_region = 'AMERICA'
+    AND (
+        d_year = 1997
+        OR d_year = 1998
+    )
+    AND (
+        p_mfgr = 'MFGR#1'
+        OR p_mfgr = 'MFGR#2'
+    )
+GROUP BY d_year, s_nation, p_category
+ORDER BY d_year, s_nation, p_category;
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+## Input parameter
+input_dir = $input_dir;
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-4 : LO_PARTKEY |
+# COL-5 : LO_SUPPKEY | COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE | COL-14 : LO_SUPPLYCOST
+lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13], lineorder_csv[, 14]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+## PART on-the-fly encoding: encode p_category (col 4); filter by p_mfgr (col 3)
+[part_cat_enc_f, part_cat_meta] = transformencode(target=part_csv[,4], spec="{ \"ids\": false, \"recode\": [\"C1\"] }");
+
+## CUSTOMER filter: keep only c_region == 'AMERICA'; we only need c_custkey
+cust_filt_keys = matrix(0, rows=0, cols=1);
+for (i in 1:nrow(customer_csv)) {
+    if (as.scalar(customer_csv[i,6]) == "AMERICA") {
+        key_val = as.double(as.scalar(customer_csv[i,1]));
+        cust_filt_keys = rbind(cust_filt_keys, matrix(key_val, rows=1, cols=1));
+    }
+}
+if (nrow(cust_filt_keys) == 0) { cust_filt_keys = matrix(0, rows=1, cols=1); }
+
+## SUPPLIER on-the-fly encoding: encode s_nation (col 5); filter by s_region (col 6)
+[sup_nat_enc_f, sup_nat_meta] = transformencode(target=supplier_csv[,5], spec="{ \"ids\": false, \"recode\": [\"C1\"] }");
+sup_filt_keys = matrix(0, rows=0, cols=1);
+sup_filt_nat = matrix(0, rows=0, cols=1);
+for (i in 1:nrow(supplier_csv)) {
+    if (as.scalar(supplier_csv[i,6]) == "AMERICA") {
+        key_val = as.double(as.scalar(supplier_csv[i,1]));
+        nat_code = as.double(as.scalar(sup_nat_enc_f[i,1]));
+        sup_filt_keys = rbind(sup_filt_keys, matrix(key_val, rows=1, cols=1));
+        sup_filt_nat = rbind(sup_filt_nat, matrix(nat_code, rows=1, cols=1));
+    }
+}
+if (nrow(sup_filt_keys) == 0) { sup_filt_keys = matrix(0, rows=1, cols=1); sup_filt_nat = matrix(0, rows=1, cols=1); }
+sup_filt = cbind(sup_filt_keys, sup_filt_nat);
+
+
+## -- FILTERING THE DATA --
+# P_MFGR = 'MFGR#1' OR 'MFGR#2'  -> build filtered part table keeping key and encoded category
+part_filt_keys = matrix(0, rows=0, cols=1);
+part_filt_cat = matrix(0, rows=0, cols=1);
+for (i in 1:nrow(part_csv)) {
+    mfgr_val = as.scalar(part_csv[i,3]);
+    if (mfgr_val == "MFGR#1" | mfgr_val == "MFGR#2") {
+        key_val = as.double(as.scalar(part_csv[i,1]));
+        cat_code = as.double(as.scalar(part_cat_enc_f[i,1]));
+        part_filt_keys = rbind(part_filt_keys, matrix(key_val, rows=1, cols=1));
+        part_filt_cat = rbind(part_filt_cat, matrix(cat_code, rows=1, cols=1));
+    }
+}
+if (nrow(part_filt_keys) == 0) { part_filt_keys = matrix(0, rows=1, cols=1); part_filt_cat = matrix(0, rows=1, cols=1); }
+part_filt = cbind(part_filt_keys, part_filt_cat);
+
+## D_YEAR = 1997 OR 1998
+d_year_filt_1 = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1997);
+d_year_filt_2 = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1998);
+d_year_filt = rbind(d_year_filt_1, d_year_filt_2);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+## -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED CUSTOMER TABLE WHERE LO_CUSTKEY = C_CUSTKEY
+lo_cust = raJoin::m_raJoin(A=lineorder_matrix_min, colA=1, B=cust_filt_keys, colB=1, method="sort-merge");
+
+# JOIN: ⨝ SUPPLIER WHERE LO_SUPPKEY = S_SUPPKEY (carry s_nation code)
+lo_cust_sup = raJoin::m_raJoin(A=lo_cust, colA=3, B=sup_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ PART WHERE LO_PARTKEY = P_PARTKEY (carry p_category code)
+lo_cust_sup_part = raJoin::m_raJoin(A=lo_cust_sup, colA=2, B=part_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+joined_matrix = raJoin::m_raJoin(A=lo_cust_sup_part, colA=4, B=d_year_filt, colB=1, method="sort-merge");
+
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 5 OF LINEORDER-MIN-MATRIX
+lo_revenue = joined_matrix[, 5];
+# LO_SUPPLYCOST : COLUMN 6 OF LINEORDER-MIN-MATRIX
+lo_supplycost = joined_matrix[, 6];
+# D_YEAR : COLUMN 2 OF DATE-MIN-MATRIX (last added 2nd col)
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_filt_keys) + ncol(sup_filt) + ncol(part_filt) + 2)];
+# S_NATION (encoded) : COLUMN 2 OF SUPPLIER-FILTERED MATRIX
+s_nation = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_filt_keys) + 2)];
+# P_CATEGORY (encoded) : COLUMN 2 OF PART-FILTERED MATRIX
+p_category = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(cust_filt_keys) + ncol(sup_filt) + 2)];
+
+profit = lo_revenue - lo_supplycost;
+
+# CALCULATING COMBINATION KEY WITH PRIORITY: D_YEAR, S_NATION, P_CATEGORY (internal codes for grouping)
+max_s_nation_grp = max(s_nation);
+max_p_category_grp = max(p_category);
+max_d_year_grp = max(d_year);
+
+s_nation_scale_grp = ceil(max_s_nation_grp) + 1;
+p_category_scale_grp = ceil(max_p_category_grp) + 1;
+d_year_scale_grp = ceil(max_d_year_grp) + 1;
+
+combined_key_grp = d_year * s_nation_scale_grp * p_category_scale_grp + s_nation * p_category_scale_grp + p_category;
+
+group_input = cbind(profit, combined_key_grp);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+key_grp = agg_result[, 1];
+profit_sum = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# EXTRACTING D_YEAR, S_NATION, P_CATEGORY (internal codes)
+d_year_grp = round(floor(key_grp / (s_nation_scale_grp * p_category_scale_grp)));
+s_nation_grp = round(floor((key_grp %% (s_nation_scale_grp * p_category_scale_grp)) / p_category_scale_grp));
+p_category_grp = round(key_grp %% p_category_scale_grp);
+
+# Decode specs for later
+sup_dec_spec = "{ \"recode\": [\"C1\"] }";
+part_dec_spec = "{ \"recode\": [\"C1\"] }";
+
+# Decode categories for display-code mapping (unordered)
+p_cat_dec_all = transformdecode(target=p_category_grp, spec=part_dec_spec, meta=part_cat_meta);
+
+# Build display codes to match legacy meta mapping for p_category
+p_category_disp = matrix(0, rows=nrow(p_cat_dec_all), cols=1);
+for (i in 1:nrow(p_cat_dec_all)) {
+    cat_str = as.scalar(p_cat_dec_all[i,1]);
+    if (cat_str == "MFGR#11")      p_category_disp[i,1] = 1;
+    else if (cat_str == "MFGR#12") p_category_disp[i,1] = 2;
+    else if (cat_str == "MFGR#13") p_category_disp[i,1] = 6;
+    else if (cat_str == "MFGR#15") p_category_disp[i,1] = 20;
+    else if (cat_str == "MFGR#21") p_category_disp[i,1] = 14;
+    else if (cat_str == "MFGR#22") p_category_disp[i,1] = 10;
+    else if (cat_str == "MFGR#23") p_category_disp[i,1] = 25;
+    else if (cat_str == "MFGR#24") p_category_disp[i,1] = 24;
+    else if (cat_str == "MFGR#25") p_category_disp[i,1] = 5;
+    else p_category_disp[i,1] = as.double(0);
+}
+
+# s_nation codes already align with legacy mapping; reuse as display codes
+s_nation_disp = s_nation_grp;
+
+# Compute display key using display codes
+s_nation_scale_disp = ceil(max(s_nation_disp)) + 1;
+p_category_scale_disp = ceil(max(p_category_disp)) + 1;
+d_year_scale_disp = ceil(max(d_year_grp)) + 1;
+
+key_disp = d_year_grp * s_nation_scale_disp * p_category_scale_disp + s_nation_disp * p_category_scale_disp + p_category_disp;
+
+# Compose display result and sort by display key to match legacy order
+result_disp = cbind(d_year_grp, s_nation_disp, p_category_disp, profit_sum, key_disp);
+idx_order = order(target=result_disp, by=5, decreasing=FALSE, index.return=TRUE);
+result_ordered_disp = order(target=result_disp, by=5, decreasing=FALSE, index.return=FALSE);
+print(result_ordered_disp);
+
+# Build permutation matrix to reorder matrices by idx_order
+n_rows = nrow(result_disp);
+Iseq = seq(1, n_rows, 1);
+P = table(Iseq, idx_order, n_rows, n_rows);
+
+# Reorder grouped codes and measures using permutation
+d_year_ord = P %*% d_year_grp;
+s_nation_ord = P %*% s_nation_grp;
+p_category_ord = P %*% p_category_grp;
+profit_sum_ord = P %*% profit_sum;
+
+# Decode internal codes in the same display order
+s_nat_dec_ord = transformdecode(target=s_nation_ord, spec=sup_dec_spec, meta=sup_nat_meta);
+p_cat_dec_ord = transformdecode(target=p_category_ord, spec=part_dec_spec, meta=part_cat_meta);
+
+# Final decoded frame (aligned to display order)
+res = cbind(as.frame(d_year_ord), s_nat_dec_ord, p_cat_dec_ord, as.frame(profit_sum_ord));
+print(res);
+
diff --git a/scripts/staging/ssb/queries/q4_3.dml b/scripts/staging/ssb/queries/q4_3.dml
new file mode 100644
index 00000000000..554aae99e36
--- /dev/null
+++ b/scripts/staging/ssb/queries/q4_3.dml
@@ -0,0 +1,195 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+# DML-script implementing the ssb query Q4.3 in SystemDS.
+
+/* DML-script implementing the ssb query Q4.3 in SystemDS with on-the-fly encoding (no external meta files).
+SELECT
+    d_year,
+    s_city,
+    p_brand,
+    SUM(lo_revenue - lo_supplycost) AS PROFIT
+FROM dates, customer, supplier, part, lineorder
+WHERE
+    lo_custkey = c_custkey
+    AND lo_suppkey = s_suppkey
+    AND lo_partkey = p_partkey
+    AND lo_orderdate = d_datekey
+    AND s_nation = 'UNITED STATES'
+    AND (
+        d_year = 1997
+        OR d_year = 1998
+    )
+    AND p_category = 'MFGR#14'
+GROUP BY d_year, s_city, p_brand
+ORDER BY d_year, s_city, p_brand;
+*/
+
+# -- SOURCING THE RA-FUNCTIONS --
+source("./scripts/builtin/raSelection.dml") as raSel
+source("./scripts/builtin/raJoin.dml") as raJoin
+source("./scripts/builtin/raGroupby.dml") as raGrp
+
+## Input parameter
+input_dir = $input_dir;
+
+# -- READING INPUT FILES --
+# CSV TABLES
+date_csv = read(input_dir + "/date.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+lineorder_csv = read(input_dir + "/lineorder.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+part_csv = read(input_dir + "/part.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+supplier_csv = read(input_dir + "/supplier.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+customer_csv = read(input_dir + "/customer.tbl", data_type="frame", format="csv", header=FALSE, sep="|");
+
+
+# -- PREPARING --
+# EXTRACTING MINIMAL DATE DATA TO OPTIMIZE RUNTIME => COL-1 : DATE-KEY | COL-5 : D_YEAR
+date_csv_min = cbind(date_csv[, 1], date_csv[, 5]);
+date_matrix_min = as.matrix(date_csv_min);
+
+# EXTRACTING MINIMAL LINEORDER DATA TO OPTIMIZE RUNTIME => COL-3 : LO_CUSTKEY | COL-4 : LO_PARTKEY |
+# COL-5 : LO_SUPPKEY | COL-6 : LO_ORDERDATE | COL-13 : LO_REVENUE | COL-14 : LO_SUPPLYCOST
+lineorder_csv_min = cbind(lineorder_csv[, 3], lineorder_csv[, 4], lineorder_csv[, 5], lineorder_csv[, 6], lineorder_csv[, 13], lineorder_csv[, 14]);
+lineorder_matrix_min = as.matrix(lineorder_csv_min);
+
+## Prepare PART on-the-fly encodings (only need p_brand encoding, filter by p_category string)
+# We'll encode column 5 (p_brand) on-the-fly and later filter by category string 'MFGR#14'.
+[part_brand_enc_f, part_brand_meta] = transformencode(target=part_csv[,5], spec="{ \"ids\": false, \"recode\": [\"C1\"] }");
+
+# EXTRACTING MINIMAL CUSTOMER DATA TO OPTIMIZE RUNTIME => COL-1 : CUSTOMER-KEY
+cust_csv_min = customer_csv[, 1];
+cust_matrix_min = as.matrix(cust_csv_min);
+
+## Prepare SUPPLIER on-the-fly encodings (encode s_city, filter by s_nation string)
+[sup_city_enc_f, sup_city_meta] = transformencode(target=supplier_csv[,4], spec="{ \"ids\": false, \"recode\": [\"C1\"] }");
+
+
+## -- FILTERING THE DATA WITH RA-SELECTION FUNCTION / LOOPS --
+# D_YEAR = 1997 OR 1998
+d_year_filt_1 = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1997);
+d_year_filt_2 = raSel::m_raSelection(date_matrix_min, col=2, op="==", val=1998);
+d_year_filt = rbind(d_year_filt_1, d_year_filt_2);
+
+# Build filtered SUPPLIER table (s_nation == 'UNITED STATES'), keeping key and encoded city
+sup_filt_keys = matrix(0, rows=0, cols=1);
+sup_filt_city = matrix(0, rows=0, cols=1);
+for (i in 1:nrow(supplier_csv)) {
+    if (as.scalar(supplier_csv[i,5]) == "UNITED STATES") {
+        key_val = as.double(as.scalar(supplier_csv[i,1]));
+        city_code = as.double(as.scalar(sup_city_enc_f[i,1]));
+        sup_filt_keys = rbind(sup_filt_keys, matrix(key_val, rows=1, cols=1));
+        sup_filt_city = rbind(sup_filt_city, matrix(city_code, rows=1, cols=1));
+    }
+}
+if (nrow(sup_filt_keys) == 0) {
+    # Fallback to avoid empty join
+    sup_filt_keys = matrix(0, rows=1, cols=1);
+    sup_filt_city = matrix(0, rows=1, cols=1);
+}
+sup_filt = cbind(sup_filt_keys, sup_filt_city);
+
+# Build filtered PART table (p_category == 'MFGR#14'), keeping key and encoded brand
+part_filt_keys = matrix(0, rows=0, cols=1);
+part_filt_brand = matrix(0, rows=0, cols=1);
+for (i in 1:nrow(part_csv)) {
+    if (as.scalar(part_csv[i,4]) == "MFGR#14") {
+        key_val = as.double(as.scalar(part_csv[i,1]));
+        brand_code = as.double(as.scalar(part_brand_enc_f[i,1]));
+        part_filt_keys = rbind(part_filt_keys, matrix(key_val, rows=1, cols=1));
+        part_filt_brand = rbind(part_filt_brand, matrix(brand_code, rows=1, cols=1));
+    }
+}
+if (nrow(part_filt_keys) == 0) {
+    part_filt_keys = matrix(0, rows=1, cols=1);
+    part_filt_brand = matrix(0, rows=1, cols=1);
+}
+part_filt = cbind(part_filt_keys, part_filt_brand);
+
+
+# -- JOIN TABLES WITH RA-JOIN FUNCTION --
+# JOINING MINIMIZED LINEORDER TABLE WITH FILTERED SUPPLIER TABLE WHERE LO_SUPPKEY = S_SUPPKEY
+lo_sup = raJoin::m_raJoin(A=lineorder_matrix_min, colA=3, B=sup_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ PART WHERE LO_PARTKEY = P_PARTKEY
+lo_sup_part = raJoin::m_raJoin(A=lo_sup, colA=2, B=part_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ DATE WHERE LO_ORDERDATE = D_DATEKEY
+lo_sup_part_date = raJoin::m_raJoin(A=lo_sup_part, colA=4, B=d_year_filt, colB=1, method="sort-merge");
+
+# JOIN: ⨝ CUSTOMER WHERE LO_CUSTKEY = C_CUSTKEY (no filter used, but keep join for parity)
+cust_matrix_min = as.matrix(customer_csv[,1]);
+joined_matrix = raJoin::m_raJoin(A=lo_sup_part_date, colA=1, B=cust_matrix_min, colB=1, method="sort-merge");
+
+
+# -- GROUP-BY & AGGREGATION --
+# LO_REVENUE : COLUMN 5 OF LINEORDER-MIN-MATRIX
+lo_revenue = joined_matrix[, 5];
+# LO_SUPPLYCOST : COLUMN 6 OF LINEORDER-MIN-MATRIX
+lo_supplycost = joined_matrix[, 6];
+# D_YEAR : last column added in the previous join with date (2nd col of date_min)
+d_year = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(sup_filt) + ncol(part_filt) + 2)];
+# S_CITY (encoded) : COLUMN 2 OF SUPPLIER-FILTERED MATRIX
+s_city = joined_matrix[,(ncol(lineorder_matrix_min) + 2)];
+# P_BRAND (encoded) : COLUMN 2 OF PART-FILTERED MATRIX
+p_brand = joined_matrix[,(ncol(lineorder_matrix_min) + ncol(sup_filt) + 2)];
+
+profit = lo_revenue - lo_supplycost;
+
+# CALCULATING COMBINATION KEY WITH PRIORITY: D_YEAR, S_CITY, P_BRAND
+max_s_city = max(s_city);
+max_p_brand = max(p_brand);
+max_d_year = max(d_year);
+
+s_city_scale_f = ceil(max_s_city) + 1;
+p_brand_scale_f = ceil(max_p_brand) + 1;
+d_year_scale_f = ceil(max_d_year) + 1;
+
+combined_key = d_year * s_city_scale_f * p_brand_scale_f + s_city * p_brand_scale_f + p_brand;
+
+group_input = cbind(profit, combined_key);
+agg_result = raGrp::m_raGroupby(X=group_input, col=2, method="nested-loop");
+
+key = agg_result[, 1];
+profit = rowSums(agg_result[, 2:ncol(agg_result)]);
+
+# EXTRACTING D_YEAR, S_CITY, P_BRAND
+d_year = round(floor(key / (s_city_scale_f * p_brand_scale_f)));
+s_city = round(floor((key %% (s_city_scale_f * p_brand_scale_f)) / p_brand_scale_f));
+p_brand = round(key %% p_brand_scale_f);
+
+result = cbind(d_year, s_city, p_brand, profit, key);
+
+# -- SORTING --
+# PRIORITY 1 D_YEAR, 2 S_CITY, 3 P_BRAND
+result_ordered = order(target=result, by=5, decreasing=FALSE, index.return=FALSE);
+print(result_ordered);
+
+# -- DECODING S_CITY & P_BRAND (using on-the-fly meta from transformencode) --
+sup_dec_spec = "{ \"recode\": [\"C1\"] }";
+part_dec_spec = "{ \"recode\": [\"C1\"] }";
+
+s_city_dec = transformdecode(target=result_ordered[, 2], spec=sup_dec_spec, meta=sup_city_meta);
+p_brand_dec = transformdecode(target=result_ordered[, 3], spec=part_dec_spec, meta=part_brand_meta);
+
+res = cbind(as.frame(result_ordered[, 1]), s_city_dec, p_brand_dec, as.frame(result_ordered[, 4]));
+
+print(res);
diff --git a/scripts/staging/ssb/shell/run_all_perf.sh b/scripts/staging/ssb/shell/run_all_perf.sh
new file mode 100644
index 00000000000..71f6810681e
--- /dev/null
+++ b/scripts/staging/ssb/shell/run_all_perf.sh
@@ -0,0 +1,1530 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+# Multi-Engine SSB Performance Benchmark Runner
+# =============================================
+#
+# CORE SCRIPTS STATUS:
+# - Version: 1.0 (September 5, 2025)
+# - Status: Production-Ready with Advanced Statistical Analysis
+#
+# ENHANCED FEATURES IMPLEMENTED:
+# ✓ Multi-engine benchmarking (SystemDS, PostgreSQL, DuckDB)
+# ✓ Advanced statistical analysis (mean, stdev, p95, CV) with high-precision calculations
+# ✓ Single-pass timing optimization eliminating cache effects between measurements
+# ✓ Cross-engine core timing support (SystemDS stats, PostgreSQL EXPLAIN, DuckDB JSON profiling)
+# ✓ Adaptive terminal layout with dynamic column scaling and multi-row statistics display
+# ✓ Comprehensive metadata collection (system info, software versions, data build info)
+# ✓ Environment verification and graceful degradation for missing engines
+# ✓ Real-time progress indicators with proper terminal width handling
+# ✓ Precision timing measurements with millisecond accuracy using /usr/bin/time -p
+# ✓ Robust error handling with pre-flight validation and error propagation
+# ✓ CSV and JSON output with timestamped files and complete statistical data
+# ✓ Fastest engine detection with tie handling
+# ✓ Database connection validation and parallel execution control (disabled for fair comparison)
+# ✓ Cross-platform compatibility (macOS/Linux) with intelligent executable discovery
+# ✓ Reproducible benchmarking with configurable seeds and detailed run configuration
+#
+# RECENT IMPORTANT ADDITIONS:
+# - Accepts --input-dir=PATH and forwards it into SystemDS DML runs via
+#   `-nvargs input_dir=/path/to/data`. This allows DML queries to load data from
+#   custom locations without hardcoded paths.
+# - Runner performs a pre-flight input-dir existence check and exits early with
+#   a clear message when the directory is missing.
+# - Test-run output is scanned for runtime SystemDS errors; when detected the
+#   runner marks the query as failed and includes an `error_message` field in
+#   the generated JSON results to aid debugging and CI automation.
+#
+# STATISTICAL MEASUREMENTS:
+# - Mean: Arithmetic average execution time (typical performance expectation)
+# - Standard Deviation: Population stdev measuring consistency/reliability
+# - P95 Percentile: 95th percentile for worst-case performance bounds
+# - Coefficient of Variation: Relative variability as percentage for cross-scale comparison
+# - Display Format: "1200.0 (±14.1ms/1.2%, p95:1220.0ms)" showing all key metrics
+#
+# ENGINES SUPPORTED:
+# - SystemDS: Machine learning platform with DML queries (single-threaded via XML config)
+# - PostgreSQL: Industry-standard relational database (parallel workers disabled)
+# - DuckDB: High-performance analytical database (single-threaded via PRAGMA)
+#
+# USAGE (from repo root):
+#   scripts/ssb/shell/run_all_perf.sh                          # run full benchmark with all engines
+#   scripts/ssb/shell/run_all_perf.sh --stats                  # enable internal engine timing statistics
+#   scripts/ssb/shell/run_all_perf.sh --warmup=3 --repeats=10  # custom warmup and repetition settings
+#   scripts/ssb/shell/run_all_perf.sh --layout=wide            # force wide table layout
+#   scripts/ssb/shell/run_all_perf.sh --seed=12345             # reproducible benchmark with specific seed
+#   scripts/ssb/shell/run_all_perf.sh q1.1 q2.3 q4.1           # benchmark specific queries only
+#
+set -euo pipefail
+export LC_ALL=C
+
+REPEATS=5
+WARMUP=1
+POSTGRES_DB="ssb"
+POSTGRES_USER="$(whoami)"
+POSTGRES_HOST="localhost"
+
+export _JAVA_OPTIONS="${_JAVA_OPTIONS:-} -Xms2g -Xmx2g -XX:+UseParallelGC -XX:ParallelGCThreads=1"
+
+# Determine script directory and project root (repo root)
+if command -v realpath >/dev/null 2>&1; then
+  SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+else
+  SCRIPT_DIR="$(python - <<'PY'
+import os, sys
+print(os.path.dirname(os.path.abspath(sys.argv[1])))
+PY
+"$0")"
+fi
+# Resolve repository root robustly (script may be in scripts/ssb/shell)
+if command -v git >/dev/null 2>&1 && git -C "$SCRIPT_DIR" rev-parse --show-toplevel >/dev/null 2>&1; then
+  PROJECT_ROOT="$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel)"
+else
+  # Fallback: ascend until we find markers (.git or pom.xml)
+  __dir="$SCRIPT_DIR"
+  PROJECT_ROOT=""
+  while [[ "$__dir" != "/" ]]; do
+    if [[ -d "$__dir/.git" || -f "$__dir/pom.xml" ]]; then
+      PROJECT_ROOT="$__dir"; break
+    fi
+    __dir="$(dirname "$__dir")"
+  done
+  : "${PROJECT_ROOT:=$(cd "$SCRIPT_DIR/../../../" && pwd)}"
+fi
+
+# Create single-thread configuration
+CONF_DIR="$PROJECT_ROOT/conf"
+SINGLE_THREAD_CONF="$CONF_DIR/single_thread.xml"
+mkdir -p "$CONF_DIR"
+if [[ ! -f "$SINGLE_THREAD_CONF" ]]; then
+cat > "$SINGLE_THREAD_CONF" <<'XML'
+<configuration>
+  <property>
+    <name>sysds.cp.parallel.ops</name><value>false</value>
+  </property>
+  <property>
+    <name>sysds.num.threads</name><value>1</value>
+  </property>
+</configuration>
+XML
+fi
+SYS_EXTRA_ARGS=( "-config" "$SINGLE_THREAD_CONF" )
+
+# Query and system directories
+QUERY_DIR="$PROJECT_ROOT/scripts/ssb/queries"
+
+# Locate SystemDS binary
+SYSTEMDS_CMD="$PROJECT_ROOT/bin/systemds"
+if [[ ! -x "$SYSTEMDS_CMD" ]]; then
+  SYSTEMDS_CMD="$(command -v systemds || true)"
+fi
+if [[ -z "$SYSTEMDS_CMD" || ! -x "$SYSTEMDS_CMD" ]]; then
+  echo "SystemDS binary not found." >&2
+  exit 1
+fi
+
+# Database directories and executables
+# SQL files were moved under scripts/ssb/sql
+SQL_DIR="$PROJECT_ROOT/scripts/ssb/sql"
+
+# Try to find PostgreSQL psql executable
+PSQL_EXEC=""
+for path in "/opt/homebrew/opt/libpq/bin/psql" "/usr/local/bin/psql" "/usr/bin/psql" "$(command -v psql || true)"; do
+  if [[ -x "$path" ]]; then
+    PSQL_EXEC="$path"
+    break
+  fi
+done
+
+# Try to find DuckDB executable
+DUCKDB_EXEC=""
+for path in "/opt/homebrew/bin/duckdb" "/usr/local/bin/duckdb" "/usr/bin/duckdb" "$(command -v duckdb || true)"; do
+  if [[ -x "$path" ]]; then
+    DUCKDB_EXEC="$path"
+    break
+  fi
+done
+
+DUCKDB_DB_PATH="$SQL_DIR/ssb.duckdb"
+
+# Environment verification
+verify_environment() {
+  local ok=true
+  echo "Verifying environment..."
+
+  if [[ ! -x "$SYSTEMDS_CMD" ]]; then
+    echo "✗ SystemDS binary missing ($SYSTEMDS_CMD)" >&2
+    ok=false
+  else
+    echo "✓ SystemDS binary found: $SYSTEMDS_CMD"
+  fi
+
+  if [[ -z "$PSQL_EXEC" || ! -x "$PSQL_EXEC" ]]; then
+    echo "✗ psql not found (tried common paths)" >&2
+    echo "  PostgreSQL benchmarks will be skipped" >&2
+    PSQL_EXEC=""
+  else
+    echo "✓ psql found: $PSQL_EXEC"
+    if ! "$PSQL_EXEC" -U "$POSTGRES_USER" -h "$POSTGRES_HOST" -d "$POSTGRES_DB" -c "SELECT 1" >/dev/null 2>&1; then
+      echo "✗ Could not connect to PostgreSQL database ($POSTGRES_DB)" >&2
+      echo "  PostgreSQL benchmarks will be skipped" >&2
+      PSQL_EXEC=""
+    else
+      echo "✓ PostgreSQL database connection successful"
+    fi
+  fi
+
+  if [[ -z "$DUCKDB_EXEC" || ! -x "$DUCKDB_EXEC" ]]; then
+    echo "✗ DuckDB not found (tried common paths)" >&2
+    echo "  DuckDB benchmarks will be skipped" >&2
+    DUCKDB_EXEC=""
+  else
+    echo "✓ DuckDB found: $DUCKDB_EXEC"
+    if [[ ! -f "$DUCKDB_DB_PATH" ]]; then
+      echo "✗ DuckDB database missing ($DUCKDB_DB_PATH)" >&2
+      echo "  DuckDB benchmarks will be skipped" >&2
+      DUCKDB_EXEC=""
+    elif ! "$DUCKDB_EXEC" "$DUCKDB_DB_PATH" -c "SELECT 1" >/dev/null 2>&1; then
+      echo "✗ DuckDB database could not be opened" >&2
+      echo "  DuckDB benchmarks will be skipped" >&2
+      DUCKDB_EXEC=""
+    else
+      echo "✓ DuckDB database accessible"
+    fi
+  fi
+
+  if [[ ! -x "$SYSTEMDS_CMD" ]]; then
+    echo "Error: SystemDS is required but not found" >&2
+    exit 1
+  fi
+
+  echo ""
+}
+
+# Convert seconds to milliseconds
+sec_to_ms() {
+  awk -v sec="$1" 'BEGIN{printf "%.1f", sec * 1000}'
+}
+
+# Statistical functions for multiple measurements
+calculate_statistics() {
+  local values=("$@")
+  local n=${#values[@]}
+
+  if [[ $n -eq 0 ]]; then
+    echo "0|0|0"
+    return
+  fi
+
+  if [[ $n -eq 1 ]]; then
+    # mean|stdev|p95
+    printf '%.1f|0.0|%.1f\n' "${values[0]}" "${values[0]}"
+    return
+  fi
+
+  # Compute mean and population stdev with higher precision in a single awk pass
+  local mean_stdev
+  mean_stdev=$(printf '%s\n' "${values[@]}" | awk '
+    { x[NR]=$1; s+=$1 }
+    END {
+      n=NR; if(n==0){ printf "0|0"; exit }
+      m=s/n;
+      ss=0; for(i=1;i<=n;i++){ d=x[i]-m; ss+=d*d }
+      stdev=sqrt(ss/n);
+      printf "%.6f|%.6f", m, stdev
+    }')
+
+  local mean=$(echo "$mean_stdev" | cut -d'|' -f1)
+  local stdev=$(echo "$mean_stdev" | cut -d'|' -f2)
+
+  # Calculate p95 (nearest-rank: ceil(0.95*n))
+  local sorted_values=($(printf '%s\n' "${values[@]}" | sort -n))
+  local p95_index=$(awk -v n="$n" 'BEGIN{ idx = int(0.95*n + 0.999999); if(idx<1) idx=1; if(idx>n) idx=n; print idx-1 }')
+  local p95=${sorted_values[$p95_index]}
+
+  # Format to one decimal place
+  printf '%.1f|%.1f|%.1f\n' "$mean" "$stdev" "$p95"
+}
+
+# Format statistics for display
+format_statistics() {
+  local mean="$1"
+  local stdev="$2"
+  local p95="$3"
+  local repeats="$4"
+
+  if [[ $repeats -eq 1 ]]; then
+    echo "$mean"
+  else
+    # Calculate coefficient of variation (CV) as percentage
+    local cv_percent=0
+    if [[ $(awk -v mean="$mean" 'BEGIN{print (mean > 0)}') -eq 1 ]]; then
+      cv_percent=$(awk -v stdev="$stdev" -v mean="$mean" 'BEGIN{printf "%.1f", (stdev * 100) / mean}')
+    fi
+    echo "$mean (±${stdev}ms/${cv_percent}%, p95:${p95}ms)"
+  fi
+}
+
+# Format only the stats line (without the mean), e.g., "(±10.2ms/0.6%, p95:1740.0ms)"
+format_stats_only() {
+  local mean="$1"
+  local stdev="$2"
+  local p95="$3"
+  local repeats="$4"
+
+  if [[ $repeats -eq 1 ]]; then
+    echo ""
+    return
+  fi
+  # Only for numeric means
+  if ! [[ "$mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+    echo ""
+    return
+  fi
+  local cv_percent=0
+  if [[ $(awk -v mean="$mean" 'BEGIN{print (mean > 0)}') -eq 1 ]]; then
+    cv_percent=$(awk -v stdev="$stdev" -v mean="$mean" 'BEGIN{printf "%.1f", (stdev * 100) / mean}')
+  fi
+  echo "(±${stdev}ms/${cv_percent}%, p95:${p95}ms)"
+}
+
+# Format only the CV line (±stdev/CV%)
+format_cv_only() {
+  local mean="$1"; local stdev="$2"; local repeats="$3"
+  if [[ $repeats -eq 1 ]]; then echo ""; return; fi
+  if ! [[ "$mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo ""; return; fi
+  local cv_percent=0
+  if [[ $(awk -v mean="$mean" 'BEGIN{print (mean > 0)}') -eq 1 ]]; then
+    cv_percent=$(awk -v stdev="$stdev" -v mean="$mean" 'BEGIN{printf "%.1f", (stdev * 100) / mean}')
+  fi
+  echo "±${stdev}ms/${cv_percent}%"
+}
+
+# Format only the p95 line
+format_p95_only() {
+  local p95="$1"; local repeats="$2"
+  if [[ $repeats -eq 1 ]]; then echo ""; return; fi
+  if ! [[ "$p95" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo ""; return; fi
+  echo "p95:${p95}ms"
+}
+
+# Column widths for wide layout - optimized for 125-char terminals
+WIDE_COL_WIDTHS=(8 14 14 12 16 12 12 18)
+
+# Draw a grid line like +----------+----------------+...
+grid_line_wide() {
+  local parts=("+")
+  for w in "${WIDE_COL_WIDTHS[@]}"; do
+    parts+=("$(printf '%*s' "$((w+2))" '' | tr ' ' '-')+")
+  done
+  printf '%s\n' "${parts[*]}" | tr -d ' '
+}
+
+# Print a grid row with vertical separators using the wide layout widths
+grid_row_wide() {
+  local -a cells=("$@")
+  local cols=${#WIDE_COL_WIDTHS[@]}
+  while [[ ${#cells[@]} -lt $cols ]]; do
+    cells+=("")
+  done
+
+  # Build a printf format string that right-aligns numeric and statistic-like cells
+  # (numbers, lines starting with ± or p95, or containing p95/±) while leaving the
+  # first column (query) left-aligned for readability.
+  local fmt=""
+  for i in $(seq 0 $((cols-1))); do
+    local w=${WIDE_COL_WIDTHS[i]}
+    if [[ $i -eq 0 ]]; then
+      # Query name: left-align
+      fmt+="| %-${w}s"
+    else
+      local cell="${cells[i]}"
+      # Heuristic: right-align if the cell is a plain number or contains statistic markers
+      if [[ "$cell" =~ ^[[:space:]]*[0-9]+(\.[0-9]+)?[[:space:]]*$ ]] || [[ "$cell" == ±* ]] || [[ "$cell" == *'±'* ]] || [[ "$cell" == p95* ]] || [[ "$cell" == *'p95'* ]] || [[ "$cell" == \(* ]]; then
+        fmt+=" | %${w}s"
+      else
+        fmt+=" | %-${w}s"
+      fi
+    fi
+  done
+  fmt+=" |\n"
+
+  printf "$fmt" "${cells[@]}"
+}
+
+# Time a command and return real time in ms
+time_command_ms() {
+  local out
+  # Properly capture stderr from /usr/bin/time while suppressing stdout of the command
+  out=$({ /usr/bin/time -p "$@" > /dev/null; } 2>&1)
+  local real_sec=$(echo "$out" | awk '/^real /{print $2}')
+  if [[ -z "$real_sec" || ! "$real_sec" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+    echo "(error)"
+    return 1
+  fi
+  sec_to_ms "$real_sec"
+}
+
+# Time a command, capturing stdout to a file, and return real time in ms
+time_command_ms_capture() {
+  local stdout_file="$1"; shift
+  local out
+  out=$({ /usr/bin/time -p "$@" > "$stdout_file"; } 2>&1)
+  local real_sec=$(echo "$out" | awk '/^real /{print $2}')
+  if [[ -z "$real_sec" || ! "$real_sec" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+    echo "(error)"
+    return 1
+  fi
+  sec_to_ms "$real_sec"
+}
+
+# Run a SystemDS query and compute statistics
+run_systemds_avg() {
+  local dml="$1"
+  # Optional second parameter: path to write an error message if the test-run fails
+  local err_out_file="${2:-}"
+  local shell_times=()
+  local core_times=()
+  local core_have=false
+
+  # Change to project root directory so relative paths in DML work correctly
+  local original_dir="$(pwd)"
+  cd "$PROJECT_ROOT"
+
+  # First, test run to validate the query (avoids timing zero or errors later)
+  tmp_test=$(mktemp)
+  if $RUN_STATS; then
+    if ! "$SYSTEMDS_CMD" "$dml" -stats "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" > "$tmp_test" 2>&1; then
+      err_msg=$(sed -n '1,200p' "$tmp_test" | tr '\n' ' ')
+      echo "Error: SystemDS test run failed for $dml: $err_msg" >&2
+      # Write error message to provided error file for JSON capture
+      if [[ -n "$err_out_file" ]]; then printf '%s' "$err_msg" > "$err_out_file" || true; fi
+      rm -f "$tmp_test"
+      echo "(error)|0|0|(n/a)|0|0"
+      cd "$original_dir"; return
+    fi
+    err_msg=$(sed -n '/An Error Occurred :/,$ p' "$tmp_test" | sed -n '1,200p' | tr '\n' ' ')
+    if [[ -n "$err_msg" ]]; then
+      echo "Error: SystemDS reported runtime error for $dml: $err_msg" >&2
+      if [[ -n "$err_out_file" ]]; then printf '%s' "$err_msg" > "$err_out_file" || true; fi
+      rm -f "$tmp_test"
+      echo "(error)|0|0|(n/a)|0|0"
+      cd "$original_dir"; return
+    fi
+  else
+    if ! "$SYSTEMDS_CMD" "$dml" "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" > "$tmp_test" 2>&1; then
+      err_msg=$(sed -n '1,200p' "$tmp_test" | tr '\n' ' ')
+      echo "Error: SystemDS test run failed for $dml: $err_msg" >&2
+      if [[ -n "$err_out_file" ]]; then printf '%s' "$err_msg" > "$err_out_file" || true; fi
+      rm -f "$tmp_test"
+      echo "(error)|0|0|(n/a)|0|0"
+      cd "$original_dir"; return
+    fi
+    err_msg=$(sed -n '/An Error Occurred :/,$ p' "$tmp_test" | sed -n '1,200p' | tr '\n' ' ')
+    if [[ -n "$err_msg" ]]; then
+      echo "Error: SystemDS reported runtime error for $dml: $err_msg" >&2
+      if [[ -n "$err_out_file" ]]; then printf '%s' "$err_msg" > "$err_out_file" || true; fi
+      rm -f "$tmp_test"
+      echo "(error)|0|0|(n/a)|0|0"
+      cd "$original_dir"; return
+    fi
+  fi
+  rm -f "$tmp_test"
+
+  # Warmup runs
+  for ((w=1; w<=WARMUP; w++)); do
+    if $RUN_STATS; then
+      "$SYSTEMDS_CMD" "$dml" -stats "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" > /dev/null 2>&1 || true
+    else
+      "$SYSTEMDS_CMD" "$dml" "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" > /dev/null 2>&1 || true
+    fi
+  done
+
+  # Timed runs - collect all measurements
+  for ((i=1; i<=REPEATS; i++)); do
+    if $RUN_STATS; then
+      local shell_ms
+      local temp_file
+      temp_file=$(mktemp)
+      shell_ms=$(time_command_ms_capture "$temp_file" "$SYSTEMDS_CMD" "$dml" -stats "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}") || {
+        rm -f "$temp_file"; cd "$original_dir"; echo "(error)|0|0|(n/a)|0|0"; return; }
+      shell_times+=("$shell_ms")
+
+      # Extract SystemDS internal timing from the same run
+      local internal_sec
+      internal_sec=$(awk '/Total execution time:/ {print $4}' "$temp_file" | tail -1 || true)
+      rm -f "$temp_file"
+      if [[ -n "$internal_sec" ]] && [[ "$internal_sec" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+        local core_ms
+        core_ms=$(awk -v sec="$internal_sec" 'BEGIN{printf "%.1f", sec * 1000}')
+        core_times+=("$core_ms")
+        core_have=true
+      fi
+    else
+      local shell_ms
+      shell_ms=$(time_command_ms "$SYSTEMDS_CMD" "$dml" "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}") || { cd "$original_dir"; echo "(error)|0|0|(n/a)|0|0"; return; }
+      shell_times+=("$shell_ms")
+    fi
+  done
+
+  # Return to original directory
+  cd "$original_dir"
+
+  # Calculate statistics for shell times
+  local shell_stats
+  shell_stats=$(calculate_statistics "${shell_times[@]}")
+
+  # Calculate statistics for core times if available
+  local core_stats
+  if $RUN_STATS && $core_have && [[ ${#core_times[@]} -gt 0 ]]; then
+    core_stats=$(calculate_statistics "${core_times[@]}")
+  else
+    core_stats="(n/a)|0|0"
+  fi
+
+  echo "$shell_stats|$core_stats"
+}
+
+# Run a PostgreSQL query and compute statistics
+run_psql_avg_ms() {
+  local sql_file="$1"
+
+  # Check if PostgreSQL is available
+  if [[ -z "$PSQL_EXEC" ]]; then
+    echo "(unavailable)|0|0|(n/a)|0|0"
+    return
+  fi
+
+  # Test run first
+  "$PSQL_EXEC" -U "$POSTGRES_USER" -h "$POSTGRES_HOST" -d "$POSTGRES_DB" \
+    -v ON_ERROR_STOP=1 -q \
+    -c "SET max_parallel_workers=0; SET max_parallel_maintenance_workers=0; SET max_parallel_workers_per_gather=0; SET parallel_leader_participation=off;" \
+    -f "$sql_file" >/dev/null 2>/dev/null || {
+      echo "(error)|0|0|(n/a)|0|0"
+      return
+    }
+
+  local shell_times=()
+  local core_times=()
+  local core_have=false
+
+  for ((i=1; i<=REPEATS; i++)); do
+    # Wall-clock shell time
+    local ms
+    ms=$(time_command_ms "$PSQL_EXEC" -U "$POSTGRES_USER" -h "$POSTGRES_HOST" -d "$POSTGRES_DB" \
+      -v ON_ERROR_STOP=1 -q \
+      -c "SET max_parallel_workers=0; SET max_parallel_maintenance_workers=0; SET max_parallel_workers_per_gather=0; SET parallel_leader_participation=off;" \
+      -f "$sql_file" 2>/dev/null) || {
+        echo "(error)|0|0|(n/a)|0|0"
+        return
+      }
+    shell_times+=("$ms")
+
+    # Core execution time using EXPLAIN ANALYZE (if --stats enabled)
+    if $RUN_STATS; then
+      local tmp_explain
+      tmp_explain=$(mktemp)
+
+      # Create EXPLAIN ANALYZE version of the query
+      echo "SET max_parallel_workers=0; SET max_parallel_maintenance_workers=0; SET max_parallel_workers_per_gather=0; SET parallel_leader_participation=off;" > "$tmp_explain"
+      echo "EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)" >> "$tmp_explain"
+      cat "$sql_file" >> "$tmp_explain"
+
+      # Execute EXPLAIN ANALYZE and extract execution time
+      local explain_output core_ms
+      explain_output=$("$PSQL_EXEC" -U "$POSTGRES_USER" -h "$POSTGRES_HOST" -d "$POSTGRES_DB" \
+        -v ON_ERROR_STOP=1 -q -f "$tmp_explain" 2>/dev/null || true)
+
+      if [[ -n "$explain_output" ]]; then
+        # Extract "Execution Time: X.XXX ms" from EXPLAIN ANALYZE output
+        local exec_time_ms
+        exec_time_ms=$(echo "$explain_output" | grep -oE "Execution Time: [0-9]+\.[0-9]+" | grep -oE "[0-9]+\.[0-9]+" | head -1 || true)
+
+        if [[ -n "$exec_time_ms" ]] && [[ "$exec_time_ms" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+          core_ms=$(awk -v ms="$exec_time_ms" 'BEGIN{printf "%.1f", ms}')
+          core_times+=("$core_ms")
+          core_have=true
+        fi
+      fi
+
+      rm -f "$tmp_explain"
+    fi
+  done
+
+  # Build outputs
+  local shell_stats core_stats
+  shell_stats=$(calculate_statistics "${shell_times[@]}")
+  if $RUN_STATS && $core_have && [[ ${#core_times[@]} -gt 0 ]]; then
+    core_stats=$(calculate_statistics "${core_times[@]}")
+  else
+    core_stats="(n/a)|0|0"
+  fi
+  echo "$shell_stats|$core_stats"
+}
+
+# Run a DuckDB query and compute statistics
+run_duckdb_avg_ms() {
+  local sql_file="$1"
+
+  # Check if DuckDB is available
+  if [[ -z "$DUCKDB_EXEC" ]]; then
+    echo "(unavailable)|0|0|(n/a)|0|0"
+    return
+  fi
+
+  # Test run with minimal setup (no profiling)
+  local tmp_test
+  tmp_test=$(mktemp)
+  printf 'PRAGMA threads=1;\n' > "$tmp_test"
+  cat "$sql_file" >> "$tmp_test"
+  "$DUCKDB_EXEC" "$DUCKDB_DB_PATH" < "$tmp_test" >/dev/null 2>&1 || {
+    rm -f "$tmp_test"
+    echo "(error)|0|0|(n/a)|0|0"
+    return
+  }
+  rm -f "$tmp_test"
+
+  local shell_times=()
+  local core_times=()
+  local core_have=false
+
+  for ((i=1; i<=REPEATS; i++)); do
+    local tmp_sql iter_json
+    tmp_sql=$(mktemp)
+    if $RUN_STATS; then
+      # Enable JSON profiling per-run and write to a temporary file
+      iter_json=$(mktemp -t duckprof.XXXXXX).json
+      cat > "$tmp_sql" <<SQL
+PRAGMA threads=1;
+PRAGMA enable_profiling=json;
+PRAGMA profiling_output='$iter_json';
+SQL
+    else
+      echo 'PRAGMA threads=1;' > "$tmp_sql"
+    fi
+    cat "$sql_file" >> "$tmp_sql"
+
+    # Wall-clock shell time
+    local ms
+    ms=$(time_command_ms "$DUCKDB_EXEC" "$DUCKDB_DB_PATH" < "$tmp_sql") || {
+      rm -f "$tmp_sql" ${iter_json:+"$iter_json"}
+      echo "(error)|0|0|(n/a)|0|0"
+      return
+    }
+    shell_times+=("$ms")
+
+    # Parse core latency from JSON profile if available
+    if $RUN_STATS && [[ -n "${iter_json:-}" && -f "$iter_json" ]]; then
+      local core_sec
+      if command -v jq >/dev/null 2>&1; then
+        core_sec=$(jq -r '.latency // empty' "$iter_json" 2>/dev/null || true)
+      else
+        core_sec=$(grep -oE '"latency"\s*:\s*[0-9.]+' "$iter_json" 2>/dev/null | sed -E 's/.*:\s*//' | head -1 || true)
+      fi
+      if [[ -n "$core_sec" ]] && [[ "$core_sec" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+        local core_ms
+        core_ms=$(awk -v s="$core_sec" 'BEGIN{printf "%.1f", s*1000}')
+        core_times+=("$core_ms")
+        core_have=true
+      fi
+    fi
+
+    rm -f "$tmp_sql" ${iter_json:+"$iter_json"}
+  done
+
+  # Build outputs
+  local shell_stats core_stats
+  shell_stats=$(calculate_statistics "${shell_times[@]}")
+  if $RUN_STATS && $core_have && [[ ${#core_times[@]} -gt 0 ]]; then
+    core_stats=$(calculate_statistics "${core_times[@]}")
+  else
+    core_stats="(n/a)|0|0"
+  fi
+  echo "$shell_stats|$core_stats"
+}
+
+# Help function
+show_help() {
+  cat << 'EOF'
+Multi-Engine SSB Performance Benchmark Runner v1.0
+
+USAGE (from repo root):
+  scripts/ssb/shell/run_all_perf.sh [OPTIONS] [QUERIES...]
+
+OPTIONS:
+  -stats, --stats         Enable SystemDS internal statistics collection
+  -warmup=N, --warmup=N   Set number of warmup runs (default: 1)
+  -repeats=N, --repeats=N Set number of timing repetitions (default: 5)
+  -seed=N, --seed=N       Set random seed for reproducible results (default: auto-generated)
+  -stacked, --stacked     Use stacked, multi-line layout (best for narrow terminals)
+  -layout=MODE, --layout=MODE Set layout: auto|wide|stacked (default: auto)
+                          Note: --layout=stacked is equivalent to --stacked
+                                --layout=wide forces wide table layout
+  -input-dir=PATH, --input-dir=PATH Specify custom data directory (default: $PROJECT_ROOT/data)
+  -output-dir=PATH, --output-dir=PATH Specify custom output directory (default: $PROJECT_ROOT/scripts/ssb/shell/ssbOutputData/PerformanceData)
+  -h, -help, --help, --h  Show this help message
+  -v, -version, --version, --v Show version information
+
+QUERIES:
+  If no queries are specified, all available SSB queries (q*.dml) will be executed.
+  To run specific queries, provide their names (with or without .dml extension):
+    scripts/ssb/shell/run_all_perf.sh q1.1 q2.3 q4.1
+
+EXAMPLES (from repo root):
+  scripts/ssb/shell/run_all_perf.sh                          # Run full benchmark with all engines
+  scripts/ssb/shell/run_all_perf.sh --warmup=3 --repeats=10  # Custom warmup and repetition settings
+  scripts/ssb/shell/run_all_perf.sh -warmup=3 -repeats=10    # Same with single dashes
+  scripts/ssb/shell/run_all_perf.sh --stats                  # Enable SystemDS internal timing
+  scripts/ssb/shell/run_all_perf.sh --layout=wide            # Force wide table layout
+  scripts/ssb/shell/run_all_perf.sh --stacked                # Force stacked layout for narrow terminals
+  scripts/ssb/shell/run_all_perf.sh q1.1 q2.3                # Benchmark specific queries only
+  scripts/ssb/shell/run_all_perf.sh --seed=12345             # Reproducible benchmark run
+  scripts/ssb/shell/run_all_perf.sh --input-dir=/path/to/data  # Custom data directory
+  scripts/ssb/shell/run_all_perf.sh -input-dir=/path/to/data   # Same as above (single dash)
+  scripts/ssb/shell/run_all_perf.sh --output-dir=/tmp/results  # Custom output directory
+  scripts/ssb/shell/run_all_perf.sh -output-dir=/tmp/results   # Same as above (single dash)
+
+ENGINES:
+  - SystemDS: Machine learning platform with DML queries
+  - PostgreSQL: Industry-standard relational database (if available)
+  - DuckDB: High-performance analytical database (if available)
+
+OUTPUT:
+  Results are saved in CSV and JSON formats with comprehensive metadata:
+  - Performance timing statistics (mean, stdev, p95)
+  - Engine comparison and fastest detection
+  - System information and run configuration
+
+STATISTICAL OUTPUT FORMAT:
+  1824 (±10, p95:1840)
+    │     │       └── 95th percentile (worst-case bound)
+    │     └── Standard deviation (consistency measure)
+    └── Mean execution time (typical performance)
+
+For more information, see the documentation in scripts/ssb/README.md
+EOF
+}
+
+# Parse arguments
+RUN_STATS=false
+QUERIES=()
+SEED=""
+LAYOUT="auto"
+INPUT_DIR=""
+OUTPUT_DIR=""
+
+# Support both --opt=value and --opt value forms
+EXPECT_OPT=""
+for arg in "$@"; do
+  if [[ -n "$EXPECT_OPT" ]]; then
+    case "$EXPECT_OPT" in
+      seed)
+        SEED="$arg"
+        EXPECT_OPT=""
+        continue
+        ;;
+      input-dir)
+        INPUT_DIR="$arg"
+        EXPECT_OPT=""
+        continue
+        ;;
+      output-dir)
+        OUTPUT_DIR="$arg"
+        EXPECT_OPT=""
+        continue
+        ;;
+      warmup)
+        WARMUP="$arg"
+        if ! [[ "$WARMUP" =~ ^[0-9]+$ ]] || [[ "$WARMUP" -lt 0 ]]; then
+          echo "Error: --warmup requires a non-negative integer (e.g., --warmup 2)" >&2
+          exit 1
+        fi
+        EXPECT_OPT=""
+        continue
+        ;;
+      repeats)
+        REPEATS="$arg"
+        if ! [[ "$REPEATS" =~ ^[0-9]+$ ]] || [[ "$REPEATS" -lt 1 ]]; then
+          echo "Error: --repeats requires a positive integer (e.g., --repeats 5)" >&2
+          exit 1
+        fi
+        EXPECT_OPT=""
+        continue
+        ;;
+      layout)
+        LAYOUT="$arg"
+        if [[ "$LAYOUT" != "auto" && "$LAYOUT" != "wide" && "$LAYOUT" != "stacked" ]]; then
+          echo "Error: --layout requires one of: auto, wide, stacked (e.g., --layout wide)" >&2
+          exit 1
+        fi
+        EXPECT_OPT=""
+        continue
+        ;;
+    esac
+  fi
+
+  if [[ "$arg" == "--help" || "$arg" == "-help" || "$arg" == "-h" || "$arg" == "--h" ]]; then
+    show_help
+    exit 0
+  elif [[ "$arg" == "--version" || "$arg" == "-version" || "$arg" == "-v" || "$arg" == "--v" ]]; then
+    echo "Multi-Engine SSB Performance Benchmark Runner v1.0"
+    echo "First Public Release: September 5, 2025"
+    exit 0
+  elif [[ "$arg" == "--stats" || "$arg" == "-stats" ]]; then
+    RUN_STATS=true
+  elif [[ "$arg" == --seed=* || "$arg" == -seed=* ]]; then
+    SEED="${arg#*seed=}"
+  elif [[ "$arg" == "--seed" || "$arg" == "-seed" ]]; then
+    EXPECT_OPT="seed"
+  elif [[ "$arg" == --warmup=* || "$arg" == -warmup=* ]]; then
+    WARMUP="${arg#*warmup=}"
+    if ! [[ "$WARMUP" =~ ^[0-9]+$ ]] || [[ "$WARMUP" -lt 0 ]]; then
+      echo "Error: -warmup/--warmup requires a non-negative integer (e.g., -warmup=2)" >&2
+      exit 1
+    fi
+  elif [[ "$arg" == --input-dir=* || "$arg" == -input-dir=* ]]; then
+    INPUT_DIR="${arg#*input-dir=}"
+  elif [[ "$arg" == "--input-dir" || "$arg" == "-input-dir" ]]; then
+    EXPECT_OPT="input-dir"
+  elif [[ "$arg" == --output-dir=* || "$arg" == -output-dir=* ]]; then
+    OUTPUT_DIR="${arg#*output-dir=}"
+  elif [[ "$arg" == "--output-dir" || "$arg" == "-output-dir" ]]; then
+    EXPECT_OPT="output-dir"
+  elif [[ "$arg" == "--warmup" || "$arg" == "-warmup" ]]; then
+    EXPECT_OPT="warmup"
+  elif [[ "$arg" == --repeats=* || "$arg" == -repeats=* ]]; then
+    REPEATS="${arg#*repeats=}"
+    if ! [[ "$REPEATS" =~ ^[0-9]+$ ]] || [[ "$REPEATS" -lt 1 ]]; then
+      echo "Error: -repeats/--repeats requires a positive integer (e.g., -repeats=5)" >&2
+      exit 1
+    fi
+  elif [[ "$arg" == "--repeats" || "$arg" == "-repeats" ]]; then
+    EXPECT_OPT="repeats"
+  elif [[ "$arg" == "--stacked" || "$arg" == "-stacked" ]]; then
+    LAYOUT="stacked"
+  elif [[ "$arg" == --layout=* || "$arg" == -layout=* ]]; then
+    LAYOUT="${arg#*layout=}"
+    if [[ "$LAYOUT" != "auto" && "$LAYOUT" != "wide" && "$LAYOUT" != "stacked" ]]; then
+      echo "Error: -layout/--layout requires one of: auto, wide, stacked (e.g., --layout=wide)" >&2
+      exit 1
+    fi
+  elif [[ "$arg" == "--layout" || "$arg" == "-layout" ]]; then
+    EXPECT_OPT="layout"
+  else
+    # Check if argument looks like an unrecognized option (starts with dash)
+    if [[ "$arg" == -* ]]; then
+      echo "Error: Unrecognized option '$arg'" >&2
+      echo "Use --help or -h to see available options." >&2
+      exit 1
+    else
+      # Treat as query name
+      QUERIES+=( "$(echo "$arg" | tr '.' '_')" )
+    fi
+  fi
+ done
+
+# If the last option expected a value but none was provided
+if [[ -n "$EXPECT_OPT" ]]; then
+  case "$EXPECT_OPT" in
+    seed) echo "Error: -seed/--seed requires a value (e.g., -seed=12345)" >&2 ;;
+    warmup) echo "Error: -warmup/--warmup requires a value (e.g., -warmup=2)" >&2 ;;
+    repeats) echo "Error: -repeats/--repeats requires a value (e.g., -repeats=5)" >&2 ;;
+    layout) echo "Error: -layout/--layout requires a value (e.g., -layout=wide)" >&2 ;;
+  esac
+  exit 1
+fi
+
+# Generate seed if not provided
+if [[ -z "$SEED" ]]; then
+  SEED=$((RANDOM * 32768 + RANDOM))
+fi
+if [[ ${#QUERIES[@]} -eq 0 ]]; then
+  for f in "$QUERY_DIR"/q*.dml; do
+    [[ -e "$f" ]] || continue
+    bname="$(basename "$f")"
+    QUERIES+=( "${bname%.dml}" )
+  done
+fi
+
+# Set data directory
+if [[ -z "$INPUT_DIR" ]]; then
+  INPUT_DIR="$PROJECT_ROOT/data"
+fi
+
+# Set output directory
+if [[ -z "$OUTPUT_DIR" ]]; then
+  OUTPUT_DIR="$PROJECT_ROOT/scripts/ssb/shell/ssbOutputData/PerformanceData"
+fi
+
+# Normalize paths by removing trailing slashes
+INPUT_DIR="${INPUT_DIR%/}"
+OUTPUT_DIR="${OUTPUT_DIR%/}"
+
+# Pass input directory to DML scripts via SystemDS named arguments
+NVARGS=( -nvargs "input_dir=${INPUT_DIR}" )
+
+# Validate data directory
+if [[ ! -d "$INPUT_DIR" ]]; then
+  echo "Error: Data directory '$INPUT_DIR' does not exist." >&2
+  echo "Please ensure the directory exists or specify a different path with -input-dir." >&2
+  exit 1
+fi
+
+# Ensure output directory exists
+mkdir -p "$OUTPUT_DIR"
+
+# Metadata collection functions
+collect_system_metadata() {
+  local timestamp hostname systemds_version jdk_version postgres_version duckdb_version cpu_info ram_info
+
+  # Basic system info
+  timestamp=$(date -u '+%Y-%m-%d %H:%M:%S UTC')
+  hostname=$(hostname 2>/dev/null || echo "unknown")
+
+  # SystemDS version
+  if [[ -x "$SYSTEMDS_CMD" ]]; then
+    # Try to get version from pom.xml first
+    if [[ -f "$PROJECT_ROOT/pom.xml" ]]; then
+      systemds_version=$(grep -A1 '<groupId>org.apache.systemds</groupId>' "$PROJECT_ROOT/pom.xml" | grep '<version>' | sed 's/.*<version>\(.*\)<\/version>.*/\1/' | head -1 2>/dev/null || echo "unknown")
+    else
+      systemds_version="unknown"
+    fi
+
+    # If pom.xml method failed, try alternative methods
+    if [[ "$systemds_version" == "unknown" ]]; then
+      # Try to extract from SystemDS JAR manifest
+      if [[ -f "$PROJECT_ROOT/target/systemds.jar" ]]; then
+        systemds_version=$(unzip -p "$PROJECT_ROOT/target/systemds.jar" META-INF/MANIFEST.MF 2>/dev/null | grep "Implementation-Version" | cut -d: -f2 | tr -d ' ' || echo "unknown")
+      else
+        # Try to find any SystemDS JAR and extract version
+        local jar_file=$(find "$PROJECT_ROOT" -name "systemds*.jar" | head -1 2>/dev/null)
+        if [[ -n "$jar_file" ]]; then
+          systemds_version=$(unzip -p "$jar_file" META-INF/MANIFEST.MF 2>/dev/null | grep "Implementation-Version" | cut -d: -f2 | tr -d ' ' || echo "unknown")
+        else
+          systemds_version="unknown"
+        fi
+      fi
+    fi
+  else
+    systemds_version="unknown"
+  fi
+
+  # JDK version
+  if command -v java >/dev/null 2>&1; then
+    jdk_version=$(java -version 2>&1 | grep -v "Picked up" | head -1 | sed 's/.*"\(.*\)".*/\1/' || echo "unknown")
+  else
+    jdk_version="unknown"
+  fi
+
+  # PostgreSQL version
+  if command -v psql >/dev/null 2>&1; then
+    postgres_version=$(psql --version 2>/dev/null | head -1 || echo "not available")
+  else
+    postgres_version="not available"
+  fi
+
+  # DuckDB version
+  if command -v duckdb >/dev/null 2>&1; then
+    duckdb_version=$(duckdb --version 2>/dev/null || echo "not available")
+  else
+    duckdb_version="not available"
+  fi
+
+  # System resources
+  if [[ "$(uname)" == "Darwin" ]]; then
+    # macOS
+    cpu_info=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "unknown")
+    ram_info=$(( $(sysctl -n hw.memsize 2>/dev/null || echo 0) / 1024 / 1024 / 1024 ))GB
+  else
+    # Linux
+    cpu_info=$(grep "model name" /proc/cpuinfo | head -1 | cut -d: -f2- | sed 's/^ *//' 2>/dev/null || echo "unknown")
+    ram_info=$(( $(grep MemTotal /proc/meminfo | awk '{print $2}' 2>/dev/null || echo 0) / 1024 / 1024 ))GB
+  fi
+
+  # Store metadata globally
+  RUN_TIMESTAMP="$timestamp"
+  RUN_HOSTNAME="$hostname"
+  RUN_SYSTEMDS_VERSION="$systemds_version"
+  RUN_JDK_VERSION="$jdk_version"
+  RUN_POSTGRES_VERSION="$postgres_version"
+  RUN_DUCKDB_VERSION="$duckdb_version"
+  RUN_CPU_INFO="$cpu_info"
+  RUN_RAM_INFO="$ram_info"
+}
+
+collect_data_metadata() {
+  # Check for SSB data directory and get basic stats
+  local ssb_data_dir="$INPUT_DIR"
+  local json_parts=()
+  local display_parts=()
+
+  if [[ -d "$ssb_data_dir" ]]; then
+    # Try to get row counts from data files (if they exist)
+    for table in customer part supplier date; do
+      local file="$ssb_data_dir/${table}.tbl"
+      if [[ -f "$file" ]]; then
+        local count=$(wc -l < "$file" 2>/dev/null | tr -d ' ' || echo "0")
+        json_parts+=("    \"$table\": \"$count\"")
+        display_parts+=("$table:$count")
+      fi
+    done
+    # Check for any lineorder*.tbl file (SSB fact table)
+    local lineorder_file=$(find "$ssb_data_dir" -name "lineorder*.tbl" -type f | head -1)
+    if [[ -n "$lineorder_file" && -f "$lineorder_file" ]]; then
+      local count=$(wc -l < "$lineorder_file" 2>/dev/null | tr -d ' ' || echo "0")
+      json_parts+=("    \"lineorder\": \"$count\"")
+      display_parts+=("lineorder:$count")
+    fi
+  fi
+
+  if [[ ${#json_parts[@]} -eq 0 ]]; then
+    RUN_DATA_INFO='"No data files found"'
+    RUN_DATA_DISPLAY="No data files found"
+  else
+    # Join array elements with commas and newlines, wrap in braces for JSON
+    local formatted_json="{\n"
+    for i in "${!json_parts[@]}"; do
+      formatted_json+="${json_parts[$i]}"
+      if [[ $i -lt $((${#json_parts[@]} - 1)) ]]; then
+        formatted_json+=",\n"
+      else
+        formatted_json+="\n"
+      fi
+    done
+    formatted_json+="  }"
+    RUN_DATA_INFO="$formatted_json"
+
+    # Join with spaces for display
+    local IFS=" "
+    RUN_DATA_DISPLAY="${display_parts[*]}"
+  fi
+}
+
+print_metadata_header() {
+  echo "=================================================================================="
+  echo "                      MULTI-ENGINE PERFORMANCE BENCHMARK METADATA"
+  echo "=================================================================================="
+  echo "Timestamp:       $RUN_TIMESTAMP"
+  echo "Hostname:        $RUN_HOSTNAME"
+  echo "Seed:            $SEED"
+  echo
+  echo "Software Versions:"
+  echo "  SystemDS:      $RUN_SYSTEMDS_VERSION"
+  echo "  JDK:           $RUN_JDK_VERSION"
+  echo "  PostgreSQL:    $RUN_POSTGRES_VERSION"
+  echo "  DuckDB:        $RUN_DUCKDB_VERSION"
+  echo
+  echo "System Resources:"
+  echo "  CPU:           $RUN_CPU_INFO"
+  echo "  RAM:           $RUN_RAM_INFO"
+  echo
+  echo "Data Build Info:"
+  echo "  SSB Data:      $RUN_DATA_DISPLAY"
+  echo
+  echo "Run Configuration:"
+  echo "  Statistics:    $(if $RUN_STATS; then echo "enabled"; else echo "disabled"; fi)"
+  echo "  Queries:       ${#QUERIES[@]} selected"
+  echo "  Warmup Runs:   $WARMUP"
+  echo "  Repeat Runs:   $REPEATS"
+  echo "=================================================================================="
+  echo
+}
+
+# Progress indicator function
+progress_indicator() {
+  local query_name="$1"
+  local stage="$2"
+  # Use terminal width for proper clearing, fallback to 120 chars if tput fails
+  local term_width
+  term_width=$(tput cols 2>/dev/null || echo 120)
+  local spaces=$(printf "%*s" "$term_width" "")
+  echo -ne "\r$spaces\r$query_name: Running $stage..."
+}
+
+# Clear progress line function
+clear_progress() {
+  local term_width
+  term_width=$(tput cols 2>/dev/null || echo 120)
+  local spaces=$(printf "%*s" "$term_width" "")
+  echo -ne "\r$spaces\r"
+}
+
+# Main execution
+# Collect metadata
+collect_system_metadata
+collect_data_metadata
+
+# Print metadata header
+print_metadata_header
+
+verify_environment
+echo
+echo "NOTE (macOS): You cannot drop OS caches like Linux (sync; echo 3 > /proc/sys/vm/drop_caches)."
+echo "We mitigate with warm-up runs and repeated averages to ensure consistent measurements."
+echo
+echo "INTERPRETATION GUIDE:"
+echo "- SystemDS Shell (ms): Total execution time including JVM startup, I/O, and computation"
+echo "- SystemDS Core (ms):  Pure computation time excluding JVM overhead (only with --stats)"
+echo "- PostgreSQL (ms):     Single-threaded execution time with parallel workers disabled"
+echo "- PostgreSQL Core (ms): Query execution time from EXPLAIN ANALYZE (only with --stats)"
+echo "- DuckDB (ms):         Single-threaded execution time with threads=1 pragma"
+echo "- DuckDB Core (ms):    Engine-internal latency from JSON profiling (with --stats)"
+echo "- (missing):           SQL file not found for this query"
+echo "- (n/a):               Core timing unavailable (run with --stats flag for internal timing)"
+echo
+echo "NOTE: All engines use single-threaded execution for fair comparison."
+echo "      Multiple runs with averaging provide statistical reliability."
+echo
+echo "Single-threaded execution; warm-up runs: $WARMUP, timed runs: $REPEATS"
+echo "Row 1 shows mean (ms); Row 2 shows ±stdev/CV; Row 3 shows p95 (ms)."
+echo "Core execution times available for all engines with --stats flag."
+echo
+term_width=$(tput cols 2>/dev/null || echo 120)
+if [[ "$LAYOUT" == "auto" ]]; then
+  if [[ $term_width -ge 140 ]]; then
+    LAYOUT_MODE="wide"
+  else
+    LAYOUT_MODE="stacked"
+  fi
+else
+  LAYOUT_MODE="$LAYOUT"
+fi
+
+# If the user requested wide layout but the terminal is too narrow, fall back to stacked
+if [[ "$LAYOUT_MODE" == "wide" ]]; then
+  # compute total printable width: sum(widths) + 3*cols + 1 (accounting for separators)
+  sumw=0
+  for w in "${WIDE_COL_WIDTHS[@]}"; do sumw=$((sumw + w)); done
+  cols=${#WIDE_COL_WIDTHS[@]}
+  total_width=$((sumw + 3*cols + 1))
+  if [[ $total_width -gt $term_width ]]; then
+    # Try to scale columns down proportionally to fit terminal width
+    reserved=$((3*cols + 1))
+    avail=$((term_width - reserved))
+  if [[ $avail -le 0 ]]; then
+    :
+  else
+      # Minimum sensible widths per column (keep labels readable)
+      MIN_COL_WIDTHS=(6 8 8 6 10 6 6 16)
+      # Start with proportional distribution
+      declare -a new_widths=()
+      for w in "${WIDE_COL_WIDTHS[@]}"; do
+        nw=$(( w * avail / sumw ))
+        if [[ $nw -lt 1 ]]; then nw=1; fi
+        new_widths+=("$nw")
+      done
+      # Enforce minimums
+      sum_new=0
+      for i in "${!new_widths[@]}"; do
+        if [[ ${new_widths[i]} -lt ${MIN_COL_WIDTHS[i]:-4} ]]; then
+          new_widths[i]=${MIN_COL_WIDTHS[i]:-4}
+        fi
+        sum_new=$((sum_new + new_widths[i]))
+      done
+      # If even minimums exceed available, fallback to stacked
+      if [[ $sum_new -gt $avail ]]; then
+        :
+      else
+        # Distribute remaining columns' widths left-to-right
+        rem=$((avail - sum_new))
+        i=0
+        while [[ $rem -gt 0 ]]; do
+          new_widths[i]=$((new_widths[i] + 1))
+          rem=$((rem - 1))
+          i=$(( (i + 1) % cols ))
+        done
+        # Replace WIDE_COL_WIDTHS with the scaled values for printing
+        WIDE_COL_WIDTHS=("${new_widths[@]}")
+        # Recompute total_width for logging
+        sumw=0
+        for w in "${WIDE_COL_WIDTHS[@]}"; do sumw=$((sumw + w)); done
+        total_width=$((sumw + reserved))
+        echo "Info: scaled wide layout to fit terminal ($term_width cols): table width $total_width"
+      fi
+    fi
+  fi
+fi
+
+if [[ "$LAYOUT_MODE" == "wide" ]]; then
+  grid_line_wide
+  grid_row_wide \
+    "Query" \
+    "SysDS Shell" "SysDS Core" \
+    "PostgreSQL" "PostgreSQL Core" \
+    "DuckDB" "DuckDB Core" \
+    "Fastest"
+  grid_row_wide "" "mean" "mean" "mean" "mean" "mean" "mean" ""
+  grid_row_wide "" "±/CV" "±/CV" "±/CV" "±/CV" "±/CV" "±/CV" ""
+  grid_row_wide "" "p95" "p95" "p95" "p95" "p95" "p95" ""
+  grid_line_wide
+else
+  echo "================================================================================"
+  echo "Stacked layout (use --layout=wide for table view)."
+  echo "Row 1 shows mean (ms); Row 2 shows (±stdev/CV, p95)."
+  echo "--------------------------------------------------------------------------------"
+fi
+# Prepare output file paths and write CSV header with comprehensive metadata
+# Ensure results directory exists and create timestamped filenames
+RESULT_DIR="$OUTPUT_DIR"
+mkdir -p "$RESULT_DIR"
+RESULT_BASENAME="ssb_results_$(date -u +%Y%m%dT%H%M%SZ)"
+RESULT_CSV="$RESULT_DIR/${RESULT_BASENAME}.csv"
+RESULT_JSON="$RESULT_DIR/${RESULT_BASENAME}.json"
+
+{
+  echo "# Multi-Engine Performance Benchmark Results"
+  echo "# Timestamp: $RUN_TIMESTAMP"
+  echo "# Hostname: $RUN_HOSTNAME"
+  echo "# Seed: $SEED"
+  echo "# SystemDS: $RUN_SYSTEMDS_VERSION"
+  echo "# JDK: $RUN_JDK_VERSION"
+  echo "# PostgreSQL: $RUN_POSTGRES_VERSION"
+  echo "# DuckDB: $RUN_DUCKDB_VERSION"
+  echo "# CPU: $RUN_CPU_INFO"
+  echo "# RAM: $RUN_RAM_INFO"
+  echo "# Data: $RUN_DATA_DISPLAY"
+  echo "# Warmup: $WARMUP, Repeats: $REPEATS"
+  echo "# Statistics: $(if $RUN_STATS; then echo "enabled"; else echo "disabled"; fi)"
+  echo "#"
+  echo "query,systemds_shell_display,systemds_shell_mean,systemds_shell_stdev,systemds_shell_p95,systemds_core_display,systemds_core_mean,systemds_core_stdev,systemds_core_p95,postgres_display,postgres_mean,postgres_stdev,postgres_p95,postgres_core_display,postgres_core_mean,postgres_core_stdev,postgres_core_p95,duckdb_display,duckdb_mean,duckdb_stdev,duckdb_p95,duckdb_core_display,duckdb_core_mean,duckdb_core_stdev,duckdb_core_p95,fastest"
+} > "$RESULT_CSV"
+for base in "${QUERIES[@]}"; do
+  # Show progress indicator for SystemDS
+  progress_indicator "$base" "SystemDS"
+
+  dml_path="$QUERY_DIR/${base}.dml"
+  # Parse SystemDS results: shell_mean|shell_stdev|shell_p95|core_mean|core_stdev|core_p95
+  # Capture potential SystemDS test-run error messages for JSON reporting
+  tmp_err_msg=$(mktemp)
+  systemds_result="$(run_systemds_avg "$dml_path" "$tmp_err_msg")"
+  # Read any captured error message
+  sysds_err_text="$(sed -n '1,200p' "$tmp_err_msg" 2>/dev/null | tr '\n' ' ' || true)"
+  rm -f "$tmp_err_msg"
+  IFS='|' read -r sd_shell_mean sd_shell_stdev sd_shell_p95 sd_core_mean sd_core_stdev sd_core_p95 <<< "$systemds_result"
+
+  # Format SystemDS results for display
+  if [[ "$sd_shell_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+    sd_shell_display=$(format_statistics "$sd_shell_mean" "$sd_shell_stdev" "$sd_shell_p95" "$REPEATS")
+  else
+  sd_shell_display="$sd_shell_mean"
+    sd_shell_stdev="0"
+    sd_shell_p95="0"
+  fi
+  if [[ "$sd_core_mean" == "(n/a)" ]]; then
+    sd_core_display="(n/a)"
+  else
+    sd_core_display=$(format_statistics "$sd_core_mean" "$sd_core_stdev" "$sd_core_p95" "$REPEATS")
+  fi
+
+  sql_name="${base//_/.}.sql"
+  sql_path="$SQL_DIR/$sql_name"
+  pg_display="(missing)"
+  duck_display="(missing)"
+
+  if [[ -n "$PSQL_EXEC" && -f "$sql_path" ]]; then
+    progress_indicator "$base" "PostgreSQL"
+    pg_result="$(run_psql_avg_ms "$sql_path")"
+    IFS='|' read -r pg_mean pg_stdev pg_p95 pg_core_mean pg_core_stdev pg_core_p95 <<< "$pg_result"
+    if [[ "$pg_mean" == "(unavailable)" || "$pg_mean" == "(error)" ]]; then
+      pg_display="$pg_mean"
+      pg_core_display="$pg_mean"
+      pg_stdev="0"
+      pg_p95="0"
+      pg_core_mean="(n/a)"
+      pg_core_stdev="0"
+      pg_core_p95="0"
+    else
+      pg_display=$(format_statistics "$pg_mean" "$pg_stdev" "$pg_p95" "$REPEATS")
+      if [[ "$pg_core_mean" != "(n/a)" ]]; then
+        pg_core_display=$(format_statistics "$pg_core_mean" "$pg_core_stdev" "$pg_core_p95" "$REPEATS")
+      else
+        pg_core_display="(n/a)"
+      fi
+    fi
+  elif [[ -z "$PSQL_EXEC" ]]; then
+    pg_display="(unavailable)"
+    pg_core_display="(unavailable)"
+    pg_mean="(unavailable)"
+    pg_core_mean="(unavailable)"
+    pg_stdev="0"
+    pg_p95="0"
+    pg_core_stdev="0"
+    pg_core_p95="0"
+  else
+    pg_display="(missing)"
+    pg_core_display="(missing)"
+    pg_mean="(missing)"
+    pg_core_mean="(missing)"
+    pg_stdev="0"
+    pg_p95="0"
+    pg_core_stdev="0"
+    pg_core_p95="0"
+  fi
+
+  if [[ -n "$DUCKDB_EXEC" && -f "$sql_path" ]]; then
+    progress_indicator "$base" "DuckDB"
+    duck_result="$(run_duckdb_avg_ms "$sql_path")"
+    IFS='|' read -r duck_mean duck_stdev duck_p95 duck_core_mean duck_core_stdev duck_core_p95 <<< "$duck_result"
+    if [[ "$duck_mean" == "(unavailable)" || "$duck_mean" == "(error)" ]]; then
+      duck_display="$duck_mean"
+      duck_stdev="0"
+      duck_p95="0"
+      duck_core_display="(n/a)"
+      duck_core_mean="(n/a)"
+      duck_core_stdev="0"
+      duck_core_p95="0"
+    else
+      duck_display=$(format_statistics "$duck_mean" "$duck_stdev" "$duck_p95" "$REPEATS")
+      if [[ "$duck_core_mean" == "(n/a)" ]]; then
+        duck_core_display="(n/a)"
+      else
+        duck_core_display=$(format_statistics "$duck_core_mean" "$duck_core_stdev" "$duck_core_p95" "$REPEATS")
+      fi
+    fi
+  elif [[ -z "$DUCKDB_EXEC" ]]; then
+    duck_display="(unavailable)"
+    duck_mean="(unavailable)"
+    duck_stdev="0"
+    duck_p95="0"
+    duck_core_display="(unavailable)"
+    duck_core_mean="(unavailable)"
+    duck_core_stdev="0"
+    duck_core_p95="0"
+  else
+    duck_display="(missing)"
+    duck_mean="(missing)"
+    duck_stdev="0"
+    duck_p95="0"
+    duck_core_display="(missing)"
+    duck_core_mean="(missing)"
+    duck_core_stdev="0"
+    duck_core_p95="0"
+  fi
+
+  # Determine fastest engine based on mean values
+  fastest=""
+  min_ms=999999999
+  for engine in systemds pg duck; do
+    val=""
+    eng_name=""
+    case "$engine" in
+      systemds) val="$sd_shell_mean"; eng_name="SystemDS";;
+      pg) val="$pg_mean"; eng_name="PostgreSQL";;
+      duck) val="$duck_mean"; eng_name="DuckDB";;
+    esac
+    # Check if value is a valid number (including decimal)
+    if [[ "$val" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+      # Use awk for floating point comparison
+      if [[ $(awk -v val="$val" -v min="$min_ms" 'BEGIN{print (val < min)}') -eq 1 ]]; then
+        min_ms=$(awk -v val="$val" 'BEGIN{printf "%.1f", val}')
+        fastest="$eng_name"
+      elif [[ $(awk -v val="$val" -v min="$min_ms" 'BEGIN{print (val == min)}') -eq 1 ]] && [[ -n "$fastest" ]]; then
+        fastest="$fastest+$eng_name"  # Show ties
+      fi
+    fi
+  done
+  [[ -z "$fastest" ]] && fastest="(n/a)"
+
+  # Determine SystemDS per-query status and include any error message captured
+  systemds_status="success"
+  systemds_error_message=null
+  if [[ "$sd_shell_mean" == "(error)" ]] || [[ -n "$sysds_err_text" ]]; then
+    systemds_status="error"
+    if [[ -n "$sysds_err_text" ]]; then
+      # Escape quotes for JSON embedding
+      esc=$(printf '%s' "$sysds_err_text" | sed -e 's/"/\\"/g')
+      systemds_error_message="\"$esc\""
+    else
+      systemds_error_message="\"SystemDS reported an error during test-run\""
+    fi
+  fi
+
+  # Prepare mean-only and stats-only cells
+  # Means: use numeric mean when available; otherwise use existing display label (unavailable/missing)
+  sd_shell_mean_cell=$([[ "$sd_shell_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$sd_shell_mean" || echo "$sd_shell_display")
+  sd_core_mean_cell=$([[ "$sd_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$sd_core_mean" || echo "$sd_core_display")
+  pg_mean_cell=$([[ "$pg_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$pg_mean" || echo "$pg_display")
+  pg_core_mean_cell=$([[ "$pg_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$pg_core_mean" || echo "$pg_core_display")
+  duck_mean_cell=$([[ "$duck_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$duck_mean" || echo "$duck_display")
+  duck_core_mean_cell=$([[ "$duck_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]] && echo "$duck_core_mean" || echo "$duck_core_display")
+
+  # Stats lines split: CV and p95
+  sd_shell_cv_cell=$(format_cv_only "$sd_shell_mean" "$sd_shell_stdev" "$REPEATS")
+  sd_core_cv_cell=$(format_cv_only "$sd_core_mean" "$sd_core_stdev" "$REPEATS")
+  pg_cv_cell=$(format_cv_only "$pg_mean" "$pg_stdev" "$REPEATS")
+  pg_core_cv_cell=$(format_cv_only "$pg_core_mean" "$pg_core_stdev" "$REPEATS")
+  duck_cv_cell=$(format_cv_only "$duck_mean" "$duck_stdev" "$REPEATS")
+  duck_core_cv_cell=$(format_cv_only "$duck_core_mean" "$duck_core_stdev" "$REPEATS")
+
+  sd_shell_p95_cell=$(format_p95_only "$sd_shell_p95" "$REPEATS")
+  sd_core_p95_cell=$(format_p95_only "$sd_core_p95" "$REPEATS")
+  pg_p95_cell=$(format_p95_only "$pg_p95" "$REPEATS")
+  pg_core_p95_cell=$(format_p95_only "$pg_core_p95" "$REPEATS")
+  duck_p95_cell=$(format_p95_only "$duck_p95" "$REPEATS")
+  duck_core_p95_cell=$(format_p95_only "$duck_core_p95" "$REPEATS")
+
+  # Clear progress line and display final results
+  clear_progress
+  if [[ "$LAYOUT_MODE" == "wide" ]]; then
+    # Three-line table style with grid separators
+    grid_row_wide \
+      "$base" \
+      "$sd_shell_mean_cell" "$sd_core_mean_cell" \
+      "$pg_mean_cell" "$pg_core_mean_cell" \
+      "$duck_mean_cell" "$duck_core_mean_cell" \
+      "$fastest"
+    grid_row_wide \
+      "" \
+      "$sd_shell_cv_cell" "$sd_core_cv_cell" \
+      "$pg_cv_cell" "$pg_core_cv_cell" \
+      "$duck_cv_cell" "$duck_core_cv_cell" \
+      ""
+    grid_row_wide \
+      "" \
+      "$sd_shell_p95_cell" "$sd_core_p95_cell" \
+      "$pg_p95_cell" "$pg_core_p95_cell" \
+      "$duck_p95_cell" "$duck_core_p95_cell" \
+      ""
+    grid_line_wide
+  else
+    # Stacked layout for narrow terminals
+    echo "Query  : $base    Fastest: $fastest"
+    printf '  %-20s %s\n' "SystemDS Shell:" "$sd_shell_mean_cell"
+    [[ -n "$sd_shell_cv_cell" ]] && printf '  %-20s %s\n' "" "$sd_shell_cv_cell"
+    [[ -n "$sd_shell_p95_cell" ]] && printf '  %-20s %s\n' "" "$sd_shell_p95_cell"
+    printf '  %-20s %s\n' "SystemDS Core:" "$sd_core_mean_cell"
+    [[ -n "$sd_core_cv_cell" ]] && printf '  %-20s %s\n' "" "$sd_core_cv_cell"
+    [[ -n "$sd_core_p95_cell" ]] && printf '  %-20s %s\n' "" "$sd_core_p95_cell"
+    printf '  %-20s %s\n' "PostgreSQL:" "$pg_mean_cell"
+    [[ -n "$pg_cv_cell" ]] && printf '  %-20s %s\n' "" "$pg_cv_cell"
+    [[ -n "$pg_p95_cell" ]] && printf '  %-20s %s\n' "" "$pg_p95_cell"
+    printf '  %-20s %s\n' "PostgreSQL Core:" "$pg_core_mean_cell"
+    [[ -n "$pg_core_cv_cell" ]] && printf '  %-20s %s\n' "" "$pg_core_cv_cell"
+    [[ -n "$pg_core_p95_cell" ]] && printf '  %-20s %s\n' "" "$pg_core_p95_cell"
+    printf '  %-20s %s\n' "DuckDB:" "$duck_mean_cell"
+    [[ -n "$duck_cv_cell" ]] && printf '  %-20s %s\n' "" "$duck_cv_cell"
+    [[ -n "$duck_p95_cell" ]] && printf '  %-20s %s\n' "" "$duck_p95_cell"
+    printf '  %-20s %s\n' "DuckDB Core:" "$duck_core_mean_cell"
+    [[ -n "$duck_core_cv_cell" ]] && printf '  %-20s %s\n' "" "$duck_core_cv_cell"
+    [[ -n "$duck_core_p95_cell" ]] && printf '  %-20s %s\n' "" "$duck_core_p95_cell"
+    echo "--------------------------------------------------------------------------------"
+  fi
+
+  # Write comprehensive data to CSV
+  echo "$base,\"$sd_shell_display\",$sd_shell_mean,$sd_shell_stdev,$sd_shell_p95,\"$sd_core_display\",$sd_core_mean,$sd_core_stdev,$sd_core_p95,\"$pg_display\",$pg_mean,$pg_stdev,$pg_p95,\"$pg_core_display\",$pg_core_mean,$pg_core_stdev,$pg_core_p95,\"$duck_display\",$duck_mean,$duck_stdev,$duck_p95,\"$duck_core_display\",$duck_core_mean,$duck_core_stdev,$duck_core_p95,$fastest" >> "$RESULT_CSV"
+
+  # Build JSON entry for this query
+  json_entry=$(cat <<JSON_ENTRY
+    {
+      "query": "$base",
+      "systemds": {
+        "shell": {
+          "display": "$sd_shell_display",
+          "mean_ms": $(if [[ "$sd_shell_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$sd_shell_mean"; else echo "null"; fi),
+          "stdev_ms": $(if [[ "$sd_shell_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$sd_shell_stdev"; else echo "null"; fi),
+          "p95_ms": $(if [[ "$sd_shell_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$sd_shell_p95"; else echo "null"; fi)
+        },
+          "core": {
+          "display": "$sd_core_display",
+          "mean_ms": $(if [[ "$sd_core_mean" == "(n/a)" ]]; then echo "null"; else echo "$sd_core_mean"; fi),
+          "stdev_ms": $(if [[ "$sd_core_mean" == "(n/a)" ]]; then echo "null"; else echo "$sd_core_stdev"; fi),
+          "p95_ms": $(if [[ "$sd_core_mean" == "(n/a)" ]]; then echo "null"; else echo "$sd_core_p95"; fi)
+        }
+        "status": "$systemds_status",
+        "error_message": $systemds_error_message
+      },
+      "postgresql": {
+        "display": "$pg_display",
+        "mean_ms": $(if [[ "$pg_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$pg_mean"; else echo "null"; fi),
+        "stdev_ms": $(if [[ "$pg_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$pg_stdev"; else echo "null"; fi),
+        "p95_ms": $(if [[ "$pg_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$pg_p95"; else echo "null"; fi)
+      },
+      "postgresql_core": {
+        "display": "$pg_core_display",
+        "mean_ms": $(if [[ "$pg_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$pg_core_mean"; else echo "null"; fi),
+        "stdev_ms": $(if [[ "$pg_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$pg_core_stdev"; else echo "null"; fi),
+        "p95_ms": $(if [[ "$pg_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$pg_core_p95"; else echo "null"; fi)
+      },
+      "duckdb": {
+        "display": "$duck_display",
+        "mean_ms": $(if [[ "$duck_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$duck_mean"; else echo "null"; fi),
+        "stdev_ms": $(if [[ "$duck_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$duck_stdev"; else echo "null"; fi),
+        "p95_ms": $(if [[ "$duck_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$duck_p95"; else echo "null"; fi)
+      },
+      "duckdb_core": {
+        "display": "$duck_core_display",
+        "mean_ms": $(if [[ "$duck_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$duck_core_mean"; else echo "null"; fi),
+        "stdev_ms": $(if [[ "$duck_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$duck_core_stdev"; else echo "null"; fi),
+        "p95_ms": $(if [[ "$duck_core_mean" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then echo "$duck_core_p95"; else echo "null"; fi)
+      },
+      "fastest_engine": "$fastest"
+    }
+JSON_ENTRY
+)
+  RESULTS_JSON_ARRAY+=("$json_entry")
+ done
+ echo "==================================================================================================================================================================="
+echo
+
+# Generate comprehensive JSON file with metadata and results
+{
+  echo "{"
+  echo "  \"benchmark_metadata\": {"
+  echo "    \"benchmark_type\": \"multi_engine_performance\","
+  echo "    \"timestamp\": \"$RUN_TIMESTAMP\","
+  echo "    \"hostname\": \"$RUN_HOSTNAME\","
+  echo "    \"seed\": $SEED,"
+  echo "    \"software_versions\": {"
+  echo "      \"systemds\": \"$RUN_SYSTEMDS_VERSION\","
+  echo "      \"jdk\": \"$RUN_JDK_VERSION\","
+  echo "      \"postgresql\": \"$RUN_POSTGRES_VERSION\","
+  echo "      \"duckdb\": \"$RUN_DUCKDB_VERSION\""
+  echo "    },"
+  echo "    \"system_resources\": {"
+  echo "      \"cpu\": \"$RUN_CPU_INFO\","
+  echo "      \"ram\": \"$RUN_RAM_INFO\""
+  echo "    },"
+  echo -e "    \"data_build_info\": $RUN_DATA_INFO,"
+  echo "    \"run_configuration\": {"
+  echo "      \"statistics_enabled\": $(if $RUN_STATS; then echo "true"; else echo "false"; fi),"
+  echo "      \"queries_selected\": ${#QUERIES[@]},"
+  echo "      \"warmup_runs\": $WARMUP,"
+  echo "      \"repeat_runs\": $REPEATS"
+  echo "    }"
+  echo "  },"
+  echo "  \"results\": ["
+
+  # Output results array
+  for i in "${!RESULTS_JSON_ARRAY[@]}"; do
+    echo "${RESULTS_JSON_ARRAY[$i]}"
+    if [[ $i -lt $((${#RESULTS_JSON_ARRAY[@]} - 1)) ]]; then
+      echo "    ,"
+    fi
+  done
+
+  echo "  ]"
+  echo "}"
+} > "$RESULT_JSON"
+
+echo "Results saved to $RESULT_CSV"
+echo "Results saved to $RESULT_JSON"
diff --git a/scripts/staging/ssb/shell/run_ssb.sh b/scripts/staging/ssb/shell/run_ssb.sh
new file mode 100644
index 00000000000..b7ad9c57d32
--- /dev/null
+++ b/scripts/staging/ssb/shell/run_ssb.sh
@@ -0,0 +1,876 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# SystemDS Star Schema Benchmark (SSB) Runner
+# ===========================================
+#
+# CORE SCRIPTS STATUS:
+# - Version: 1.0 (September 5, 2025)
+# - Status: Production-Ready with Advanced User Experience
+# - First Public Release: September 5, 2025
+#
+# FEATURES IMPLEMENTED:
+# ✓ Basic SSB query execution with SystemDS 3.4.0-SNAPSHOT
+# ✓ Single-threaded configuration for consistent benchmarking
+# ✓ Progress indicators with real-time updates
+# ✓ Comprehensive timing measurements using /usr/bin/time
+# ✓ Query result extraction (scalar and table formats)
+# ✓ Success/failure tracking with detailed reporting
+# ✓ Query summary table with execution status
+# ✓ "See below" notation with result reprinting (NEW)
+# ✓ Long table outputs displayed after summary (NEW)
+# ✓ Error handling with timeout protection
+# ✓ Cross-platform compatibility (macOS/Linux)
+#
+# RECENT IMPORTANT ADDITIONS:
+# - Accepts --input-dir=PATH and forwards it into DML runs as a SystemDS named
+#   argument: -nvargs input_dir=/path/to/data (DML can use sys.vinput_dir or
+#   the named argument to locate data files instead of hardcoded `data/`).
+# - Fast-fail on missing input directory: the runner verifies the provided
+#   input path exists and exits with a clear error message if not.
+# - Runtime SystemDS error detection: test-run output is scanned for runtime
+#   error blocks (e.g., "An Error Occurred : ..."). Queries with runtime
+#   failures are reported as `status: "error"` and include `error_message`
+#   in generated JSON metadata for easier debugging and CI integration.
+#
+# MAJOR FEATURES IN v1.0 (First Public Release):
+# - Complete SSB query execution with SystemDS 3.4.0-SNAPSHOT
+# - Enhanced "see below" notation with result reprinting
+# - Long table outputs displayed after summary for better UX
+# - Eliminated need to scroll back through terminal output
+# - Maintained array alignment for consistent result tracking
+# - JSON metadata contains complete query results, not "see below"
+# - Added --out-dir option for custom output directory
+# - Multi-format output: TXT, CSV, JSON for each query result
+# - Structured output directory with comprehensive run.json metadata file
+#
+# DEPENDENCIES:
+# - SystemDS binary (3.4.0-SNAPSHOT or later)
+# - Single-threaded configuration file (auto-generated)
+# - SSB query files in scripts/ssb/queries/
+# - Bash 4.0+ with timeout support
+#
+# USAGE (from repo root):
+#   scripts/ssb/shell/run_ssb.sh                    # run all SSB queries
+#   scripts/ssb/shell/run_ssb.sh q1.1 q2.3          # run specific queries
+#   scripts/ssb/shell/run_ssb.sh --stats            # enable internal statistics
+#   scripts/ssb/shell/run_ssb.sh q3.1 --stats       # run specific query with stats
+#   scripts/ssb/shell/run_ssb.sh --seed=12345       # run with specific seed for reproducibility
+#   scripts/ssb/shell/run_ssb.sh --out-dir=/path    # specify output directory for results
+#
+set -euo pipefail
+export LC_ALL=C
+
+# Determine script directory and project root (repo root)
+if command -v realpath >/dev/null 2>&1; then
+  SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+else
+  SCRIPT_DIR="$(python - <<'PY'
+import os, sys
+print(os.path.dirname(os.path.abspath(sys.argv[1])))
+PY
+"$0")"
+fi
+if command -v git >/dev/null 2>&1 && git -C "$SCRIPT_DIR" rev-parse --show-toplevel >/dev/null 2>&1; then
+  PROJECT_ROOT="$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel)"
+else
+  __dir="$SCRIPT_DIR"
+  PROJECT_ROOT=""
+  while [[ "$__dir" != "/" ]]; do
+    if [[ -d "$__dir/.git" || -f "$__dir/pom.xml" ]]; then
+      PROJECT_ROOT="$__dir"; break
+    fi
+    __dir="$(dirname "$__dir")"
+  done
+  : "${PROJECT_ROOT:=$(cd "$SCRIPT_DIR/../../../" && pwd)}"
+fi
+
+# Locate SystemDS executable
+SYSTEMDS_CMD="$PROJECT_ROOT/bin/systemds"
+if [[ ! -x "$SYSTEMDS_CMD" ]]; then
+  SYSTEMDS_CMD="$(command -v systemds || true)"
+fi
+if [[ -z "$SYSTEMDS_CMD" || ! -x "$SYSTEMDS_CMD" ]]; then
+  echo "Error: could not find SystemDS executable." >&2
+  echo "       Tried: $PROJECT_ROOT/bin/systemds and PATH" >&2
+  exit 1
+fi
+
+# Ensure single-threaded configuration file exists
+CONF_DIR="$PROJECT_ROOT/conf"
+SINGLE_THREAD_CONF="$CONF_DIR/single_thread.xml"
+mkdir -p "$CONF_DIR"
+if [[ ! -f "$SINGLE_THREAD_CONF" ]]; then
+cat > "$SINGLE_THREAD_CONF" <<'XML'
+<configuration>
+  <property>
+    <name>sysds.cp.parallel.ops</name><value>false</value>
+  </property>
+  <property>
+    <name>sysds.num.threads</name><value>1</value>
+  </property>
+</configuration>
+XML
+fi
+SYS_EXTRA_ARGS=( "-config" "$SINGLE_THREAD_CONF" )
+
+# Query directory
+QUERY_DIR="$PROJECT_ROOT/scripts/ssb/queries"
+
+# Verify query directory exists
+if [[ ! -d "$QUERY_DIR" ]]; then
+  echo "Error: Query directory not found: $QUERY_DIR" >&2
+  exit 1
+fi
+
+# Help function
+show_help() {
+  cat << 'EOF'
+SystemDS Star Schema Benchmark (SSB) Runner v1.0
+
+USAGE (from repo root):
+  scripts/ssb/shell/run_ssb.sh [OPTIONS] [QUERIES...]
+
+OPTIONS:
+  --stats, -stats         Enable SystemDS internal statistics collection
+  --seed=N, -seed=N      Set random seed for reproducible results (default: auto-generated)
+  --output-dir=PATH, -output-dir=PATH  Specify custom output directory (default: $PROJECT_ROOT/scripts/ssb/shell/ssbOutputData/QueryData)
+  --input-dir=PATH, -input-dir=PATH  Specify custom data directory (default: $PROJECT_ROOT/data)
+  --help, -help, -h, --h Show this help message
+  --version, -version, -v, --v  Show version information
+
+QUERIES:
+  If no queries are specified, all available SSB queries (q*.dml) will be executed.
+  To run specific queries, provide their names (with or without .dml extension):
+    ./run_ssb.sh q1.1 q2.3 q4.1
+
+EXAMPLES (from repo root):
+  scripts/ssb/shell/run_ssb.sh                          # Run all SSB queries
+  scripts/ssb/shell/run_ssb.sh --stats                  # Run all queries with statistics
+  scripts/ssb/shell/run_ssb.sh -stats                   # Same as above (single dash)
+  scripts/ssb/shell/run_ssb.sh q1.1 q2.3                # Run specific queries only
+  scripts/ssb/shell/run_ssb.sh --seed=12345 --stats     # Reproducible run with statistics
+  scripts/ssb/shell/run_ssb.sh -seed=12345 -stats       # Same as above (single dash)
+  scripts/ssb/shell/run_ssb.sh --output-dir=/tmp/results   # Custom output directory
+  scripts/ssb/shell/run_ssb.sh -output-dir=/tmp/results    # Same as above (single dash)
+  scripts/ssb/shell/run_ssb.sh --input-dir=/path/to/data  # Custom data directory
+  scripts/ssb/shell/run_ssb.sh -input-dir=/path/to/data   # Same as above (single dash)
+
+OUTPUT:
+  Results are saved in multiple formats:
+  - TXT: Human-readable format
+  - CSV: Machine-readable data format
+  - JSON: Structured format with metadata
+  - run.json: Complete run metadata and results
+
+For more information, see the documentation in scripts/ssb/README.md
+EOF
+}
+
+# Parse arguments
+RUN_STATS=false
+QUERIES=()
+SEED=""
+OUT_DIR=""
+INPUT_DIR=""
+for arg in "$@"; do
+  if [[ "$arg" == "--help" || "$arg" == "-help" || "$arg" == "-h" || "$arg" == "--h" ]]; then
+    show_help
+    exit 0
+  elif [[ "$arg" == "--version" || "$arg" == "-version" || "$arg" == "-v" || "$arg" == "--v" ]]; then
+    echo "SystemDS Star Schema Benchmark (SSB) Runner v1.0"
+    echo "First Public Release: September 5, 2025"
+    exit 0
+  elif [[ "$arg" == "--stats" || "$arg" == "-stats" ]]; then
+    RUN_STATS=true
+  elif [[ "$arg" == --seed=* || "$arg" == -seed=* ]]; then
+    if [[ "$arg" == --seed=* ]]; then
+      SEED="${arg#--seed=}"
+    else
+      SEED="${arg#-seed=}"
+    fi
+  elif [[ "$arg" == "--seed" || "$arg" == "-seed" ]]; then
+    echo "Error: --seed/-seed requires a value (e.g., --seed=12345 or -seed=12345)" >&2
+    exit 1
+  elif [[ "$arg" == --output-dir=* || "$arg" == -output-dir=* ]]; then
+    if [[ "$arg" == --output-dir=* ]]; then
+      OUT_DIR="${arg#--output-dir=}"
+    else
+      OUT_DIR="${arg#-output-dir=}"
+    fi
+  elif [[ "$arg" == "--output-dir" || "$arg" == "-output-dir" ]]; then
+    echo "Error: --output-dir/-output-dir requires a value (e.g., --output-dir=/path/to/output or -output-dir=/path/to/output)" >&2
+    exit 1
+  elif [[ "$arg" == --input-dir=* || "$arg" == -input-dir=* ]]; then
+    if [[ "$arg" == --input-dir=* ]]; then
+      INPUT_DIR="${arg#--input-dir=}"
+    else
+      INPUT_DIR="${arg#-input-dir=}"
+    fi
+  elif [[ "$arg" == "--input-dir" || "$arg" == "-input-dir" ]]; then
+    echo "Error: --input-dir/-input-dir requires a value (e.g., --input-dir=/path/to/data or -input-dir=/path/to/data)" >&2
+    exit 1
+  else
+    # Check if argument looks like an unrecognized option (starts with dash)
+    if [[ "$arg" == -* ]]; then
+      echo "Error: Unrecognized option '$arg'" >&2
+      echo "Use --help or -h to see available options." >&2
+      exit 1
+    else
+      # Treat as query name
+      name="$(echo "$arg" | tr '.' '_')"
+      QUERIES+=( "$name.dml" )
+    fi
+  fi
+done
+
+# Set default output directory if not provided
+if [[ -z "$OUT_DIR" ]]; then
+  OUT_DIR="$PROJECT_ROOT/scripts/ssb/shell/ssbOutputData/QueryData"
+fi
+
+# Set default input data directory if not provided
+if [[ -z "$INPUT_DIR" ]]; then
+  INPUT_DIR="$PROJECT_ROOT/data"
+fi
+
+# Normalize paths by removing trailing slashes
+INPUT_DIR="${INPUT_DIR%/}"
+OUT_DIR="${OUT_DIR%/}"
+
+# Ensure output directory exists
+mkdir -p "$OUT_DIR"
+
+# Pass input directory to DML scripts via SystemDS named arguments
+NVARGS=( -nvargs "input_dir=${INPUT_DIR}" )
+
+# Validate input data directory exists
+if [[ ! -d "$INPUT_DIR" ]]; then
+  echo "Error: Input data directory '$INPUT_DIR' does not exist." >&2
+  echo "Please create the directory or specify a valid path with --input-dir=PATH" >&2
+  exit 1
+fi
+
+# Generate seed if not provided
+if [[ -z "$SEED" ]]; then
+  SEED=$((RANDOM * 32768 + RANDOM))
+fi
+
+# Discover queries if none provided
+shopt -s nullglob
+if [[ ${#QUERIES[@]} -eq 0 ]]; then
+  for f in "$QUERY_DIR"/q*.dml; do
+    if [[ -f "$f" ]]; then
+      QUERIES+=("$(basename "$f")")
+    fi
+  done
+  if [[ ${#QUERIES[@]} -eq 0 ]]; then
+    echo "Error: No query files found in $QUERY_DIR" >&2
+    exit 1
+  fi
+fi
+shopt -u nullglob
+
+# Metadata collection functions
+collect_system_metadata() {
+  local timestamp hostname systemds_version jdk_version cpu_info ram_info
+
+  # Basic system info
+  timestamp=$(date -u '+%Y-%m-%d %H:%M:%S UTC')
+  hostname=$(hostname 2>/dev/null || echo "unknown")
+
+  # SystemDS version
+  if [[ -x "$SYSTEMDS_CMD" ]]; then
+    # Try to get version from pom.xml first
+    if [[ -f "$PROJECT_ROOT/pom.xml" ]]; then
+      systemds_version=$(grep -A1 '<groupId>org.apache.systemds</groupId>' "$PROJECT_ROOT/pom.xml" | grep '<version>' | sed 's/.*<version>\(.*\)<\/version>.*/\1/' | head -1 2>/dev/null || echo "unknown")
+    else
+      systemds_version="unknown"
+    fi
+
+    # If pom.xml method failed, try alternative methods
+    if [[ "$systemds_version" == "unknown" ]]; then
+      # Try to extract from SystemDS JAR manifest
+      if [[ -f "$PROJECT_ROOT/target/systemds.jar" ]]; then
+        systemds_version=$(unzip -p "$PROJECT_ROOT/target/systemds.jar" META-INF/MANIFEST.MF 2>/dev/null | grep "Implementation-Version" | cut -d: -f2 | tr -d ' ' || echo "unknown")
+      else
+        # Try to find any SystemDS JAR and extract version
+        local jar_file=$(find "$PROJECT_ROOT" -name "systemds*.jar" | head -1 2>/dev/null)
+        if [[ -n "$jar_file" ]]; then
+          systemds_version=$(unzip -p "$jar_file" META-INF/MANIFEST.MF 2>/dev/null | grep "Implementation-Version" | cut -d: -f2 | tr -d ' ' || echo "unknown")
+        else
+          systemds_version="unknown"
+        fi
+      fi
+    fi
+  else
+    systemds_version="unknown"
+  fi
+
+  # JDK version
+  if command -v java >/dev/null 2>&1; then
+    jdk_version=$(java -version 2>&1 | head -1 | sed 's/.*"\(.*\)".*/\1/' || echo "unknown")
+  else
+    jdk_version="unknown"
+  fi
+
+  # System resources
+  if [[ "$(uname)" == "Darwin" ]]; then
+    # macOS
+    cpu_info=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "unknown")
+    ram_info=$(( $(sysctl -n hw.memsize 2>/dev/null || echo 0) / 1024 / 1024 / 1024 ))GB
+  else
+    # Linux
+    cpu_info=$(grep "model name" /proc/cpuinfo | head -1 | cut -d: -f2- | sed 's/^ *//' 2>/dev/null || echo "unknown")
+    ram_info=$(( $(grep MemTotal /proc/meminfo | awk '{print $2}' 2>/dev/null || echo 0) / 1024 / 1024 ))GB
+  fi
+
+  # Store metadata globally
+  RUN_TIMESTAMP="$timestamp"
+  RUN_HOSTNAME="$hostname"
+  RUN_SYSTEMDS_VERSION="$systemds_version"
+  RUN_JDK_VERSION="$jdk_version"
+  RUN_CPU_INFO="$cpu_info"
+  RUN_RAM_INFO="$ram_info"
+}
+
+collect_data_metadata() {
+  # Check for SSB data directory and get basic stats
+  local ssb_data_dir="$INPUT_DIR"
+  local json_parts=()
+  local display_parts=()
+
+  if [[ -d "$ssb_data_dir" ]]; then
+    # Try to get row counts from data files (if they exist)
+    for table in customer part supplier date; do
+      local file="$ssb_data_dir/${table}.tbl"
+      if [[ -f "$file" ]]; then
+        local count=$(wc -l < "$file" 2>/dev/null | tr -d ' ' || echo "0")
+        json_parts+=("    \"$table\": \"$count\"")
+        display_parts+=("$table:$count")
+      fi
+    done
+    # Check for any lineorder*.tbl file (SSB fact table)
+    local lineorder_file=$(find "$ssb_data_dir" -name "lineorder*.tbl" -type f | head -1)
+    if [[ -n "$lineorder_file" && -f "$lineorder_file" ]]; then
+      local count=$(wc -l < "$lineorder_file" 2>/dev/null | tr -d ' ' || echo "0")
+      json_parts+=("    \"lineorder\": \"$count\"")
+      display_parts+=("lineorder:$count")
+    fi
+  fi
+
+  if [[ ${#json_parts[@]} -eq 0 ]]; then
+    RUN_DATA_INFO='"No data files found"'
+    RUN_DATA_DISPLAY="No data files found"
+  else
+    # Join array elements with commas and newlines, wrap in braces for JSON
+    local formatted_json="{\n"
+    for i in "${!json_parts[@]}"; do
+      formatted_json+="${json_parts[$i]}"
+      if [[ $i -lt $((${#json_parts[@]} - 1)) ]]; then
+        formatted_json+=",\n"
+      else
+        formatted_json+="\n"
+      fi
+    done
+    formatted_json+="  }"
+    RUN_DATA_INFO="$formatted_json"
+
+    # Join with spaces for display
+    local IFS=" "
+    RUN_DATA_DISPLAY="${display_parts[*]}"
+  fi
+}
+
+# Output format functions
+create_output_structure() {
+  local run_id="$1"
+  local base_dir="$OUT_DIR/ssb_run_$run_id"
+
+  # Create output directory structure
+  mkdir -p "$base_dir"/{txt,csv,json}
+
+  # Set global variables for output paths
+  OUTPUT_BASE_DIR="$base_dir"
+  OUTPUT_TXT_DIR="$base_dir/txt"
+  OUTPUT_CSV_DIR="$base_dir/csv"
+  OUTPUT_JSON_DIR="$base_dir/json"
+  OUTPUT_METADATA_FILE="$base_dir/run.json"
+}
+
+save_query_result_txt() {
+  local query_name="$1"
+  local result_data="$2"
+  local output_file="$OUTPUT_TXT_DIR/${query_name}.txt"
+
+  {
+    echo "========================================="
+    echo "SSB Query: $query_name"
+    echo "========================================="
+    echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
+    echo "Seed: $SEED"
+    echo ""
+    echo "Result:"
+    echo "---------"
+    echo "$result_data"
+    echo ""
+    echo "========================================="
+  } > "$output_file"
+}
+
+save_query_result_csv() {
+  local query_name="$1"
+  local result_data="$2"
+  local output_file="$OUTPUT_CSV_DIR/${query_name}.csv"
+
+  # Check if result is a single scalar value (including negative numbers and scientific notation)
+  if [[ "$result_data" =~ ^-?[0-9]+(\.[0-9]+)?([eE][+-]?[0-9]+)?$ ]]; then
+    # Scalar result
+    {
+      echo "query,result"
+      echo "$query_name,$result_data"
+    } > "$output_file"
+  else
+    # Table result - try to convert to CSV format
+    {
+      echo "# SSB Query: $query_name"
+      echo "# Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
+      echo "# Seed: $SEED"
+      # Convert space-separated table data to CSV
+      echo "$result_data" | sed 's/  */,/g' | sed 's/^,//g' | sed 's/,$//g'
+    } > "$output_file"
+  fi
+}
+
+save_query_result_json() {
+  local query_name="$1"
+  local result_data="$2"
+  local output_file="$OUTPUT_JSON_DIR/${query_name}.json"
+
+  # Escape quotes and special characters for JSON
+  local escaped_result=$(echo "$result_data" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ')
+
+  {
+    echo "{"
+    echo "  \"query\": \"$query_name\","
+    echo "  \"timestamp\": \"$(date -u '+%Y-%m-%d %H:%M:%S UTC')\","
+    echo "  \"seed\": $SEED,"
+    echo "  \"result\": \"$escaped_result\","
+    echo "  \"metadata\": {"
+    echo "    \"systemds_version\": \"$RUN_SYSTEMDS_VERSION\","
+    echo "    \"hostname\": \"$RUN_HOSTNAME\""
+    echo "  }"
+    echo "}"
+  } > "$output_file"
+}
+
+save_all_formats() {
+  local query_name="$1"
+  local result_data="$2"
+
+  save_query_result_txt "$query_name" "$result_data"
+  save_query_result_csv "$query_name" "$result_data"
+  save_query_result_json "$query_name" "$result_data"
+}
+
+# Collect metadata
+collect_system_metadata
+collect_data_metadata
+
+# Create output directory structure with timestamp-based run ID
+RUN_ID="$(date +%Y%m%d_%H%M%S)"
+create_output_structure "$RUN_ID"
+
+# Execute queries
+count=0
+failed=0
+SUCCESSFUL_QUERIES=()  # Array to track successfully executed queries
+ALL_RUN_QUERIES=()     # Array to track all queries that were attempted (in order)
+QUERY_STATUS=()        # Array to track status: "success" or "error"
+QUERY_ERROR_MSG=()     # Array to store error messages for failed queries
+QUERY_RESULTS=()       # Array to track query results for display
+QUERY_FULL_RESULTS=()  # Array to track complete query results for JSON
+QUERY_STATS=()         # Array to track SystemDS statistics for JSON
+QUERY_TIMINGS=()       # Array to track execution timing statistics
+LONG_OUTPUTS=()        # Array to store long table outputs for display after summary
+
+# Progress indicator function
+progress_indicator() {
+  local query_name="$1"
+  local current="$2"
+  local total="$3"
+  echo -ne "\r[$current/$total] Running: $query_name                                                   "
+}
+
+for q in "${QUERIES[@]}"; do
+  dml="$QUERY_DIR/$q"
+  if [[ ! -f "$dml" ]]; then
+    echo "Warning: query file '$dml' not found; skipping." >&2
+    continue
+  fi
+
+  # Show progress
+  progress_indicator "$q" "$((count + failed + 1))" "${#QUERIES[@]}"
+
+  # Change to project root directory so relative paths in DML work correctly
+  cd "$PROJECT_ROOT"
+
+  # Clear progress line before showing output
+  echo -ne "\r                                                                                   \r"
+  echo "[$((count + failed + 1))/${#QUERIES[@]}] Running: $q"
+
+  # Record attempted query
+  ALL_RUN_QUERIES+=("$q")
+
+  if $RUN_STATS; then
+    # Capture output to extract result
+    temp_output=$(mktemp)
+  if "$SYSTEMDS_CMD" "$dml" -stats "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" | tee "$temp_output"; then
+      # Even when SystemDS exits 0, the DML can emit runtime errors. Detect common error markers.
+      error_msg=$(sed -n '/An Error Occurred :/,$ p' "$temp_output" | sed -n '1,200p' | tr '\n' ' ' | sed 's/^ *//;s/ *$//')
+      if [[ -n "$error_msg" ]]; then
+        echo "Error: Query $q reported runtime error" >&2
+        echo "$error_msg" >&2
+        failed=$((failed+1))
+        QUERY_STATUS+=("error")
+        QUERY_ERROR_MSG+=("$error_msg")
+        # Maintain array alignment
+        QUERY_STATS+=("")
+        QUERY_RESULTS+=("N/A")
+        QUERY_FULL_RESULTS+=("N/A")
+        LONG_OUTPUTS+=("")
+      else
+        count=$((count+1))
+        SUCCESSFUL_QUERIES+=("$q")  # Track successful query
+        QUERY_STATUS+=("success")
+      # Extract result - try multiple patterns with timeouts to prevent hanging:
+      # 1. Simple scalar pattern like "REVENUE: 687752409"
+      result=$(timeout 5s grep -E "^[A-Z_]+:\s*[0-9]+" "$temp_output" | tail -1 | awk '{print $2}' 2>/dev/null || true)
+      full_result="$result"  # For scalar results, display and full results are the same
+
+      # 2. If no scalar pattern, check for table output and get row count
+      if [[ -z "$result" ]]; then
+        # Look for frame info like "# FRAME: nrow = 53, ncol = 3"
+        nrows=$(timeout 5s grep "# FRAME: nrow =" "$temp_output" | awk '{print $5}' | tr -d ',' 2>/dev/null || true)
+        if [[ -n "$nrows" ]]; then
+          result="${nrows} rows (see below)"
+          # Extract and store the long output for later display (excluding statistics)
+          long_output=$(grep -v "^#" "$temp_output" | grep -v "WARNING" | grep -v "WARN" | grep -v "^$" | sed '/^SystemDS Statistics:/,$ d')
+          LONG_OUTPUTS+=("$long_output")
+          # For JSON, store the actual table data
+          full_result="$long_output"
+        else
+          # Count actual data rows (lines with numbers, excluding headers and comments) - limit to prevent hanging
+          nrows=$(timeout 5s grep -E "^[0-9]" "$temp_output" | sed '/^SystemDS Statistics:/,$ d' | head -1000 | wc -l | tr -d ' ' 2>/dev/null || echo "0")
+          if [[ "$nrows" -gt 0 ]]; then
+            result="${nrows} rows (see below)"
+            # Extract and store the long output for later display (excluding statistics)
+            long_output=$(grep -E "^[0-9]" "$temp_output" | sed '/^SystemDS Statistics:/,$ d' | head -1000)
+            LONG_OUTPUTS+=("$long_output")
+            # For JSON, store the actual table data
+            full_result="$long_output"
+          else
+            result="N/A"
+            full_result="N/A"
+            LONG_OUTPUTS+=("")  # Empty placeholder to maintain array alignment
+          fi
+        fi
+      else
+        LONG_OUTPUTS+=("")  # Empty placeholder for scalar results to maintain array alignment
+      fi
+      QUERY_RESULTS+=("$result")  # Track query result for display
+      QUERY_FULL_RESULTS+=("$full_result")  # Track complete query result for JSON
+
+      # Save result in all formats
+      query_name_clean="${q%.dml}"
+
+      # Extract and store statistics for JSON (preserving newlines)
+      stats_output=$(sed -n '/^SystemDS Statistics:/,$ p' "$temp_output")
+  QUERY_STATS+=("$stats_output")  # Track statistics for JSON
+
+      save_all_formats "$query_name_clean" "$full_result"
+      fi
+    else
+      echo "Error: Query $q failed" >&2
+      failed=$((failed+1))
+      QUERY_STATUS+=("error")
+      QUERY_ERROR_MSG+=("Query execution failed (non-zero exit)")
+      # Add empty stats entry for failed queries to maintain array alignment
+      QUERY_STATS+=("")
+    fi
+    rm -f "$temp_output"
+  else
+    # Capture output to extract result
+    temp_output=$(mktemp)
+  if "$SYSTEMDS_CMD" "$dml" "${SYS_EXTRA_ARGS[@]}" "${NVARGS[@]}" | tee "$temp_output"; then
+      # Detect runtime errors in output even if command returned 0
+      error_msg=$(sed -n '/An Error Occurred :/,$ p' "$temp_output" | sed -n '1,200p' | tr '\n' ' ' | sed 's/^ *//;s/ *$//')
+      if [[ -n "$error_msg" ]]; then
+        echo "Error: Query $q reported runtime error" >&2
+        echo "$error_msg" >&2
+        failed=$((failed+1))
+        QUERY_STATUS+=("error")
+        QUERY_ERROR_MSG+=("$error_msg")
+        QUERY_STATS+=("")
+        QUERY_RESULTS+=("N/A")
+        QUERY_FULL_RESULTS+=("N/A")
+        LONG_OUTPUTS+=("")
+      else
+        count=$((count+1))
+        SUCCESSFUL_QUERIES+=("$q")  # Track successful query
+        QUERY_STATUS+=("success")
+      # Extract result - try multiple patterns with timeouts to prevent hanging:
+      # 1. Simple scalar pattern like "REVENUE: 687752409"
+      result=$(timeout 5s grep -E "^[A-Z_]+:\s*[0-9]+" "$temp_output" | tail -1 | awk '{print $2}' 2>/dev/null || true)
+      full_result="$result"  # For scalar results, display and full results are the same
+
+      # 2. If no scalar pattern, check for table output and get row count
+      if [[ -z "$result" ]]; then
+        # Look for frame info like "# FRAME: nrow = 53, ncol = 3"
+        nrows=$(timeout 5s grep "# FRAME: nrow =" "$temp_output" | awk '{print $5}' | tr -d ',' 2>/dev/null || true)
+        if [[ -n "$nrows" ]]; then
+          result="${nrows} rows (see below)"
+          # Extract and store the long output for later display
+          long_output=$(grep -v "^#" "$temp_output" | grep -v "WARNING" | grep -v "WARN" | grep -v "^$" | tail -n +1)
+          LONG_OUTPUTS+=("$long_output")
+          # For JSON, store the actual table data
+          full_result="$long_output"
+        else
+          # Count actual data rows (lines with numbers, excluding headers and comments) - limit to prevent hanging
+          nrows=$(timeout 5s grep -E "^[0-9]" "$temp_output" | head -1000 | wc -l | tr -d ' ' 2>/dev/null || echo "0")
+          if [[ "$nrows" -gt 0 ]]; then
+            result="${nrows} rows (see below)"
+            # Extract and store the long output for later display
+            long_output=$(grep -E "^[0-9]" "$temp_output" | head -1000)
+            LONG_OUTPUTS+=("$long_output")
+            # For JSON, store the actual table data
+            full_result="$long_output"
+          else
+            result="N/A"
+            full_result="N/A"
+            LONG_OUTPUTS+=("")  # Empty placeholder to maintain array alignment
+          fi
+        fi
+      else
+        LONG_OUTPUTS+=("")  # Empty placeholder for scalar results to maintain array alignment
+      fi
+      QUERY_RESULTS+=("$result")  # Track query result for display
+      QUERY_FULL_RESULTS+=("$full_result")  # Track complete query result for JSON
+
+  # Add empty stats entry for non-stats runs to maintain array alignment
+  QUERY_STATS+=("")
+
+      # Save result in all formats
+      query_name_clean="${q%.dml}"
+      save_all_formats "$query_name_clean" "$full_result"
+      fi
+    else
+      echo "Error: Query $q failed" >&2
+      failed=$((failed+1))
+      QUERY_STATUS+=("error")
+      QUERY_ERROR_MSG+=("Query execution failed (non-zero exit)")
+      # Add empty stats entry for failed queries to maintain array alignment
+      QUERY_STATS+=("")
+    fi
+    rm -f "$temp_output"
+  fi
+done
+
+# Summary
+echo ""
+echo "========================================="
+echo "SSB benchmark completed!"
+echo "Total queries executed: $count"
+if [[ $failed -gt 0 ]]; then
+  echo "Failed queries: $failed"
+fi
+if $RUN_STATS; then
+  echo "Statistics: enabled"
+else
+  echo "Statistics: disabled"
+fi
+
+# Display run metadata summary
+echo ""
+echo "========================================="
+echo "RUN METADATA SUMMARY"
+echo "========================================="
+echo "Timestamp:       $RUN_TIMESTAMP"
+echo "Hostname:        $RUN_HOSTNAME"
+echo "Seed:            $SEED"
+echo ""
+echo "Software Versions:"
+echo "  SystemDS:      $RUN_SYSTEMDS_VERSION"
+echo "  JDK:           $RUN_JDK_VERSION"
+echo ""
+echo "System Resources:"
+echo "  CPU:           $RUN_CPU_INFO"
+echo "  RAM:           $RUN_RAM_INFO"
+echo ""
+echo "Data Build Info:"
+echo "  SSB Data:      $RUN_DATA_DISPLAY"
+echo "========================================="
+
+# Generate metadata JSON file (include all attempted queries with status and error messages)
+{
+  echo "{"
+  echo "  \"benchmark_type\": \"ssb_systemds\","
+  echo "  \"timestamp\": \"$RUN_TIMESTAMP\","
+  echo "  \"hostname\": \"$RUN_HOSTNAME\","
+  echo "  \"seed\": $SEED,"
+  echo "  \"software_versions\": {"
+  echo "    \"systemds\": \"$RUN_SYSTEMDS_VERSION\","
+  echo "    \"jdk\": \"$RUN_JDK_VERSION\""
+  echo "  },"
+  echo "  \"system_resources\": {"
+  echo "    \"cpu\": \"$RUN_CPU_INFO\","
+  echo "    \"ram\": \"$RUN_RAM_INFO\""
+  echo "  },"
+  echo -e "  \"data_build_info\": $RUN_DATA_INFO,"
+  echo "  \"run_configuration\": {"
+  echo "    \"statistics_enabled\": $(if $RUN_STATS; then echo "true"; else echo "false"; fi),"
+  echo "    \"queries_selected\": ${#QUERIES[@]},"
+  echo "    \"queries_executed\": $count,"
+  echo "    \"queries_failed\": $failed"
+  echo "  },"
+  echo "  \"results\": ["
+  for i in "${!ALL_RUN_QUERIES[@]}"; do
+    query="${ALL_RUN_QUERIES[$i]}"
+    status="${QUERY_STATUS[$i]:-error}"
+    error_msg="${QUERY_ERROR_MSG[$i]:-}"
+    # Find matching full_result and stats by searching SUCCESSFUL_QUERIES index
+    full_result=""
+    stats_result=""
+    if [[ "$status" == "success" ]]; then
+      # Find index in SUCCESSFUL_QUERIES
+      for j in "${!SUCCESSFUL_QUERIES[@]}"; do
+        if [[ "${SUCCESSFUL_QUERIES[$j]}" == "$query" ]]; then
+          full_result="${QUERY_FULL_RESULTS[$j]}"
+          stats_result="${QUERY_STATS[$j]}"
+          break
+        fi
+      done
+    fi
+    # Escape quotes and newlines for JSON
+    escaped_result=$(echo "$full_result" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ')
+    escaped_error=$(echo "$error_msg" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | tr '\n' ' ')
+
+    echo "    {"
+    echo "      \"query\": \"${query%.dml}\","
+    echo "      \"status\": \"$status\","
+    echo "      \"error_message\": \"$escaped_error\","
+    echo "      \"result\": \"$escaped_result\""
+    if [[ -n "$stats_result" ]]; then
+      echo "      ,\"stats\": ["
+      echo "$stats_result" | sed 's/\\/\\\\/g' | sed 's/"/\\"/g' | sed 's/\t/    /g' | awk '
+        BEGIN { first = 1 }
+        {
+          if (!first) printf ",\n"
+          printf "        \"%s\"", $0
+          first = 0
+        }
+        END { if (!first) printf "\n" }
+      '
+      echo "      ]"
+    fi
+    if [[ $i -lt $((${#ALL_RUN_QUERIES[@]} - 1)) ]]; then
+      echo "    },"
+    else
+      echo "    }"
+    fi
+  done
+  echo "  ]"
+  echo "}"
+} > "$OUTPUT_METADATA_FILE"
+
+echo ""
+echo "Metadata saved to $OUTPUT_METADATA_FILE"
+echo "Output directory: $OUTPUT_BASE_DIR"
+echo "  - TXT files: $OUTPUT_TXT_DIR"
+echo "  - CSV files: $OUTPUT_CSV_DIR"
+echo "  - JSON files: $OUTPUT_JSON_DIR"
+
+# Detailed per-query summary (show status and error messages if any)
+if [[ ${#ALL_RUN_QUERIES[@]} -gt 0 ]]; then
+  echo ""
+  echo "==================================================="
+  echo "QUERIES SUMMARY"
+  echo "==================================================="
+  printf "%-4s %-15s %-30s %s\n" "No." "Query" "Result" "Status"
+  echo "---------------------------------------------------"
+  for i in "${!ALL_RUN_QUERIES[@]}"; do
+    query="${ALL_RUN_QUERIES[$i]}"
+    query_display="${query%.dml}"  # Remove .dml extension for display
+    status="${QUERY_STATUS[$i]:-error}"
+    if [[ "$status" == "success" ]]; then
+      # Find index in SUCCESSFUL_QUERIES to fetch result
+      result=""
+      for j in "${!SUCCESSFUL_QUERIES[@]}"; do
+        if [[ "${SUCCESSFUL_QUERIES[$j]}" == "$query" ]]; then
+          result="${QUERY_RESULTS[$j]}"
+          break
+        fi
+      done
+      printf "%-4d %-15s %-30s %s\n" "$((i+1))" "$query_display" "$result" "✓ Success"
+    else
+      err="${QUERY_ERROR_MSG[$i]:-Unknown error}"
+      printf "%-4d %-15s %-30s %s\n" "$((i+1))" "$query_display" "N/A" "ERROR: ${err}"
+    fi
+  done
+echo "==================================================="
+fi
+
+# Display long outputs for queries that had table results
+if [[ ${#SUCCESSFUL_QUERIES[@]} -gt 0 ]]; then
+  # Check if we have any long outputs to display
+  has_long_outputs=false
+  for i in "${!LONG_OUTPUTS[@]}"; do
+    if [[ -n "${LONG_OUTPUTS[$i]}" ]]; then
+      has_long_outputs=true
+      break
+    fi
+  done
+
+  if $has_long_outputs; then
+    echo ""
+    echo "========================================="
+    echo "DETAILED QUERY RESULTS"
+    echo "========================================="
+    for i in "${!SUCCESSFUL_QUERIES[@]}"; do
+      if [[ -n "${LONG_OUTPUTS[$i]}" ]]; then
+        query="${SUCCESSFUL_QUERIES[$i]}"
+        query_display="${query%.dml}"  # Remove .dml extension for display
+        echo ""
+        echo "[$((i+1))] Results for $query_display:"
+        echo "----------------------------------------"
+        echo "${LONG_OUTPUTS[$i]}"
+        echo "----------------------------------------"
+      fi
+    done
+    echo "========================================="
+  fi
+fi
+
+# Exit with appropriate code
+if [[ $failed -gt 0 ]]; then
+  exit 1
+fi
diff --git a/src/main/java/org/apache/sysds/api/DMLOptions.java b/src/main/java/org/apache/sysds/api/DMLOptions.java
index 917aecc4ab3..10c41e3d0a8 100644
--- a/src/main/java/org/apache/sysds/api/DMLOptions.java
+++ b/src/main/java/org/apache/sysds/api/DMLOptions.java
@@ -19,6 +19,10 @@
  
 package org.apache.sysds.api;
 
+import java.nio.file.Files;
+import java.nio.file.InvalidPathException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -66,6 +70,10 @@ public class DMLOptions {
 	public boolean              gpu           = false;            // Whether to use the GPU
 	public boolean              forceGPU      = false;            // Whether to ignore memory & estimates and always use the GPU
 	public boolean              ooc           = false;            // Whether to use the OOC backend
+	public boolean              oocLogEvents  = false;            // Whether to record I/O and task compute events (fine grained, may impact performance on many small tasks)
+	public String               oocLogPath    = "./";             // The directory where to save the recorded event logs (csv)
+	public boolean              oocStats      = false;            // Wether to record and print coarse grained ooc statistics
+	public int                  oocStatsCount = 10;               // Default ooc statistics count
 	public boolean              debug         = false;            // to go into debug mode to be able to step through a program
 	public String               filePath      = null;             // path to script
 	public String               script        = null;             // the script itself
@@ -105,7 +113,11 @@ public String toString() {
 			", fedStats=" + fedStats +
 			", fedStatsCount=" + fedStatsCount +
 			", fedMonitoring=" + fedMonitoring +
-			", fedMonitoringAddress" + fedMonitoringAddress +
+			", fedMonitoringAddress=" + fedMonitoringAddress +
+			", oocStats=" + oocStats +
+			", oocStatsCount=" + oocStatsCount +
+			", oocLogEvents=" + oocLogEvents +
+			", oocLogPath=" + oocLogPath +
 			", memStats=" + memStats +
 			", explainType=" + explainType +
 			", execMode=" + execMode +
@@ -193,7 +205,7 @@ else if (lineageType.equalsIgnoreCase("debugger"))
 			else if (execMode.equalsIgnoreCase("hybrid")) dmlOptions.execMode = ExecMode.HYBRID;
 			else if (execMode.equalsIgnoreCase("spark")) dmlOptions.execMode = ExecMode.SPARK;
 			else throw new org.apache.commons.cli.ParseException("Invalid argument specified for -exec option, must be one of [hadoop, singlenode, hybrid, HYBRID, spark]");
-	}
+		}
 		if (line.hasOption("explain")) {
 			dmlOptions.explainType = ExplainType.RUNTIME;
 			String explainType = line.getOptionValue("explain");
@@ -259,6 +271,33 @@ else if (lineageType.equalsIgnoreCase("debugger"))
 			}
 		}
 
+		dmlOptions.oocStats = line.hasOption("oocStats");
+		if (dmlOptions.oocStats) {
+			String oocStatsCount = line.getOptionValue("oocStats");
+			if (oocStatsCount != null) {
+				try {
+					dmlOptions.oocStatsCount = Integer.parseInt(oocStatsCount);
+				} catch (NumberFormatException e) {
+					throw new org.apache.commons.cli.ParseException("Invalid argument specified for -oocStats option, must be a valid integer");
+				}
+			}
+		}
+
+		dmlOptions.oocLogEvents = line.hasOption("oocLogEvents");
+		if (dmlOptions.oocLogEvents) {
+			String eventLogPath = line.getOptionValue("oocLogEvents");
+			if (eventLogPath != null) {
+				try {
+					Path p = Paths.get(eventLogPath);
+					if (!Files.isDirectory(p))
+						throw new org.apache.commons.cli.ParseException("Invalid argument specified for -oocLogEvents option, must be valid directory");
+				} catch (InvalidPathException e) {
+					throw new org.apache.commons.cli.ParseException("Invalid argument specified for -oocLogEvents option, must be a valid path");
+				}
+				dmlOptions.oocLogPath = eventLogPath;
+			}
+		}
+
 		dmlOptions.memStats = line.hasOption("mem");
 
 		dmlOptions.clean = line.hasOption("clean");
@@ -387,6 +426,12 @@ private static Options createCLIOptions() {
 		Option fedStatsOpt = OptionBuilder.withArgName("count")
 			.withDescription("monitors and reports summary execution statistics of federated workers; heavy hitter <count> is 10 unless overridden; default off")
 			.hasOptionalArg().create("fedStats");
+		Option oocStatsOpt = OptionBuilder
+			.withDescription("monitors and reports summary execution statistics of ooc operators and tasks; heavy hitter <count> is 10 unless overriden; default off")
+			.hasOptionalArg().create("oocStats");
+		Option oocLogEventsOpt = OptionBuilder
+			.withDescription("records fine grained events of compute tasks, I/O, and cache; -oocLogEvents [dir='./']")
+			.hasOptionalArg().create("oocLogEvents");
 		Option memOpt = OptionBuilder.withDescription("monitors and reports max memory consumption in CP; default off")
 			.create("mem");
 		Option explainOpt = OptionBuilder.withArgName("level")
@@ -452,6 +497,8 @@ private static Options createCLIOptions() {
 		options.addOption(statsOpt);
 		options.addOption(ngramsOpt);
 		options.addOption(fedStatsOpt);
+		options.addOption(oocStatsOpt);
+		options.addOption(oocLogEventsOpt);
 		options.addOption(memOpt);
 		options.addOption(explainOpt);
 		options.addOption(execOpt);
diff --git a/src/main/java/org/apache/sysds/api/DMLScript.java b/src/main/java/org/apache/sysds/api/DMLScript.java
index 65805b5c2ed..81acb9deacd 100644
--- a/src/main/java/org/apache/sysds/api/DMLScript.java
+++ b/src/main/java/org/apache/sysds/api/DMLScript.java
@@ -75,6 +75,7 @@
 import org.apache.sysds.runtime.lineage.LineageCacheConfig;
 import org.apache.sysds.runtime.lineage.LineageCacheConfig.LineageCachePolicy;
 import org.apache.sysds.runtime.lineage.LineageCacheConfig.ReuseCacheType;
+import org.apache.sysds.runtime.ooc.stats.OOCEventLog;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 import org.apache.sysds.runtime.util.HDFSTool;
 import org.apache.sysds.runtime.util.LocalFileUtils;
@@ -149,6 +150,12 @@ public class DMLScript
 	public static boolean           SYNCHRONIZE_GPU      = true;
 	// Set OOC backend
 	public static boolean           USE_OOC              = DMLOptions.defaultOptions.ooc;
+	// Record and print OOC statistics
+	public static boolean           OOC_STATISTICS       = DMLOptions.defaultOptions.oocStats;
+	public static int               OOC_STATISTICS_COUNT = DMLOptions.defaultOptions.oocStatsCount;
+	// Record and save fine grained OOC event logs as csv to the specified dir
+	public static boolean           OOC_LOG_EVENTS       = DMLOptions.defaultOptions.oocLogEvents;
+	public static String            OOC_LOG_PATH         = DMLOptions.defaultOptions.oocLogPath;
 	// Enable eager CUDA free on rmvar
 	public static boolean           EAGER_CUDA_FREE      = false;
 
@@ -272,6 +279,10 @@ public static boolean executeScript( String[] args )
 			USE_ACCELERATOR       = dmlOptions.gpu;
 			FORCE_ACCELERATOR     = dmlOptions.forceGPU;
 			USE_OOC	              = dmlOptions.ooc;
+			OOC_STATISTICS        = dmlOptions.oocStats;
+			OOC_STATISTICS_COUNT  = dmlOptions.oocStatsCount;
+			OOC_LOG_EVENTS        = dmlOptions.oocLogEvents;
+			OOC_LOG_PATH          = dmlOptions.oocLogPath;
 			EXPLAIN               = dmlOptions.explainType;
 			EXEC_MODE             = dmlOptions.execMode;
 			LINEAGE               = dmlOptions.lineage;
@@ -323,11 +334,14 @@ public static boolean executeScript( String[] args )
 			LineageCacheConfig.setCachePolicy(LINEAGE_POLICY);
 			LineageCacheConfig.setEstimator(LINEAGE_ESTIMATE);
 
+			if (dmlOptions.oocLogEvents)
+				OOCEventLog.setup(100000);
+
 			String dmlScriptStr = readDMLScript(isFile, fileOrScript);
 			Map<String, String> argVals = dmlOptions.argVals;
 
 			DML_FILE_PATH_ANTLR_PARSER = dmlOptions.filePath;
-			
+
 			//Step 3: invoke dml script
 			printInvocationInfo(fileOrScript, fnameOptConfig, argVals);
 			execute(dmlScriptStr, fnameOptConfig, argVals, args);
diff --git a/src/main/java/org/apache/sysds/api/PythonDMLScript.java b/src/main/java/org/apache/sysds/api/PythonDMLScript.java
index 3b1864d71dd..1a74ba0ea49 100644
--- a/src/main/java/org/apache/sysds/api/PythonDMLScript.java
+++ b/src/main/java/org/apache/sysds/api/PythonDMLScript.java
@@ -1,18 +1,18 @@
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
+ * or more contributor license agreements.	See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *	 http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
+ * KIND, either express or implied.	 See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
@@ -24,9 +24,11 @@
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.sysds.api.jmlc.Connection;
+import org.apache.sysds.common.Types.ValueType;
 
-import org.apache.sysds.common.Types;
 import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.frame.data.FrameBlock;
+import org.apache.sysds.runtime.frame.data.columns.Array;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 import org.apache.sysds.runtime.util.UnixPipeUtils;
@@ -79,7 +81,7 @@ public static void main(String[] args) throws Exception {
 			 * therefore use logging framework. and terminate program.
 			 */
 			LOG.info("failed startup", p4e);
-			System.exit(-1);
+			exitHandler.exit(-1);
 		}
 		catch(Exception e) {
 			throw new DMLException("Failed startup and maintaining Python gateway", e);
@@ -116,59 +118,59 @@ public void openPipes(String path, int num) throws IOException {
 		}
 	}
 
-	public MatrixBlock startReadingMbFromPipe(int id, int rlen, int clen, Types.ValueType type) throws IOException {
+	public MatrixBlock startReadingMbFromPipe(int id, int rlen, int clen, ValueType type) throws IOException {
 		long limit = (long) rlen * clen;
 		LOG.debug("trying to read matrix from "+id+" with "+rlen+" rows and "+clen+" columns. Total size: "+limit);
 		if(limit > Integer.MAX_VALUE)
 			throw new DMLRuntimeException("Dense NumPy array of size " + limit +
 					" cannot be converted to MatrixBlock");
-		MatrixBlock mb = new MatrixBlock(rlen, clen, false, -1);
+		MatrixBlock mb;
 		if(fromPython != null){
 			BufferedInputStream pipe = fromPython.get(id);
 			double[] denseBlock = new double[(int) limit];
-			UnixPipeUtils.readNumpyArrayInBatches(pipe, id, BATCH_SIZE, (int) limit, type, denseBlock, 0);
-			mb.init(denseBlock, rlen, clen);
+			long nnz = UnixPipeUtils.readNumpyArrayInBatches(pipe, id, BATCH_SIZE, (int) limit, type, denseBlock, 0);
+			mb = new MatrixBlock(rlen, clen, denseBlock);
+			mb.setNonZeros(nnz);
 		} else {
 			throw new DMLRuntimeException("FIFO Pipes are not initialized.");
 		}
-		mb.recomputeNonZeros();
-		mb.examSparsity();
 		LOG.debug("Reading from Python finished");
+		mb.examSparsity();
 		return mb;
 	}
 
-	public MatrixBlock startReadingMbFromPipes(int[] blockSizes, int rlen, int clen, Types.ValueType type) throws ExecutionException, InterruptedException {
+	public MatrixBlock startReadingMbFromPipes(int[] blockSizes, int rlen, int clen, ValueType type) throws ExecutionException, InterruptedException {
 		long limit = (long) rlen * clen;
 		if(limit > Integer.MAX_VALUE)
 			throw new DMLRuntimeException("Dense NumPy array of size " + limit +
 					" cannot be converted to MatrixBlock");
-		MatrixBlock mb = new MatrixBlock(rlen, clen, false, -1);
+		MatrixBlock mb = new MatrixBlock(rlen, clen, false, rlen*clen);
 		if(fromPython != null){
 			ExecutorService pool = CommonThreadPool.get();
 			double[] denseBlock = new double[(int) limit];
 			int offsetOut = 0;
-			List<Future<Void>> futures = new ArrayList<>();
+			List<Future<Long>> futures = new ArrayList<>();
 			for (int i = 0; i < blockSizes.length; i++) {
 				BufferedInputStream pipe = fromPython.get(i);
 				int id = i, blockSize = blockSizes[i], _offsetOut = offsetOut;
-				Callable<Void> task = () -> {
-					UnixPipeUtils.readNumpyArrayInBatches(pipe, id, BATCH_SIZE, blockSize, type, denseBlock, _offsetOut);
-					return null;
+				Callable<Long> task = () -> {
+					return UnixPipeUtils.readNumpyArrayInBatches(pipe, id, BATCH_SIZE, blockSize, type, denseBlock, _offsetOut);
 				};
 
 				futures.add(pool.submit(task));
 				offsetOut += blockSize;
 			}
-			// Wait for all tasks and propagate exceptions
-			for (Future<Void> f : futures) {
-				f.get();
+			// Wait for all tasks and propagate exceptions, sum up nonzeros
+			long nnz = 0;
+			for (Future<Long> f : futures) {
+				nnz += f.get();
 			}
 
-			mb.init(denseBlock, rlen, clen);
+			mb = new MatrixBlock(rlen, clen, denseBlock);
+			mb.setNonZeros(nnz);
 		} else {
 			throw new DMLRuntimeException("FIFO Pipes are not initialized.");
 		}
-		mb.recomputeNonZeros();
 		mb.examSparsity();
 		return mb;
 	}
@@ -181,7 +183,7 @@ public void startWritingMbToPipe(int id, MatrixBlock mb) throws IOException {
 			LOG.debug("Trying to write matrix ["+baseDir + "-"+ id+"] with "+rlen+" rows and "+clen+" columns. Total size: "+numElem*8);
 
 			BufferedOutputStream out = toPython.get(id);
-			long bytes = UnixPipeUtils.writeNumpyArrayInBatches(out, id, BATCH_SIZE, numElem, Types.ValueType.FP64, mb);
+			long bytes = UnixPipeUtils.writeNumpyArrayInBatches(out, id, BATCH_SIZE, numElem, ValueType.FP64, mb);
 
 			LOG.debug("Writing of " + bytes +" Bytes to Python ["+baseDir + "-"+ id+"] finished");
 		} else {
@@ -189,6 +191,43 @@ public void startWritingMbToPipe(int id, MatrixBlock mb) throws IOException {
 		}
 	}
 
+	public void startReadingColFromPipe(int id, FrameBlock fb, int rows, int totalBytes, int col, ValueType type, boolean any) throws IOException {
+		if (fromPython == null) {
+			throw new DMLRuntimeException("FIFO Pipes are not initialized.");
+		}
+
+		BufferedInputStream pipe = fromPython.get(id);
+		LOG.debug("Start reading FrameBlock column from pipe #" + id + " with type " + type);
+
+		// Delegate to UnixPipeUtils
+		Array<?> arr = UnixPipeUtils.readFrameColumnFromPipe(pipe, id, rows, totalBytes, BATCH_SIZE, type);
+		// Set column into FrameBlock
+		fb.setColumn(col, arr);
+		ValueType[] schema = fb.getSchema();
+		// inplace update the schema for cases: int8 -> int32
+		schema[col] = arr.getValueType();
+
+		LOG.debug("Finished reading FrameBlock column from pipe #" + id);
+	}
+
+	public void startWritingColToPipe(int id, FrameBlock fb, int col) throws IOException {
+		if (toPython == null) {
+			throw new DMLRuntimeException("FIFO Pipes are not initialized.");
+		}
+
+		BufferedOutputStream pipe = toPython.get(id);
+		ValueType type = fb.getSchema()[col];
+		int rows = fb.getNumRows();
+		Array<?> array = fb.getColumn(col);
+		
+		LOG.debug("Start writing FrameBlock column #" + col + " to pipe #" + id + " with type " + type + " and " + rows + " rows");
+
+		// Delegate to UnixPipeUtils
+		long bytes = UnixPipeUtils.writeFrameColumnToPipe(pipe, id, BATCH_SIZE, array, type);
+
+		LOG.debug("Finished writing FrameBlock column #" + col + " to pipe #" + id + ". Total bytes: " + bytes);
+	}
+
 	public void closePipes() throws IOException {
 		LOG.debug("Closing all pipes in Java");
 		for (BufferedInputStream pipe : fromPython.values())
@@ -198,6 +237,20 @@ public void closePipes() throws IOException {
 		LOG.debug("Closed all pipes in Java");
 	}
 
+	@FunctionalInterface
+	public interface ExitHandler {
+		void exit(int status);
+	}
+
+	private static volatile ExitHandler exitHandler = System::exit;
+
+	public static void setExitHandler(ExitHandler handler) {
+		exitHandler = handler == null ? System::exit : handler;
+	}
+
+	public static void resetExitHandler() {
+		exitHandler = System::exit;
+	}
 	protected static class DMLGateWayListener extends DefaultGatewayServerListener {
 		private static final Log LOG = LogFactory.getLog(DMLGateWayListener.class.getName());
 
diff --git a/src/main/java/org/apache/sysds/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysds/api/ScriptExecutorUtils.java
index fd344e44e9b..0d15218bf9a 100644
--- a/src/main/java/org/apache/sysds/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysds/api/ScriptExecutorUtils.java
@@ -36,6 +36,7 @@
 import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.lineage.LineageEstimatorStatistics;
 import org.apache.sysds.runtime.lineage.LineageGPUCacheEviction;
+import org.apache.sysds.runtime.ooc.cache.OOCCacheManager;
 import org.apache.sysds.utils.Statistics;
 
 public class ScriptExecutorUtils {
@@ -127,6 +128,9 @@ public static void executeRuntimeProgram(Program rtprog, ExecutionContext ec, DM
 			
 			if (DMLScript.LINEAGE_ESTIMATE)
 				System.out.println(LineageEstimatorStatistics.displayLineageEstimates());
+
+			if (DMLScript.USE_OOC)
+				OOCCacheManager.reset();
 		}
 	}
 
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index 4feab311c76..dc1f23b83fc 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -392,6 +392,8 @@ public enum Builtins {
 	RMEMPTY("removeEmpty", false, true),
 	SCALE("scale", true, false),
 	SCALEAPPLY("scaleApply", true, false),
+	SCALEROBUST("scaleRobust", true, false),
+	SCALEROBUSTAPPLY("scaleRobustApply", true, false),
 	SCALE_MINMAX("scaleMinMax", true, false),
 	TIME("time", false),
 	TOKENIZE("tokenize", false, true),
diff --git a/src/main/java/org/apache/sysds/common/Opcodes.java b/src/main/java/org/apache/sysds/common/Opcodes.java
index 251f773a18c..1b0536416d6 100644
--- a/src/main/java/org/apache/sysds/common/Opcodes.java
+++ b/src/main/java/org/apache/sysds/common/Opcodes.java
@@ -19,7 +19,25 @@
 
 package org.apache.sysds.common;
 
-import org.apache.sysds.lops.*;
+import org.apache.sysds.lops.Append;
+import org.apache.sysds.lops.DataGen;
+import org.apache.sysds.lops.LeftIndex;
+import org.apache.sysds.lops.RightIndex;
+import org.apache.sysds.lops.Compression;
+import org.apache.sysds.lops.DeCompression;
+import org.apache.sysds.lops.Local;
+import org.apache.sysds.lops.Checkpoint;
+import org.apache.sysds.lops.WeightedCrossEntropy;
+import org.apache.sysds.lops.WeightedCrossEntropyR;
+import org.apache.sysds.lops.WeightedDivMM;
+import org.apache.sysds.lops.WeightedDivMMR;
+import org.apache.sysds.lops.WeightedSigmoid;
+import org.apache.sysds.lops.WeightedSigmoidR;
+import org.apache.sysds.lops.WeightedSquaredLoss;
+import org.apache.sysds.lops.WeightedSquaredLossR;
+import org.apache.sysds.lops.WeightedUnaryMM;
+import org.apache.sysds.lops.WeightedUnaryMMR;
+
 import org.apache.sysds.common.Types.OpOp1;
 import org.apache.sysds.hops.FunctionOp;
 
diff --git a/src/main/java/org/apache/sysds/hops/NaryOp.java b/src/main/java/org/apache/sysds/hops/NaryOp.java
index 6962beadcbc..d752316a526 100644
--- a/src/main/java/org/apache/sysds/hops/NaryOp.java
+++ b/src/main/java/org/apache/sysds/hops/NaryOp.java
@@ -165,7 +165,7 @@ else if ( areDimsBelowThreshold() )
 		setRequiresRecompileIfNecessary();
 		
 		//ensure cp exec type for single-node operations
-		if ( _op == OpOpN.PRINTF  || _op == OpOpN.EVAL || _op == OpOpN.LIST
+		if ( _op == OpOpN.PRINTF  || _op == OpOpN.EVAL || _op == OpOpN.LIST || _op == OpOpN.EINSUM
 			//TODO: cbind/rbind of lists only support in CP right now
 			|| (_op == OpOpN.CBIND && getInput().get(0).getDataType().isList())
 			|| (_op == OpOpN.RBIND && getInput().get(0).getDataType().isList())
diff --git a/src/main/java/org/apache/sysds/hops/ReorgOp.java b/src/main/java/org/apache/sysds/hops/ReorgOp.java
index 5fc73e2bd3f..f43d1cc2baf 100644
--- a/src/main/java/org/apache/sysds/hops/ReorgOp.java
+++ b/src/main/java/org/apache/sysds/hops/ReorgOp.java
@@ -173,7 +173,9 @@ _op, getDataType(), getValueType(), et,
 				for (int i = 0; i < 2; i++)
 					linputs[i] = getInput().get(i).constructLops();
 
-				Transform transform1 = new Transform(linputs, _op, getDataType(), getValueType(), et, 1);
+				Transform transform1 = new Transform(
+						linputs, _op, getDataType(), getValueType(), et,
+						OptimizerUtils.getConstrainedNumThreads(_maxNumThreads));
 
 				setOutputDimensions(transform1);
 				setLineNumbers(transform1);
diff --git a/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlanCostEstimator.java b/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlanCostEstimator.java
index 0c5d6c0290e..fe40c83e690 100644
--- a/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlanCostEstimator.java
+++ b/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlanCostEstimator.java
@@ -19,6 +19,12 @@
 
 package org.apache.sysds.hops.fedplanner;
 
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.NoSuchElementException;
+
 import org.apache.commons.lang3.tuple.Pair;
 import org.apache.sysds.common.Types;
 import org.apache.sysds.hops.DataOp;
@@ -28,7 +34,6 @@
 import org.apache.sysds.hops.fedplanner.FederatedMemoTable.HopCommon;
 import org.apache.sysds.runtime.instructions.fed.FEDInstruction.FederatedOutput;
 
-import java.util.*;
 
 /**
  * Cost estimator for federated execution plans.
diff --git a/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlanRewireTransTable.java b/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlanRewireTransTable.java
index f41d924999e..9a41509e81f 100644
--- a/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlanRewireTransTable.java
+++ b/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlanRewireTransTable.java
@@ -23,15 +23,29 @@
 import org.apache.commons.lang3.tuple.ImmutablePair;
 import org.apache.sysds.common.Types;
 import org.apache.sysds.common.Types.ParamBuiltinOp;
-import org.apache.sysds.hops.*;
+import org.apache.sysds.hops.AggBinaryOp;
+import org.apache.sysds.hops.AggUnaryOp;
+import org.apache.sysds.hops.BinaryOp;
+import org.apache.sysds.hops.DataGenOp;
+import org.apache.sysds.hops.DataOp;
+import org.apache.sysds.hops.DnnOp;
+import org.apache.sysds.hops.FunctionOp;
 import org.apache.sysds.hops.FunctionOp.FunctionType;
-import org.apache.sysds.parser.*;
+import org.apache.sysds.hops.Hop;
+import org.apache.sysds.hops.IndexingOp;
+import org.apache.sysds.hops.LeftIndexingOp;
+import org.apache.sysds.hops.LiteralOp;
+import org.apache.sysds.hops.NaryOp;
+import org.apache.sysds.hops.ParameterizedBuiltinOp;
+import org.apache.sysds.hops.QuaternaryOp;
+import org.apache.sysds.hops.ReorgOp;
+import org.apache.sysds.hops.TernaryOp;
+import org.apache.sysds.hops.UnaryOp;
 import org.apache.sysds.hops.fedplanner.FederatedMemoTable.HopCommon;
 import org.apache.sysds.hops.rewrite.HopRewriteUtils;
 import org.apache.sysds.runtime.controlprogram.federated.FederatedRange;
 import org.apache.sysds.runtime.controlprogram.federated.FederationUtils;
 import org.apache.sysds.runtime.util.UtilFunctions;
-import java.util.*;
 import org.apache.sysds.runtime.instructions.fed.InitFEDInstruction;
 import org.apache.sysds.runtime.controlprogram.federated.FederatedData;
 import java.net.InetAddress;
@@ -49,8 +63,26 @@
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.common.Types.ReOrgOp;
 import org.apache.sysds.lops.MMTSJ.MMTSJType;
+import org.apache.sysds.parser.DMLProgram;
+import org.apache.sysds.parser.DataExpression;
+import org.apache.sysds.parser.ForStatement;
+import org.apache.sysds.parser.ForStatementBlock;
+import org.apache.sysds.parser.FunctionStatement;
+import org.apache.sysds.parser.FunctionStatementBlock;
+import org.apache.sysds.parser.IfStatement;
+import org.apache.sysds.parser.IfStatementBlock;
+import org.apache.sysds.parser.StatementBlock;
+import org.apache.sysds.parser.VariableSet;
+import org.apache.sysds.parser.WhileStatement;
+import org.apache.sysds.parser.WhileStatementBlock;
+
 import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
+import java.util.Set;
 
 public class FederatedPlanRewireTransTable {
 	
diff --git a/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlannerFedCostBased.java b/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlannerFedCostBased.java
index 99043860bd7..9204b65c793 100644
--- a/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlannerFedCostBased.java
+++ b/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlannerFedCostBased.java
@@ -64,23 +64,23 @@ public void rewriteFunctionDynamic(FunctionStatementBlock function, LocalVariabl
 	private void rewriteHop(FedPlan optimalPlan, FederatedMemoTable memoTable, Set<Long> visited) {
 		long hopID = optimalPlan.getHopRef().getHopID();
 
-        if (visited.contains(hopID)) {
-            return;
-        } else {
-            visited.add(hopID);
-        }
+		if (visited.contains(hopID)) {
+			return;
+		} else {
+			visited.add(hopID);
+		}
 
-        for (Pair<Long, FEDInstruction.FederatedOutput> childFedPlanPair : optimalPlan.getChildFedPlans()) {
-            FedPlan childPlan = memoTable.getFedPlanAfterPrune(childFedPlanPair);
-            
-            // DEBUG: Check if getFedPlanAfterPrune returns null
-            if (childPlan == null) {
+		for (Pair<Long, FEDInstruction.FederatedOutput> childFedPlanPair : optimalPlan.getChildFedPlans()) {
+			FedPlan childPlan = memoTable.getFedPlanAfterPrune(childFedPlanPair);
+			
+			// DEBUG: Check if getFedPlanAfterPrune returns null
+			if (childPlan == null) {
 				FederatedPlannerLogger.logNullChildPlanDebug(childFedPlanPair, optimalPlan, memoTable);
-                continue;
-            }
-            
+				continue;
+			}
+			
 			rewriteHop(childPlan, memoTable, visited);
-        }
+		}
 
 		if (optimalPlan.getFedOutType() == FEDInstruction.FederatedOutput.LOUT) {
 			optimalPlan.setFederatedOutput(FEDInstruction.FederatedOutput.LOUT);
diff --git a/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlannerLogger.java b/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlannerLogger.java
index 35742ce1e4f..d2c8134c765 100644
--- a/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlannerLogger.java
+++ b/src/main/java/org/apache/sysds/hops/fedplanner/FederatedPlannerLogger.java
@@ -39,560 +39,560 @@
  * This class integrates the functionality of the former FederatedMemoTablePrinter.
  */
 public class FederatedPlannerLogger {
-    
-    /**
-     * Logs hop information including name, hop ID, child hop IDs, privacy constraint, and ftype
-     * @param hop The hop to log information for
-     * @param privacyConstraintMap Map containing privacy constraints for hops
-     * @param fTypeMap Map containing FType information for hops
-     * @param logPrefix Prefix string to identify the log source
-     */
-    public static void logHopInfo(Hop hop, Map<Long, Privacy> privacyConstraintMap, 
-                                  Map<Long, FType> fTypeMap, String logPrefix) {
-        StringBuilder childIds = new StringBuilder();
-        if (hop.getInput() != null && !hop.getInput().isEmpty()) {
-            for (int i = 0; i < hop.getInput().size(); i++) {
-                if (i > 0) childIds.append(",");
-                childIds.append(hop.getInput().get(i).getHopID());
-            }
-        } else {
-            childIds.append("none");
-        }
-        
-        Privacy privacyConstraint = privacyConstraintMap.get(hop.getHopID());
-        FType ftype = fTypeMap.get(hop.getHopID());
-        
-        // Get hop type and opcode information
-        String hopType = hop.getClass().getSimpleName();
-        String opCode = hop.getOpString();
-        
-        System.out.println("[" + logPrefix + "] (ID:" + hop.getHopID() + " Name:" + hop.getName() + 
-                          ") Type:" + hopType + " OpCode:" + opCode + 
-                          " ChildIDs:(" + childIds.toString() + ") Privacy:" + 
-                          (privacyConstraint != null ? privacyConstraint : "null") + 
-                          " FType:" + (ftype != null ? ftype : "null"));
-    }
-    
-    /**
-     * Logs basic hop information without privacy and FType details
-     * @param hop The hop to log information for
-     * @param logPrefix Prefix string to identify the log source
-     */
-    public static void logBasicHopInfo(Hop hop, String logPrefix) {
-        StringBuilder childIds = new StringBuilder();
-        if (hop.getInput() != null && !hop.getInput().isEmpty()) {
-            for (int i = 0; i < hop.getInput().size(); i++) {
-                if (i > 0) childIds.append(",");
-                childIds.append(hop.getInput().get(i).getHopID());
-            }
-        } else {
-            childIds.append("none");
-        }
-        
-        String hopType = hop.getClass().getSimpleName();
-        String opCode = hop.getOpString();
-        
-        System.out.println("[" + logPrefix + "] (ID:" + hop.getHopID() + " Name:" + hop.getName() + 
-                          ") Type:" + hopType + " OpCode:" + opCode + 
-                          " ChildIDs:(" + childIds.toString() + ")");
-    }
-    
-    /**
-     * Logs detailed hop information with dimension and data type
-     * @param hop The hop to log information for
-     * @param privacyConstraintMap Map containing privacy constraints for hops
-     * @param fTypeMap Map containing FType information for hops
-     * @param logPrefix Prefix string to identify the log source
-     */
-    public static void logDetailedHopInfo(Hop hop, Map<Long, Privacy> privacyConstraintMap, 
-                                         Map<Long, FType> fTypeMap, String logPrefix) {
-        StringBuilder childIds = new StringBuilder();
-        if (hop.getInput() != null && !hop.getInput().isEmpty()) {
-            for (int i = 0; i < hop.getInput().size(); i++) {
-                if (i > 0) childIds.append(",");
-                childIds.append(hop.getInput().get(i).getHopID());
-            }
-        } else {
-            childIds.append("none");
-        }
-        
-        Privacy privacyConstraint = privacyConstraintMap.get(hop.getHopID());
-        FType ftype = fTypeMap.get(hop.getHopID());
-        
-        String hopType = hop.getClass().getSimpleName();
-        String opCode = hop.getOpString();
-        String dataType = hop.getDataType().toString();
-        String dimensions = "[" + hop.getDim1() + "x" + hop.getDim2() + "]";
-        
-        System.out.println("[" + logPrefix + "] (ID:" + hop.getHopID() + " Name:" + hop.getName() + 
-                          ") Type:" + hopType + " OpCode:" + opCode + " DataType:" + dataType + 
-                          " Dims:" + dimensions + " ChildIDs:(" + childIds.toString() + ") Privacy:" + 
-                          (privacyConstraint != null ? privacyConstraint : "null") + 
-                          " FType:" + (ftype != null ? ftype : "null"));
-    }
-    
-    /**
-     * Logs error information for null fed plan scenarios
-     * @param hopID The hop ID that caused the error
-     * @param logPrefix Prefix string to identify the log source
-     */
-    public static void logNullFedPlanError(long hopID, String logPrefix) {
-        System.err.println("[" + logPrefix + "] childFedPlan is null for hopID: " + hopID);
-    }
-    
-    /**
-     * Logs detailed error information for conflict resolution scenarios
-     * @param hopID The hop ID that caused the error
-     * @param fedPlan The federated plan with error details
-     * @param logPrefix Prefix string to identify the log source
-     */
-    public static void logConflictResolutionError(long hopID, Object fedPlan, String logPrefix) {
-        System.err.println("[" + logPrefix + "] confilctLOutFedPlan or confilctFOutFedPlan is null for hopID: " + hopID);
-        System.err.println("  Child Hop Details:");
-        if (fedPlan != null) {
-            // Note: This assumes fedPlan has a getHopRef() method
-            // In actual implementation, you might need to cast or handle differently
-            System.err.println("    - Class: N/A");
-            System.err.println("    - Name: N/A");
-            System.err.println("    - OpString: N/A");
-            System.err.println("    - HopID: " + hopID);
-        }
-    }
-    
-    /**
-     * Logs debug information for getFederatedType function
-     * @param hop The hop being analyzed
-     * @param returnFType The FType that will be returned
-     * @param reason The reason for the FType decision
-     */
-    public static void logGetFederatedTypeDebug(Hop hop, FType returnFType, String reason) {
-        String hopName = hop.getName() != null ? hop.getName() : "null";
-        long hopID = hop.getHopID();
-        String operationType = hop.getClass().getSimpleName();
-        String opCode = hop.getOpString();
-        
-        System.out.println("[GetFederatedType] HopName: " + hopName + " | HopID: " + hopID +
-                          " | OperationType: " + operationType + " | OpCode: " + opCode + 
-                          " | ReturnFType: " + (returnFType != null ? returnFType : "null") + 
-                          " | Reason: " + reason);
-    }
-    
-    /**
-     * Logs detailed hop error information with complete hop details
-     * @param hop The hop that caused the error
-     * @param logPrefix Prefix string to identify the log source
-     * @param additionalMessage Additional error message
-     */
-    public static void logHopErrorDetails(Hop hop, String logPrefix, String additionalMessage) {
-        System.err.println("[" + logPrefix + "] " + additionalMessage);
-        System.err.println("  Child Hop Details:");
-        System.err.println("    - Class: " + hop.getClass().getSimpleName());
-        System.err.println("    - Name: " + (hop.getName() != null ? hop.getName() : "null"));
-        System.err.println("    - OpString: " + hop.getOpString());
-        System.err.println("    - HopID: " + hop.getHopID());
-    }
-    
-    /**
-     * Logs detailed null child plan debugging information
-     * @param childFedPlanPair The child federated plan pair that is null
-     * @param optimalPlan The current optimal plan (parent)
-     * @param memoTable The memo table for lookups
-     */
-    public static void logNullChildPlanDebug(Pair<Long, FederatedOutput> childFedPlanPair, 
-                                           FedPlan optimalPlan, 
-                                           org.apache.sysds.hops.fedplanner.FederatedMemoTable memoTable) {
-        FederatedOutput alternativeFedType = (childFedPlanPair.getRight() == FederatedOutput.LOUT) ? 
-                                           FederatedOutput.FOUT : FederatedOutput.LOUT;
-        FedPlan alternativeChildPlan = memoTable.getFedPlanAfterPrune(childFedPlanPair.getLeft(), alternativeFedType);
-        
-        // Get child hop info
-        Hop childHop = null;
-        String childInfo = "UNKNOWN";
-        if (alternativeChildPlan != null) {
-            childHop = alternativeChildPlan.getHopRef();
-            // Check if required fed type plan exists
-            String requiredExists = memoTable.getFedPlanAfterPrune(childFedPlanPair.getLeft(), childFedPlanPair.getRight()) != null ? "O" : "X";
-            // Check if alternative fed type plan exists  
-            String altExists = alternativeChildPlan != null ? "O" : "X";
-            
-            childInfo = String.format("ID:%d|Name:%s|Op:%s|RequiredFedType:%s(%s)|AltFedType:%s(%s)", 
-                childHop.getHopID(),
-                childHop.getName() != null ? childHop.getName() : "null",
-                childHop.getOpString(),
-                childFedPlanPair.getRight(),
-                requiredExists,
-                alternativeFedType,
-                altExists);
-        }
-        
-        // Current parent hop info
-        String currentParentInfo = String.format("ID:%d|Name:%s|Op:%s|FedType:%s|RequiredChild:%s", 
-            optimalPlan.getHopID(),
-            optimalPlan.getHopRef().getName() != null ? optimalPlan.getHopRef().getName() : "null",
-            optimalPlan.getHopRef().getOpString(),
-            optimalPlan.getFedOutType(),
-            childFedPlanPair.getRight());
-        
-        // Alternative parent info (if child has other parents)
-        String alternativeParentInfo = "NONE";
-        if (childHop != null) {
-            List<Hop> parents = childHop.getParent();
-            for (Hop parent : parents) {
-                if (parent.getHopID() != optimalPlan.getHopID()) {
-                    // Try to find alt parent's fed plan info
-                    String altParentFedType = "UNKNOWN";
-                    String altParentRequiredChild = "UNKNOWN";
-                    
-                    // Check both LOUT and FOUT plans for alt parent
-                    FedPlan altParentPlanLOUT = memoTable.getFedPlanAfterPrune(parent.getHopID(), FederatedOutput.LOUT);
-                    FedPlan altParentPlanFOUT = memoTable.getFedPlanAfterPrune(parent.getHopID(), FederatedOutput.FOUT);
-                    
-                    if (altParentPlanLOUT != null) {
-                        altParentFedType = "LOUT";
-                        // Find what this alt parent expects from child
-                        for (Pair<Long, FederatedOutput> altChildPair : altParentPlanLOUT.getChildFedPlans()) {
-                            if (altChildPair.getLeft() == childHop.getHopID()) {
-                                altParentRequiredChild = altChildPair.getRight().toString();
-                                break;
-                            }
-                        }
-                    } else if (altParentPlanFOUT != null) {
-                        altParentFedType = "FOUT";
-                        // Find what this alt parent expects from child
-                        for (Pair<Long, FederatedOutput> altChildPair : altParentPlanFOUT.getChildFedPlans()) {
-                            if (altChildPair.getLeft() == childHop.getHopID()) {
-                                altParentRequiredChild = altChildPair.getRight().toString();
-                                break;
-                            }
-                        }
-                    }
-                    
-                    alternativeParentInfo = String.format("ID:%d|Name:%s|Op:%s|FedType:%s|RequiredChild:%s", 
-                        parent.getHopID(),
-                        parent.getName() != null ? parent.getName() : "null",
-                        parent.getOpString(),
-                        altParentFedType,
-                        altParentRequiredChild);
-                    break;
-                }
-            }
-        }
-        
-        System.err.println("[DEBUG] NULL CHILD PLAN DETECTED:");
-        System.err.println("  Child:           " + childInfo);
-        System.err.println("  Current Parent:  " + currentParentInfo);
-        System.err.println("  Alt Parent:      " + alternativeParentInfo);
-        System.err.println("  Alt Plan Exists: " + (alternativeChildPlan != null));
-    }
-    
-    /**
-     * Logs debugging information for TransRead hop rewiring process
-     * @param hopName The name of the TransRead hop
-     * @param hopID The ID of the TransRead hop  
-     * @param childHops List of child hops found during rewiring
-     * @param isEmptyChildHops Whether the child hops list is empty
-     * @param logPrefix Prefix string to identify the log source
-     */
-    public static void logTransReadRewireDebug(String hopName, long hopID, List<Hop> childHops, 
-                                             boolean isEmptyChildHops, String logPrefix) {
-        if (isEmptyChildHops) {
-            System.err.println("[" + logPrefix + "] (hopName: " + hopName + ", hopID: " + hopID + ") child hops is empty");
-        }
-    }
-    
-    /**
-     * Logs debugging information for filtered child hops during TransRead rewiring
-     * @param hopName The name of the TransRead hop
-     * @param hopID The ID of the TransRead hop
-     * @param filteredChildHops List of filtered child hops
-     * @param isEmptyFilteredChildHops Whether the filtered child hops list is empty
-     * @param logPrefix Prefix string to identify the log source
-     */
-    public static void logFilteredChildHopsDebug(String hopName, long hopID, List<Hop> filteredChildHops, 
-                                               boolean isEmptyFilteredChildHops, String logPrefix) {
-        if (isEmptyFilteredChildHops) {
-            System.err.println("[" + logPrefix + "] (hopName: " + hopName + ", hopID: " + hopID + ") filtered child hops is empty");
-        }
-    }
-    
-    /**
-     * Logs detailed FType mismatch error information for TransRead hop
-     * @param hop The TransRead hop with FType mismatch
-     * @param filteredChildHops List of filtered child hops
-     * @param fTypeMap Map containing FType information for hops
-     * @param expectedFType The expected FType
-     * @param mismatchedFType The mismatched FType
-     * @param mismatchIndex The index where mismatch occurred
-     */
-    public static void logFTypeMismatchError(Hop hop, List<Hop> filteredChildHops, Map<Long, FType> fTypeMap,
-                                           FType expectedFType, FType mismatchedFType, int mismatchIndex) {
-        String hopName = hop.getName();
-        long hopID = hop.getHopID();
-        
-        System.err.println("[Error] FType MISMATCH DETECTED for TransRead (hopName: " + hopName + ", hopID: " + hopID + ")");
-        System.err.println("[Error] TRANSREAD HOP DETAILS - Type: " + hop.getClass().getSimpleName() + 
-            ", OpType: " + (hop instanceof org.apache.sysds.hops.DataOp ? 
-                ((org.apache.sysds.hops.DataOp)hop).getOp() : "N/A") + 
-            ", DataType: " + hop.getDataType() + 
-            ", Dims: [" + hop.getDim1() + "x" + hop.getDim2() + "]");
-        System.err.println("[Error] FILTERED CHILD HOPS FTYPE ANALYSIS:");
-        
-        for (int j = 0; j < filteredChildHops.size(); j++) {
-            Hop childHop = filteredChildHops.get(j);
-            FType childFType = fTypeMap.get(childHop.getHopID());
-            System.err.println("[Error]   FilteredChild[" + j + "] - Name: " + childHop.getName() + 
-                ", ID: " + childHop.getHopID() + 
-                ", FType: " + childFType + 
-                ", Type: " + childHop.getClass().getSimpleName() + 
-                ", OpType: " + (childHop instanceof org.apache.sysds.hops.DataOp ? 
-                    ((org.apache.sysds.hops.DataOp)childHop).getOp().toString() : "N/A") +
-                ", Dims: [" + childHop.getDim1() + "x" + childHop.getDim2() + "]");
-        }
-        
-        System.err.println("[Error] Expected FType: " + expectedFType + 
-                          ", Mismatched FType: " + mismatchedFType + 
-                          " at child index: " + mismatchIndex);
-    }
-    
-    /**
-     * Logs FType debug information for DataOp operations (FEDERATED, TRANSIENTWRITE, TRANSIENTREAD)
-     * @param hop The DataOp hop being analyzed
-     * @param fType The FType that was determined for this operation
-     * @param opType The operation type (FEDERATED, TRANSIENTWRITE, TRANSIENTREAD)
-     * @param reason The reason for the FType decision
-     */
-    public static void logDataOpFTypeDebug(Hop hop, FType fType, String opType, String reason) {
-        String hopName = hop.getName() != null ? hop.getName() : "null";
-        long hopID = hop.getHopID();
-        String hopClass = hop.getClass().getSimpleName();
-        String dimensions = "[" + hop.getDim1() + "x" + hop.getDim2() + "]";
-        
-        System.out.println("[GetFederatedType] HopName: " + hopName +
-                          " | HopID: " + hopID + 
-                          " | HopClass: " + hopClass + 
-                          " | OpType: " + opType + 
-                          " | Dims: " + dimensions + 
-                          " | FType: " + (fType != null ? fType : "null") + 
-                          " | Reason: " + reason);
-    }
-    
-    // ========== FederatedMemoTable Printing Methods ==========
-    
-    /**
-     * Recursively prints a tree representation of the DAG starting from the given root FedPlan.
-     * Includes information about hopID, fedOutType, TotalCost, SelfCost, and NetCost for each node.
-     * Additionally, prints the additional total cost once at the beginning.
-     *
-     * @param rootFedPlan The starting point FedPlan to print
-     * @param rootHopStatSet Set of root hop statistics
-     * @param memoTable The memoization table containing FedPlan variants
-     * @param additionalTotalCost The additional cost to be printed once
-     */
-    public static void printFedPlanTree(FederatedMemoTable.FedPlan rootFedPlan, Set<Long> rootHopStatSet,
-                                        FederatedMemoTable memoTable, double additionalTotalCost) {
-        System.out.println("Additional Cost: " + additionalTotalCost);
-        Set<Long> visited = new HashSet<>();
-        printFedPlanTreeRecursive(rootFedPlan, memoTable, visited, 0);
 
-        for (Long hopID : rootHopStatSet) {
-            FedPlan plan = memoTable.getFedPlanAfterPrune(hopID, FederatedOutput.LOUT);
-            if (plan == null){
-                plan = memoTable.getFedPlanAfterPrune(hopID, FederatedOutput.FOUT);
-            }
-            printNotReferencedFedPlanRecursive(plan, memoTable, visited, 1);
-        }
-    }
+	/**
+	 * Logs hop information including name, hop ID, child hop IDs, privacy constraint, and ftype
+	 * @param hop The hop to log information for
+	 * @param privacyConstraintMap Map containing privacy constraints for hops
+	 * @param fTypeMap Map containing FType information for hops
+	 * @param logPrefix Prefix string to identify the log source
+	 */
+	public static void logHopInfo(Hop hop, Map<Long, Privacy> privacyConstraintMap, 
+								  Map<Long, FType> fTypeMap, String logPrefix) {
+		StringBuilder childIds = new StringBuilder();
+		if (hop.getInput() != null && !hop.getInput().isEmpty()) {
+			for (int i = 0; i < hop.getInput().size(); i++) {
+				if (i > 0) childIds.append(",");
+				childIds.append(hop.getInput().get(i).getHopID());
+			}
+		} else {
+			childIds.append("none");
+		}
+		
+		Privacy privacyConstraint = privacyConstraintMap.get(hop.getHopID());
+		FType ftype = fTypeMap.get(hop.getHopID());
+		
+		// Get hop type and opcode information
+		String hopType = hop.getClass().getSimpleName();
+		String opCode = hop.getOpString();
+		
+		System.out.println("[" + logPrefix + "] (ID:" + hop.getHopID() + " Name:" + hop.getName() + 
+						  ") Type:" + hopType + " OpCode:" + opCode + 
+						  " ChildIDs:(" + childIds.toString() + ") Privacy:" + 
+						  (privacyConstraint != null ? privacyConstraint : "null") + 
+						  " FType:" + (ftype != null ? ftype : "null"));
+	}
+	
+	/**
+	 * Logs basic hop information without privacy and FType details
+	 * @param hop The hop to log information for
+	 * @param logPrefix Prefix string to identify the log source
+	 */
+	public static void logBasicHopInfo(Hop hop, String logPrefix) {
+		StringBuilder childIds = new StringBuilder();
+		if (hop.getInput() != null && !hop.getInput().isEmpty()) {
+			for (int i = 0; i < hop.getInput().size(); i++) {
+				if (i > 0) childIds.append(",");
+				childIds.append(hop.getInput().get(i).getHopID());
+			}
+		} else {
+			childIds.append("none");
+		}
+		
+		String hopType = hop.getClass().getSimpleName();
+		String opCode = hop.getOpString();
+		
+		System.out.println("[" + logPrefix + "] (ID:" + hop.getHopID() + " Name:" + hop.getName() + 
+						  ") Type:" + hopType + " OpCode:" + opCode + 
+						  " ChildIDs:(" + childIds.toString() + ")");
+	}
+	
+	/**
+	 * Logs detailed hop information with dimension and data type
+	 * @param hop The hop to log information for
+	 * @param privacyConstraintMap Map containing privacy constraints for hops
+	 * @param fTypeMap Map containing FType information for hops
+	 * @param logPrefix Prefix string to identify the log source
+	 */
+	public static void logDetailedHopInfo(Hop hop, Map<Long, Privacy> privacyConstraintMap, 
+										 Map<Long, FType> fTypeMap, String logPrefix) {
+		StringBuilder childIds = new StringBuilder();
+		if (hop.getInput() != null && !hop.getInput().isEmpty()) {
+			for (int i = 0; i < hop.getInput().size(); i++) {
+				if (i > 0) childIds.append(",");
+				childIds.append(hop.getInput().get(i).getHopID());
+			}
+		} else {
+			childIds.append("none");
+		}
+		
+		Privacy privacyConstraint = privacyConstraintMap.get(hop.getHopID());
+		FType ftype = fTypeMap.get(hop.getHopID());
+		
+		String hopType = hop.getClass().getSimpleName();
+		String opCode = hop.getOpString();
+		String dataType = hop.getDataType().toString();
+		String dimensions = "[" + hop.getDim1() + "x" + hop.getDim2() + "]";
+		
+		System.out.println("[" + logPrefix + "] (ID:" + hop.getHopID() + " Name:" + hop.getName() + 
+						  ") Type:" + hopType + " OpCode:" + opCode + " DataType:" + dataType + 
+						  " Dims:" + dimensions + " ChildIDs:(" + childIds.toString() + ") Privacy:" + 
+						  (privacyConstraint != null ? privacyConstraint : "null") + 
+						  " FType:" + (ftype != null ? ftype : "null"));
+	}
+	
+	/**
+	 * Logs error information for null fed plan scenarios
+	 * @param hopID The hop ID that caused the error
+	 * @param logPrefix Prefix string to identify the log source
+	 */
+	public static void logNullFedPlanError(long hopID, String logPrefix) {
+		System.err.println("[" + logPrefix + "] childFedPlan is null for hopID: " + hopID);
+	}
+	
+	/**
+	 * Logs detailed error information for conflict resolution scenarios
+	 * @param hopID The hop ID that caused the error
+	 * @param fedPlan The federated plan with error details
+	 * @param logPrefix Prefix string to identify the log source
+	 */
+	public static void logConflictResolutionError(long hopID, Object fedPlan, String logPrefix) {
+		System.err.println("[" + logPrefix + "] confilctLOutFedPlan or confilctFOutFedPlan is null for hopID: " + hopID);
+		System.err.println("  Child Hop Details:");
+		if (fedPlan != null) {
+			// Note: This assumes fedPlan has a getHopRef() method
+			// In actual implementation, you might need to cast or handle differently
+			System.err.println("	- Class: N/A");
+			System.err.println("	- Name: N/A");
+			System.err.println("	- OpString: N/A");
+			System.err.println("	- HopID: " + hopID);
+		}
+	}
+	
+	/**
+	 * Logs debug information for getFederatedType function
+	 * @param hop The hop being analyzed
+	 * @param returnFType The FType that will be returned
+	 * @param reason The reason for the FType decision
+	 */
+	public static void logGetFederatedTypeDebug(Hop hop, FType returnFType, String reason) {
+		String hopName = hop.getName() != null ? hop.getName() : "null";
+		long hopID = hop.getHopID();
+		String operationType = hop.getClass().getSimpleName();
+		String opCode = hop.getOpString();
+		
+		System.out.println("[GetFederatedType] HopName: " + hopName + " | HopID: " + hopID +
+						  " | OperationType: " + operationType + " | OpCode: " + opCode + 
+						  " | ReturnFType: " + (returnFType != null ? returnFType : "null") + 
+						  " | Reason: " + reason);
+	}
+	
+	/**
+	 * Logs detailed hop error information with complete hop details
+	 * @param hop The hop that caused the error
+	 * @param logPrefix Prefix string to identify the log source
+	 * @param additionalMessage Additional error message
+	 */
+	public static void logHopErrorDetails(Hop hop, String logPrefix, String additionalMessage) {
+		System.err.println("[" + logPrefix + "] " + additionalMessage);
+		System.err.println("  Child Hop Details:");
+		System.err.println("	- Class: " + hop.getClass().getSimpleName());
+		System.err.println("	- Name: " + (hop.getName() != null ? hop.getName() : "null"));
+		System.err.println("	- OpString: " + hop.getOpString());
+		System.err.println("	- HopID: " + hop.getHopID());
+	}
+	
+	/**
+	 * Logs detailed null child plan debugging information
+	 * @param childFedPlanPair The child federated plan pair that is null
+	 * @param optimalPlan The current optimal plan (parent)
+	 * @param memoTable The memo table for lookups
+	 */
+	public static void logNullChildPlanDebug(Pair<Long, FederatedOutput> childFedPlanPair, 
+										   FedPlan optimalPlan, 
+										   org.apache.sysds.hops.fedplanner.FederatedMemoTable memoTable) {
+		FederatedOutput alternativeFedType = (childFedPlanPair.getRight() == FederatedOutput.LOUT) ? 
+										   FederatedOutput.FOUT : FederatedOutput.LOUT;
+		FedPlan alternativeChildPlan = memoTable.getFedPlanAfterPrune(childFedPlanPair.getLeft(), alternativeFedType);
+		
+		// Get child hop info
+		Hop childHop = null;
+		String childInfo = "UNKNOWN";
+		if (alternativeChildPlan != null) {
+			childHop = alternativeChildPlan.getHopRef();
+			// Check if required fed type plan exists
+			String requiredExists = memoTable.getFedPlanAfterPrune(childFedPlanPair.getLeft(), childFedPlanPair.getRight()) != null ? "O" : "X";
+			// Check if alternative fed type plan exists  
+			String altExists = alternativeChildPlan != null ? "O" : "X";
+			
+			childInfo = String.format("ID:%d|Name:%s|Op:%s|RequiredFedType:%s(%s)|AltFedType:%s(%s)", 
+				childHop.getHopID(),
+				childHop.getName() != null ? childHop.getName() : "null",
+				childHop.getOpString(),
+				childFedPlanPair.getRight(),
+				requiredExists,
+				alternativeFedType,
+				altExists);
+		}
+		
+		// Current parent hop info
+		String currentParentInfo = String.format("ID:%d|Name:%s|Op:%s|FedType:%s|RequiredChild:%s", 
+			optimalPlan.getHopID(),
+			optimalPlan.getHopRef().getName() != null ? optimalPlan.getHopRef().getName() : "null",
+			optimalPlan.getHopRef().getOpString(),
+			optimalPlan.getFedOutType(),
+			childFedPlanPair.getRight());
+		
+		// Alternative parent info (if child has other parents)
+		String alternativeParentInfo = "NONE";
+		if (childHop != null) {
+			List<Hop> parents = childHop.getParent();
+			for (Hop parent : parents) {
+				if (parent.getHopID() != optimalPlan.getHopID()) {
+					// Try to find alt parent's fed plan info
+					String altParentFedType = "UNKNOWN";
+					String altParentRequiredChild = "UNKNOWN";
+					
+					// Check both LOUT and FOUT plans for alt parent
+					FedPlan altParentPlanLOUT = memoTable.getFedPlanAfterPrune(parent.getHopID(), FederatedOutput.LOUT);
+					FedPlan altParentPlanFOUT = memoTable.getFedPlanAfterPrune(parent.getHopID(), FederatedOutput.FOUT);
+					
+					if (altParentPlanLOUT != null) {
+						altParentFedType = "LOUT";
+						// Find what this alt parent expects from child
+						for (Pair<Long, FederatedOutput> altChildPair : altParentPlanLOUT.getChildFedPlans()) {
+							if (altChildPair.getLeft() == childHop.getHopID()) {
+								altParentRequiredChild = altChildPair.getRight().toString();
+								break;
+							}
+						}
+					} else if (altParentPlanFOUT != null) {
+						altParentFedType = "FOUT";
+						// Find what this alt parent expects from child
+						for (Pair<Long, FederatedOutput> altChildPair : altParentPlanFOUT.getChildFedPlans()) {
+							if (altChildPair.getLeft() == childHop.getHopID()) {
+								altParentRequiredChild = altChildPair.getRight().toString();
+								break;
+							}
+						}
+					}
+					
+					alternativeParentInfo = String.format("ID:%d|Name:%s|Op:%s|FedType:%s|RequiredChild:%s", 
+						parent.getHopID(),
+						parent.getName() != null ? parent.getName() : "null",
+						parent.getOpString(),
+						altParentFedType,
+						altParentRequiredChild);
+					break;
+				}
+			}
+		}
+		
+		System.err.println("[DEBUG] NULL CHILD PLAN DETECTED:");
+		System.err.println("  Child:		   " + childInfo);
+		System.err.println("  Current Parent:  " + currentParentInfo);
+		System.err.println("  Alt Parent:	  " + alternativeParentInfo);
+		System.err.println("  Alt Plan Exists: " + (alternativeChildPlan != null));
+	}
+	
+	/**
+	 * Logs debugging information for TransRead hop rewiring process
+	 * @param hopName The name of the TransRead hop
+	 * @param hopID The ID of the TransRead hop  
+	 * @param childHops List of child hops found during rewiring
+	 * @param isEmptyChildHops Whether the child hops list is empty
+	 * @param logPrefix Prefix string to identify the log source
+	 */
+	public static void logTransReadRewireDebug(String hopName, long hopID, List<Hop> childHops, 
+											 boolean isEmptyChildHops, String logPrefix) {
+		if (isEmptyChildHops) {
+			System.err.println("[" + logPrefix + "] (hopName: " + hopName + ", hopID: " + hopID + ") child hops is empty");
+		}
+	}
+	
+	/**
+	 * Logs debugging information for filtered child hops during TransRead rewiring
+	 * @param hopName The name of the TransRead hop
+	 * @param hopID The ID of the TransRead hop
+	 * @param filteredChildHops List of filtered child hops
+	 * @param isEmptyFilteredChildHops Whether the filtered child hops list is empty
+	 * @param logPrefix Prefix string to identify the log source
+	 */
+	public static void logFilteredChildHopsDebug(String hopName, long hopID, List<Hop> filteredChildHops, 
+											   boolean isEmptyFilteredChildHops, String logPrefix) {
+		if (isEmptyFilteredChildHops) {
+			System.err.println("[" + logPrefix + "] (hopName: " + hopName + ", hopID: " + hopID + ") filtered child hops is empty");
+		}
+	}
+	
+	/**
+	 * Logs detailed FType mismatch error information for TransRead hop
+	 * @param hop The TransRead hop with FType mismatch
+	 * @param filteredChildHops List of filtered child hops
+	 * @param fTypeMap Map containing FType information for hops
+	 * @param expectedFType The expected FType
+	 * @param mismatchedFType The mismatched FType
+	 * @param mismatchIndex The index where mismatch occurred
+	 */
+	public static void logFTypeMismatchError(Hop hop, List<Hop> filteredChildHops, Map<Long, FType> fTypeMap,
+										   FType expectedFType, FType mismatchedFType, int mismatchIndex) {
+		String hopName = hop.getName();
+		long hopID = hop.getHopID();
+		
+		System.err.println("[Error] FType MISMATCH DETECTED for TransRead (hopName: " + hopName + ", hopID: " + hopID + ")");
+		System.err.println("[Error] TRANSREAD HOP DETAILS - Type: " + hop.getClass().getSimpleName() + 
+			", OpType: " + (hop instanceof org.apache.sysds.hops.DataOp ? 
+				((org.apache.sysds.hops.DataOp)hop).getOp() : "N/A") + 
+			", DataType: " + hop.getDataType() + 
+			", Dims: [" + hop.getDim1() + "x" + hop.getDim2() + "]");
+		System.err.println("[Error] FILTERED CHILD HOPS FTYPE ANALYSIS:");
+		
+		for (int j = 0; j < filteredChildHops.size(); j++) {
+			Hop childHop = filteredChildHops.get(j);
+			FType childFType = fTypeMap.get(childHop.getHopID());
+			System.err.println("[Error]   FilteredChild[" + j + "] - Name: " + childHop.getName() + 
+				", ID: " + childHop.getHopID() + 
+				", FType: " + childFType + 
+				", Type: " + childHop.getClass().getSimpleName() + 
+				", OpType: " + (childHop instanceof org.apache.sysds.hops.DataOp ? 
+					((org.apache.sysds.hops.DataOp)childHop).getOp().toString() : "N/A") +
+				", Dims: [" + childHop.getDim1() + "x" + childHop.getDim2() + "]");
+		}
+		
+		System.err.println("[Error] Expected FType: " + expectedFType + 
+						  ", Mismatched FType: " + mismatchedFType + 
+						  " at child index: " + mismatchIndex);
+	}
+	
+	/**
+	 * Logs FType debug information for DataOp operations (FEDERATED, TRANSIENTWRITE, TRANSIENTREAD)
+	 * @param hop The DataOp hop being analyzed
+	 * @param fType The FType that was determined for this operation
+	 * @param opType The operation type (FEDERATED, TRANSIENTWRITE, TRANSIENTREAD)
+	 * @param reason The reason for the FType decision
+	 */
+	public static void logDataOpFTypeDebug(Hop hop, FType fType, String opType, String reason) {
+		String hopName = hop.getName() != null ? hop.getName() : "null";
+		long hopID = hop.getHopID();
+		String hopClass = hop.getClass().getSimpleName();
+		String dimensions = "[" + hop.getDim1() + "x" + hop.getDim2() + "]";
+		
+		System.out.println("[GetFederatedType] HopName: " + hopName +
+						  " | HopID: " + hopID + 
+						  " | HopClass: " + hopClass + 
+						  " | OpType: " + opType + 
+						  " | Dims: " + dimensions + 
+						  " | FType: " + (fType != null ? fType : "null") + 
+						  " | Reason: " + reason);
+	}
+	
+	// ========== FederatedMemoTable Printing Methods ==========
+	
+	/**
+	 * Recursively prints a tree representation of the DAG starting from the given root FedPlan.
+	 * Includes information about hopID, fedOutType, TotalCost, SelfCost, and NetCost for each node.
+	 * Additionally, prints the additional total cost once at the beginning.
+	 *
+	 * @param rootFedPlan The starting point FedPlan to print
+	 * @param rootHopStatSet Set of root hop statistics
+	 * @param memoTable The memoization table containing FedPlan variants
+	 * @param additionalTotalCost The additional cost to be printed once
+	 */
+	public static void printFedPlanTree(FederatedMemoTable.FedPlan rootFedPlan, Set<Long> rootHopStatSet,
+										FederatedMemoTable memoTable, double additionalTotalCost) {
+		System.out.println("Additional Cost: " + additionalTotalCost);
+		Set<Long> visited = new HashSet<>();
+		printFedPlanTreeRecursive(rootFedPlan, memoTable, visited, 0);
 
-    /**
-     * Helper method to recursively print the FedPlan tree for not referenced plans.
-     *
-     * @param plan  The current FedPlan to print
-     * @param memoTable The memoization table containing FedPlan variants
-     * @param visited Set to keep track of visited FedPlans (prevents cycles)
-     * @param depth   The current depth level for indentation
-     */
-    private static void printNotReferencedFedPlanRecursive(FederatedMemoTable.FedPlan plan, FederatedMemoTable memoTable,
-                                           Set<Long> visited, int depth) {
-        long hopID = plan.getHopRef().getHopID();
+		for (Long hopID : rootHopStatSet) {
+			FedPlan plan = memoTable.getFedPlanAfterPrune(hopID, FederatedOutput.LOUT);
+			if (plan == null){
+				plan = memoTable.getFedPlanAfterPrune(hopID, FederatedOutput.FOUT);
+			}
+			printNotReferencedFedPlanRecursive(plan, memoTable, visited, 1);
+		}
+	}
 
-        if (visited.contains(hopID)) {
-            return;
-        }
+	/**
+	 * Helper method to recursively print the FedPlan tree for not referenced plans.
+	 *
+	 * @param plan  The current FedPlan to print
+	 * @param memoTable The memoization table containing FedPlan variants
+	 * @param visited Set to keep track of visited FedPlans (prevents cycles)
+	 * @param depth   The current depth level for indentation
+	 */
+	private static void printNotReferencedFedPlanRecursive(FederatedMemoTable.FedPlan plan, FederatedMemoTable memoTable,
+										   Set<Long> visited, int depth) {
+		long hopID = plan.getHopRef().getHopID();
 
-        visited.add(hopID);
-        printFedPlan(plan, memoTable, depth, true);
+		if (visited.contains(hopID)) {
+			return;
+		}
 
-        // Process child nodes
-        List<Pair<Long, FEDInstruction.FederatedOutput>> childFedPlanPairs = plan.getChildFedPlans();
-        for (int i = 0; i < childFedPlanPairs.size(); i++) {
-            Pair<Long, FEDInstruction.FederatedOutput> childFedPlanPair = childFedPlanPairs.get(i);
-            FederatedMemoTable.FedPlanVariants childVariants = memoTable.getFedPlanVariants(childFedPlanPair);
-            if (childVariants == null || childVariants.isEmpty())
-                continue;
+		visited.add(hopID);
+		printFedPlan(plan, memoTable, depth, true);
 
-            for (FederatedMemoTable.FedPlan childPlan : childVariants.getFedPlanVariants()) {
-                printNotReferencedFedPlanRecursive(childPlan, memoTable, visited, depth + 1);
-            }
-        }
-    }
+		// Process child nodes
+		List<Pair<Long, FEDInstruction.FederatedOutput>> childFedPlanPairs = plan.getChildFedPlans();
+		for (int i = 0; i < childFedPlanPairs.size(); i++) {
+			Pair<Long, FEDInstruction.FederatedOutput> childFedPlanPair = childFedPlanPairs.get(i);
+			FederatedMemoTable.FedPlanVariants childVariants = memoTable.getFedPlanVariants(childFedPlanPair);
+			if (childVariants == null || childVariants.isEmpty())
+				continue;
 
-    /**
-     * Helper method to recursively print the FedPlan tree.
-     *
-     * @param plan  The current FedPlan to print
-     * @param memoTable The memoization table containing FedPlan variants
-     * @param visited Set to keep track of visited FedPlans (prevents cycles)
-     * @param depth   The current depth level for indentation
-     */
-    private static void printFedPlanTreeRecursive(FederatedMemoTable.FedPlan plan, FederatedMemoTable memoTable,
-                                           Set<Long> visited, int depth) {
-        long hopID = 0;
+			for (FederatedMemoTable.FedPlan childPlan : childVariants.getFedPlanVariants()) {
+				printNotReferencedFedPlanRecursive(childPlan, memoTable, visited, depth + 1);
+			}
+		}
+	}
 
-        if (depth == 0) {
-            hopID = -1;
-        } else {
-            hopID = plan.getHopRef().getHopID();
-        }
+	/**
+	 * Helper method to recursively print the FedPlan tree.
+	 *
+	 * @param plan  The current FedPlan to print
+	 * @param memoTable The memoization table containing FedPlan variants
+	 * @param visited Set to keep track of visited FedPlans (prevents cycles)
+	 * @param depth   The current depth level for indentation
+	 */
+	private static void printFedPlanTreeRecursive(FederatedMemoTable.FedPlan plan, FederatedMemoTable memoTable,
+										   Set<Long> visited, int depth) {
+		long hopID = 0;
 
-        if (visited.contains(hopID)) {
-            return;
-        }
+		if (depth == 0) {
+			hopID = -1;
+		} else {
+			hopID = plan.getHopRef().getHopID();
+		}
 
-        visited.add(hopID);
-        printFedPlan(plan, memoTable, depth, false);
-        
-        // Process child nodes
-        List<Pair<Long, FEDInstruction.FederatedOutput>> childFedPlanPairs = plan.getChildFedPlans();
-        for (int i = 0; i < childFedPlanPairs.size(); i++) {
-            Pair<Long, FEDInstruction.FederatedOutput> childFedPlanPair = childFedPlanPairs.get(i);
-            FederatedMemoTable.FedPlanVariants childVariants = memoTable.getFedPlanVariants(childFedPlanPair);
-            if (childVariants == null || childVariants.isEmpty())
-                continue;
+		if (visited.contains(hopID)) {
+			return;
+		}
 
-            for (FederatedMemoTable.FedPlan childPlan : childVariants.getFedPlanVariants()) {
-                printFedPlanTreeRecursive(childPlan, memoTable, visited, depth + 1);
-            }
-        }
-    }
+		visited.add(hopID);
+		printFedPlan(plan, memoTable, depth, false);
+		
+		// Process child nodes
+		List<Pair<Long, FEDInstruction.FederatedOutput>> childFedPlanPairs = plan.getChildFedPlans();
+		for (int i = 0; i < childFedPlanPairs.size(); i++) {
+			Pair<Long, FEDInstruction.FederatedOutput> childFedPlanPair = childFedPlanPairs.get(i);
+			FederatedMemoTable.FedPlanVariants childVariants = memoTable.getFedPlanVariants(childFedPlanPair);
+			if (childVariants == null || childVariants.isEmpty())
+				continue;
 
-    /**
-     * Prints detailed information about a FedPlan including costs, dimensions, and memory estimates.
-     *
-     * @param plan The FedPlan to print
-     * @param memoTable The memoization table containing FedPlan variants
-     * @param depth The current depth level for indentation
-     * @param isNotReferenced Whether this plan is not referenced
-     */
-    private static void printFedPlan(FederatedMemoTable.FedPlan plan, FederatedMemoTable memoTable, int depth, boolean isNotReferenced) {
-        StringBuilder sb = new StringBuilder();
-        Hop hop = null;
+			for (FederatedMemoTable.FedPlan childPlan : childVariants.getFedPlanVariants()) {
+				printFedPlanTreeRecursive(childPlan, memoTable, visited, depth + 1);
+			}
+		}
+	}
 
-        if (depth == 0){
-            sb.append("(R) ROOT [Root]");
-        } else {
-            hop = plan.getHopRef();
-            // Add FedPlan information
-            sb.append(String.format("(%d) ", hop.getHopID()))
-                    .append(hop.getOpString())
-                    .append(" [");
+	/**
+	 * Prints detailed information about a FedPlan including costs, dimensions, and memory estimates.
+	 *
+	 * @param plan The FedPlan to print
+	 * @param memoTable The memoization table containing FedPlan variants
+	 * @param depth The current depth level for indentation
+	 * @param isNotReferenced Whether this plan is not referenced
+	 */
+	private static void printFedPlan(FederatedMemoTable.FedPlan plan, FederatedMemoTable memoTable, int depth, boolean isNotReferenced) {
+		StringBuilder sb = new StringBuilder();
+		Hop hop = null;
 
-            if (isNotReferenced) {
-                if (depth == 1) {
-                    sb.append("NRef(TOP)");
-                } else {
-                    sb.append("NRef");
-                }
-            } else{
-                sb.append(plan.getFedOutType());
-            }
-            sb.append("]");
-        }
+		if (depth == 0){
+			sb.append("(R) ROOT [Root]");
+		} else {
+			hop = plan.getHopRef();
+			// Add FedPlan information
+			sb.append(String.format("(%d) ", hop.getHopID()))
+					.append(hop.getOpString())
+					.append(" [");
 
-        StringBuilder childs = new StringBuilder();
-        childs.append(" (");
+			if (isNotReferenced) {
+				if (depth == 1) {
+					sb.append("NRef(TOP)");
+				} else {
+					sb.append("NRef");
+				}
+			} else{
+				sb.append(plan.getFedOutType());
+			}
+			sb.append("]");
+		}
 
-        boolean childAdded = false;
-        for (Pair<Long, FederatedOutput> childPair : plan.getChildFedPlans()){
-            childs.append(childAdded?",":"");
-            childs.append(childPair.getLeft());
-            childAdded = true;
-        }
-        
-        childs.append(")");
+		StringBuilder childs = new StringBuilder();
+		childs.append(" (");
 
-        if (childAdded)
-            sb.append(childs.toString());
+		boolean childAdded = false;
+		for (Pair<Long, FederatedOutput> childPair : plan.getChildFedPlans()){
+			childs.append(childAdded?",":"");
+			childs.append(childPair.getLeft());
+			childAdded = true;
+		}
+		
+		childs.append(")");
 
-        if (depth == 0){
-            sb.append(String.format(" {Total: %.1f}", plan.getCumulativeCost()));
-            System.out.println(sb);
-            return;
-        }
+		if (childAdded)
+			sb.append(childs.toString());
 
-        sb.append(String.format(" {Total: %.1f, Self: %.1f, Net: %.1f, Weight: %.1f}",
-                plan.getCumulativeCost(),
-                plan.getSelfCost(),
-                plan.getForwardingCost(),
-                plan.getComputeWeight()));
+		if (depth == 0){
+			sb.append(String.format(" {Total: %.1f}", plan.getCumulativeCost()));
+			System.out.println(sb);
+			return;
+		}
 
-        // Add matrix characteristics
-        sb.append(" [")
-                .append(hop.getDim1()).append(", ")
-                .append(hop.getDim2()).append(", ")
-                .append(hop.getBlocksize()).append(", ")
-                .append(hop.getNnz());
+		sb.append(String.format(" {Total: %.1f, Self: %.1f, Net: %.1f, Weight: %.1f}",
+				plan.getCumulativeCost(),
+				plan.getSelfCost(),
+				plan.getForwardingCost(),
+				plan.getComputeWeight()));
 
-        if (hop.getUpdateType().isInPlace()) {
-            sb.append(", ").append(hop.getUpdateType().toString().toLowerCase());
-        }
-        sb.append("]");
+		// Add matrix characteristics
+		sb.append(" [")
+				.append(hop.getDim1()).append(", ")
+				.append(hop.getDim2()).append(", ")
+				.append(hop.getBlocksize()).append(", ")
+				.append(hop.getNnz());
 
-        // Add memory estimates
-        sb.append(" [")
-                .append(OptimizerUtils.toMB(hop.getInputMemEstimate())).append(", ")
-                .append(OptimizerUtils.toMB(hop.getIntermediateMemEstimate())).append(", ")
-                .append(OptimizerUtils.toMB(hop.getOutputMemEstimate())).append(" -> ")
-                .append(OptimizerUtils.toMB(hop.getMemEstimate())).append("MB]");
+		if (hop.getUpdateType().isInPlace()) {
+			sb.append(", ").append(hop.getUpdateType().toString().toLowerCase());
+		}
+		sb.append("]");
 
-        // Add reblock and checkpoint requirements
-        if (hop.requiresReblock() && hop.requiresCheckpoint()) {
-            sb.append(" [rblk, chkpt]");
-        } else if (hop.requiresReblock()) {
-            sb.append(" [rblk]");
-        } else if (hop.requiresCheckpoint()) {
-            sb.append(" [chkpt]");
-        }
+		// Add memory estimates
+		sb.append(" [")
+				.append(OptimizerUtils.toMB(hop.getInputMemEstimate())).append(", ")
+				.append(OptimizerUtils.toMB(hop.getIntermediateMemEstimate())).append(", ")
+				.append(OptimizerUtils.toMB(hop.getOutputMemEstimate())).append(" -> ")
+				.append(OptimizerUtils.toMB(hop.getMemEstimate())).append("MB]");
 
-        // Add execution type
-        if (hop.getExecType() != null) {
-            sb.append(", ").append(hop.getExecType());
-        }
-        
-        if (childAdded){
-            sb.append(" [Edges]{");
-            for (Pair<Long, FederatedOutput> childPair : plan.getChildFedPlans()){
-                // Add forwarding weight for each edge
-                FedPlan childPlan = memoTable.getFedPlanAfterPrune(childPair.getLeft(), childPair.getRight());
-                
-                if (childPlan == null) {
-                    sb.append(String.format("(ID:%d, NULL)", childPair.getLeft()));
-                } else {
-                    String isForwardingCostOccured = "";
-                    if (childPair.getRight() == plan.getFedOutType()){
-                        isForwardingCostOccured = "X";
-                    } else {
-                        isForwardingCostOccured = "O";
-                    }
-                    sb.append(String.format("(ID:%d, %s, C:%.1f, F:%.1f, FW:%.1f)", childPair.getLeft(), isForwardingCostOccured, 
-                                childPlan.getCumulativeCostPerParents(), 
-                                plan.getChildForwardingWeight(childPlan.getLoopContext()) * childPlan.getForwardingCostPerParents(), 
-                                plan.getChildForwardingWeight(childPlan.getLoopContext())));
-                }
-                sb.append(childAdded?",":"");
-            }
-            sb.append("}");
-        }
+		// Add reblock and checkpoint requirements
+		if (hop.requiresReblock() && hop.requiresCheckpoint()) {
+			sb.append(" [rblk, chkpt]");
+		} else if (hop.requiresReblock()) {
+			sb.append(" [rblk]");
+		} else if (hop.requiresCheckpoint()) {
+			sb.append(" [chkpt]");
+		}
 
-        System.out.println(sb);
-    }
+		// Add execution type
+		if (hop.getExecType() != null) {
+			sb.append(", ").append(hop.getExecType());
+		}
+		
+		if (childAdded){
+			sb.append(" [Edges]{");
+			for (Pair<Long, FederatedOutput> childPair : plan.getChildFedPlans()){
+				// Add forwarding weight for each edge
+				FedPlan childPlan = memoTable.getFedPlanAfterPrune(childPair.getLeft(), childPair.getRight());
+				
+				if (childPlan == null) {
+					sb.append(String.format("(ID:%d, NULL)", childPair.getLeft()));
+				} else {
+					String isForwardingCostOccured = "";
+					if (childPair.getRight() == plan.getFedOutType()){
+						isForwardingCostOccured = "X";
+					} else {
+						isForwardingCostOccured = "O";
+					}
+					sb.append(String.format("(ID:%d, %s, C:%.1f, F:%.1f, FW:%.1f)", childPair.getLeft(), isForwardingCostOccured, 
+								childPlan.getCumulativeCostPerParents(), 
+								plan.getChildForwardingWeight(childPlan.getLoopContext()) * childPlan.getForwardingCostPerParents(), 
+								plan.getChildForwardingWeight(childPlan.getLoopContext())));
+				}
+				sb.append(childAdded?",":"");
+			}
+			sb.append("}");
+		}
+
+		System.out.println(sb);
+	}
 }
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/hops/rewrite/ProgramRewriter.java b/src/main/java/org/apache/sysds/hops/rewrite/ProgramRewriter.java
index c2602dba510..aa82adcfdc5 100644
--- a/src/main/java/org/apache/sysds/hops/rewrite/ProgramRewriter.java
+++ b/src/main/java/org/apache/sysds/hops/rewrite/ProgramRewriter.java
@@ -69,15 +69,14 @@ public ProgramRewriter(boolean staticRewrites, boolean dynamicRewrites)
 		
 		//initialize StatementBlock rewrite ruleSet (with fixed rewrite order)
 		_sbRuleSet = new ArrayList<>();
-		
-		
+
+
 		//STATIC REWRITES (which do not rely on size information)
 		if( staticRewrites )
 		{
 			//add static HOP DAG rewrite rules
 			_dagRuleSet.add(     new RewriteRemoveReadAfterWrite()               ); //dependency: before blocksize
 			_dagRuleSet.add(     new RewriteBlockSizeAndReblock()                );
-			_dagRuleSet.add(     new RewriteInjectOOCTee()                       );
 			if( OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION )
 				_dagRuleSet.add( new RewriteRemoveUnnecessaryCasts()             );
 			if( OptimizerUtils.ALLOW_COMMON_SUBEXPRESSION_ELIMINATION )
@@ -94,6 +93,7 @@ public ProgramRewriter(boolean staticRewrites, boolean dynamicRewrites)
 			if( OptimizerUtils.ALLOW_QUANTIZE_COMPRESS_REWRITE )
 				_dagRuleSet.add( new RewriteQuantizationFusedCompression()  	 );
 
+
 			//add statement block rewrite rules
 			if( OptimizerUtils.ALLOW_BRANCH_REMOVAL )
 				_sbRuleSet.add(  new RewriteRemoveUnnecessaryBranches()          ); //dependency: constant folding
@@ -152,6 +152,7 @@ public ProgramRewriter(boolean staticRewrites, boolean dynamicRewrites)
 			_dagRuleSet.add( new RewriteConstantFolding()                    ); //dependency: cse
 		_sbRuleSet.add(  new RewriteRemoveEmptyBasicBlocks()                 );
 		_sbRuleSet.add(  new RewriteRemoveEmptyForLoops()                    );
+		_sbRuleSet.add(      new RewriteInjectOOCTee()                       );
 	}
 	
 	/**
diff --git a/src/main/java/org/apache/sysds/hops/rewrite/RewriteInjectOOCTee.java b/src/main/java/org/apache/sysds/hops/rewrite/RewriteInjectOOCTee.java
index 54dffa263eb..7abfb15d1d4 100644
--- a/src/main/java/org/apache/sysds/hops/rewrite/RewriteInjectOOCTee.java
+++ b/src/main/java/org/apache/sysds/hops/rewrite/RewriteInjectOOCTee.java
@@ -25,6 +25,7 @@
 import org.apache.sysds.hops.DataOp;
 import org.apache.sysds.hops.Hop;
 import org.apache.sysds.hops.ReorgOp;
+import org.apache.sysds.parser.StatementBlock;
 
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -49,73 +50,20 @@
  * 2. <b>Apply Rewrites (Modification):</b> Iterate over the collected candidate and put
  * {@code TeeOp}, and safely rewire the graph.
  */
-public class RewriteInjectOOCTee extends HopRewriteRule {
+public class RewriteInjectOOCTee extends StatementBlockRewriteRule {
 
 	public static boolean APPLY_ONLY_XtX_PATTERN = false;
+
+	private static final Map<String, Integer> _transientVars = new HashMap<>();
+	private static final Map<String, List<Hop>> _transientHops = new HashMap<>();
+	private static final Set<String> teeTransientVars = new HashSet<>();
 	
 	private static final Set<Long> rewrittenHops = new HashSet<>();
 	private static final Map<Long, Hop> handledHop = new HashMap<>();
 
 	// Maintain a list of candidates to rewrite in the second pass
 	private final List<Hop> rewriteCandidates = new ArrayList<>();
-
-	/**
-	 * Handle a generic (last-level) hop DAG with multiple roots.
-	 *
-	 * @param roots high-level operator roots
-	 * @param state program rewrite status
-	 * @return list of high-level operators
-	 */
-	@Override
-	public ArrayList<Hop> rewriteHopDAGs(ArrayList<Hop> roots, ProgramRewriteStatus state) {
-		if (roots == null) {
-			return null;
-		}
-
-		// Clear candidates for this pass
-		rewriteCandidates.clear();
-
-		// PASS 1: Identify candidates without modifying the graph
-		for (Hop root : roots) {
-			root.resetVisitStatus();
-			findRewriteCandidates(root);
-		}
-
-		// PASS 2: Apply rewrites to identified candidates
-		for (Hop candidate : rewriteCandidates) {
-			applyTopDownTeeRewrite(candidate);
-		}
-
-		return roots;
-	}
-
-	/**
-	 * Handle a predicate hop DAG with exactly one root.
-	 *
-	 * @param root  high-level operator root
-	 * @param state program rewrite status
-	 * @return high-level operator
-	 */
-	@Override
-	public Hop rewriteHopDAG(Hop root, ProgramRewriteStatus state) {
-		if (root == null) {
-			return null;
-		}
-
-		// Clear candidates for this pass
-		rewriteCandidates.clear();
-
-		// PASS 1: Identify candidates without modifying the graph
-		root.resetVisitStatus();
-		findRewriteCandidates(root);
-
-		// PASS 2: Apply rewrites to identified candidates
-		for (Hop candidate : rewriteCandidates) {
-			applyTopDownTeeRewrite(candidate);
-		}
-
-		return root;
-	}
+	private boolean forceTee = false;
 
 	/**
 	 * First pass: Find candidates for rewrite without modifying the graph.
@@ -137,6 +85,35 @@ private void findRewriteCandidates(Hop hop) {
 			findRewriteCandidates(input);
 		}
 
+		boolean isRewriteCandidate = DMLScript.USE_OOC
+			&& hop.getDataType().isMatrix()
+			&& !HopRewriteUtils.isData(hop, OpOpData.TEE)
+			&& hop.getParent().size() > 1
+			&& (!APPLY_ONLY_XtX_PATTERN || isSelfTranposePattern(hop));
+
+		if (HopRewriteUtils.isData(hop, OpOpData.TRANSIENTREAD) && hop.getDataType().isMatrix()) {
+			_transientVars.compute(hop.getName(), (key, ctr) -> {
+				int incr = (isRewriteCandidate || forceTee) ? 2 : 1;
+
+				int ret = ctr == null ? 0 : ctr;
+				ret += incr;
+
+				if (ret > 1)
+					teeTransientVars.add(hop.getName());
+
+				return ret;
+			});
+
+			_transientHops.compute(hop.getName(), (key, hops) -> {
+				if (hops == null)
+					return new ArrayList<>(List.of(hop));
+				hops.add(hop);
+				return hops;
+			});
+
+			return; // We do not tee transient reads but rather inject before TWrite or PRead as caching stream
+		}
+
 		// Check if this hop is a candidate for OOC Tee injection
 		if (DMLScript.USE_OOC 
 			&& hop.getDataType().isMatrix()
@@ -160,11 +137,17 @@ private void applyTopDownTeeRewrite(Hop sharedInput) {
 			return;
 		}
 
+		int consumerCount = sharedInput.getParent().size();
+		if (LOG.isDebugEnabled()) {
+			LOG.debug("Inject tee for hop " + sharedInput.getHopID() + " ("
+				+ sharedInput.getName() + "), consumers=" + consumerCount);
+		}
+
 		// Take a defensive copy of consumers before modifying the graph
 		ArrayList<Hop> consumers = new ArrayList<>(sharedInput.getParent());
 
 		// Create the new TeeOp with the original hop as input
-		DataOp teeOp = new DataOp("tee_out_" + sharedInput.getName(), 
+		DataOp teeOp = new DataOp("tee_out_" + sharedInput.getName(),
 			sharedInput.getDataType(), sharedInput.getValueType(), Types.OpOpData.TEE, null,
 			sharedInput.getDim1(), sharedInput.getDim2(), sharedInput.getNnz(), sharedInput.getBlocksize());
 		HopRewriteUtils.addChildReference(teeOp, sharedInput);
@@ -177,6 +160,11 @@ private void applyTopDownTeeRewrite(Hop sharedInput) {
 		// Record that we've handled this hop
 		handledHop.put(sharedInput.getHopID(), teeOp);
 		rewrittenHops.add(sharedInput.getHopID());
+
+		if (LOG.isDebugEnabled()) {
+			LOG.debug("Created tee hop " + teeOp.getHopID() + " -> "
+				+ teeOp.getName());
+		}
 	}
 
 	@SuppressWarnings("unused")
@@ -196,4 +184,108 @@ else if (HopRewriteUtils.isMatrixMultiply(parent)) {
 		}
 		return hasTransposeConsumer &&  hasMatrixMultiplyConsumer;
 	}
+
+	@Override
+	public boolean createsSplitDag() {
+		return false;
+	}
+
+	@Override
+	public List<StatementBlock> rewriteStatementBlock(StatementBlock sb, ProgramRewriteStatus state) {
+		if (!DMLScript.USE_OOC)
+			return List.of(sb);
+
+		rewriteSB(sb, state);
+
+		for (String tVar : teeTransientVars) {
+			List<Hop> tHops = _transientHops.get(tVar);
+
+			if (tHops == null)
+				continue;
+
+			for (Hop affectedHops : tHops) {
+				applyTopDownTeeRewrite(affectedHops);
+			}
+
+			tHops.clear();
+		}
+
+		removeRedundantTeeChains(sb);
+
+		return List.of(sb);
+	}
+
+	@Override
+	public List<StatementBlock> rewriteStatementBlocks(List<StatementBlock> sbs, ProgramRewriteStatus state) {
+		if (!DMLScript.USE_OOC)
+			return sbs;
+
+		for (StatementBlock sb : sbs)
+			rewriteSB(sb, state);
+
+		for (String tVar : teeTransientVars) {
+			List<Hop> tHops = _transientHops.get(tVar);
+
+			if (tHops == null)
+				continue;
+
+			for (Hop affectedHops : tHops) {
+				applyTopDownTeeRewrite(affectedHops);
+			}
+		}
+
+		for (StatementBlock sb : sbs)
+			removeRedundantTeeChains(sb);
+
+		return sbs;
+	}
+
+	private void rewriteSB(StatementBlock sb, ProgramRewriteStatus state) {
+		rewriteCandidates.clear();
+
+		if (sb.getHops() != null) {
+			for(Hop hop : sb.getHops()) {
+				hop.resetVisitStatus();
+				findRewriteCandidates(hop);
+			}
+		}
+
+		for (Hop candidate : rewriteCandidates) {
+			applyTopDownTeeRewrite(candidate);
+		}
+	}
+
+	private void removeRedundantTeeChains(StatementBlock sb) {
+		if (sb == null || sb.getHops() == null)
+			return;
+
+		Hop.resetVisitStatus(sb.getHops());
+		for (Hop hop : sb.getHops())
+			removeRedundantTeeChains(hop);
+		Hop.resetVisitStatus(sb.getHops());
+	}
+
+	private void removeRedundantTeeChains(Hop hop) {
+		if (hop.isVisited())
+			return;
+
+		ArrayList<Hop> inputs = new ArrayList<>(hop.getInput());
+		for (Hop in : inputs)
+			removeRedundantTeeChains(in);
+
+		if (HopRewriteUtils.isData(hop, OpOpData.TEE) && hop.getInput().size() == 1) {
+			Hop teeInput = hop.getInput().get(0);
+			if (HopRewriteUtils.isData(teeInput, OpOpData.TEE)) {
+				if (LOG.isDebugEnabled()) {
+					LOG.debug("Remove redundant tee hop " + hop.getHopID()
+						+ " (" + hop.getName() + ") -> " + teeInput.getHopID()
+						+ " (" + teeInput.getName() + ")");
+				}
+				HopRewriteUtils.rewireAllParentChildReferences(hop, teeInput);
+				HopRewriteUtils.removeAllChildReferences(hop);
+			}
+		}
+
+		hop.setVisited();
+	}
 }
diff --git a/src/main/java/org/apache/sysds/hops/rewrite/RewriteMatrixMultChainOptimization.java b/src/main/java/org/apache/sysds/hops/rewrite/RewriteMatrixMultChainOptimization.java
index fdd2f8343fe..960560c254e 100644
--- a/src/main/java/org/apache/sysds/hops/rewrite/RewriteMatrixMultChainOptimization.java
+++ b/src/main/java/org/apache/sysds/hops/rewrite/RewriteMatrixMultChainOptimization.java
@@ -210,7 +210,7 @@ protected void optimizeMMChain(Hop hop, List<Hop> mmChain, List<Hop> mmOperators
 	 * Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, Clifford Stein
 	 * Introduction to Algorithms, Third Edition, MIT Press, page 395.
 	 */
-	private static int[][] mmChainDP(double[] dimArray, int size) 
+	public static int[][] mmChainDP(double[] dimArray, int size)
 	{
 		double[][] dpMatrix = new double[size][size]; //min cost table
 		int[][] split = new int[size][size]; //min cost index table
diff --git a/src/main/java/org/apache/sysds/lops/Transform.java b/src/main/java/org/apache/sysds/lops/Transform.java
index 0d2e79f83a8..d9537dcca6c 100644
--- a/src/main/java/org/apache/sysds/lops/Transform.java
+++ b/src/main/java/org/apache/sysds/lops/Transform.java
@@ -180,7 +180,7 @@ private String getInstructions(String input1, int numInputs, String output) {
 		sb.append( this.prepOutputOperand(output));
 		
 		if( (getExecType()==ExecType.CP || getExecType()==ExecType.FED || getExecType()==ExecType.OOC)
-			&& (_operation == ReOrgOp.TRANS || _operation == ReOrgOp.REV || _operation == ReOrgOp.SORT) ) {
+			&& (_operation == ReOrgOp.TRANS || _operation == ReOrgOp.REV || _operation == ReOrgOp.SORT || _operation == ReOrgOp.ROLL) ) {
 			sb.append( OPERAND_DELIMITOR );
 			sb.append( _numThreads );
 			if ( getExecType()==ExecType.FED ) {
diff --git a/src/main/java/org/apache/sysds/parser/DMLTranslator.java b/src/main/java/org/apache/sysds/parser/DMLTranslator.java
index 092fbffe36d..c6e7188d7bc 100644
--- a/src/main/java/org/apache/sysds/parser/DMLTranslator.java
+++ b/src/main/java/org/apache/sysds/parser/DMLTranslator.java
@@ -1392,7 +1392,7 @@ public void constructHopsForWhileControlBlock(WhileStatementBlock sb) {
 
 	public void constructHopsForConditionalPredicate(StatementBlock passedSB) {
 
-		HashMap<String, Hop> _ids = new HashMap<>();
+		HashMap<String, Hop> ids = new HashMap<>();
 
 		// set conditional predicate
 		ConditionalPredicate cp = null;
@@ -1428,7 +1428,7 @@ else if (passedSB instanceof IfStatementBlock) {
 						null, actualDim1, actualDim2, var.getNnz(), var.getBlocksize());
 				read.setParseInfo(var);
 			}
-			_ids.put(varName, read);
+			ids.put(varName, read);
 		}
 
 		DataIdentifier target = new DataIdentifier(Expression.getTempName());
@@ -1439,12 +1439,12 @@ else if (passedSB instanceof IfStatementBlock) {
 		Expression predicate = cp.getPredicate();
 
 		if (predicate instanceof RelationalExpression) {
-			predicateHops = processRelationalExpression((RelationalExpression) cp.getPredicate(), target, _ids);
+			predicateHops = processRelationalExpression((RelationalExpression) cp.getPredicate(), target, ids);
 		} else if (predicate instanceof BooleanExpression) {
-			predicateHops = processBooleanExpression((BooleanExpression) cp.getPredicate(), target, _ids);
+			predicateHops = processBooleanExpression((BooleanExpression) cp.getPredicate(), target, ids);
 		} else if (predicate instanceof DataIdentifier) {
 			// handle data identifier predicate
-			predicateHops = processExpression(cp.getPredicate(), null, _ids);
+			predicateHops = processExpression(cp.getPredicate(), null, ids);
 		} else if (predicate instanceof ConstIdentifier) {
 			// handle constant identifier
 			//  a) translate 0 --> FALSE; translate 1 --> TRUE
@@ -1463,7 +1463,7 @@ else if (passedSB instanceof IfStatementBlock) {
 				throw new ParseException(predicate.printErrorLocation() + "String value '" + predicate.toString()
 						+ "' is not allowed for iterable predicate");
 			}
-			predicateHops = processExpression(cp.getPredicate(), null, _ids);
+			predicateHops = processExpression(cp.getPredicate(), null, ids);
 		}
 
 		//create transient write to internal variable name on top of expression
@@ -1487,7 +1487,7 @@ else if (passedSB instanceof IfStatementBlock)
 	 */
 	public void constructHopsForIterablePredicate(ForStatementBlock fsb) 
 	{
-		HashMap<String, Hop> _ids = new HashMap<>();
+		HashMap<String, Hop> ids = new HashMap<>();
 
 		// set iterable predicate 
 		ForStatement fs = (ForStatement) fsb.getStatement(0);
@@ -1513,13 +1513,13 @@ public void constructHopsForIterablePredicate(ForStatementBlock fsb)
 								null, actualDim1, actualDim2,  var.getNnz(), var.getBlocksize());
 						read.setParseInfo(var);
 					}
-					_ids.put(varName, read);
+					ids.put(varName, read);
 				}
 			}
 
 			//create transient write to internal variable name on top of expression
 			//in order to ensure proper instruction generation
-			Hop predicateHops = processTempIntExpression(expr, _ids);
+			Hop predicateHops = processTempIntExpression(expr, ids);
 			if( predicateHops != null )
 				predicateHops = HopRewriteUtils.createDataOp(
 					ProgramBlock.PRED_VAR, predicateHops, OpOpData.TRANSIENTWRITE);
diff --git a/src/main/java/org/apache/sysds/parser/DataExpression.java b/src/main/java/org/apache/sysds/parser/DataExpression.java
index 1b9afb41b68..22dbe21c187 100644
--- a/src/main/java/org/apache/sysds/parser/DataExpression.java
+++ b/src/main/java/org/apache/sysds/parser/DataExpression.java
@@ -1019,6 +1019,18 @@ public void validateExpression(HashMap<String, DataIdentifier> ids, HashMap<Stri
 					inferredFormatType = true;
 				}
 			}
+
+			// check if file is HDF5 format (by file extension)
+			if (formatTypeString == null && shouldReadMTD) {
+				formatTypeString = MetaDataAll.checkHasHDF5Format(inputFileName);
+				if (formatTypeString != null) {
+					addVarParam(FORMAT_TYPE, new StringIdentifier(formatTypeString, this));
+					configObj.setFormatTypeString(formatTypeString);
+					inferredFormatType = true;
+					// HDF5 doesn't require metadata file for dimension validation
+					shouldReadMTD = false;
+				}
+			}
 			
 			if (formatTypeString != null && formatTypeString.equalsIgnoreCase(FileFormat.MM.toString())){
 				/*
diff --git a/src/main/java/org/apache/sysds/parser/dml/DmlLexer.java b/src/main/java/org/apache/sysds/parser/dml/DmlLexer.java
index 0ce5c153a0a..954fe9e71c4 100644
--- a/src/main/java/org/apache/sysds/parser/dml/DmlLexer.java
+++ b/src/main/java/org/apache/sysds/parser/dml/DmlLexer.java
@@ -22,8 +22,6 @@
 
 import org.antlr.v4.runtime.Lexer;
 import org.antlr.v4.runtime.CharStream;
-import org.antlr.v4.runtime.Token;
-import org.antlr.v4.runtime.TokenStream;
 import org.antlr.v4.runtime.*;
 import org.antlr.v4.runtime.atn.*;
 import org.antlr.v4.runtime.dfa.DFA;
@@ -411,4 +409,4 @@ public DmlLexer(CharStream input) {
 			_decisionToDFA[i] = new DFA(_ATN.getDecisionState(i), i);
 		}
 	}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/apache/sysds/parser/dml/DmlParser.java b/src/main/java/org/apache/sysds/parser/dml/DmlParser.java
index 531b93c5ac7..263ef205d38 100644
--- a/src/main/java/org/apache/sysds/parser/dml/DmlParser.java
+++ b/src/main/java/org/apache/sysds/parser/dml/DmlParser.java
@@ -26,7 +26,6 @@
 import org.antlr.v4.runtime.misc.*;
 import org.antlr.v4.runtime.tree.*;
 import java.util.List;
-import java.util.Iterator;
 import java.util.ArrayList;
 
 @SuppressWarnings({"all", "warnings", "unchecked", "unused", "cast"})
@@ -3463,4 +3462,4 @@ private boolean expression_sempred(ExpressionContext _localctx, int predIndex) {
 			_decisionToDFA[i] = new DFA(_ATN.getDecisionState(i), i);
 		}
 	}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/apache/sysds/resource/CloudUtils.java b/src/main/java/org/apache/sysds/resource/CloudUtils.java
index 8af5b72aec8..01db30960ba 100644
--- a/src/main/java/org/apache/sysds/resource/CloudUtils.java
+++ b/src/main/java/org/apache/sysds/resource/CloudUtils.java
@@ -23,7 +23,11 @@
 import org.apache.wink.json4j.JSONObject;
 import org.apache.wink.json4j.JSONArray;
 
-import java.io.*;
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
 import java.rmi.RemoteException;
 import java.util.HashMap;
 
diff --git a/src/main/java/org/apache/sysds/resource/ResourceCompiler.java b/src/main/java/org/apache/sysds/resource/ResourceCompiler.java
index eaa538339ca..8fe409fc898 100644
--- a/src/main/java/org/apache/sysds/resource/ResourceCompiler.java
+++ b/src/main/java/org/apache/sysds/resource/ResourceCompiler.java
@@ -27,8 +27,31 @@
 import org.apache.sysds.hops.Hop;
 import org.apache.sysds.hops.OptimizerUtils;
 import org.apache.sysds.hops.recompile.Recompiler;
-import org.apache.sysds.parser.*;
-import org.apache.sysds.runtime.controlprogram.*;
+import org.apache.sysds.parser.AssignmentStatement;
+import org.apache.sysds.parser.DMLProgram;
+import org.apache.sysds.parser.DMLTranslator;
+import org.apache.sysds.parser.DataExpression;
+import org.apache.sysds.parser.Expression;
+import org.apache.sysds.parser.ForStatementBlock;
+import org.apache.sysds.parser.FunctionDictionary;
+import org.apache.sysds.parser.FunctionStatement;
+import org.apache.sysds.parser.FunctionStatementBlock;
+import org.apache.sysds.parser.IfStatementBlock;
+import org.apache.sysds.parser.OutputStatement;
+import org.apache.sysds.parser.ParserFactory;
+import org.apache.sysds.parser.ParserWrapper;
+import org.apache.sysds.parser.Statement;
+import org.apache.sysds.parser.StatementBlock;
+import org.apache.sysds.parser.StringIdentifier;
+import org.apache.sysds.parser.WhileStatementBlock;
+import org.apache.sysds.runtime.controlprogram.BasicProgramBlock;
+import org.apache.sysds.runtime.controlprogram.ForProgramBlock;
+import org.apache.sysds.runtime.controlprogram.FunctionProgramBlock;
+import org.apache.sysds.runtime.controlprogram.IfProgramBlock;
+import org.apache.sysds.runtime.controlprogram.LocalVariableMap;
+import org.apache.sysds.runtime.controlprogram.Program;
+import org.apache.sysds.runtime.controlprogram.ProgramBlock;
+import org.apache.sysds.runtime.controlprogram.WhileProgramBlock;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContextFactory;
 import org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysds.runtime.instructions.Instruction;
@@ -42,9 +65,7 @@
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
-import static org.apache.sysds.api.DMLScript.*;
 import static org.apache.sysds.parser.DataExpression.IO_FILENAME;
-import static org.apache.sysds.resource.CloudUtils.*;
 
 /**
  * This class does full or partial program recompilation
@@ -69,7 +90,7 @@ public static Program compile(String filePath, Map<String, String> args, HashMap
 		DMLOptions dmlOptions =DMLOptions.defaultOptions;
 		dmlOptions.argVals = args;
 
-		String dmlScriptStr = readDMLScript(true, filePath);
+		String dmlScriptStr = DMLScript.readDMLScript(true, filePath);
 		Map<String, String> argVals = dmlOptions.argVals;
 
 		ParserWrapper parser = ParserFactory.createParser();
@@ -235,7 +256,7 @@ else if (originBlock instanceof ForProgramBlock) //incl parfor
 	public static void setSingleNodeResourceConfigs(long nodeMemory, int nodeCores) {
 		DMLScript.setGlobalExecMode(Types.ExecMode.SINGLE_NODE);
 		// use 90% of the node's memory for the JVM heap -> rest needed for the OS
-		long effectiveSingleNodeMemory = (long) (nodeMemory * JVM_MEMORY_FACTOR);
+		long effectiveSingleNodeMemory = (long) (nodeMemory * CloudUtils.JVM_MEMORY_FACTOR);
 		// CPU core would be shared with OS -> no further limitation
 		InfrastructureAnalyzer.setLocalMaxMemory(effectiveSingleNodeMemory);
 		InfrastructureAnalyzer.setLocalPar(nodeCores);
@@ -259,9 +280,9 @@ public static void setSparkClusterResourceConfigs(long driverMemory, int driverC
 		// ------------------- CP (driver) configurations -------------------
 		// use at most 90% of the node's memory for the JVM heap -> rest needed for the OS and resource management
 		// adapt the minimum based on the need for YAN RM
-		long effectiveDriverMemory = calculateEffectiveDriverMemoryBudget(driverMemory, numExecutors*executorCores);
+		long effectiveDriverMemory = CloudUtils.calculateEffectiveDriverMemoryBudget(driverMemory, numExecutors*executorCores);
 		// require that always at least half of the memory budget is left for driver memory or 1GB
-		if (effectiveDriverMemory <= GBtoBytes(1)  || driverMemory > 2*effectiveDriverMemory) {
+		if (effectiveDriverMemory <= CloudUtils.GBtoBytes(1)  || driverMemory > 2*effectiveDriverMemory) {
 			throw new IllegalArgumentException("Driver resources are not sufficient to handle the cluster");
 		}
 		// CPU core would be shared -> no further limitation
@@ -279,7 +300,7 @@ public static void setSparkClusterResourceConfigs(long driverMemory, int driverC
 
 		// ------------------ Dynamic Spark Configurations -------------------
 		// calculate the effective resource that would be available for the executor containers in YARN
-		int[] effectiveValues = getEffectiveExecutorResources(executorMemory, executorCores, numExecutors);
+		int[] effectiveValues = CloudUtils.getEffectiveExecutorResources(executorMemory, executorCores, numExecutors);
 		int effectiveExecutorMemory = effectiveValues[0];
 		int effectiveExecutorCores = effectiveValues[1];
 		int effectiveNumExecutor = effectiveValues[2];
diff --git a/src/main/java/org/apache/sysds/resource/ResourceOptimizer.java b/src/main/java/org/apache/sysds/resource/ResourceOptimizer.java
index ad75de8abc1..cfa7573152b 100644
--- a/src/main/java/org/apache/sysds/resource/ResourceOptimizer.java
+++ b/src/main/java/org/apache/sysds/resource/ResourceOptimizer.java
@@ -38,7 +38,11 @@
 import org.apache.commons.configuration2.io.FileHandler;
 
 import java.io.IOException;
-import java.nio.file.*;
+import java.nio.file.FileAlreadyExistsException;
+import java.nio.file.Files;
+import java.nio.file.InvalidPathException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.HashMap;
 import java.util.Map;
 
diff --git a/src/main/java/org/apache/sysds/resource/cost/CPCostUtils.java b/src/main/java/org/apache/sysds/resource/cost/CPCostUtils.java
index 0558e17c9cd..cd344612b51 100644
--- a/src/main/java/org/apache/sysds/resource/cost/CPCostUtils.java
+++ b/src/main/java/org/apache/sysds/resource/cost/CPCostUtils.java
@@ -22,9 +22,28 @@
 import org.apache.sysds.common.Opcodes;
 import org.apache.sysds.common.Types;
 import org.apache.sysds.hops.OptimizerUtils;
-import org.apache.sysds.lops.*;
+import org.apache.sysds.lops.MMTSJ;
+import org.apache.sysds.lops.PickByCount;
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.instructions.cp.*;
+import org.apache.sysds.runtime.instructions.cp.AggregateBinaryCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.BinaryCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.BuiltinNaryCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.CPInstruction;
+import org.apache.sysds.runtime.instructions.cp.CentralMomentCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.ComputationCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.CovarianceCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.DataGenCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.DnnCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.MMTSJCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.MultiReturnBuiltinCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.ParameterizedBuiltinCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.QuantilePickCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.QuantileSortCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.ReorgCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.StringInitCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.UaggOuterChainCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.UnaryCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.VariableCPInstruction;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.CMOperator;
 import org.apache.sysds.utils.stats.InfrastructureAnalyzer;
diff --git a/src/main/java/org/apache/sysds/resource/cost/CostEstimator.java b/src/main/java/org/apache/sysds/resource/cost/CostEstimator.java
index 3d4b460cb18..14c477190b3 100644
--- a/src/main/java/org/apache/sysds/resource/cost/CostEstimator.java
+++ b/src/main/java/org/apache/sysds/resource/cost/CostEstimator.java
@@ -23,28 +23,97 @@
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.common.Types.FileFormat;
 import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.lops.DataGen;
 import org.apache.sysds.lops.LeftIndex;
 import org.apache.sysds.lops.MapMult;
 import org.apache.sysds.parser.DMLProgram;
 import org.apache.sysds.parser.DataIdentifier;
 import org.apache.sysds.resource.CloudInstance;
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.*;
+import org.apache.sysds.runtime.controlprogram.BasicProgramBlock;
+import org.apache.sysds.runtime.controlprogram.ForProgramBlock;
+import org.apache.sysds.runtime.controlprogram.FunctionProgramBlock;
+import org.apache.sysds.runtime.controlprogram.IfProgramBlock;
+import org.apache.sysds.runtime.controlprogram.Program;
+import org.apache.sysds.runtime.controlprogram.ProgramBlock;
+import org.apache.sysds.runtime.controlprogram.WhileProgramBlock;
 import org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysds.runtime.instructions.Instruction;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
-import org.apache.sysds.runtime.instructions.cp.*;
-import org.apache.sysds.runtime.instructions.spark.*;
+import org.apache.sysds.runtime.instructions.cp.AggregateBinaryCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.AggregateUnaryCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.BinaryCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.BuiltinNaryCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.CPInstruction;
+import org.apache.sysds.runtime.instructions.cp.CPOperand;
+import org.apache.sysds.runtime.instructions.cp.CompressionCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.ComputationCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.CtableCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.DataGenCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.DeCompressionCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.FunctionCallCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.IndexingCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.MultiReturnBuiltinCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.MultiReturnParameterizedBuiltinCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.ParameterizedBuiltinCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.ParamservBuiltinCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.ReorgCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.ScalarBuiltinNaryCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.StringInitCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.UnaryCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.VariableCPInstruction;
+import org.apache.sysds.runtime.instructions.spark.AggregateBinarySPInstruction;
+import org.apache.sysds.runtime.instructions.spark.AggregateUnarySPInstruction;
+import org.apache.sysds.runtime.instructions.spark.AggregateUnarySketchSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.AppendMSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.AppendSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.BinaryFrameFrameSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.BinaryFrameMatrixSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.BinaryMatrixBVectorSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.BinaryMatrixMatrixSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.BinaryMatrixScalarSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.BinarySPInstruction;
+import org.apache.sysds.runtime.instructions.spark.CSVReblockSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.CastSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.CentralMomentSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.CheckpointSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.CtableSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.IndexingSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.LIBSVMReblockSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.MapmmChainSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.MapmmSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.MatrixIndexingSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.MatrixReshapeSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.PMapmmSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.ParameterizedBuiltinSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.PmmSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.QuantileSortSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.QuaternarySPInstruction;
+import org.apache.sysds.runtime.instructions.spark.RandSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.ReblockSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.ReorgSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.SPInstruction;
+import org.apache.sysds.runtime.instructions.spark.TernarySPInstruction;
+import org.apache.sysds.runtime.instructions.spark.Tsmm2SPInstruction;
+import org.apache.sysds.runtime.instructions.spark.TsmmSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.UnaryFrameSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.UnaryMatrixSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.UnarySPInstruction;
+import org.apache.sysds.runtime.instructions.spark.WriteSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.ZipmmSPInstruction;
 import org.apache.sysds.runtime.meta.DataCharacteristics;
 import org.apache.sysds.runtime.meta.MetaDataFormat;
 
 import static org.apache.sysds.lops.Data.PREAD_PREFIX;
-import static org.apache.sysds.lops.DataGen.*;
 import static org.apache.sysds.resource.cost.CPCostUtils.opcodeRequiresScan;
-import static org.apache.sysds.resource.cost.IOCostUtils.*;
 import static org.apache.sysds.resource.cost.SparkCostUtils.getRandInstTime;
 
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+
 
 /**
  * Class for estimating the execution time of a program.
@@ -87,7 +156,7 @@ public CostEstimator(Program program, CloudInstance driverNode, CloudInstance ex
 		_functions = new HashSet<>();
 		localMemoryLimit = (long) (OptimizerUtils.getLocalMemBudget() * MEM_ALLOCATION_LIMIT_FRACTION);
 		freeLocalMemory = localMemoryLimit;
-		driverMetrics = new IOMetrics(driverNode);
+		driverMetrics = new IOCostUtils.IOMetrics(driverNode);
 		if (executorNode == null) {
 			// estimation for single node execution -> no executor resources
 			executorMetrics = null;
@@ -100,7 +169,7 @@ public CostEstimator(Program program, CloudInstance driverNode, CloudInstance ex
 					((double) executorNode.getVCPUs() / dedicatedExecutorCores));
 			// adapting the rest of the metrics not needed since the OS and resource management tasks
 			// would not consume large portion of the memory/storage/network bandwidth in the general case
-			executorMetrics = new IOMetrics(
+			executorMetrics = new IOCostUtils.IOMetrics(
 					effectiveExecutorFlops,
 					dedicatedExecutorCores,
 					executorNode.getMemoryBandwidth(),
@@ -645,7 +714,7 @@ public double parseSPInst(SPInstruction inst) throws CostEstimationException {
 			RandSPInstruction rinst = (RandSPInstruction) inst;
 			String opcode = rinst.getOpcode();
 			int randType = -1; // default for non-random object generation operations
-			if (opcode.equals(RAND_OPCODE) || opcode.equals(FRAME_OPCODE)) {
+			if (opcode.equals(DataGen.RAND_OPCODE) || opcode.equals(DataGen.FRAME_OPCODE)) {
 				if (rinst.getMinValue() == 0d && rinst.getMaxValue() == 0d) { // empty matrix
 					randType = 0;
 				} else if (rinst.getSparsity() == 1.0 && rinst.getMinValue() == rinst.getMaxValue()) { // allocate, array fill
@@ -747,7 +816,10 @@ public double parseSPInst(SPInstruction inst) throws CostEstimationException {
 			} else {
 				throw new RuntimeException("Unsupported Unary Spark instruction of type " + inst.getClass().getName());
 			}
-		} else if (inst instanceof BinaryFrameFrameSPInstruction || inst instanceof BinaryFrameMatrixSPInstruction || inst instanceof BinaryMatrixMatrixSPInstruction || inst instanceof BinaryMatrixScalarSPInstruction) {
+		} else if (inst instanceof BinaryFrameFrameSPInstruction 
+				|| inst instanceof BinaryFrameMatrixSPInstruction 
+				|| inst instanceof BinaryMatrixMatrixSPInstruction 
+				|| inst instanceof BinaryMatrixScalarSPInstruction) {
 			BinarySPInstruction binst = (BinarySPInstruction) inst;
 			VarStats input1 = getStatsWithDefaultScalar((binst).input1.getName());
 			VarStats input2 = getStatsWithDefaultScalar((binst).input2.getName());
@@ -942,7 +1014,7 @@ public double getTimeEstimateSparkJob(VarStats varToCollect) {
 			collectTime = IOCostUtils.getSparkCollectTime(varToCollect.rddStats, driverMetrics, executorMetrics);
 		} else {
 			// redirect through HDFS (writing to HDFS on executors and reading back on driver)
-			varToCollect.fileInfo = new Object[] {HDFS_SOURCE_IDENTIFIER, FileFormat.BINARY};
+			varToCollect.fileInfo = new Object[] {IOCostUtils.HDFS_SOURCE_IDENTIFIER, FileFormat.BINARY};
 			collectTime = IOCostUtils.getHadoopWriteTime(varToCollect, executorMetrics) +
 					IOCostUtils.getFileSystemReadTime(varToCollect, driverMetrics);
 		}
@@ -985,7 +1057,7 @@ private double loadCPVarStatsAndEstimateTime(VarStats input) throws CostEstimati
 			// loading from a file
 			if (input.fileInfo == null || input.fileInfo.length != 2) {
 				throw new RuntimeException("Time estimation is not possible without file info.");
-			} else if (isInvalidDataSource((String) input.fileInfo[0])) {
+			} else if (IOCostUtils.isInvalidDataSource((String) input.fileInfo[0])) {
 				throw new RuntimeException("Time estimation is not possible for data source: " + input.fileInfo[0]);
 			}
 			loadTime = IOCostUtils.getFileSystemReadTime(input, driverMetrics);
@@ -1057,7 +1129,7 @@ private double loadRDDStatsAndEstimateTime(VarStats input) {
 			if (input.allocatedMemory >= 0) { // generated object locally
 				if (inputRDD.distributedSize < freeLocalMemory && inputRDD.distributedSize < (0.1 * localMemoryLimit)) {
 					// in this case transfer the data object over HDF (first set the fileInfo of the input)
-					input.fileInfo = new Object[] {HDFS_SOURCE_IDENTIFIER, FileFormat.BINARY};
+					input.fileInfo = new Object[] {IOCostUtils.HDFS_SOURCE_IDENTIFIER, FileFormat.BINARY};
 					ret = IOCostUtils.getFileSystemWriteTime(input, driverMetrics);
 					ret += IOCostUtils.getHadoopReadTime(input, executorMetrics);
 				} else {
diff --git a/src/main/java/org/apache/sysds/resource/cost/SparkCostUtils.java b/src/main/java/org/apache/sysds/resource/cost/SparkCostUtils.java
index addf37d350d..6695a0d820b 100644
--- a/src/main/java/org/apache/sysds/resource/cost/SparkCostUtils.java
+++ b/src/main/java/org/apache/sysds/resource/cost/SparkCostUtils.java
@@ -24,28 +24,59 @@
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.hops.AggBinaryOp;
 import org.apache.sysds.hops.OptimizerUtils;
-import org.apache.sysds.lops.*;
+import org.apache.sysds.lops.DataGen;
+import org.apache.sysds.lops.MMTSJ;
+import org.apache.sysds.resource.cost.IOCostUtils.IOMetrics;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.instructions.cp.CPInstruction.CPType;
-import org.apache.sysds.runtime.instructions.spark.*;
+import org.apache.sysds.runtime.instructions.spark.AggregateUnarySPInstruction;
+import org.apache.sysds.runtime.instructions.spark.AggregateUnarySketchSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.AppendGAlignedSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.AppendMSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.AppendRSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.AppendSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.BinaryFrameFrameSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.BinaryFrameMatrixSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.BinaryMatrixBVectorSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.BinaryMatrixMatrixSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.BinaryMatrixScalarSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.BinarySPInstruction;
+import org.apache.sysds.runtime.instructions.spark.CastSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.CentralMomentSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.CpmmSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.CtableSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.CumulativeAggregateSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.IndexingSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.MapmmChainSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.MapmmSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.PMapmmSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.ParameterizedBuiltinSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.PmmSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.QuantileSortSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.QuaternarySPInstruction;
+import org.apache.sysds.runtime.instructions.spark.RmmSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.SPInstruction;
 import org.apache.sysds.runtime.instructions.spark.SPInstruction.SPType;
+import org.apache.sysds.runtime.instructions.spark.TernarySPInstruction;
+import org.apache.sysds.runtime.instructions.spark.Tsmm2SPInstruction;
+import org.apache.sysds.runtime.instructions.spark.TsmmSPInstruction;
+import org.apache.sysds.runtime.instructions.spark.UnarySPInstruction;
+import org.apache.sysds.runtime.instructions.spark.ZipmmSPInstruction;
 import org.apache.sysds.runtime.matrix.operators.CMOperator;
 import org.apache.sysds.runtime.meta.MatrixCharacteristics;
 
-import static org.apache.sysds.lops.DataGen.*;
-import static org.apache.sysds.resource.cost.IOCostUtils.*;
 
 public class SparkCostUtils {
 
 	public static double getReblockInstTime(String opcode, VarStats input, VarStats output, IOMetrics executorMetrics) {
 		// Reblock triggers a new stage
 		// old stage: read text file + shuffle the intermediate text rdd
-		double readTime = getHadoopReadTime(input, executorMetrics);
+		double readTime = IOCostUtils.getHadoopReadTime(input, executorMetrics);
 		long sizeTextFile = OptimizerUtils.estimateSizeTextOutput(input.getM(), input.getN(), input.getNNZ(), (Types.FileFormat) input.fileInfo[1]);
 		RDDStats textRdd = new RDDStats(sizeTextFile, -1);
-		double shuffleTime = getSparkShuffleTime(textRdd, executorMetrics, false);
+		double shuffleTime = IOCostUtils.getSparkShuffleTime(textRdd, executorMetrics, false);
 		double timeStage1 = readTime + shuffleTime;
 		// new stage: transform partitioned shuffled text object into partitioned binary object
 		long nflop = getInstNFLOP(SPType.Reblock, opcode, output);
@@ -53,19 +84,19 @@ public static double getReblockInstTime(String opcode, VarStats input, VarStats
 		return timeStage1 + timeStage2;
 	}
 
-	public static double getRandInstTime(String opcode, int randType, VarStats output, IOMetrics executorMetrics) {
-		if (opcode.equals(SAMPLE_OPCODE)) {
+	public static double getRandInstTime(String opcode, int randType, VarStats output, IOCostUtils.IOMetrics executorMetrics) {
+		if (opcode.equals(DataGen.SAMPLE_OPCODE)) {
 			// sample uses sortByKey() op. and it should be handled differently
-			throw new RuntimeException("Spark operation Rand with opcode " + SAMPLE_OPCODE + " is not supported yet");
+			throw new RuntimeException("Spark operation Rand with opcode " + DataGen.SAMPLE_OPCODE + " is not supported yet");
 		}
 
 		long nflop;
-		if (opcode.equals(RAND_OPCODE) || opcode.equals(FRAME_OPCODE)) {
+		if (opcode.equals(DataGen.RAND_OPCODE) || opcode.equals(DataGen.FRAME_OPCODE)) {
 			if (randType == 0) return 0; // empty matrix
 			else if (randType == 1) nflop = 8; // allocate, array fill
 			else if (randType == 2) nflop = 32; // full rand
 			else throw new RuntimeException("Unknown type of random instruction");
-		} else if (opcode.equals(SEQ_OPCODE)) {
+		} else if (opcode.equals(DataGen.SEQ_OPCODE)) {
 			nflop = 1;
 		} else {
 			throw new DMLRuntimeException("Rand operation with opcode '" + opcode + "' is not supported by SystemDS");
@@ -93,7 +124,7 @@ public static double getAggUnaryInstTime(UnarySPInstruction inst, VarStats input
 				((AggregateUnarySketchSPInstruction) inst).getAggType();
 		double shuffleTime;
 		if (inst instanceof CumulativeAggregateSPInstruction) {
-			shuffleTime = getSparkShuffleTime(output.rddStats, executorMetrics, true);
+			shuffleTime = IOCostUtils.getSparkShuffleTime(output.rddStats, executorMetrics, true);
 			output.rddStats.hashPartitioned = true;
 		} else {
 			if (aggType == AggBinaryOp.SparkAggType.SINGLE_BLOCK) {
@@ -111,9 +142,9 @@ public static double getAggUnaryInstTime(UnarySPInstruction inst, VarStats input
 							input.getNNZ()
 					);
 					RDDStats filteredRDD = new RDDStats(diagonalBlockSize, input.rddStats.numPartitions);
-					shuffleTime = getSparkShuffleTime(filteredRDD, executorMetrics, true);
+					shuffleTime = IOCostUtils.getSparkShuffleTime(filteredRDD, executorMetrics, true);
 				} else {
-					shuffleTime = getSparkShuffleTime(input.rddStats, executorMetrics, true);
+					shuffleTime = IOCostUtils.getSparkShuffleTime(input.rddStats, executorMetrics, true);
 				}
 				output.rddStats.hashPartitioned = true;
 				output.rddStats.numPartitions = input.rddStats.numPartitions;
@@ -137,17 +168,17 @@ public static double getIndexingInstTime(IndexingSPInstruction inst, VarStats in
 			int blockSize = ConfigurationManager.getBlocksize();
 			if (output.getM() <= blockSize && output.getN() <= blockSize) {
 				// represents single block and multi block cases
-				dataTransmissionTime = getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
+				dataTransmissionTime = IOCostUtils.getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
 				output.rddStats.isCollected = true;
 			} else {
 				// represents general indexing: worst case: shuffling required
-				dataTransmissionTime = getSparkShuffleTime(output.rddStats, executorMetrics, true);
+				dataTransmissionTime = IOCostUtils.getSparkShuffleTime(output.rddStats, executorMetrics, true);
 			}
 		} else if (opcode.equals(Opcodes.LEFT_INDEX.toString())) {
 			// model combineByKey() with shuffling the second input
-			dataTransmissionTime = getSparkShuffleTime(input2.rddStats, executorMetrics, true);
+			dataTransmissionTime = IOCostUtils.getSparkShuffleTime(input2.rddStats, executorMetrics, true);
 		} else { // mapLeftIndex
-			dataTransmissionTime = getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
+			dataTransmissionTime = IOCostUtils.getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
 		}
 		long nflop = getInstNFLOP(SPType.MatrixIndexing, opcode, output);
 		// scan only the size of the output since filter is applied first
@@ -169,34 +200,34 @@ public static double getBinaryInstTime(SPInstruction inst, VarStats input1, VarS
 		if (inst instanceof BinaryMatrixMatrixSPInstruction) {
 			if (inst instanceof BinaryMatrixBVectorSPInstruction) {
 				// the second matrix is always the broadcast one
-				dataTransmissionTime = getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
+				dataTransmissionTime = IOCostUtils.getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
 				// flatMapToPair() or ()mapPartitionsToPair invoked -> no shuffling
 				output.rddStats.numPartitions = input1.rddStats.numPartitions;
 				output.rddStats.hashPartitioned = input1.rddStats.hashPartitioned;
 			} else { // regular BinaryMatrixMatrixSPInstruction
 				// join() input1 and input2
-				dataTransmissionTime = getSparkShuffleWriteTime(input1.rddStats, executorMetrics) +
-						getSparkShuffleWriteTime(input2.rddStats, executorMetrics);
+				dataTransmissionTime = IOCostUtils.getSparkShuffleWriteTime(input1.rddStats, executorMetrics) +
+						IOCostUtils.getSparkShuffleWriteTime(input2.rddStats, executorMetrics);
 				if (input1.rddStats.hashPartitioned) {
 					output.rddStats.numPartitions = input1.rddStats.numPartitions;
 					if (!input2.rddStats.hashPartitioned || !(input1.rddStats.numPartitions == input2.rddStats.numPartitions)) {
 						// shuffle needed for join() -> actual shuffle only for input2
-						dataTransmissionTime += getSparkShuffleReadStaticTime(input1.rddStats, executorMetrics) +
-								getSparkShuffleReadTime(input2.rddStats, executorMetrics);
+						dataTransmissionTime += IOCostUtils.getSparkShuffleReadStaticTime(input1.rddStats, executorMetrics) +
+								IOCostUtils.getSparkShuffleReadTime(input2.rddStats, executorMetrics);
 					} else { // no shuffle needed for join() -> only read from local disk
-						dataTransmissionTime += getSparkShuffleReadStaticTime(input1.rddStats, executorMetrics) +
-								getSparkShuffleReadStaticTime(input2.rddStats, executorMetrics);
+						dataTransmissionTime += IOCostUtils.getSparkShuffleReadStaticTime(input1.rddStats, executorMetrics) +
+								IOCostUtils.getSparkShuffleReadStaticTime(input2.rddStats, executorMetrics);
 					}
 				} else if (input2.rddStats.hashPartitioned) {
 					output.rddStats.numPartitions = input2.rddStats.numPartitions;
 					// input1 not hash partitioned: shuffle needed for join() -> actual shuffle only for input2
-					dataTransmissionTime += getSparkShuffleReadStaticTime(input1.rddStats, executorMetrics) +
-							getSparkShuffleReadTime(input2.rddStats, executorMetrics);
+					dataTransmissionTime += IOCostUtils.getSparkShuffleReadStaticTime(input1.rddStats, executorMetrics) +
+							IOCostUtils.getSparkShuffleReadTime(input2.rddStats, executorMetrics);
 				} else {
 					// repartition all data needed
 					output.rddStats.numPartitions = 2 * output.rddStats.numPartitions;
-					dataTransmissionTime += getSparkShuffleReadTime(input1.rddStats, executorMetrics) +
-							getSparkShuffleReadTime(input2.rddStats, executorMetrics);
+					dataTransmissionTime += IOCostUtils.getSparkShuffleReadTime(input1.rddStats, executorMetrics) +
+							IOCostUtils.getSparkShuffleReadTime(input2.rddStats, executorMetrics);
 				}
 				output.rddStats.hashPartitioned = true;
 			}
@@ -217,16 +248,16 @@ public static double getBinaryInstTime(SPInstruction inst, VarStats input1, VarS
 	public static double getAppendInstTime(AppendSPInstruction inst, VarStats input1, VarStats input2, VarStats output, IOMetrics driverMetrics, IOMetrics executorMetrics) {
 		double dataTransmissionTime;
 		if (inst instanceof AppendMSPInstruction) {
-			dataTransmissionTime = getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
+			dataTransmissionTime = IOCostUtils.getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
 			output.rddStats.hashPartitioned = true;
 		} else if (inst instanceof AppendRSPInstruction) {
-			dataTransmissionTime = getSparkShuffleTime(output.rddStats, executorMetrics, false);
+			dataTransmissionTime = IOCostUtils.getSparkShuffleTime(output.rddStats, executorMetrics, false);
 		} else if (inst instanceof AppendGAlignedSPInstruction) {
 			// only changing matrix indexing
 			dataTransmissionTime = 0;
 		} else { // AppendGSPInstruction
 			// shuffle the whole appended matrix
-			dataTransmissionTime = getSparkShuffleTime(input2.rddStats, executorMetrics, true);
+			dataTransmissionTime = IOCostUtils.getSparkShuffleTime(input2.rddStats, executorMetrics, true);
 			output.rddStats.hashPartitioned = true;
 		}
 		// opcode not relevant for the nflop estimation of append instructions;
@@ -241,7 +272,7 @@ public static double getReorgInstTime(UnarySPInstruction inst, VarStats input, V
 		double dataTransmissionTime;
 		switch (opcode) {
 			case "rshape":
-				dataTransmissionTime = getSparkShuffleTime(input.rddStats, executorMetrics, true);
+				dataTransmissionTime = IOCostUtils.getSparkShuffleTime(input.rddStats, executorMetrics, true);
 				output.rddStats.hashPartitioned = true;
 				break;
 			case "r'":
@@ -249,7 +280,7 @@ public static double getReorgInstTime(UnarySPInstruction inst, VarStats input, V
 				output.rddStats.hashPartitioned = input.rddStats.hashPartitioned;
 				break;
 			case "rev":
-				dataTransmissionTime = getSparkShuffleTime(output.rddStats, executorMetrics, true);
+				dataTransmissionTime = IOCostUtils.getSparkShuffleTime(output.rddStats, executorMetrics, true);
 				output.rddStats.hashPartitioned = true;
 				break;
 			case "rdiag":
@@ -267,8 +298,8 @@ public static double getReorgInstTime(UnarySPInstruction inst, VarStats input, V
 					shuffleFactor = 4;// estimate cost for 2 shuffles
 				}
 				// assume case: 4 times shuffling the output
-				dataTransmissionTime = getSparkShuffleWriteTime(output.rddStats, executorMetrics) +
-						getSparkShuffleReadTime(output.rddStats, executorMetrics);
+				dataTransmissionTime = IOCostUtils.getSparkShuffleWriteTime(output.rddStats, executorMetrics) +
+						IOCostUtils.getSparkShuffleReadTime(output.rddStats, executorMetrics);
 				dataTransmissionTime *= shuffleFactor;
 				break;
 		}
@@ -285,7 +316,7 @@ public static double getTSMMInstTime(UnarySPInstruction inst, VarStats input, Va
 		if (inst instanceof TsmmSPInstruction) {
 			type = ((TsmmSPInstruction) inst).getMMTSJType();
 			// fold() used but result is still a whole matrix block
-			dataTransmissionTime = getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
+			dataTransmissionTime = IOCostUtils.getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
 			output.rddStats.isCollected = true;
 		} else { // Tsmm2SPInstruction
 			type = ((Tsmm2SPInstruction) inst).getMMTSJType();
@@ -296,9 +327,9 @@ public static double getTSMMInstTime(UnarySPInstruction inst, VarStats input, Va
 					input.getN() - input.characteristics.getBlocksize();
 			VarStats broadcast = new VarStats("tmp1", new MatrixCharacteristics(rowsRange, colsRange));
 			broadcast.rddStats = new RDDStats(broadcast);
-			dataTransmissionTime = getSparkCollectTime(broadcast.rddStats, driverMetrics, executorMetrics);
-			dataTransmissionTime += getSparkBroadcastTime(broadcast, driverMetrics, executorMetrics);
-			dataTransmissionTime += getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
+			dataTransmissionTime = IOCostUtils.getSparkCollectTime(broadcast.rddStats, driverMetrics, executorMetrics);
+			dataTransmissionTime += IOCostUtils.getSparkBroadcastTime(broadcast, driverMetrics, executorMetrics);
+			dataTransmissionTime += IOCostUtils.getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
 		}
 		opcode += type.isLeft() ? "_left" : "_right";
 		long nflop = getInstNFLOP(inst.getSPInstructionType(), opcode, output, input);
@@ -312,8 +343,8 @@ public static double getCentralMomentInstTime(CentralMomentSPInstruction inst, V
 
 		double dataTransmissionTime = 0;
 		if (weights != null) {
-			dataTransmissionTime = getSparkShuffleWriteTime(weights.rddStats, executorMetrics) +
-					getSparkShuffleReadTime(weights.rddStats, executorMetrics);
+			dataTransmissionTime = IOCostUtils.getSparkShuffleWriteTime(weights.rddStats, executorMetrics) +
+					IOCostUtils.getSparkShuffleReadTime(weights.rddStats, executorMetrics);
 
 		}
 		output.rddStats.isCollected = true;
@@ -327,8 +358,8 @@ public static double getCentralMomentInstTime(CentralMomentSPInstruction inst, V
 	public static double getCastInstTime(CastSPInstruction inst, VarStats input, VarStats output, IOMetrics executorMetrics) {
 		double shuffleTime = 0;
 		if (input.getN() > input.characteristics.getBlocksize()) {
-			shuffleTime = getSparkShuffleWriteTime(input.rddStats, executorMetrics) +
-					getSparkShuffleReadTime(input.rddStats, executorMetrics);
+			shuffleTime = IOCostUtils.getSparkShuffleWriteTime(input.rddStats, executorMetrics) +
+					IOCostUtils.getSparkShuffleReadTime(input.rddStats, executorMetrics);
 			output.rddStats.hashPartitioned = true;
 		}
 		long nflop = getInstNFLOP(inst.getSPInstructionType(), inst.getOpcode(), output, input);
@@ -341,11 +372,11 @@ public static double getQSortInstTime(QuantileSortSPInstruction inst, VarStats i
 		double shuffleTime = 0;
 		if (weights != null) {
 			opcode += "_wts";
-			shuffleTime += getSparkShuffleWriteTime(weights.rddStats, executorMetrics) +
-					getSparkShuffleReadTime(weights.rddStats, executorMetrics);
+			shuffleTime += IOCostUtils.getSparkShuffleWriteTime(weights.rddStats, executorMetrics) +
+					IOCostUtils.getSparkShuffleReadTime(weights.rddStats, executorMetrics);
 		}
-		shuffleTime += getSparkShuffleWriteTime(output.rddStats, executorMetrics) +
-				getSparkShuffleReadTime(output.rddStats, executorMetrics);
+		shuffleTime += IOCostUtils.getSparkShuffleWriteTime(output.rddStats, executorMetrics) +
+				IOCostUtils.getSparkShuffleReadTime(output.rddStats, executorMetrics);
 		output.rddStats.hashPartitioned = true;
 
 		long nflop = getInstNFLOP(SPType.QSort, opcode, output, input, weights);
@@ -363,12 +394,12 @@ public static double getMatMulInstTime(BinarySPInstruction inst, VarStats input1
 			// estimate for in1.join(in2)
 			long joinedSize = input1.rddStats.distributedSize + input2.rddStats.distributedSize;
 			RDDStats joinedRDD = new RDDStats(joinedSize, -1);
-			dataTransmissionTime = getSparkShuffleTime(joinedRDD, executorMetrics, true);
+			dataTransmissionTime = IOCostUtils.getSparkShuffleTime(joinedRDD, executorMetrics, true);
 			if (aggType == AggBinaryOp.SparkAggType.SINGLE_BLOCK) {
-				dataTransmissionTime += getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
+				dataTransmissionTime += IOCostUtils.getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
 				output.rddStats.isCollected = true;
 			} else {
-				dataTransmissionTime += getSparkShuffleTime(output.rddStats, executorMetrics, true);
+				dataTransmissionTime += IOCostUtils.getSparkShuffleTime(output.rddStats, executorMetrics, true);
 				output.rddStats.hashPartitioned = true;
 			}
 			numPartitionsForMapping = joinedRDD.numPartitions;
@@ -376,33 +407,33 @@ public static double getMatMulInstTime(BinarySPInstruction inst, VarStats input1
 			// estimate for in1.join(in2)
 			long joinedSize = input1.rddStats.distributedSize + input2.rddStats.distributedSize;
 			RDDStats joinedRDD = new RDDStats(joinedSize, -1);
-			dataTransmissionTime = getSparkShuffleTime(joinedRDD, executorMetrics, true);
+			dataTransmissionTime = IOCostUtils.getSparkShuffleTime(joinedRDD, executorMetrics, true);
 			// estimate for out.combineByKey() per partition
-			dataTransmissionTime += getSparkShuffleTime(output.rddStats, executorMetrics, false);
+			dataTransmissionTime += IOCostUtils.getSparkShuffleTime(output.rddStats, executorMetrics, false);
 			output.rddStats.hashPartitioned = true;
 			numPartitionsForMapping = joinedRDD.numPartitions;
 		} else if (inst instanceof MapmmSPInstruction) {
-			dataTransmissionTime = getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
+			dataTransmissionTime = IOCostUtils.getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
 			MapmmSPInstruction mapmminst = (MapmmSPInstruction) inst;
 			AggBinaryOp.SparkAggType aggType = mapmminst.getAggType();
 			if (aggType == AggBinaryOp.SparkAggType.SINGLE_BLOCK) {
-				dataTransmissionTime += getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
+				dataTransmissionTime += IOCostUtils.getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
 				output.rddStats.isCollected = true;
 			} else {
-				dataTransmissionTime += getSparkShuffleTime(output.rddStats, executorMetrics, true);
+				dataTransmissionTime += IOCostUtils.getSparkShuffleTime(output.rddStats, executorMetrics, true);
 				output.rddStats.hashPartitioned = true;
 			}
 			numPartitionsForMapping = input1.rddStats.numPartitions;
 		} else if (inst instanceof PmmSPInstruction) {
-			dataTransmissionTime = getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
+			dataTransmissionTime = IOCostUtils.getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
 			output.rddStats.numPartitions = input1.rddStats.numPartitions;
-			dataTransmissionTime += getSparkShuffleTime(output.rddStats, executorMetrics, true);
+			dataTransmissionTime += IOCostUtils.getSparkShuffleTime(output.rddStats, executorMetrics, true);
 			output.rddStats.hashPartitioned = true;
 			numPartitionsForMapping = input1.rddStats.numPartitions;
 		} else if (inst instanceof ZipmmSPInstruction) {
 			// assume always a shuffle without data re-distribution
-			dataTransmissionTime = getSparkShuffleTime(output.rddStats, executorMetrics, false);
-			dataTransmissionTime += getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
+			dataTransmissionTime = IOCostUtils.getSparkShuffleTime(output.rddStats, executorMetrics, false);
+			dataTransmissionTime += IOCostUtils.getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
 			numPartitionsForMapping = input1.rddStats.numPartitions;
 			output.rddStats.isCollected = true;
 		} else if (inst instanceof PMapmmSPInstruction) {
@@ -425,10 +456,10 @@ public static double getMatMulChainInstTime(MapmmChainSPInstruction inst, VarSta
 												IOMetrics driverMetrics, IOMetrics executorMetrics) {
 		double dataTransmissionTime = 0;
 		if (input3 != null) {
-			dataTransmissionTime += getSparkBroadcastTime(input3, driverMetrics, executorMetrics);
+			dataTransmissionTime += IOCostUtils.getSparkBroadcastTime(input3, driverMetrics, executorMetrics);
 		}
-		dataTransmissionTime += getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
-		dataTransmissionTime += getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
+		dataTransmissionTime += IOCostUtils.getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
+		dataTransmissionTime += IOCostUtils.getSparkCollectTime(output.rddStats, driverMetrics, executorMetrics);
 		output.rddStats.isCollected = true;
 
 		long nflop = getInstNFLOP(SPType.MAPMMCHAIN, inst.getOpcode(), output, input1, input2);
@@ -441,20 +472,20 @@ public static double getCtableInstTime(CtableSPInstruction tableInst, VarStats i
 		double shuffleTime;
 		if (opcode.equals(Opcodes.CTABLEEXPAND.toString()) || !input2.isScalar() && input3.isScalar()) { // CTABLE_EXPAND_SCALAR_WEIGHT/CTABLE_TRANSFORM_SCALAR_WEIGHT
 			// in1.join(in2)
-			shuffleTime = getSparkShuffleTime(input2.rddStats, executorMetrics, true);
+			shuffleTime = IOCostUtils.getSparkShuffleTime(input2.rddStats, executorMetrics, true);
 		} else if (input2.isScalar() && input3.isScalar()) { // CTABLE_TRANSFORM_HISTOGRAM
 			// no joins
 			shuffleTime = 0;
 		} else if (input2.isScalar() && !input3.isScalar()) { // CTABLE_TRANSFORM_WEIGHTED_HISTOGRAM
 			// in1.join(in3)
-			shuffleTime = getSparkShuffleTime(input3.rddStats, executorMetrics, true);
+			shuffleTime = IOCostUtils.getSparkShuffleTime(input3.rddStats, executorMetrics, true);
 		} else { // CTABLE_TRANSFORM
 			// in1.join(in2).join(in3)
-			shuffleTime = getSparkShuffleTime(input2.rddStats, executorMetrics, true);
-			shuffleTime += getSparkShuffleTime(input3.rddStats, executorMetrics, true);
+			shuffleTime = IOCostUtils.getSparkShuffleTime(input2.rddStats, executorMetrics, true);
+			shuffleTime += IOCostUtils.getSparkShuffleTime(input3.rddStats, executorMetrics, true);
 		}
 		// combineByKey()
-		shuffleTime += getSparkShuffleTime(output.rddStats, executorMetrics, true);
+		shuffleTime += IOCostUtils.getSparkShuffleTime(output.rddStats, executorMetrics, true);
 		output.rddStats.hashPartitioned = true;
 
 		long nflop = getInstNFLOP(SPType.Ctable, opcode, output, input1, input2, input3);
@@ -470,16 +501,16 @@ public static double getParameterizedBuiltinInstTime(ParameterizedBuiltinSPInstr
 		switch (opcode) {
 			case "rmempty":
 				if (input2.rddStats == null) // broadcast
-					dataTransmissionTime = getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
+					dataTransmissionTime = IOCostUtils.getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
 				else // join
-					dataTransmissionTime = getSparkShuffleTime(input1.rddStats, executorMetrics, true);
-				dataTransmissionTime += getSparkShuffleTime(output.rddStats, executorMetrics, true);
+					dataTransmissionTime = IOCostUtils.getSparkShuffleTime(input1.rddStats, executorMetrics, true);
+				dataTransmissionTime += IOCostUtils.getSparkShuffleTime(output.rddStats, executorMetrics, true);
 				break;
 			case "contains":
 				if (input2.isScalar()) {
 					dataTransmissionTime = 0;
 				} else {
-					dataTransmissionTime = getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
+					dataTransmissionTime = IOCostUtils.getSparkBroadcastTime(input2, driverMetrics, executorMetrics);
 					// ignore reduceByKey() cost
 				}
 				output.rddStats.isCollected = true;
@@ -505,32 +536,32 @@ public static double getTernaryInstTime(TernarySPInstruction tInst, VarStats inp
 		if (!input1.isScalar() && !input2.isScalar()) {
 			inputRddStats = new RDDStats[]{input1.rddStats, input2.rddStats};
 			// input1.join(input2)
-			dataTransmissionTime += getSparkShuffleTime(input1.rddStats, executorMetrics,
+			dataTransmissionTime += IOCostUtils.getSparkShuffleTime(input1.rddStats, executorMetrics,
 					input1.rddStats.hashPartitioned);
-			dataTransmissionTime += getSparkShuffleTime(input2.rddStats, executorMetrics,
+			dataTransmissionTime += IOCostUtils.getSparkShuffleTime(input2.rddStats, executorMetrics,
 					input2.rddStats.hashPartitioned);
 		} else if (!input1.isScalar() && !input3.isScalar()) {
 			inputRddStats = new RDDStats[]{input1.rddStats, input3.rddStats};
 			// input1.join(input3)
-			dataTransmissionTime += getSparkShuffleTime(input1.rddStats, executorMetrics,
+			dataTransmissionTime += IOCostUtils.getSparkShuffleTime(input1.rddStats, executorMetrics,
 					input1.rddStats.hashPartitioned);
-			dataTransmissionTime += getSparkShuffleTime(input3.rddStats, executorMetrics,
+			dataTransmissionTime += IOCostUtils.getSparkShuffleTime(input3.rddStats, executorMetrics,
 					input3.rddStats.hashPartitioned);
 		} else if (!input2.isScalar() || !input3.isScalar()) {
 			inputRddStats = new RDDStats[]{input2.rddStats, input3.rddStats};
 			// input2.join(input3)
-			dataTransmissionTime += getSparkShuffleTime(input2.rddStats, executorMetrics,
+			dataTransmissionTime += IOCostUtils.getSparkShuffleTime(input2.rddStats, executorMetrics,
 					input2.rddStats.hashPartitioned);
-			dataTransmissionTime += getSparkShuffleTime(input3.rddStats, executorMetrics,
+			dataTransmissionTime += IOCostUtils.getSparkShuffleTime(input3.rddStats, executorMetrics,
 					input3.rddStats.hashPartitioned);
 		} else if (!input1.isScalar() && !input2.isScalar() && !input3.isScalar()) {
 			inputRddStats = new RDDStats[]{input1.rddStats, input2.rddStats, input3.rddStats};
 			// input1.join(input2).join(input3)
-			dataTransmissionTime += getSparkShuffleTime(input1.rddStats, executorMetrics,
+			dataTransmissionTime += IOCostUtils.getSparkShuffleTime(input1.rddStats, executorMetrics,
 					input1.rddStats.hashPartitioned);
-			dataTransmissionTime += getSparkShuffleTime(input2.rddStats, executorMetrics,
+			dataTransmissionTime += IOCostUtils.getSparkShuffleTime(input2.rddStats, executorMetrics,
 					input2.rddStats.hashPartitioned);
-			dataTransmissionTime += getSparkShuffleTime(input3.rddStats, executorMetrics,
+			dataTransmissionTime += IOCostUtils.getSparkShuffleTime(input3.rddStats, executorMetrics,
 					input3.rddStats.hashPartitioned);
 		}
 
@@ -547,12 +578,12 @@ public static double getQuaternaryInstTime(QuaternarySPInstruction quatInst, Var
 			throw new RuntimeException("Spark Quaternary reduce-operations are not supported yet");
 		}
 		double dataTransmissionTime;
-		dataTransmissionTime = getSparkBroadcastTime(input2, driverMetrics, executorMetrics)
-				+ getSparkBroadcastTime(input3, driverMetrics, executorMetrics); // for map-side ops only
+		dataTransmissionTime = IOCostUtils.getSparkBroadcastTime(input2, driverMetrics, executorMetrics)
+				+ IOCostUtils.getSparkBroadcastTime(input3, driverMetrics, executorMetrics); // for map-side ops only
 		if (opcode.equals("mapwsloss") || opcode.equals("mapwcemm")) {
 			output.rddStats.isCollected = true;
 		} else if (opcode.equals("mapwdivmm")) {
-			dataTransmissionTime += getSparkShuffleTime(output.rddStats, executorMetrics, true);
+			dataTransmissionTime += IOCostUtils.getSparkShuffleTime(output.rddStats, executorMetrics, true);
 		}
 
 		long nflop = getInstNFLOP(quatInst.getSPInstructionType(), opcode, output, input1);
@@ -580,12 +611,12 @@ public static double getCPUTime(long nflop, int numPartitions, IOMetrics executo
 		for (RDDStats input: inputs) {
 			if (input == null) continue;
 			// compensates for spill-overs to account for non-compute bound operations
-			memScanTime += getMemReadTime(input, executorMetrics);
+			memScanTime += IOCostUtils.getMemReadTime(input, executorMetrics);
 		}
 		double numWaves = Math.ceil((double) numPartitions / SparkExecutionContext.getDefaultParallelism(false));
 		double scaledNFLOP = (numWaves * nflop) / numPartitions;
 		double cpuComputationTime = scaledNFLOP / executorMetrics.cpuFLOPS;
-		double memWriteTime = output != null? getMemWriteTime(output, executorMetrics) : 0;
+		double memWriteTime = output != null? IOCostUtils.getMemWriteTime(output, executorMetrics) : 0;
 		return Math.max(memScanTime, cpuComputationTime) + memWriteTime;
 	}
 
diff --git a/src/main/java/org/apache/sysds/resource/enumeration/EnumerationUtils.java b/src/main/java/org/apache/sysds/resource/enumeration/EnumerationUtils.java
index fa22f6c4f73..67a657c5d25 100644
--- a/src/main/java/org/apache/sysds/resource/enumeration/EnumerationUtils.java
+++ b/src/main/java/org/apache/sysds/resource/enumeration/EnumerationUtils.java
@@ -19,9 +19,12 @@
 
 package org.apache.sysds.resource.enumeration;
 
-import org.apache.sysds.resource.CloudInstance;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.TreeMap;
 
-import java.util.*;
+import org.apache.sysds.resource.CloudInstance;
 
 public class EnumerationUtils {
 	/**
diff --git a/src/main/java/org/apache/sysds/resource/enumeration/Enumerator.java b/src/main/java/org/apache/sysds/resource/enumeration/Enumerator.java
index b2de8410a5e..fb986c32650 100644
--- a/src/main/java/org/apache/sysds/resource/enumeration/Enumerator.java
+++ b/src/main/java/org/apache/sysds/resource/enumeration/Enumerator.java
@@ -33,8 +33,14 @@
 import org.apache.sysds.resource.enumeration.EnumerationUtils.ConfigurationPoint;
 import org.apache.sysds.resource.enumeration.EnumerationUtils.SolutionPoint;
 
-import java.util.*;
+import java.util.ArrayList;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
 import java.util.Map.Entry;
+import java.util.Set;
+import java.util.TreeMap;
 import java.util.concurrent.atomic.AtomicReference;
 
 public abstract class Enumerator {
diff --git a/src/main/java/org/apache/sysds/resource/enumeration/GridBasedEnumerator.java b/src/main/java/org/apache/sysds/resource/enumeration/GridBasedEnumerator.java
index 571fc929b5c..5a13a4283f0 100644
--- a/src/main/java/org/apache/sysds/resource/enumeration/GridBasedEnumerator.java
+++ b/src/main/java/org/apache/sysds/resource/enumeration/GridBasedEnumerator.java
@@ -19,7 +19,7 @@
 
 package org.apache.sysds.resource.enumeration;
 
-import java.util.*;
+import java.util.ArrayList;
 
 public class GridBasedEnumerator extends Enumerator {
 	/** sets the step size at iterating over number of executors at enumeration */
diff --git a/src/main/java/org/apache/sysds/resource/enumeration/InterestBasedEnumerator.java b/src/main/java/org/apache/sysds/resource/enumeration/InterestBasedEnumerator.java
index 349d44312f5..aed8682dcd9 100644
--- a/src/main/java/org/apache/sysds/resource/enumeration/InterestBasedEnumerator.java
+++ b/src/main/java/org/apache/sysds/resource/enumeration/InterestBasedEnumerator.java
@@ -22,10 +22,19 @@
 import org.apache.sysds.hops.Hop;
 import org.apache.sysds.hops.OptimizerUtils;
 import org.apache.sysds.parser.StatementBlock;
-import org.apache.sysds.runtime.controlprogram.*;
 import org.apache.sysds.resource.enumeration.EnumerationUtils.InstanceSearchSpace;
+import org.apache.sysds.runtime.controlprogram.ForProgramBlock;
+import org.apache.sysds.runtime.controlprogram.FunctionProgramBlock;
+import org.apache.sysds.runtime.controlprogram.IfProgramBlock;
+import org.apache.sysds.runtime.controlprogram.Program;
+import org.apache.sysds.runtime.controlprogram.ProgramBlock;
+import org.apache.sysds.runtime.controlprogram.WhileProgramBlock;
 
-import java.util.*;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
 import java.util.stream.Collectors;
 
 import static org.apache.sysds.resource.CloudUtils.JVM_MEMORY_FACTOR;
diff --git a/src/main/java/org/apache/sysds/resource/enumeration/PruneBasedEnumerator.java b/src/main/java/org/apache/sysds/resource/enumeration/PruneBasedEnumerator.java
index 300188d5e6e..8b2ee747326 100644
--- a/src/main/java/org/apache/sysds/resource/enumeration/PruneBasedEnumerator.java
+++ b/src/main/java/org/apache/sysds/resource/enumeration/PruneBasedEnumerator.java
@@ -23,7 +23,13 @@
 import org.apache.sysds.common.Opcodes;
 import org.apache.sysds.resource.CloudInstance;
 import org.apache.sysds.resource.ResourceCompiler;
-import org.apache.sysds.runtime.controlprogram.*;
+import org.apache.sysds.runtime.controlprogram.BasicProgramBlock;
+import org.apache.sysds.runtime.controlprogram.ForProgramBlock;
+import org.apache.sysds.runtime.controlprogram.FunctionProgramBlock;
+import org.apache.sysds.runtime.controlprogram.IfProgramBlock;
+import org.apache.sysds.runtime.controlprogram.Program;
+import org.apache.sysds.runtime.controlprogram.ProgramBlock;
+import org.apache.sysds.runtime.controlprogram.WhileProgramBlock;
 import org.apache.sysds.runtime.instructions.Instruction;
 
 import java.util.HashMap;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index 48637595741..e08f731e829 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -75,7 +75,7 @@
 import org.apache.sysds.runtime.data.SparseRow;
 import org.apache.sysds.runtime.functionobjects.SwapIndex;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.matrix.data.CTableMap;
@@ -816,12 +816,12 @@ public MatrixBlock zeroOutOperations(MatrixValue result, IndexRange range) {
 	}
 
 	@Override
-	public CM_COV_Object cmOperations(CMOperator op) {
+	public CmCovObject cmOperations(CMOperator op) {
 		return CLALibCMOps.centralMoment(this, op);
 	}
 
 	@Override
-	public CM_COV_Object cmOperations(CMOperator op, MatrixBlock weights) {
+	public CmCovObject cmOperations(CMOperator op, MatrixBlock weights) {
 		printDecompressWarning("cmOperations");
 		MatrixBlock right = getUncompressed(weights);
 		if(isEmpty())
@@ -833,13 +833,13 @@ public CM_COV_Object cmOperations(CMOperator op, MatrixBlock weights) {
 	}
 
 	@Override
-	public CM_COV_Object covOperations(COVOperator op, MatrixBlock that) {
+	public CmCovObject covOperations(COVOperator op, MatrixBlock that) {
 		MatrixBlock right = getUncompressed(that);
 		return getUncompressed("covOperations", op.getNumThreads()).covOperations(op, right);
 	}
 
 	@Override
-	public CM_COV_Object covOperations(COVOperator op, MatrixBlock that, MatrixBlock weights) {
+	public CmCovObject covOperations(COVOperator op, MatrixBlock that, MatrixBlock weights) {
 		MatrixBlock right1 = getUncompressed(that);
 		MatrixBlock right2 = getUncompressed(weights);
 		return getUncompressed("covOperations", op.getNumThreads()).covOperations(op, right1, right2);
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
index f6321bc1b6d..af944fce750 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
@@ -133,11 +133,14 @@ public class CompressionSettings {
 
 	public final double[] scaleFactors;
 
+	public final boolean preferDeltaEncoding;
+
 	protected CompressionSettings(double samplingRatio, double samplePower, boolean allowSharedDictionary,
 		String transposeInput, int seed, boolean lossy, EnumSet<CompressionType> validCompressions,
 		boolean sortValuesByLength, PartitionerType columnPartitioner, int maxColGroupCoCode, double coCodePercentage,
 		int minimumSampleSize, int maxSampleSize, EstimationType estimationType, CostType costComputationType,
-		double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors) {
+		double minimumCompressionRatio, boolean isInSparkInstruction, SORT_TYPE sdcSortType, double[] scaleFactors,
+		boolean preferDeltaEncoding) {
 		this.samplingRatio = samplingRatio;
 		this.samplePower = samplePower;
 		this.allowSharedDictionary = allowSharedDictionary;
@@ -157,6 +160,7 @@ protected CompressionSettings(double samplingRatio, double samplePower, boolean
 		this.isInSparkInstruction = isInSparkInstruction;
 		this.sdcSortType = sdcSortType;
 		this.scaleFactors = scaleFactors;
+		this.preferDeltaEncoding = preferDeltaEncoding;
 		
 		if(!printedStatus && LOG.isDebugEnabled()) {
 			printedStatus = true;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
index ae6a0b2d231..02c9f97498d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
@@ -53,6 +53,7 @@ public class CompressionSettingsBuilder {
 	private boolean isInSparkInstruction = false;
 	private SORT_TYPE sdcSortType = SORT_TYPE.MATERIALIZE;
 	private double[] scaleFactors = null;
+	private boolean preferDeltaEncoding = false;
 
 	public CompressionSettingsBuilder() {
 
@@ -101,6 +102,7 @@ public CompressionSettingsBuilder copySettings(CompressionSettings that) {
 		this.maxColGroupCoCode = that.maxColGroupCoCode;
 		this.coCodePercentage = that.coCodePercentage;
 		this.minimumSampleSize = that.minimumSampleSize;
+		this.preferDeltaEncoding = that.preferDeltaEncoding;
 		return this;
 	}
 
@@ -336,6 +338,19 @@ public CompressionSettingsBuilder setSDCSortType(SORT_TYPE sdcSortType) {
 		return this;
 	}
 
+	/**
+	 * Set whether to prefer delta encoding during compression estimation.
+	 * When enabled, the compression estimator will use delta encoding statistics
+	 * instead of regular encoding statistics.
+	 * 
+	 * @param preferDeltaEncoding Whether to prefer delta encoding
+	 * @return The CompressionSettingsBuilder
+	 */
+	public CompressionSettingsBuilder setPreferDeltaEncoding(boolean preferDeltaEncoding) {
+		this.preferDeltaEncoding = preferDeltaEncoding;
+		return this;
+	}
+
 	/**
 	 * Create the CompressionSettings object to use in the compression.
 	 * 
@@ -345,6 +360,6 @@ public CompressionSettings create() {
 		return new CompressionSettings(samplingRatio, samplePower, allowSharedDictionary, transposeInput, seed, lossy,
 			validCompressions, sortValuesByLength, columnPartitioner, maxColGroupCoCode, coCodePercentage,
 			minimumSampleSize, maxSampleSize, estimationType, costType, minimumCompressionRatio, isInSparkInstruction,
-			sdcSortType, scaleFactors);
+			sdcSortType, scaleFactors, preferDeltaEncoding);
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
index ec502d6d122..003703f86a4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
@@ -45,7 +45,7 @@
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.data.SparseBlockMCSR;
 import org.apache.sysds.runtime.functionobjects.Plus;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
@@ -610,7 +610,7 @@ public AColGroup addVector(double[] v) {
 	 * @param nRows The number of rows contained in the ColumnGroup.
 	 * @return A Central Moment object.
 	 */
-	public abstract CM_COV_Object centralMoment(CMOperator op, int nRows);
+	public abstract CmCovObject centralMoment(CMOperator op, int nRows);
 
 	/**
 	 * Expand the column group to multiple columns. (one hot encode the column group)
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java
index 0cde289b30f..45358c7ce46 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroupValue.java
@@ -26,7 +26,7 @@
 import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
 import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.operators.CMOperator;
 
 public abstract class AColGroupValue extends ADictBasedColGroup {
@@ -189,7 +189,7 @@ public AColGroup replace(double pattern, double replace) {
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CMOperator op, int nRows) {
+	public CmCovObject centralMoment(CMOperator op, int nRows) {
 		return _dict.centralMoment(op.fn, getCounts(), nRows);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
index 21c6a0e1d80..94137eb6381 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
@@ -50,7 +50,7 @@
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.data.SparseBlockMCSR;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.CMOperator;
@@ -518,8 +518,8 @@ protected double[] preAggBuiltinRows(Builtin builtin) {
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CMOperator op, int nRows) {
-		CM_COV_Object ret = new CM_COV_Object();
+	public CmCovObject centralMoment(CMOperator op, int nRows) {
+		CmCovObject ret = new CmCovObject();
 		op.fn.execute(ret, _dict.getValue(0), nRows);
 		return ret;
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
index bbcefd134c1..03af6cad162 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
@@ -33,6 +33,7 @@
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.DMLCompressionException;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupUtils.P;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DeltaDictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
@@ -43,6 +44,9 @@
 import org.apache.sysds.runtime.compress.colgroup.indexes.RangeIndex;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.utils.ACount;
+import org.apache.sysds.runtime.compress.utils.DblArray;
+import org.apache.sysds.runtime.compress.utils.DblArrayCountHashMap;
 import org.apache.sysds.runtime.compress.colgroup.offset.AOffsetIterator;
 import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
 import org.apache.sysds.runtime.compress.colgroup.scheme.DDCScheme;
@@ -79,7 +83,7 @@ public class ColGroupDDC extends APreAgg implements IMapToDataGroup {
 
 	static final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
 
-	private ColGroupDDC(IColIndex colIndexes, IDictionary dict, AMapToData data, int[] cachedCounts) {
+	protected ColGroupDDC(IColIndex colIndexes, IDictionary dict, AMapToData data, int[] cachedCounts) {
 		super(colIndexes, dict, cachedCounts);
 		_data = data;
 
@@ -1107,4 +1111,57 @@ protected boolean allowShallowIdentityRightMult() {
 		return true;
 	}
 
+	public AColGroup convertToDeltaDDC() {
+		int numCols = _colIndexes.size();
+		int numRows = _data.size();
+		
+		DblArrayCountHashMap map = new DblArrayCountHashMap(Math.max(numRows, 64));
+		double[] rowDelta = new double[numCols];
+		double[] prevRow = new double[numCols];
+		DblArray dblArray = new DblArray(rowDelta);
+		int[] rowToDictId = new int[numRows];
+		
+		double[] dictVals = _dict.getValues();
+
+		for(int i = 0; i < numRows; i++) {
+			int dictIdx = _data.getIndex(i);
+			int off = dictIdx * numCols;
+			for(int j = 0; j < numCols; j++) {
+				double val = dictVals[off + j];
+				if(i == 0) {
+					rowDelta[j] = val;
+					prevRow[j] = val;
+				} else {
+					rowDelta[j] = val - prevRow[j];
+					prevRow[j] = val;
+				}
+			}
+			
+			rowToDictId[i] = map.increment(dblArray);
+		}
+		
+		if(map.size() == 0)
+			return new ColGroupEmpty(_colIndexes);
+		
+		ACount<DblArray>[] vals = map.extractValues();
+		final int nVals = vals.length;
+		final double[] dictValues = new double[nVals * numCols];
+		final int[] oldIdToNewId = new int[map.size()];
+		int idx = 0;
+		for(int i = 0; i < nVals; i++) {
+			final ACount<DblArray> dac = vals[i];
+			final double[] arrData = dac.key().getData();
+			System.arraycopy(arrData, 0, dictValues, idx, numCols);
+			oldIdToNewId[dac.id] = i;
+			idx += numCols;
+		}
+		
+		DeltaDictionary deltaDict = new DeltaDictionary(dictValues, numCols);
+		AMapToData newData = MapToFactory.create(numRows, nVals);
+		for(int i = 0; i < numRows; i++) {
+			newData.set(i, oldIdToNewId[rowToDictId[i]]);
+		}
+		return ColGroupDeltaDDC.create(_colIndexes, deltaDict, newData, null);
+	}
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCFOR.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCFOR.java
index 70191a27936..d2ee8cd6673 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCFOR.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDCFOR.java
@@ -48,7 +48,7 @@
 import org.apache.sysds.runtime.functionobjects.Minus;
 import org.apache.sysds.runtime.functionobjects.Multiply;
 import org.apache.sysds.runtime.functionobjects.Plus;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.CMOperator;
@@ -403,9 +403,9 @@ public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) {
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CMOperator op, int nRows) {
+	public CmCovObject centralMoment(CMOperator op, int nRows) {
 		// should be guaranteed to be one column therefore only one reference value.
-		CM_COV_Object ret = _dict.centralMomentWithReference(op.fn, getCounts(), _reference[0], nRows);
+		CmCovObject ret = _dict.centralMomentWithReference(op.fn, getCounts(), _reference[0], nRows);
 		return ret;
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDeltaDDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDeltaDDC.java
index 2666860ca68..08bdfd1e1d8 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDeltaDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDeltaDDC.java
@@ -19,62 +19,559 @@
 
 package org.apache.sysds.runtime.compress.colgroup;
 
+import java.io.DataInput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DeltaDictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.utils.ACount;
+import org.apache.sysds.runtime.compress.utils.DblArray;
+import org.apache.sysds.runtime.compress.utils.DblArrayCountHashMap;
+import org.apache.sysds.runtime.compress.utils.DoubleCountHashMap;
+import org.apache.sysds.runtime.compress.utils.Util;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Divide;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Multiply;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+
 /**
  * Class to encapsulate information about a column group that is first delta encoded then encoded with dense dictionary
  * encoding (DeltaDDC).
  */
-public class ColGroupDeltaDDC  { // extends ColGroupDDC
-
-// 	private static final long serialVersionUID = -1045556313148564147L;
-
-// 	/** Constructor for serialization */
-// 	protected ColGroupDeltaDDC() {
-// 	}
-
-// 	private ColGroupDeltaDDC(int[] colIndexes, ADictionary dict, AMapToData data, int[] cachedCounts) {
-// 		super();
-// 		LOG.info("Carefully use of DeltaDDC since implementation is not finished.");
-// 		_colIndexes = colIndexes;
-// 		_dict = dict;
-// 		_data = data;
-// 	}
-
-// 	public static AColGroup create(int[] colIndices, ADictionary dict, AMapToData data, int[] cachedCounts) {
-// 		if(dict == null)
-// 			throw new NotImplementedException("Not implemented constant delta group");
-// 		else
-// 			return new ColGroupDeltaDDC(colIndices, dict, data, cachedCounts);
-// 	}
-
-// 	public CompressionType getCompType() {
-// 		return CompressionType.DeltaDDC;
-// 	}
-
-// 	@Override
-// 	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
-// 		double[] values) {
-// 		final int nCol = _colIndexes.length;
-// 		for(int i = rl, offT = rl + offR; i < ru; i++, offT++) {
-// 			final double[] c = db.values(offT);
-// 			final int off = db.pos(offT) + offC;
-// 			final int rowIndex = _data.getIndex(i) * nCol;
-// 			final int prevOff = (off == 0) ? off : off - nCol;
-// 			for(int j = 0; j < nCol; j++) {
-// 				// Here we use the values in the previous row to compute current values along with the delta
-// 				double newValue = c[prevOff + j] + values[rowIndex + j];
-// 				c[off + _colIndexes[j]] += newValue;
-// 			}
-// 		}
-// 	}
-
-// 	@Override
-// 	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
-// 		double[] values) {
-// 		throw new NotImplementedException();
-// 	}
-
-// 	@Override
-// 	public AColGroup scalarOperation(ScalarOperator op) {
-// 		return new ColGroupDeltaDDC(_colIndexes, _dict.applyScalarOp(op), _data, getCachedCounts());
-// 	}
+public class ColGroupDeltaDDC extends ColGroupDDC {
+	private static final long serialVersionUID = -1045556313148564147L;
+
+	private ColGroupDeltaDDC(IColIndex colIndexes, IDictionary dict, AMapToData data, int[] cachedCounts) {
+		super(colIndexes, dict, data, cachedCounts);
+		if(CompressedMatrixBlock.debug) {
+			if(!(dict instanceof DeltaDictionary))
+				throw new DMLCompressionException("DeltaDDC must use DeltaDictionary");
+		}
+	}
+
+	public static AColGroup create(IColIndex colIndexes, IDictionary dict, AMapToData data, int[] cachedCounts) {
+		if(dict == null)
+			return new ColGroupEmpty(colIndexes);
+		
+		if(!(dict instanceof DeltaDictionary))
+			throw new DMLCompressionException("ColGroupDeltaDDC must use DeltaDictionary");
+		
+		if(data.getUnique() == 1) {
+			DeltaDictionary deltaDict = (DeltaDictionary) dict;
+			double[] values = deltaDict.getValues();
+			final int nCol = colIndexes.size();
+			boolean allZeros = true;
+			for(int i = 0; i < nCol; i++) {
+				if(!Util.eq(values[i], 0.0)) {
+					allZeros = false;
+					break;
+				}
+			}
+			if(allZeros) {
+				double[] constValues = new double[nCol];
+				System.arraycopy(values, 0, constValues, 0, nCol);
+				return ColGroupConst.create(colIndexes, Dictionary.create(constValues));
+			}
+		}
+		
+		return new ColGroupDeltaDDC(colIndexes, dict, data, cachedCounts);
+	}
+
+	@Override
+	public CompressionType getCompType() {
+		return CompressionType.DeltaDDC;
+	}
+
+	@Override
+	public ColGroupType getColGroupType() {
+		return ColGroupType.DeltaDDC;
+	}
+
+	public static ColGroupDeltaDDC read(DataInput in) throws IOException {
+		IColIndex cols = ColIndexFactory.read(in);
+		IDictionary dict = DictionaryFactory.read(in);
+		AMapToData data = MapToFactory.readIn(in);
+		return new ColGroupDeltaDDC(cols, dict, data, null);
+	}
+
+	@Override
+	protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		double[] values) {
+		final int nCol = _colIndexes.size();
+		final double[] prevRow = new double[nCol];
+		
+		if(rl > 0) {
+			final int dictIdx0 = _data.getIndex(0);
+			final int rowIndex0 = dictIdx0 * nCol;
+			for(int j = 0; j < nCol; j++) {
+				prevRow[j] = values[rowIndex0 + j];
+			}
+			for(int i = 1; i < rl; i++) {
+				final int dictIdx = _data.getIndex(i);
+				final int rowIndex = dictIdx * nCol;
+				for(int j = 0; j < nCol; j++) {
+					prevRow[j] += values[rowIndex + j];
+				}
+			}
+		}
+
+		if(db.isContiguous() && nCol == db.getDim(1) && offC == 0) {
+			final int nColOut = db.getDim(1);
+			final double[] c = db.values(0);
+			for(int i = rl; i < ru; i++) {
+				final int dictIdx = _data.getIndex(i);
+				final int rowIndex = dictIdx * nCol;
+				final int rowBaseOff = (i + offR) * nColOut;
+				
+				if(i == 0 && rl == 0) {
+					for(int j = 0; j < nCol; j++) {
+						final double value = values[rowIndex + j];
+						c[rowBaseOff + j] = value;
+						prevRow[j] = value;
+					}
+				}
+				else {
+					for(int j = 0; j < nCol; j++) {
+						final double delta = values[rowIndex + j];
+						final double newValue = prevRow[j] + delta;
+						c[rowBaseOff + j] = newValue;
+						prevRow[j] = newValue;
+					}
+				}
+			}
+		}
+		else {
+			for(int i = rl, offT = rl + offR; i < ru; i++, offT++) {
+				final double[] c = db.values(offT);
+				final int off = db.pos(offT) + offC;
+				final int dictIdx = _data.getIndex(i);
+				final int rowIndex = dictIdx * nCol;
+				
+				if(i == 0 && rl == 0) {
+					for(int j = 0; j < nCol; j++) {
+						final double value = values[rowIndex + j];
+						final int colIdx = _colIndexes.get(j);
+						c[off + colIdx] = value;
+						prevRow[j] = value;
+					}
+				}
+				else {
+					for(int j = 0; j < nCol; j++) {
+						final double delta = values[rowIndex + j];
+						final double newValue = prevRow[j] + delta;
+						final int colIdx = _colIndexes.get(j);
+						c[off + colIdx] = newValue;
+						prevRow[j] = newValue;
+					}
+				}
+			}
+		}
+	}
+
+	@Override
+	protected void decompressToSparseBlockDenseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
+		double[] values) {
+		final int nCol = _colIndexes.size();
+		final double[] prevRow = new double[nCol];
+		
+		if(rl > 0) {
+			final int dictIdx0 = _data.getIndex(0);
+			final int rowIndex0 = dictIdx0 * nCol;
+			for(int j = 0; j < nCol; j++) {
+				prevRow[j] = values[rowIndex0 + j];
+			}
+			for(int i = 1; i < rl; i++) {
+				final int dictIdx = _data.getIndex(i);
+				final int rowIndex = dictIdx * nCol;
+				for(int j = 0; j < nCol; j++) {
+					prevRow[j] += values[rowIndex + j];
+				}
+			}
+		}
+
+		for(int i = rl, offT = rl + offR; i < ru; i++, offT++) {
+			final int dictIdx = _data.getIndex(i);
+			final int rowIndex = dictIdx * nCol;
+			
+			if(i == 0 && rl == 0) {
+				for(int j = 0; j < nCol; j++) {
+					final double value = values[rowIndex + j];
+					final int colIdx = _colIndexes.get(j);
+					ret.append(offT, colIdx + offC, value);
+					prevRow[j] = value;
+				}
+			}
+			else {
+				for(int j = 0; j < nCol; j++) {
+					final double delta = values[rowIndex + j];
+					final double newValue = prevRow[j] + delta;
+					final int colIdx = _colIndexes.get(j);
+					ret.append(offT, colIdx + offC, newValue);
+					prevRow[j] = newValue;
+				}
+			}
+		}
+	}
+
+	@Override
+	protected void decompressToDenseBlockSparseDictionary(DenseBlock db, int rl, int ru, int offR, int offC,
+		SparseBlock sb) {
+		throw new NotImplementedException("Dense block decompression from sparse dictionary for DeltaDDC not yet implemented");
+	}
+
+	@Override
+	protected void decompressToSparseBlockSparseDictionary(SparseBlock ret, int rl, int ru, int offR, int offC,
+		SparseBlock sb) {
+		throw new NotImplementedException("Sparse block decompression from sparse dictionary for DeltaDDC not yet implemented");
+	}
+
+	@Override
+	protected void decompressToDenseBlockTransposedSparseDictionary(DenseBlock db, int rl, int ru, SparseBlock sb) {
+		throw new NotImplementedException("Transposed dense block decompression from sparse dictionary for DeltaDDC not yet implemented");
+	}
+
+	@Override
+	protected void decompressToDenseBlockTransposedDenseDictionary(DenseBlock db, int rl, int ru, double[] dict) {
+		throw new NotImplementedException("Transposed dense block decompression from dense dictionary for DeltaDDC not yet implemented");
+	}
+
+	@Override
+	protected void decompressToSparseBlockTransposedSparseDictionary(SparseBlockMCSR sbr, SparseBlock sb, int nColOut) {
+		throw new NotImplementedException("Transposed sparse block decompression from sparse dictionary for DeltaDDC not yet implemented");
+	}
+
+	@Override
+	protected void decompressToSparseBlockTransposedDenseDictionary(SparseBlockMCSR sbr, double[] dict, int nColOut) {
+		throw new NotImplementedException("Transposed sparse block decompression from dense dictionary for DeltaDDC not yet implemented");
+	}
+
+	@Override
+	public AColGroup scalarOperation(ScalarOperator op) {
+		if(op.fn instanceof Multiply || op.fn instanceof Divide) {
+			double[] val = _dict.getValues();
+			double[] newVal = new double[val.length];
+			for(int i = 0; i < val.length; i++)
+				newVal[i] = op.executeScalar(val[i]);
+			return create(_colIndexes, new DeltaDictionary(newVal, _colIndexes.size()), _data, getCounts());
+		}
+		else if(op.fn instanceof Plus || op.fn instanceof Minus) {
+			return scalarOperationShift(op);
+		}
+		else {
+			AColGroup ddc = convertToDDC();
+			return ddc.scalarOperation(op);
+		}
+	}
+
+	private AColGroup scalarOperationShift(ScalarOperator op) {
+		final int nCol = _colIndexes.size();
+		final int id0 = _data.getIndex(0);
+		final double[] vals = _dict.getValues();
+		final double[] tuple0 = new double[nCol];
+		for(int j = 0; j < nCol; j++)
+			tuple0[j] = vals[id0 * nCol + j];
+
+		final double[] tupleNew = new double[nCol];
+		for(int j = 0; j < nCol; j++)
+			tupleNew[j] = op.executeScalar(tuple0[j]);
+
+		int[] counts = getCounts();
+		if(counts[id0] == 1) {
+			double[] newVals = vals.clone();
+			for(int j = 0; j < nCol; j++)
+				newVals[id0 * nCol + j] = tupleNew[j];
+			return create(_colIndexes, new DeltaDictionary(newVals, nCol), _data, counts);
+		}
+		else {
+			int idNew = -1;
+			int nEntries = vals.length / nCol;
+			for(int k = 0; k < nEntries; k++) {
+				boolean match = true;
+				for(int j = 0; j < nCol; j++) {
+					if(vals[k * nCol + j] != tupleNew[j]) {
+						match = false;
+						break;
+					}
+				}
+				if(match) {
+					idNew = k;
+					break;
+				}
+			}
+
+			IDictionary newDict = _dict;
+			if(idNew == -1) {
+				double[] newVals = Arrays.copyOf(vals, vals.length + nCol);
+				System.arraycopy(tupleNew, 0, newVals, vals.length, nCol);
+				newDict = new DeltaDictionary(newVals, nCol);
+				idNew = nEntries;
+			}
+
+			AMapToData newData = MapToFactory.create(_data.size(), Math.max(_data.getUpperBoundValue(), idNew) + 1);
+			for(int i = 0; i < _data.size(); i++)
+				newData.set(i, _data.getIndex(i));
+			newData.set(0, idNew);
+			
+			return create(_colIndexes, newDict, newData, null);
+		}
+	}
+
+	@Override
+	public AColGroup unaryOperation(UnaryOperator op) {
+		AColGroup ddc = convertToDDC();
+		return ddc.unaryOperation(op);
+	}
+
+	@Override
+	public void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock result, int rl, int ru, int cl, int cu) {
+		throw new NotImplementedException("Left matrix multiplication not supported for DeltaDDC");
+	}
+
+	@Override
+	public void rightDecompressingMult(MatrixBlock right, MatrixBlock ret, int rl, int ru, int nRows, int crl, int cru) {
+		throw new NotImplementedException("Right matrix multiplication not supported for DeltaDDC");
+	}
+
+	@Override
+	public void preAggregateDense(MatrixBlock m, double[] preAgg, int rl, int ru, int cl, int cu) {
+		throw new NotImplementedException("Pre-aggregate dense not supported for DeltaDDC");
+	}
+
+	@Override
+	public void preAggregateSparse(SparseBlock sb, double[] preAgg, int rl, int ru, int cl, int cu) {
+		throw new NotImplementedException("Pre-aggregate sparse not supported for DeltaDDC");
+	}
+
+	@Override
+	public void preAggregateThatDDCStructure(ColGroupDDC that, Dictionary ret) {
+		throw new NotImplementedException("Pre-aggregate DDC structure not supported for DeltaDDC");
+	}
+
+	@Override
+	public void preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary ret) {
+		throw new NotImplementedException("Pre-aggregate SDCZeros structure not supported for DeltaDDC");
+	}
+
+	@Override
+	public void preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary ret) {
+		throw new NotImplementedException("Pre-aggregate SDCSingleZeros structure not supported for DeltaDDC");
+	}
+
+	@Override
+	protected void preAggregateThatRLEStructure(ColGroupRLE that, Dictionary ret) {
+		throw new NotImplementedException("Pre-aggregate RLE structure not supported for DeltaDDC");
+	}
+
+	@Override
+	protected double computeMxx(double c, Builtin builtin) {
+		throw new NotImplementedException("Compute Min/Max not supported for DeltaDDC");
+	}
+
+	@Override
+	protected void computeColMxx(double[] c, Builtin builtin) {
+		throw new NotImplementedException("Compute Column Min/Max not supported for DeltaDDC");
+	}
+
+	@Override
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru, double[] preAgg) {
+		throw new NotImplementedException("Compute Row Min/Max not supported for DeltaDDC");
+	}
+
+	@Override
+	protected void computeRowSums(double[] c, int rl, int ru, double[] preAgg) {
+		throw new NotImplementedException("Compute Row Sums not supported for DeltaDDC");
+	}
+
+	@Override
+	protected void computeRowProduct(double[] c, int rl, int ru, double[] preAgg) {
+		throw new NotImplementedException("Compute Row Product not supported for DeltaDDC");
+	}
+
+	@Override
+	public boolean containsValue(double pattern) {
+		throw new NotImplementedException("Contains value not supported for DeltaDDC");
+	}
+
+	@Override
+	public AColGroup append(AColGroup g) {
+		throw new NotImplementedException("Append not supported for DeltaDDC");
+	}
+
+	@Override
+	public AColGroup appendNInternal(AColGroup[] g, int blen, int rlen) {
+		throw new NotImplementedException("AppendN not supported for DeltaDDC");
+	}
+
+	@Override
+	public long getNumberNonZeros(int nRows) {
+		long nnz = 0;
+		final int nCol = _colIndexes.size();
+		final double[] prevRow = new double[nCol];
+		
+		for(int i = 0; i < nRows; i++) {
+			final int dictIdx = _data.getIndex(i);
+			final double[] vals = _dict.getValues();
+			final int rowIndex = dictIdx * nCol;
+			
+			if(i == 0) {
+				for(int j = 0; j < nCol; j++) {
+					double val = vals[rowIndex + j];
+					prevRow[j] = val;
+					if(val != 0)
+						nnz++;
+				}
+			}
+			else {
+				for(int j = 0; j < nCol; j++) {
+					double val = prevRow[j] + vals[rowIndex + j];
+					prevRow[j] = val;
+					if(val != 0)
+						nnz++;
+				}
+			}
+		}
+		return nnz;
+	}
+
+	@Override
+	public AColGroup sliceRows(int rl, int ru) {
+		final int nCol = _colIndexes.size();
+		double[] firstRowValues = new double[nCol];
+		double[] dictVals = ((DeltaDictionary)_dict).getValues();
+		
+		for(int i = 0; i <= rl; i++) {
+			int dictIdx = _data.getIndex(i);
+			int dictOffset = dictIdx * nCol;
+			if(i == 0) {
+				for(int j = 0; j < nCol; j++) firstRowValues[j] = dictVals[dictOffset + j];
+			} else {
+				for(int j = 0; j < nCol; j++) firstRowValues[j] += dictVals[dictOffset + j];
+			}
+		}
+		
+		int nEntries = dictVals.length / nCol;
+		int newId = -1;
+		for(int k = 0; k < nEntries; k++) {
+			boolean match = true;
+			for(int j = 0; j < nCol; j++) {
+				if(dictVals[k * nCol + j] != firstRowValues[j]) {
+					match = false;
+					break;
+				}
+			}
+			if(match) {
+				newId = k;
+				break;
+			}
+		}
+		
+		IDictionary newDict = _dict;
+		if(newId == -1) {
+			double[] newDictVals = Arrays.copyOf(dictVals, dictVals.length + nCol);
+			System.arraycopy(firstRowValues, 0, newDictVals, dictVals.length, nCol);
+			newDict = new DeltaDictionary(newDictVals, nCol);
+			newId = nEntries;
+		}
+		
+		int numRows = ru - rl;
+		AMapToData slicedData = MapToFactory.create(numRows, Math.max(_data.getUpperBoundValue(), newId) + 1);
+		for(int i = 0; i < numRows; i++)
+			slicedData.set(i, _data.getIndex(rl + i));
+		
+		slicedData.set(0, newId);
+		return ColGroupDeltaDDC.create(_colIndexes, newDict, slicedData, null);
+	}
+
+	private AColGroup convertToDDC() {
+		final int nCol = _colIndexes.size();
+		final int nRow = _data.size();
+		double[] values = new double[nRow * nCol];
+
+		double[] prevRow = new double[nCol];
+		for(int i = 0; i < nRow; i++) {
+			final int dictIdx = _data.getIndex(i);
+			final double[] dictVals = _dict.getValues();
+			final int rowIndex = dictIdx * nCol;
+
+			for(int j = 0; j < nCol; j++) {
+				if(i == 0) {
+					prevRow[j] = dictVals[rowIndex + j];
+				}
+				else {
+					prevRow[j] = prevRow[j] + dictVals[rowIndex + j];
+				}
+				values[i * nCol + j] = prevRow[j];
+			}
+		}
+
+		return compress(values, _colIndexes);
+	}
+
+	private static AColGroup compress(double[] values, IColIndex colIndexes) {
+		int nRow = values.length / colIndexes.size();
+		int nCol = colIndexes.size();
+
+		if(nCol == 1) {
+			DoubleCountHashMap map = new DoubleCountHashMap(16);
+			AMapToData mapData = MapToFactory.create(nRow, 256);
+			for(int i = 0; i < nRow; i++) {
+				int id = map.increment(values[i]);
+				if(id >= mapData.getUpperBoundValue()) {
+					mapData = mapData.resize(Math.max(mapData.getUpperBoundValue() * 2, id + 1));
+				}
+				mapData.set(i, id);
+			}
+			if(map.size() == 1)
+				return ColGroupConst.create(colIndexes, Dictionary.create(new double[] {map.getMostFrequent()}));
+
+			IDictionary dict = Dictionary.create(map.getDictionary());
+			return ColGroupDDC.create(colIndexes, dict, mapData.resize(map.size()), null);
+		}
+		else {
+			DblArrayCountHashMap map = new DblArrayCountHashMap(16);
+			AMapToData mapData = MapToFactory.create(nRow, 256);
+			DblArray dblArray = new DblArray(new double[nCol]);
+			for(int i = 0; i < nRow; i++) {
+				System.arraycopy(values, i * nCol, dblArray.getData(), 0, nCol);
+				int id = map.increment(dblArray);
+				if(id >= mapData.getUpperBoundValue()) {
+					mapData = mapData.resize(Math.max(mapData.getUpperBoundValue() * 2, id + 1));
+				}
+				mapData.set(i, id);
+			}
+			if(map.size() == 1) {
+				ACount<DblArray>[] counts = map.extractValues();
+				return ColGroupConst.create(colIndexes, Dictionary.create(counts[0].key().getData()));
+			}
+
+			ACount<DblArray>[] counts = map.extractValues();
+			Arrays.sort(counts, Comparator.comparingInt(x -> x.id));
+			
+			double[] dictValues = new double[counts.length * nCol];
+			for(int i = 0; i < counts.length; i++) {
+				System.arraycopy(counts[i].key().getData(), 0, dictValues, i * nCol, nCol);
+			}
+
+			IDictionary dict = Dictionary.create(dictValues);
+			return ColGroupDDC.create(colIndexes, dict, mapData.resize(map.size()), null);
+		}
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
index ba547a8d7aa..6d7872fce54 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
@@ -49,7 +49,7 @@
 import org.apache.sysds.runtime.data.SparseBlockMCSR;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.CMOperator;
@@ -303,8 +303,8 @@ protected double[] preAggBuiltinRows(Builtin builtin) {
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CMOperator op, int nRows) {
-		CM_COV_Object ret = new CM_COV_Object();
+	public CmCovObject centralMoment(CMOperator op, int nRows) {
+		CmCovObject ret = new CmCovObject();
 		op.fn.execute(ret, 0.0, nRows);
 		return ret;
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
index c6a098f5c32..273df9ff26f 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
@@ -38,6 +38,7 @@
 import org.apache.sysds.runtime.compress.bitmap.ABitmap;
 import org.apache.sysds.runtime.compress.bitmap.BitmapEncoder;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DeltaDictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
@@ -288,6 +289,12 @@ else if((ct == CompressionType.SDC || ct == CompressionType.CONST) //
 		else if(ct == CompressionType.DDC) {
 			return directCompressDDC(colIndexes, cg);
 		}
+		else if(ct == CompressionType.DeltaDDC) {
+			return directCompressDeltaDDC(colIndexes, cg);
+		}
+		else if(ct == CompressionType.CONST && cs.preferDeltaEncoding) {
+			return directCompressDeltaDDC(colIndexes, cg);
+		}
 		else if(ct == CompressionType.LinearFunctional) {
 			if(cs.scaleFactors != null) {
 				throw new NotImplementedException(); // quantization-fused compression NOT allowed
@@ -684,6 +691,129 @@ private AColGroup directCompressDDCMultiCol(IColIndex colIndexes, CompressedSize
 		return ColGroupDDC.create(colIndexes, dict, resData, null);
 	}
 
+	private AColGroup directCompressDeltaDDC(IColIndex colIndexes, CompressedSizeInfoColGroup cg) throws Exception {
+		if(cs.transposed) {
+			throw new NotImplementedException("Delta encoding for transposed matrices not yet implemented");
+		}
+		if(cs.scaleFactors != null) {
+			throw new NotImplementedException("Delta encoding with quantization not yet implemented");
+		}
+		
+		if(colIndexes.size() > 1) {
+			return directCompressDeltaDDCMultiCol(colIndexes, cg);
+		}
+		else {
+			return directCompressDeltaDDCSingleCol(colIndexes, cg);
+		}
+	}
+
+	private AColGroup directCompressDeltaDDCSingleCol(IColIndex colIndexes, CompressedSizeInfoColGroup cg) {
+		final AMapToData d = MapToFactory.create(nRow, Math.max(Math.min(cg.getNumOffs() + 1, nRow), 126));
+		final DoubleCountHashMap map = new DoubleCountHashMap(cg.getNumVals());
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(in, colIndexes, cs.transposed, 0, nRow);
+		DblArray cellVals = reader.nextRow();
+		int r = 0;
+		while(r < nRow && cellVals != null) {
+			final int row = reader.getCurrentRowIndex();
+			if(row == r) {
+				final double val = cellVals.getData()[0];
+				final int id = map.increment(val);
+				d.set(row, id);
+				cellVals = reader.nextRow();
+				r++;
+			}
+			else {
+				r = row;
+			}
+		}
+
+		if(map.size() == 0)
+			return new ColGroupEmpty(colIndexes);
+		
+		final double[] dictValues = map.getDictionary();
+		IDictionary dict = new DeltaDictionary(dictValues, 1);
+
+		final int nUnique = map.size();
+		final AMapToData resData = d.resize(nUnique);
+		return ColGroupDeltaDDC.create(colIndexes, dict, resData, null);
+	}
+
+	private AColGroup directCompressDeltaDDCMultiCol(IColIndex colIndexes, CompressedSizeInfoColGroup cg) throws Exception {
+		final AMapToData d = MapToFactory.create(nRow, Math.max(Math.min(cg.getNumOffs() + 1, nRow), 126));
+		final int fill = d.getUpperBoundValue();
+		d.fill(fill);
+
+		final DblArrayCountHashMap map = new DblArrayCountHashMap(Math.max(cg.getNumVals(), 64));
+		boolean extra;
+		if(nRow < CompressionSettings.PAR_DDC_THRESHOLD || k < csi.getNumberColGroups() || pool == null) {
+			extra = readToMapDeltaDDC(colIndexes, map, d, 0, nRow, fill);
+		}
+		else {
+			throw new NotImplementedException("Parallel delta DDC compression not yet implemented");
+		}
+
+		if(map.size() == 0)
+			return new ColGroupEmpty(colIndexes);
+
+		final ACount<DblArray>[] vals = map.extractValues();
+		final int nVals = vals.length;
+		final int nTuplesOut = nVals + (extra ? 1 : 0);
+		final double[] dictValues = new double[nTuplesOut * colIndexes.size()];
+		final int[] oldIdToNewId = new int[map.size()];
+		int idx = 0;
+		for(int i = 0; i < nVals; i++) {
+			final ACount<DblArray> dac = vals[i];
+			final double[] arrData = dac.key().getData();
+			System.arraycopy(arrData, 0, dictValues, idx, colIndexes.size());
+			oldIdToNewId[dac.id] = i;
+			idx += colIndexes.size();
+		}
+		IDictionary dict = new DeltaDictionary(dictValues, colIndexes.size());
+
+		if(extra)
+			d.replace(fill, map.size());
+		final int nUnique = map.size() + (extra ? 1 : 0);
+		final AMapToData resData = d.resize(nUnique);
+		for(int i = 0; i < nRow; i++) {
+			final int oldId = resData.getIndex(i);
+			if(extra && oldId == map.size()) {
+				resData.set(i, nVals);
+			}
+			else if(oldId < oldIdToNewId.length) {
+				resData.set(i, oldIdToNewId[oldId]);
+			}
+		}
+		return ColGroupDeltaDDC.create(colIndexes, dict, resData, null);
+	}
+
+	private boolean readToMapDeltaDDC(IColIndex colIndexes, DblArrayCountHashMap map, AMapToData data, int rl, int ru,
+		int fill) {
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(in, colIndexes, cs.transposed, rl, ru);
+
+		DblArray cellVals = reader.nextRow();
+		boolean extra = false;
+		int r = rl;
+		while(r < ru && cellVals != null) {
+			final int row = reader.getCurrentRowIndex();
+			if(row == r) {
+				final int id = map.increment(cellVals);
+				data.set(row, id);
+				cellVals = reader.nextRow();
+				r++;
+			}
+			else {
+				r = row;
+				extra = true;
+			}
+		}
+
+		if(r < ru)
+			extra = true;
+
+		return extra;
+	}
+
 	private boolean readToMapDDC(IColIndex colIndexes, DblArrayCountHashMap map, AMapToData data, int rl, int ru,
 		int fill) {
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
index 91442281317..b47100d4e64 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
@@ -105,10 +105,12 @@ public static AColGroup readColGroup(DataInput in, int nRows) throws IOException
 		switch(ctype) {
 			case DDC:
 				return ColGroupDDC.read(in);
-			case DDCFOR:
-				return ColGroupDDCFOR.read(in);
-			case OLE:
-				return ColGroupOLE.read(in, nRows);
+		case DDCFOR:
+			return ColGroupDDCFOR.read(in);
+		case DeltaDDC:
+			return ColGroupDeltaDDC.read(in);
+		case OLE:
+			return ColGroupOLE.read(in, nRows);
 			case RLE:
 				return ColGroupRLE.read(in, nRows);
 			case CONST:
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupLinearFunctional.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupLinearFunctional.java
index 45b4fbeb026..4e9fffaf718 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupLinearFunctional.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupLinearFunctional.java
@@ -40,7 +40,7 @@
 import org.apache.sysds.runtime.functionobjects.Minus;
 import org.apache.sysds.runtime.functionobjects.Multiply;
 import org.apache.sysds.runtime.functionobjects.Plus;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
 import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -618,7 +618,7 @@ public long estimateInMemorySize() {
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CMOperator op, int nRows) {
+	public CmCovObject centralMoment(CMOperator op, int nRows) {
 		throw new NotImplementedException();
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
index 1270823bfdc..4340637a737 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
@@ -50,7 +50,7 @@
 import org.apache.sysds.runtime.compress.utils.Util;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.CMOperator;
@@ -493,7 +493,7 @@ public AColGroup subtractDefaultTuple() {
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CMOperator op, int nRows) {
+	public CmCovObject centralMoment(CMOperator op, int nRows) {
 		return _dict.centralMomentWithDefault(op.fn, getCounts(), _defaultTuple[0], nRows);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCFOR.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCFOR.java
index 41fb7ac5709..675c1120c38 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCFOR.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCFOR.java
@@ -51,7 +51,7 @@
 import org.apache.sysds.runtime.functionobjects.Minus;
 import org.apache.sysds.runtime.functionobjects.Multiply;
 import org.apache.sysds.runtime.functionobjects.Plus;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.CMOperator;
@@ -431,7 +431,7 @@ public AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows) {
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CMOperator op, int nRows) {
+	public CmCovObject centralMoment(CMOperator op, int nRows) {
 		// should be guaranteed to be one column therefore only one reference value.
 		return _dict.centralMomentWithReference(op.fn, getCounts(), _reference[0], nRows);
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
index fa5772c0c3e..a954f380a04 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
@@ -48,7 +48,7 @@
 import org.apache.sysds.runtime.compress.utils.IntArrayList;
 import org.apache.sysds.runtime.compress.utils.Util;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.CMOperator;
@@ -460,7 +460,7 @@ public long getNumberNonZeros(int nRows) {
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CMOperator op, int nRows) {
+	public CmCovObject centralMoment(CMOperator op, int nRows) {
 		return _dict.centralMomentWithDefault(op.fn, getCounts(), _defaultTuple[0], nRows);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
index 1c3bce2e16c..8d446575975 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
@@ -56,7 +56,7 @@
 import org.apache.sysds.runtime.functionobjects.ReduceAll;
 import org.apache.sysds.runtime.functionobjects.ReduceRow;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
 import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -949,7 +949,7 @@ public void computeColSums(double[] c, int nRows) {
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CMOperator op, int nRows) {
+	public CmCovObject centralMoment(CMOperator op, int nRows) {
 		return _data.cmOperations(op);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressedArray.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressedArray.java
index 31e29341645..08cbab30bcc 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressedArray.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressedArray.java
@@ -28,7 +28,7 @@
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.data.SparseBlockMCSR;
 import org.apache.sysds.runtime.frame.data.columns.Array;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
@@ -203,7 +203,7 @@ public void computeColSums(double[] c, int nRows) {
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CMOperator op, int nRows) {
+	public CmCovObject centralMoment(CMOperator op, int nRows) {
 		throw new UnsupportedOperationException("Unimplemented method 'centralMoment'");
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
index 31f2c9fb3c4..0b768ef4a2a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
@@ -26,7 +26,7 @@
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
@@ -40,16 +40,16 @@ public abstract class ADictionary implements IDictionary, Serializable {
 
 	public abstract IDictionary clone();
 
-	public final CM_COV_Object centralMoment(ValueFunction fn, int[] counts, int nRows) {
-		return centralMoment(new CM_COV_Object(), fn, counts, nRows);
+	public final CmCovObject centralMoment(ValueFunction fn, int[] counts, int nRows) {
+		return centralMoment(new CmCovObject(), fn, counts, nRows);
 	}
 
-	public final CM_COV_Object centralMomentWithDefault(ValueFunction fn, int[] counts, double def, int nRows) {
-		return centralMomentWithDefault(new CM_COV_Object(), fn, counts, def, nRows);
+	public final CmCovObject centralMomentWithDefault(ValueFunction fn, int[] counts, double def, int nRows) {
+		return centralMomentWithDefault(new CmCovObject(), fn, counts, def, nRows);
 	}
 
-	public final CM_COV_Object centralMomentWithReference(ValueFunction fn, int[] counts, double reference, int nRows) {
-		return centralMomentWithReference(new CM_COV_Object(), fn, counts, reference, nRows);
+	public final CmCovObject centralMomentWithReference(ValueFunction fn, int[] counts, double reference, int nRows) {
+		return centralMomentWithReference(new CmCovObject(), fn, counts, reference, nRows);
 	}
 
 	@Override
@@ -144,7 +144,7 @@ public void productWithReference(double[] ret, int[] counts, double[] reference,
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CM_COV_Object ret, ValueFunction fn, int[] counts, int nRows) {
+	public CmCovObject centralMoment(CmCovObject ret, ValueFunction fn, int[] counts, int nRows) {
 		return getMBDict().centralMoment(ret, fn, counts, nRows);
 	}
 
@@ -154,13 +154,13 @@ public double getSparsity() {
 	}
 
 	@Override
-	public CM_COV_Object centralMomentWithDefault(CM_COV_Object ret, ValueFunction fn, int[] counts, double def,
+	public CmCovObject centralMomentWithDefault(CmCovObject ret, ValueFunction fn, int[] counts, double def,
 		int nRows) {
 		return getMBDict().centralMomentWithDefault(ret, fn, counts, def, nRows);
 	}
 
 	@Override
-	public CM_COV_Object centralMomentWithReference(CM_COV_Object ret, ValueFunction fn, int[] counts, double reference,
+	public CmCovObject centralMomentWithReference(CmCovObject ret, ValueFunction fn, int[] counts, double reference,
 		int nRows) {
 		return getMBDict().centralMomentWithReference(ret, fn, counts, reference, nRows);
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DeltaDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DeltaDictionary.java
index d67ab95f824..d667e76ed5e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DeltaDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DeltaDictionary.java
@@ -19,14 +19,13 @@
 
 package org.apache.sysds.runtime.compress.colgroup.dictionary;
 
+import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
 
 import org.apache.commons.lang3.NotImplementedException;
 import org.apache.sysds.runtime.functionobjects.Divide;
-import org.apache.sysds.runtime.functionobjects.Minus;
 import org.apache.sysds.runtime.functionobjects.Multiply;
-import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
 /**
@@ -50,26 +49,22 @@ public double[] getValues(){
 		return _values;
 	}
 
+	@Override
+	public double getValue(int i, int col, int nCol) {
+		return _values[i * nCol + col];
+	}
+
 	@Override
 	public DeltaDictionary applyScalarOp(ScalarOperator op) {
-		final double[] retV = new double[_values.length];
 		if(op.fn instanceof Multiply || op.fn instanceof Divide) {
+			final double[] retV = new double[_values.length];
 			for(int i = 0; i < _values.length; i++)
 				retV[i] = op.executeScalar(_values[i]);
+			return new DeltaDictionary(retV, _numCols);
 		}
-		else if(op.fn instanceof Plus || op.fn instanceof Minus) {
-			// With Plus and Minus only the first row needs to be updated when delta encoded
-			for(int i = 0; i < _values.length; i++) {
-				if(i < _numCols)
-					retV[i] = op.executeScalar(_values[i]);
-				else
-					retV[i] = _values[i];
-			}
+		else {
+			throw new NotImplementedException("Scalar op " + op.fn.getClass().getSimpleName() + " not supported in DeltaDictionary");
 		}
-		else
-			throw new NotImplementedException();
-
-		return new DeltaDictionary(retV, _numCols);
 	}
 
 	@Override
@@ -79,17 +74,30 @@ public long getInMemorySize() {
 
 	@Override
 	public void write(DataOutput out) throws IOException {
-		throw new NotImplementedException();
+		out.writeByte(DictionaryFactory.Type.DELTA_DICT.ordinal());
+		out.writeInt(_numCols);
+		out.writeInt(_values.length);
+		for(int i = 0; i < _values.length; i++)
+			out.writeDouble(_values[i]);
+	}
+
+	public static DeltaDictionary read(DataInput in) throws IOException {
+		int numCols = in.readInt();
+		int numValues = in.readInt();
+		double[] values = new double[numValues];
+		for(int i = 0; i < numValues; i++)
+			values[i] = in.readDouble();
+		return new DeltaDictionary(values, numCols);
 	}
 
 	@Override
 	public long getExactSizeOnDisk() {
-		throw new NotImplementedException();
+		return 1 + 4 + 4 + 8L * _values.length;
 	}
 
 	@Override
 	public DictType getDictType() {
-		throw new NotImplementedException();
+		return DictType.Delta;
 	}
 
 	@Override
@@ -104,12 +112,19 @@ public int getNumberOfColumns(int nrow){
 
 	@Override
 	public String getString(int colIndexes) {
-		throw new NotImplementedException();
+		StringBuilder sb = new StringBuilder();
+		for(int i = 0; i < _values.length; i++) {
+			sb.append(_values[i]);
+			if(i != _values.length - 1) {
+				sb.append((i + 1) % colIndexes == 0 ? "\n" : ", ");
+			}
+		}
+		return sb.toString();
 	}
 
 	@Override
 	public long getNumberNonZeros(int[] counts, int nCol) {
-		throw new NotImplementedException();
+		throw new NotImplementedException("Cannot calculate non-zeros from DeltaDictionary alone");
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
index 939b48bf424..e94cbd7c570 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
@@ -37,7 +37,7 @@
 import org.apache.sysds.runtime.functionobjects.Multiply;
 import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
@@ -1078,7 +1078,7 @@ else if(!Double.isInfinite(ret[0]))
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CM_COV_Object ret, ValueFunction fn, int[] counts, int nRows) {
+	public CmCovObject centralMoment(CmCovObject ret, ValueFunction fn, int[] counts, int nRows) {
 		// should be guaranteed to only contain one value per tuple in dictionary.
 		for(int i = 0; i < _values.length; i++)
 			fn.execute(ret, _values[i], counts[i]);
@@ -1089,7 +1089,7 @@ public CM_COV_Object centralMoment(CM_COV_Object ret, ValueFunction fn, int[] co
 	}
 
 	@Override
-	public CM_COV_Object centralMomentWithDefault(CM_COV_Object ret, ValueFunction fn, int[] counts, double def,
+	public CmCovObject centralMomentWithDefault(CmCovObject ret, ValueFunction fn, int[] counts, double def,
 		int nRows) {
 		// should be guaranteed to only contain one value per tuple in dictionary.
 		for(int i = 0; i < _values.length; i++)
@@ -1101,7 +1101,7 @@ public CM_COV_Object centralMomentWithDefault(CM_COV_Object ret, ValueFunction f
 	}
 
 	@Override
-	public CM_COV_Object centralMomentWithReference(CM_COV_Object ret, ValueFunction fn, int[] counts, double reference,
+	public CmCovObject centralMomentWithReference(CmCovObject ret, ValueFunction fn, int[] counts, double reference,
 		int nRows) {
 		// should be guaranteed to only contain one value per tuple in dictionary.
 		for(int i = 0; i < _values.length; i++)
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
index f88ac99b87b..005d14f9ce1 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
@@ -52,7 +52,7 @@ public interface DictionaryFactory {
 	static final Log LOG = LogFactory.getLog(DictionaryFactory.class.getName());
 
 	public enum Type {
-		FP64_DICT, MATRIX_BLOCK_DICT, INT8_DICT, IDENTITY, IDENTITY_SLICE, PLACE_HOLDER
+		FP64_DICT, MATRIX_BLOCK_DICT, INT8_DICT, IDENTITY, IDENTITY_SLICE, PLACE_HOLDER, DELTA_DICT
 	}
 
 	public static IDictionary read(DataInput in) throws IOException {
@@ -68,6 +68,8 @@ public static IDictionary read(DataInput in) throws IOException {
 				return IdentityDictionary.read(in);
 			case IDENTITY_SLICE:
 				return IdentityDictionarySlice.read(in);
+			case DELTA_DICT:
+				return DeltaDictionary.read(in);
 			case MATRIX_BLOCK_DICT:
 			default:
 				return MatrixBlockDictionary.read(in);
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IDictionary.java
index dddea0eec7a..49330ba2748 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IDictionary.java
@@ -29,7 +29,7 @@
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
@@ -700,7 +700,7 @@ public IDictionary preaggValuesFromDense(final int numVals, final IColIndex colI
 	 * @param nRows  The number of rows in total of the column group
 	 * @return The central moment Object
 	 */
-	public CM_COV_Object centralMoment(ValueFunction fn, int[] counts, int nRows);
+	public CmCovObject centralMoment(ValueFunction fn, int[] counts, int nRows);
 
 	/**
 	 * Central moment function to calculate the central moment of this column group. MUST be on a single column
@@ -712,7 +712,7 @@ public IDictionary preaggValuesFromDense(final int numVals, final IColIndex colI
 	 * @param nRows  The number of rows in total of the column group
 	 * @return The central moment Object
 	 */
-	public CM_COV_Object centralMoment(CM_COV_Object ret, ValueFunction fn, int[] counts, int nRows);
+	public CmCovObject centralMoment(CmCovObject ret, ValueFunction fn, int[] counts, int nRows);
 
 	/**
 	 * Central moment function to calculate the central moment of this column group with a default offset on all missing
@@ -724,7 +724,7 @@ public IDictionary preaggValuesFromDense(final int numVals, final IColIndex colI
 	 * @param nRows  The number of rows in total of the column group
 	 * @return The central moment Object
 	 */
-	public CM_COV_Object centralMomentWithDefault(ValueFunction fn, int[] counts, double def, int nRows);
+	public CmCovObject centralMomentWithDefault(ValueFunction fn, int[] counts, double def, int nRows);
 
 	/**
 	 * Central moment function to calculate the central moment of this column group with a default offset on all missing
@@ -737,7 +737,7 @@ public IDictionary preaggValuesFromDense(final int numVals, final IColIndex colI
 	 * @param nRows  The number of rows in total of the column group
 	 * @return The central moment Object
 	 */
-	public CM_COV_Object centralMomentWithDefault(CM_COV_Object ret, ValueFunction fn, int[] counts, double def,
+	public CmCovObject centralMomentWithDefault(CmCovObject ret, ValueFunction fn, int[] counts, double def,
 		int nRows);
 
 	/**
@@ -750,7 +750,7 @@ public CM_COV_Object centralMomentWithDefault(CM_COV_Object ret, ValueFunction f
 	 * @param nRows     The number of rows in total of the column group
 	 * @return The central moment Object
 	 */
-	public CM_COV_Object centralMomentWithReference(ValueFunction fn, int[] counts, double reference, int nRows);
+	public CmCovObject centralMomentWithReference(ValueFunction fn, int[] counts, double reference, int nRows);
 
 	/**
 	 * Central moment function to calculate the central moment of this column group with a reference offset on each
@@ -763,7 +763,7 @@ public CM_COV_Object centralMomentWithDefault(CM_COV_Object ret, ValueFunction f
 	 * @param nRows     The number of rows in total of the column group
 	 * @return The central moment Object
 	 */
-	public CM_COV_Object centralMomentWithReference(CM_COV_Object ret, ValueFunction fn, int[] counts, double reference,
+	public CmCovObject centralMomentWithReference(CmCovObject ret, ValueFunction fn, int[] counts, double reference,
 		int nRows);
 
 	/**
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
index 24776f3adc4..71a4112f157 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
@@ -51,7 +51,7 @@
 import org.apache.sysds.runtime.functionobjects.Minus;
 import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixAgg;
 import org.apache.sysds.runtime.matrix.data.LibMatrixBincell;
 import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
@@ -2425,7 +2425,7 @@ else if(!Double.isInfinite(ret[0]))
 	}
 
 	@Override
-	public CM_COV_Object centralMoment(CM_COV_Object ret, ValueFunction fn, int[] counts, int nRows) {
+	public CmCovObject centralMoment(CmCovObject ret, ValueFunction fn, int[] counts, int nRows) {
 		// should be guaranteed to only contain one value per tuple in dictionary.
 		if(_data.isInSparseFormat())
 			throw new DMLCompressionException("The dictionary should not be sparse with one column");
@@ -2438,7 +2438,7 @@ public CM_COV_Object centralMoment(CM_COV_Object ret, ValueFunction fn, int[] co
 	}
 
 	@Override
-	public CM_COV_Object centralMomentWithDefault(CM_COV_Object ret, ValueFunction fn, int[] counts, double def,
+	public CmCovObject centralMomentWithDefault(CmCovObject ret, ValueFunction fn, int[] counts, double def,
 		int nRows) {
 		// should be guaranteed to only contain one value per tuple in dictionary.
 		if(_data.isInSparseFormat())
@@ -2453,7 +2453,7 @@ public CM_COV_Object centralMomentWithDefault(CM_COV_Object ret, ValueFunction f
 	}
 
 	@Override
-	public CM_COV_Object centralMomentWithReference(CM_COV_Object ret, ValueFunction fn, int[] counts, double reference,
+	public CmCovObject centralMomentWithReference(CmCovObject ret, ValueFunction fn, int[] counts, double reference,
 		int nRows) {
 		// should be guaranteed to only contain one value per tuple in dictionary.
 		if(_data.isInSparseFormat())
diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/AComEst.java b/src/main/java/org/apache/sysds/runtime/compress/estim/AComEst.java
index 2dce0bafe4e..ef7981e941b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/estim/AComEst.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/estim/AComEst.java
@@ -197,7 +197,10 @@ public final CompressedSizeInfoColGroup combine(IColIndex combinedColumns, Compr
 			return null; // This combination is clearly not a good idea return null to indicate that.
 		else if(g1.getMap() == null || g2.getMap() == null)
 			// the previous information did not contain maps, therefore fall back to extract from sample
-			return getColGroupInfo(combinedColumns, Math.max(g1V, g2V), (int) max);
+			if(_cs.preferDeltaEncoding)
+				return getDeltaColGroupInfo(combinedColumns, Math.max(g1V, g2V), (int) max);
+			else
+				return getColGroupInfo(combinedColumns, Math.max(g1V, g2V), (int) max);
 		else // Default combine the previous subject to max value calculated.
 			return combine(combinedColumns, g1, g2, (int) max);
 	}
@@ -254,8 +257,12 @@ private List<CompressedSizeInfoColGroup> CompressedSizeInfoColGroupSingleThread(
 		List<CompressedSizeInfoColGroup> ret = new ArrayList<>(clen);
 		if(!_cs.transposed && !_data.isEmpty() && _data.isInSparseFormat())
 			nnzCols = LibMatrixReorg.countNnzPerColumn(_data);
-		for(int col = 0; col < clen; col++)
-			ret.add(getColGroupInfo(new SingleIndex(col)));
+		for(int col = 0; col < clen; col++) {
+			if(_cs.preferDeltaEncoding)
+				ret.add(getDeltaColGroupInfo(new SingleIndex(col)));
+			else
+				ret.add(getColGroupInfo(new SingleIndex(col)));
+		}
 		return ret;
 	}
 
@@ -286,9 +293,14 @@ private List<CompressedSizeInfoColGroup> CompressedSizeInfoColGroupParallel(int
 			for(int col = 0; col < clen; col += blkz) {
 				final int start = col;
 				final int end = Math.min(clen, col + blkz);
+				final boolean useDelta = _cs.preferDeltaEncoding;
 				tasks.add(pool.submit(() -> {
-					for(int c = start; c < end; c++)
-						res[c] = getColGroupInfo(new SingleIndex(c));
+					for(int c = start; c < end; c++) {
+						if(useDelta)
+							res[c] = getDeltaColGroupInfo(new SingleIndex(c));
+						else
+							res[c] = getColGroupInfo(new SingleIndex(c));
+					}
 					return null;
 				}));
 			}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
index 963a044d14f..df353931c0b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
@@ -255,7 +255,11 @@ private static double getCompressionSize(IColIndex cols, CompressionType ct, Est
 			case LinearFunctional:
 				return ColGroupSizes.estimateInMemorySizeLinearFunctional(numCols, contiguousColumns);
 			case DeltaDDC:
-				throw new NotImplementedException();
+				// DeltaDDC has the same size estimation as DDC since it uses the same structure
+				// The delta encoding is just a different way of interpreting the data
+				nv = fact.numVals + (fact.numOffs < fact.numRows ? 1 : 0);
+				return ColGroupSizes.estimateInMemorySizeDDC(numCols, contiguousColumns, nv, fact.numRows,
+					fact.tupleSparsity, fact.lossy);
 			case DDC:
 				nv = fact.numVals + (fact.numOffs < fact.numRows ? 1 : 0);
 				return ColGroupSizes.estimateInMemorySizeDDC(numCols, contiguousColumns, nv, fact.numRows,
diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/encoding/EncodingFactory.java b/src/main/java/org/apache/sysds/runtime/compress/estim/encoding/EncodingFactory.java
index b196da658c3..efe1e14c47a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/estim/encoding/EncodingFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/estim/encoding/EncodingFactory.java
@@ -88,9 +88,8 @@ else if(rowCols.size() == 1) {
 	 * @return A delta encoded encoding.
 	 */
 	public static IEncode createFromMatrixBlockDelta(MatrixBlock m, boolean transposed, IColIndex rowCols) {
-		throw new NotImplementedException();
-		// final int sampleSize = transposed ? m.getNumColumns() : m.getNumRows();
-		// return createFromMatrixBlockDelta(m, transposed, rowCols, sampleSize);
+		final int sampleSize = transposed ? m.getNumColumns() : m.getNumRows();
+		return createFromMatrixBlockDelta(m, transposed, rowCols, sampleSize);
 	}
 
 	/**
@@ -107,7 +106,7 @@ public static IEncode createFromMatrixBlockDelta(MatrixBlock m, boolean transpos
 	 */
 	public static IEncode createFromMatrixBlockDelta(MatrixBlock m, boolean transposed, IColIndex rowCols,
 		int sampleSize) {
-		throw new NotImplementedException();
+		return createWithDeltaReader(m, rowCols, transposed, sampleSize);
 	}
 
 	/**
@@ -691,4 +690,68 @@ private static IEncode createWithReaderSparse(MatrixBlock m, DblArrayCountHashMa
 	public static SparseEncoding createSparse(AMapToData map, AOffset off, int nRows) {
 		return new SparseEncoding(map, off, nRows);
 	}
+
+	private static IEncode createWithDeltaReader(MatrixBlock m, IColIndex rowCols, boolean transposed, int sampleSize) {
+		final int rl = 0;
+		final int ru = Math.min(sampleSize, transposed ? m.getNumColumns() : m.getNumRows());
+		final ReaderColumnSelection reader1 = ReaderColumnSelection.createDeltaReader(m, rowCols, transposed, rl, ru);
+		final DblArrayCountHashMap map = new DblArrayCountHashMap();
+		final IntArrayList offsets = new IntArrayList();
+		DblArray cellVals = reader1.nextRow();
+		boolean isFirstRow = true;
+
+		while(cellVals != null) {
+			map.increment(cellVals);
+			if(isFirstRow || !cellVals.isEmpty())
+				offsets.appendValue(reader1.getCurrentRowIndex());
+			isFirstRow = false;
+			cellVals = reader1.nextRow();
+		}
+
+		if(offsets.size() == 0)
+			return new EmptyEncoding();
+		else if(map.size() == 1 && offsets.size() == ru)
+			return new ConstEncoding(ru);
+
+		final ReaderColumnSelection reader2 = ReaderColumnSelection.createDeltaReader(m, rowCols, transposed, rl, ru);
+		if(offsets.size() < ru / 4)
+			return createWithDeltaReaderSparse(m, map, rowCols, offsets, ru, reader2);
+		else
+			return createWithDeltaReaderDense(m, map, rowCols, ru, offsets.size() < ru, reader2);
+	}
+
+	private static IEncode createWithDeltaReaderDense(MatrixBlock m, DblArrayCountHashMap map, IColIndex rowCols,
+		int nRows, boolean zero, ReaderColumnSelection reader2) {
+		final int unique = map.size() + (zero ? 1 : 0);
+		final AMapToData d = MapToFactory.create(nRows, unique);
+
+		DblArray cellVals;
+		if(zero)
+			while((cellVals = reader2.nextRow()) != null)
+				d.set(reader2.getCurrentRowIndex(), map.getId(cellVals) + 1);
+		else
+			while((cellVals = reader2.nextRow()) != null)
+				d.set(reader2.getCurrentRowIndex(), map.getId(cellVals));
+
+		return new DenseEncoding(d);
+	}
+
+	private static IEncode createWithDeltaReaderSparse(MatrixBlock m, DblArrayCountHashMap map, IColIndex rowCols,
+		IntArrayList offsets, int nRows, ReaderColumnSelection reader2) {
+		DblArray cellVals = reader2.nextRow();
+		final AMapToData d = MapToFactory.create(offsets.size(), map.size());
+
+		int i = 0;
+		boolean isFirstRow = true;
+		while(cellVals != null) {
+			if(isFirstRow || !cellVals.isEmpty()) {
+				d.set(i++, map.getId(cellVals));
+			}
+			isFirstRow = false;
+			cellVals = reader2.nextRow();
+		}
+
+		final AOffset o = OffsetFactory.createOffset(offsets);
+		return new SparseEncoding(d, o, nRows);
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/io/WriterCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/io/WriterCompressed.java
index c4d9db367bb..1ba0ba61d10 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/io/WriterCompressed.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/io/WriterCompressed.java
@@ -46,7 +46,7 @@
 import org.apache.sysds.runtime.compress.lib.CLALibSeparator;
 import org.apache.sysds.runtime.compress.lib.CLALibSeparator.SeparatedGroups;
 import org.apache.sysds.runtime.compress.lib.CLALibSlice;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
+import org.apache.sysds.runtime.instructions.ooc.OOCStream;
 import org.apache.sysds.runtime.instructions.spark.CompressionSPInstruction.CompressionFunction;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.instructions.spark.utils.RDDConverterUtils;
@@ -410,7 +410,7 @@ public Object call() throws Exception {
 	}
 
 	@Override
-	public long writeMatrixFromStream(String fname, LocalTaskQueue<IndexedMatrixValue> stream, long rlen, long clen, int blen) {
+	public long writeMatrixFromStream(String fname, OOCStream<IndexedMatrixValue> stream, long rlen, long clen, int blen) {
 		throw new UnsupportedOperationException("Writing from an OOC stream is not supported for the HDF5 format.");
 	};
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCMOps.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCMOps.java
index 3e77aaefb9e..644e73697a4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCMOps.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCMOps.java
@@ -23,7 +23,7 @@
 
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixAgg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.CMOperator;
@@ -34,7 +34,7 @@ private CLALibCMOps() {
 		// private constructor
 	}
 
-	public static CM_COV_Object centralMoment(CompressedMatrixBlock cmb, CMOperator op) {
+	public static CmCovObject centralMoment(CompressedMatrixBlock cmb, CMOperator op) {
 		MatrixBlock.checkCMOperations(cmb, op);
 		if(cmb.isEmpty())
 			return LibMatrixAgg.aggregateCmCov(cmb, null, null, op.fn);
diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibUnary.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibUnary.java
index f858f15b746..cc0ff901df4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibUnary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibUnary.java
@@ -21,10 +21,15 @@
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.commons.lang3.tuple.Pair;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlockFactory;
+import org.apache.sysds.runtime.compress.CompressionSettingsBuilder;
+import org.apache.sysds.runtime.compress.CompressionStatistics;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
+import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode;
 import org.apache.sysds.runtime.matrix.data.LibMatrixAgg;
@@ -43,10 +48,40 @@ public static MatrixBlock unaryOperations(CompressedMatrixBlock m, UnaryOperator
 		final boolean overlapping = m.isOverlapping();
 		final int r = m.getNumRows();
 		final int c = m.getNumColumns();
+		
 		// early aborts:
 		if(m.isEmpty())
 			return new MatrixBlock(r, c, 0).unaryOperations(op, result);
-		else if(overlapping) {
+		
+		if(Builtin.isBuiltinCode(op.fn, BuiltinCode.CUMSUM)) {
+			MatrixBlock uncompressed = m.getUncompressed("CUMSUM requires uncompression", op.getNumThreads());
+			MatrixBlock opResult = uncompressed.unaryOperations(op, null);
+			
+			CompressionSettingsBuilder csb = new CompressionSettingsBuilder();
+			csb.clearValidCompression();
+			csb.setPreferDeltaEncoding(true);
+			csb.addValidCompression(CompressionType.DeltaDDC);
+			csb.addValidCompression(CompressionType.UNCOMPRESSED);
+			csb.setTransposeInput("false");
+			Pair<MatrixBlock, CompressionStatistics> compressedPair = CompressedMatrixBlockFactory.compress(opResult, op.getNumThreads(), csb);
+			MatrixBlock compressedResult = compressedPair.getLeft();
+			
+			if(compressedResult == null) {
+				compressedResult = opResult;
+			}
+			
+			CompressedMatrixBlock finalResult;
+			if(compressedResult instanceof CompressedMatrixBlock) {
+				finalResult = (CompressedMatrixBlock) compressedResult;
+			}
+			else {
+				finalResult = CompressedMatrixBlockFactory.genUncompressedCompressedMatrixBlock(compressedResult);
+			}
+			
+			return finalResult;
+		}
+
+		if(overlapping) {
 			// when in overlapping state it is guaranteed that there is no infinites, NA, or NANs.
 			if(Builtin.isBuiltinCode(op.fn, BuiltinCode.ISINF, BuiltinCode.ISNA, BuiltinCode.ISNAN))
 				return new MatrixBlock(r, c, 0);
@@ -64,8 +99,9 @@ else if(Builtin.isBuiltinCode(op.fn, BuiltinCode.ISINF, BuiltinCode.ISNAN, Built
 			return new MatrixBlock(r, c, 0); // avoid unnecessary allocation
 		else if(LibMatrixAgg.isSupportedUnaryOperator(op)) {
 			String message = "Unary Op not supported: " + op.fn.getClass().getSimpleName();
-			// e.g., cumsum/cumprod/cummin/cumax/cumsumprod
-			return m.getUncompressed(message, op.getNumThreads()).unaryOperations(op, null);
+			MatrixBlock uncompressed = m.getUncompressed(message, op.getNumThreads());
+			MatrixBlock opResult = uncompressed.unaryOperations(op, null);
+			return opResult;
 		}
 		else {
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelection.java b/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelection.java
index d6ec60336f0..1734d39f4ce 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelection.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelection.java
@@ -193,13 +193,63 @@ else if(rawBlock.isInSparseFormat()) {
 		else {
 			return new ReaderColumnSelectionDenseSingleBlockQuantized(rawBlock, colIndices, rl, ru, scaleFactors);
 		}
-	}	
+	}
+
+	/**
+	 * Create a reader of the matrix block that computes delta values (current row - previous row) on-the-fly.
+	 * 
+	 * Note the reader reuse the return, therefore if needed for something please copy the returned rows.
+	 * The first row is returned as-is (no delta computation).
+	 * 
+	 * @param rawBlock   The block to iterate though
+	 * @param colIndices The column indexes to extract and insert into the double array
+	 * @param transposed If the raw block should be treated as transposed
+	 * @return A delta reader of the columns specified
+	 */
+	public static ReaderColumnSelection createDeltaReader(MatrixBlock rawBlock, IColIndex colIndices, boolean transposed) {
+		final int rl = 0;
+		final int ru = transposed ? rawBlock.getNumColumns() : rawBlock.getNumRows();
+		return createDeltaReader(rawBlock, colIndices, transposed, rl, ru);
+	}
+
+	/**
+	 * Create a reader of the matrix block that computes delta values (current row - previous row) on-the-fly.
+	 * 
+	 * Note the reader reuse the return, therefore if needed for something please copy the returned rows.
+	 * The first row is returned as-is (no delta computation).
+	 * 
+	 * @param rawBlock   The block to iterate though
+	 * @param colIndices The column indexes to extract and insert into the double array
+	 * @param transposed If the raw block should be treated as transposed
+	 * @param rl         The row to start at
+	 * @param ru         The row to end at (not inclusive)
+	 * @return A delta reader of the columns specified
+	 */
+	public static ReaderColumnSelection createDeltaReader(MatrixBlock rawBlock, IColIndex colIndices, boolean transposed,
+		int rl, int ru) {
+		checkInput(rawBlock, colIndices, rl, ru, transposed);
+		rl = rl - 1;
+		if(rawBlock.isEmpty()) {
+			LOG.warn("It is likely an error occurred when reading an empty block, but we do support it!");
+			return new ReaderColumnSelectionEmpty(rawBlock, colIndices, rl, ru, transposed);
+		}
+
+		if(transposed) {
+			throw new NotImplementedException("Delta encoding for transposed matrices not yet implemented");
+		}
+
+		if(rawBlock.isInSparseFormat())
+			return new ReaderColumnSelectionSparseDelta(rawBlock, colIndices, rl, ru);
+		else if(rawBlock.getDenseBlock().numBlocks() > 1)
+			return new ReaderColumnSelectionDenseMultiBlockDelta(rawBlock, colIndices, rl, ru);
+		return new ReaderColumnSelectionDenseSingleBlockDelta(rawBlock, colIndices, rl, ru);
+	}
 
 	private static void checkInput(final MatrixBlock rawBlock, final IColIndex colIndices, final int rl, final int ru,
 		final boolean transposed) {
-		if(colIndices.size() <= 1)
+		if(colIndices.size() < 1)
 			throw new DMLCompressionException(
-				"Column selection reader should not be done on single column groups: " + colIndices);
+				"Column selection reader should not be done on empty column groups: " + colIndices);
 		else if(rl >= ru)
 			throw new DMLCompressionException("Invalid inverse range for reader " + rl + " to " + ru);
 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelectionDenseMultiBlockDelta.java b/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelectionDenseMultiBlockDelta.java
new file mode 100644
index 00000000000..f700ebd94b7
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelectionDenseMultiBlockDelta.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.readers;
+
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.utils.DblArray;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+public class ReaderColumnSelectionDenseMultiBlockDelta extends ReaderColumnSelection {
+	private final DenseBlock _data;
+	private final double[] _previousRow;
+	private boolean _isFirstRow;
+
+	protected ReaderColumnSelectionDenseMultiBlockDelta(MatrixBlock data, IColIndex colIndices, int rl, int ru) {
+		super(colIndices, rl, Math.min(ru, data.getNumRows()) - 1);
+		_data = data.getDenseBlock();
+		_previousRow = new double[colIndices.size()];
+		_isFirstRow = true;
+	}
+
+	protected DblArray getNextRow() {
+		_rl++;
+
+		if(_isFirstRow) {
+			for(int i = 0; i < _colIndexes.size(); i++) {
+				final double val = _data.get(_rl, _colIndexes.get(i));
+				_previousRow[i] = val;
+				reusableArr[i] = val;
+			}
+			_isFirstRow = false;
+		}
+		else {
+			for(int i = 0; i < _colIndexes.size(); i++) {
+				final double currentVal = _data.get(_rl, _colIndexes.get(i));
+				reusableArr[i] = currentVal - _previousRow[i];
+				_previousRow[i] = currentVal;
+			}
+		}
+
+		return reusableReturn;
+	}
+}
+
+
+
diff --git a/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelectionDenseSingleBlockDelta.java b/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelectionDenseSingleBlockDelta.java
new file mode 100644
index 00000000000..65f13343201
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelectionDenseSingleBlockDelta.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.readers;
+
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.utils.DblArray;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+public class ReaderColumnSelectionDenseSingleBlockDelta extends ReaderColumnSelection {
+	private final double[] _data;
+	private final int _numCols;
+	private final double[] _previousRow;
+	private boolean _isFirstRow;
+
+	protected ReaderColumnSelectionDenseSingleBlockDelta(MatrixBlock data, IColIndex colIndices, int rl, int ru) {
+		super(colIndices, rl, Math.min(ru, data.getNumRows()) - 1);
+		_data = data.getDenseBlockValues();
+		_numCols = data.getNumColumns();
+		_previousRow = new double[colIndices.size()];
+		_isFirstRow = true;
+	}
+
+	protected DblArray getNextRow() {
+		_rl++;
+		final int indexOff = _rl * _numCols;
+
+		if(_isFirstRow) {
+			for(int i = 0; i < _colIndexes.size(); i++) {
+				final double val = _data[indexOff + _colIndexes.get(i)];
+				_previousRow[i] = val;
+				reusableArr[i] = val;
+			}
+			_isFirstRow = false;
+		}
+		else {
+			for(int i = 0; i < _colIndexes.size(); i++) {
+				final double currentVal = _data[indexOff + _colIndexes.get(i)];
+				reusableArr[i] = currentVal - _previousRow[i];
+				_previousRow[i] = currentVal;
+			}
+		}
+
+		return reusableReturn;
+	}
+}
+
+
+
diff --git a/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelectionSparseDelta.java b/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelectionSparseDelta.java
new file mode 100644
index 00000000000..8ea1fff3396
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderColumnSelectionSparseDelta.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.readers;
+
+import java.util.Arrays;
+
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.utils.DblArray;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+public class ReaderColumnSelectionSparseDelta extends ReaderColumnSelection {
+
+	private final SparseBlock _a;
+	private final double[] _previousRow;
+	private boolean _isFirstRow;
+
+	protected ReaderColumnSelectionSparseDelta(MatrixBlock data, IColIndex colIndexes, int rl, int ru) {
+		super(colIndexes, rl, Math.min(ru, data.getNumRows()) - 1);
+		_a = data.getSparseBlock();
+		_previousRow = new double[colIndexes.size()];
+		_isFirstRow = true;
+	}
+
+	protected final DblArray getNextRow() {
+		_rl++;
+		for(int i = 0; i < _colIndexes.size(); i++)
+			reusableArr[i] = 0.0;
+
+		if(!_a.isEmpty(_rl))
+			processInRange(_rl);
+
+		if(_isFirstRow) {
+			for(int i = 0; i < _colIndexes.size(); i++)
+				_previousRow[i] = reusableArr[i];
+			_isFirstRow = false;
+		}
+		else {
+			for(int i = 0; i < _colIndexes.size(); i++) {
+				final double currentVal = reusableArr[i];
+				reusableArr[i] = currentVal - _previousRow[i];
+				_previousRow[i] = currentVal;
+			}
+		}
+
+		return reusableReturn;
+	}
+
+	final void processInRange(final int r) {
+		final int apos = _a.pos(r);
+		final int alen = _a.size(r) + apos;
+		final int[] aix = _a.indexes(r);
+		final double[] avals = _a.values(r);
+		int skip = 0;
+		int j = Arrays.binarySearch(aix, apos, alen, _colIndexes.get(0));
+		if(j < 0)
+			j = Math.abs(j + 1);
+
+		while(skip < _colIndexes.size() && j < alen) {
+			if(_colIndexes.get(skip) == aix[j]) {
+				reusableArr[skip] = avals[j];
+				skip++;
+				j++;
+			}
+			else if(_colIndexes.get(skip) > aix[j])
+				j++;
+			else
+				skip++;
+		}
+	}
+}
+
+
+
diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
index 34a8aa18631..36637ee8959 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/CacheableData.java
@@ -43,7 +43,6 @@
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.controlprogram.caching.LazyWriteBuffer.RPolicy;
 import org.apache.sysds.runtime.controlprogram.federated.FederationMap;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
 import org.apache.sysds.runtime.controlprogram.parfor.util.IDSequence;
 import org.apache.sysds.runtime.instructions.cp.Data;
 import org.apache.sysds.runtime.instructions.fed.InitFEDInstruction;
@@ -471,12 +470,12 @@ public boolean hasBroadcastHandle() {
 		return  _bcHandle != null && _bcHandle.hasBackReference();
 	}
 	
-	public OOCStream<IndexedMatrixValue> getStreamHandle() {
+	public synchronized OOCStream<IndexedMatrixValue> getStreamHandle() {
 		if( !hasStreamHandle() ) {
 			final SubscribableTaskQueue<IndexedMatrixValue> _mStream = new SubscribableTaskQueue<>();
-			_streamHandle = _mStream;
 			DataCharacteristics dc = getDataCharacteristics();
 			MatrixBlock src = (MatrixBlock)acquireReadAndRelease();
+			_streamHandle = _mStream;
 			LongStream.range(0, dc.getNumBlocks())
 				.mapToObj(i -> UtilFunctions.createIndexedMatrixBlock(src, dc, i))
 				.forEach( blk -> {
@@ -489,7 +488,14 @@ public OOCStream<IndexedMatrixValue> getStreamHandle() {
 			_mStream.closeInput();
 		}
 		
-		return _streamHandle.getReadStream();
+		OOCStream<IndexedMatrixValue> stream = _streamHandle.getReadStream();
+		if (!stream.hasStreamCache())
+			_streamHandle = null; // To ensure read once
+		return stream;
+	}
+
+	public OOCStreamable<IndexedMatrixValue> getStreamable() {
+		return _streamHandle;
 	}
 	
 	/**
@@ -499,7 +505,7 @@ public OOCStream<IndexedMatrixValue> getStreamHandle() {
 	 * @return true if existing, false otherwise
 	 */
 	public boolean hasStreamHandle() {
-		return _streamHandle != null && !_streamHandle.isProcessed();
+		return _streamHandle != null;
 	} 
 
 	@SuppressWarnings({ "rawtypes", "unchecked" })
@@ -626,7 +632,7 @@ && getRDDHandle() == null) ) {
 					_requiresLocalWrite = false;
 				}
 				else if( hasStreamHandle() ) {
-					_data = readBlobFromStream( getStreamHandle().toLocalTaskQueue() );
+					_data = readBlobFromStream( getStreamHandle() );
 				}
 				else if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() ) {
 					if( DMLScript.STATISTICS )
@@ -1161,7 +1167,7 @@ protected abstract T readBlobFromHDFS(String fname, long[] dims)
 	protected abstract T readBlobFromRDD(RDDObject rdd, MutableBoolean status)
 		throws IOException;
 
-	protected abstract T readBlobFromStream(LocalTaskQueue<IndexedMatrixValue> stream)
+	protected abstract T readBlobFromStream(OOCStream<IndexedMatrixValue> stream)
 		throws IOException;
 
 	// Federated read
diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java
index f4d20bb55a0..7151d87211c 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/FrameObject.java
@@ -33,8 +33,8 @@
 import org.apache.sysds.runtime.controlprogram.federated.FederatedRange;
 import org.apache.sysds.runtime.controlprogram.federated.FederatedResponse;
 import org.apache.sysds.runtime.controlprogram.federated.FederationMap;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
 import org.apache.sysds.runtime.frame.data.FrameBlock;
+import org.apache.sysds.runtime.instructions.ooc.OOCStream;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.instructions.spark.data.RDDObject;
 import org.apache.sysds.runtime.io.FileFormatProperties;
@@ -316,7 +316,7 @@ protected void writeBlobFromRDDtoHDFS(RDDObject rdd, String fname, String ofmt)
 	}
 
 	@Override
-	protected FrameBlock readBlobFromStream(LocalTaskQueue<IndexedMatrixValue> stream) throws IOException {
+	protected FrameBlock readBlobFromStream(OOCStream<IndexedMatrixValue> stream) throws IOException {
 		// TODO Auto-generated method stub
 		return null;
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/MatrixObject.java b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/MatrixObject.java
index 8191040eb18..0b1a1ee27cb 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/MatrixObject.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/MatrixObject.java
@@ -45,6 +45,7 @@
 import org.apache.sysds.runtime.controlprogram.federated.FederationUtils;
 import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
 import org.apache.sysds.runtime.instructions.fed.InitFEDInstruction;
+import org.apache.sysds.runtime.instructions.ooc.OOCStream;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.instructions.spark.data.RDDObject;
 import org.apache.sysds.runtime.io.FileFormatProperties;
@@ -528,17 +529,16 @@ protected MatrixBlock readBlobFromRDD(RDDObject rdd, MutableBoolean writeStatus)
 	
 
 	@Override
-	protected MatrixBlock readBlobFromStream(LocalTaskQueue<IndexedMatrixValue> stream) throws IOException {
+	protected MatrixBlock readBlobFromStream(OOCStream<IndexedMatrixValue> stream) throws IOException {
 		boolean dimsUnknown = getNumRows() < 0 || getNumColumns() < 0;
 		int nrows = (int)getNumRows();
 		int ncols = (int)getNumColumns();
 		MatrixBlock ret = dimsUnknown ? null : new MatrixBlock((int)getNumRows(), (int)getNumColumns(), false);
-		// TODO if stream is CachingStream, block parts might be evicted resulting in null pointer exceptions
 		List<IndexedMatrixValue> blockCache = dimsUnknown ? new ArrayList<>() : null;
 		IndexedMatrixValue tmp = null;
 		try {
 			int blen = getBlocksize(), lnnz = 0;
-			while( (tmp = stream.dequeueTask()) != LocalTaskQueue.NO_MORE_TASKS ) {
+			while( (tmp = stream.dequeue()) != LocalTaskQueue.NO_MORE_TASKS ) {
 				// compute row/column block offsets
 				final int row_offset = (int) (tmp.getIndexes().getRowIndex() - 1) * blen;
 				final int col_offset = (int) (tmp.getIndexes().getColumnIndex() - 1) * blen;
@@ -559,12 +559,12 @@ protected MatrixBlock readBlobFromStream(LocalTaskQueue<IndexedMatrixValue> stre
 			if (dimsUnknown) {
 				ret = new MatrixBlock(nrows, ncols, false);
 
-				for (IndexedMatrixValue _tmp : blockCache) {
+				for (IndexedMatrixValue tmp2 : blockCache) {
 					// compute row/column block offsets
-					final int row_offset = (int) (_tmp.getIndexes().getRowIndex() - 1) * blen;
-					final int col_offset = (int) (_tmp.getIndexes().getColumnIndex() - 1) * blen;
+					final int row_offset = (int) (tmp2.getIndexes().getRowIndex() - 1) * blen;
+					final int col_offset = (int) (tmp2.getIndexes().getColumnIndex() - 1) * blen;
 
-					((MatrixBlock) _tmp.getValue()).putInto(ret, row_offset, col_offset, true);
+					((MatrixBlock) tmp2.getValue()).putInto(ret, row_offset, col_offset, true);
 				}
 			}
 
@@ -636,7 +636,7 @@ protected long writeStreamToHDFS(String fname, String ofmt, int rep, FileFormatP
 		MetaDataFormat iimd = (MetaDataFormat) _metaData;
 		FileFormat fmt = (ofmt != null ? FileFormat.safeValueOf(ofmt) : iimd.getFileFormat());
 		MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(fmt, rep, fprop);
-		return writer.writeMatrixFromStream(fname, getStreamHandle().toLocalTaskQueue(),
+		return writer.writeMatrixFromStream(fname, getStreamHandle(),
 			getNumRows(), getNumColumns(), ConfigurationManager.getBlocksize());
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/TensorObject.java b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/TensorObject.java
index d0111a34300..474db9a65fe 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/TensorObject.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/TensorObject.java
@@ -30,9 +30,9 @@
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysds.runtime.controlprogram.federated.FederationMap;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
 import org.apache.sysds.runtime.data.TensorBlock;
 import org.apache.sysds.runtime.data.TensorIndexes;
+import org.apache.sysds.runtime.instructions.ooc.OOCStream;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.instructions.spark.data.RDDObject;
 import org.apache.sysds.runtime.io.FileFormatProperties;
@@ -210,7 +210,7 @@ protected void writeBlobFromRDDtoHDFS(RDDObject rdd, String fname, String ofmt)
 	
 
 	@Override
-	protected TensorBlock readBlobFromStream(LocalTaskQueue<IndexedMatrixValue> stream) throws IOException {
+	protected TensorBlock readBlobFromStream(OOCStream<IndexedMatrixValue> stream) throws IOException {
 		// TODO Auto-generated method stub
 		return null;
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/paramserv/ParamServer.java b/src/main/java/org/apache/sysds/runtime/controlprogram/paramserv/ParamServer.java
index 83b612de054..333c889b7c1 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/paramserv/ParamServer.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/paramserv/ParamServer.java
@@ -448,10 +448,10 @@ protected void updateAndBroadcastModel(ListObject new_model, Timing tAgg, boolea
 	}
 
 	protected ListObject weightModels(ListObject params, int numWorkers) {
-		double _averagingFactor = 1d / numWorkers;
+		double averagingFactor = 1d / numWorkers;
 
-		if( _averagingFactor != 1) {
-			double final_averagingFactor = _averagingFactor;
+		if( averagingFactor != 1) {
+			double final_averagingFactor = averagingFactor;
 			params.getData().parallelStream().forEach((matrix) -> {
 				MatrixObject matrixObject = (MatrixObject) matrix;
 				MatrixBlock input = matrixObject.acquireReadAndRelease().scalarOperations(
diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/parfor/LocalTaskQueue.java b/src/main/java/org/apache/sysds/runtime/controlprogram/parfor/LocalTaskQueue.java
index 783981e0f12..1849ad066b3 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/parfor/LocalTaskQueue.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/parfor/LocalTaskQueue.java
@@ -45,7 +45,7 @@ public class LocalTaskQueue<T>
 	
 	protected LinkedList<T>  _data        = null;
 	protected boolean 	   _closedInput = false;
-	private DMLRuntimeException _failure = null;
+	protected DMLRuntimeException _failure = null;
 	private static final Log LOG = LogFactory.getLog(LocalTaskQueue.class.getName());
 	
 	public LocalTaskQueue()
@@ -103,6 +103,10 @@ public synchronized T dequeueTask()
 		return t;
 	}
 
+	public synchronized boolean hasNext() {
+		return !_data.isEmpty() || _closedInput;
+	}
+
 	/**
 	 * Synchronized (logical) insert of a NO_MORE_TASKS symbol at the end of the FIFO queue in order to
 	 * mark that no more tasks will be inserted into the queue.
diff --git a/src/main/java/org/apache/sysds/runtime/einsum/EOpNode.java b/src/main/java/org/apache/sysds/runtime/einsum/EOpNode.java
new file mode 100644
index 00000000000..665e5ae5588
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/einsum/EOpNode.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.einsum;
+
+import org.apache.commons.logging.Log;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public abstract class EOpNode {
+	public Character c1;
+	public Character c2;
+	public Integer dim1;
+	public Integer dim2;
+	public EOpNode(Character c1, Character c2, Integer dim1, Integer dim2) {
+		this.c1 = c1;
+		this.c2 = c2;
+		this.dim1 = dim1;
+		this.dim2 = dim2;
+	}
+
+	public String getOutputString() {
+		if(c1 == null) return "''";
+		if(c2 == null) return c1.toString();
+		return c1.toString() + c2.toString();
+	}
+	public abstract List<EOpNode> getChildren();
+
+	public String[] recursivePrintString(){
+		List<String[]> inpStrings = new ArrayList<>();
+		for (EOpNode node : getChildren()) {
+			inpStrings.add(node.recursivePrintString());
+		}
+		String[] inpRes = inpStrings.stream().flatMap(Arrays::stream).toArray(String[]::new);
+		String[] res = new String[1 + inpRes.length];
+
+		res[0] = this.toString();
+
+		for  (int i=0; i<inpRes.length; i++) {
+			res[i+1] = (i==0 ?  "┌  " : (i==inpRes.length-1 ?  "└  " : "|  "))+inpRes[i];
+		}
+		return res;
+	};
+
+	public abstract MatrixBlock computeEOpNode(List<MatrixBlock> inputs, int numOfThreads, Log LOG);
+
+	public abstract EOpNode reorderChildrenAndOptimize(EOpNode parent, Character outChar1, Character outChar2);
+}
+
diff --git a/src/main/java/org/apache/sysds/runtime/einsum/EOpNodeBinary.java b/src/main/java/org/apache/sysds/runtime/einsum/EOpNodeBinary.java
new file mode 100644
index 00000000000..071fb6706a2
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/einsum/EOpNodeBinary.java
@@ -0,0 +1,434 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.einsum;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.lang3.tuple.Triple;
+import org.apache.commons.logging.Log;
+import org.apache.sysds.runtime.functionobjects.Multiply;
+import org.apache.sysds.runtime.functionobjects.SwapIndex;
+import org.apache.sysds.runtime.instructions.cp.DoubleObject;
+import org.apache.sysds.runtime.instructions.cp.EinsumCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.ScalarObject;
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
+import org.apache.sysds.runtime.matrix.operators.SimpleOperator;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.function.Predicate;
+
+import static org.apache.sysds.runtime.instructions.cp.EinsumCPInstruction.ensureMatrixBlockColumnVector;
+import static org.apache.sysds.runtime.instructions.cp.EinsumCPInstruction.ensureMatrixBlockRowVector;
+
+public class EOpNodeBinary extends EOpNode {
+
+	public enum EBinaryOperand { // upper case: char remains, lower case: summed (reduced) dimension
+		////// mm:   //////
+		Ba_aC, // -> BC
+		aB_Ca, // -> CB
+		Ba_Ca, // -> BC
+		aB_aC, // -> BC
+
+		////// element-wise multiplications and sums //////
+		aB_aB,// elemwise and colsum -> B
+		Ab_Ab, // elemwise and rowsum ->A
+		Ab_bA, // elemwise, either colsum or rowsum -> A
+		aB_Ba,
+		ab_ab,//M-M sum all
+		ab_ba, //M-M.T sum all
+		aB_a,// -> B
+		Ab_b, // -> A
+
+		////// elementwise, no summations:   //////
+		A_A,// v-elemwise -> A
+		AB_AB,// M-M elemwise -> AB
+		AB_BA, // M-M.T elemwise -> AB
+		AB_A, // M-v colwise -> BA!?
+		AB_B, // M-v rowwise -> AB
+
+		////// other   //////
+		a_a,// dot ->
+		A_B, // outer mult -> AB
+		A_scalar, // v-scalar
+		AB_scalar, // m-scalar
+		scalar_scalar
+	}
+	public EOpNode left;
+	public EOpNode right;
+	public EBinaryOperand operand;
+	private boolean transposeResult;
+	public EOpNodeBinary(EOpNode left, EOpNode right, EBinaryOperand operand){
+		super(null,null,null, null);
+		Character c1, c2;
+		Integer dim1, dim2;
+		switch(operand){
+			case Ba_aC -> {
+				c1=left.c1;
+				c2=right.c2;
+				dim1=left.dim1;
+				dim2=right.dim2;
+			}
+			case aB_Ca -> {
+				c1=left.c2;
+				c2=right.c1;
+				dim1=left.dim2;
+				dim2=right.dim1;
+			}
+			case Ba_Ca -> {
+				c1=left.c1;
+				c2=right.c1;
+				dim1=left.dim1;
+				dim2=right.dim1;
+			}
+			case aB_aC -> {
+				c1=left.c2;
+				c2=right.c2;
+				dim1=left.dim2;
+				dim2=right.dim2;
+			}
+			case aB_aB, aB_Ba, aB_a -> {
+				c1=left.c2;
+				c2=null;
+				dim1=left.dim2;
+				dim2=null;
+			}
+			case Ab_Ab, Ab_bA, Ab_b, A_A, A_scalar -> {
+				c1=left.c1;
+				c2=null;
+				dim1=left.dim1;
+				dim2=null;
+			}
+			case ab_ab, ab_ba, a_a, scalar_scalar -> {
+				c1=null;
+				c2=null;
+				dim1=null;
+				dim2=null;
+			}
+			case AB_AB, AB_BA, AB_A, AB_B, AB_scalar ->{
+				c1=left.c1;
+				c2=left.c2;
+				dim1=left.dim1;
+				dim2=left.dim2;
+			}
+			case A_B -> {
+				c1=left.c1;
+				c2=right.c1;
+				dim1=left.dim1;
+				dim2=right.dim1;
+			}
+			default -> throw new IllegalStateException("EOpNodeBinary Unexpected type: " + operand);
+		}
+		//	super(c1, c2, dim1, dim2); // unavailable in JDK < 22
+		this.c1 = c1;
+		this.c2 = c2;
+		this.dim1 = dim1;
+		this.dim2 = dim2;
+		this.left = left;
+		this.right = right;
+		this.operand = operand;
+	}
+
+	public void setTransposeResult(boolean transposeResult){
+		this.transposeResult = transposeResult;
+	}
+
+	public static EOpNodeBinary combineMatrixMultiply(EOpNode left, EOpNode right) {
+		if (left.c2 == right.c1) { return new EOpNodeBinary(left, right, EBinaryOperand.Ba_aC); }
+		if (left.c2 == right.c2) { return new EOpNodeBinary(left, right, EBinaryOperand.Ba_Ca); }
+		if (left.c1 == right.c1) { return new EOpNodeBinary(left, right, EBinaryOperand.aB_aC); }
+		if (left.c1 == right.c2) {
+			var res = new EOpNodeBinary(left, right, EBinaryOperand.aB_Ca);
+			res.setTransposeResult(true);
+			return res;
+		}
+		throw new RuntimeException("EOpNodeBinary::combineMatrixMultiply: invalid matrix operation");
+	}
+
+	@Override
+	public List<EOpNode> getChildren() {
+		return List.of(this.left, this.right);
+	}
+	@Override
+	public String toString() {
+		return this.getClass().getSimpleName()+" ("+ operand.toString()+") "+getOutputString();
+	}
+
+	@Override
+	public MatrixBlock computeEOpNode(List<MatrixBlock> inputs, int numThreads, Log LOG) {
+		EOpNodeBinary bin = this;
+		MatrixBlock left = this.left.computeEOpNode(inputs, numThreads, LOG);
+		MatrixBlock right = this.right.computeEOpNode(inputs, numThreads, LOG);
+
+		//AggregateOperator agg = new AggregateOperator(0, Plus.getPlusFnObject());
+
+		MatrixBlock res;
+
+		switch (bin.operand){
+			case AB_AB -> {
+				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left, right},new ScalarObject[]{}, new MatrixBlock());
+			}
+			case A_A -> {
+				ensureMatrixBlockColumnVector(left);
+				ensureMatrixBlockColumnVector(right);
+				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left, right},new ScalarObject[]{}, new MatrixBlock());
+			}
+			case a_a -> {
+				ensureMatrixBlockColumnVector(left);
+				ensureMatrixBlockColumnVector(right);
+				res = new MatrixBlock(0.0);
+				res.allocateDenseBlock();
+				res.getDenseBlockValues()[0] = LibMatrixMult.dotProduct(left.getDenseBlockValues(), right.getDenseBlockValues(), 0,0 , left.getNumRows());
+			}
+			case Ab_Ab -> {
+				res = EOpNodeFuse.compute(EOpNodeFuse.EinsumRewriteType.AB_BA_B_A__A, List.of(left, right), new ArrayList<>(), new ArrayList<>(), new ArrayList<>(), new ArrayList<>(),
+					null, numThreads);
+			}
+			case aB_aB -> {
+				res = EOpNodeFuse.compute(EOpNodeFuse.EinsumRewriteType.AB_BA_A__B, List.of(left, right), new ArrayList<>(), new ArrayList<>(), new ArrayList<>(), new ArrayList<>(),
+					null, numThreads);
+			}
+			case ab_ab -> {
+				res = EOpNodeFuse.compute(EOpNodeFuse.EinsumRewriteType.AB_BA_B_A__, List.of(left, right), new ArrayList<>(), new ArrayList<>(), new ArrayList<>(), new ArrayList<>(),
+					null, numThreads);
+			}
+			case ab_ba -> {
+				res = EOpNodeFuse.compute(EOpNodeFuse.EinsumRewriteType.AB_BA_B_A__, List.of(left), List.of(right), new ArrayList<>(), new ArrayList<>(), new ArrayList<>(),
+					null, numThreads);
+			}
+			case Ab_bA -> {
+				res = EOpNodeFuse.compute(EOpNodeFuse.EinsumRewriteType.AB_BA_B_A__A, List.of(left), List.of(right), new ArrayList<>(), new ArrayList<>(), new ArrayList<>(),
+					null, numThreads);
+			}
+			case aB_Ba -> {
+				res = EOpNodeFuse.compute(EOpNodeFuse.EinsumRewriteType.AB_BA_A__B, List.of(left), List.of(right), new ArrayList<>(), new ArrayList<>(), new ArrayList<>(),
+					null, numThreads);
+			}
+			case AB_BA -> {
+				ReorgOperator transpose = new ReorgOperator(SwapIndex.getSwapIndexFnObject(), numThreads);
+				right = right.reorgOperations(transpose, new MatrixBlock(), 0, 0, 0);
+				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left, right},new ScalarObject[]{}, new MatrixBlock());
+			}
+			case Ba_aC -> {
+				res = LibMatrixMult.matrixMult(left,right, new MatrixBlock(), numThreads);
+			}
+			case aB_Ca -> {
+				res = LibMatrixMult.matrixMult(right,left, new MatrixBlock(), numThreads);
+			}
+			case Ba_Ca -> {
+				ReorgOperator transpose = new ReorgOperator(SwapIndex.getSwapIndexFnObject(), numThreads);
+				right = right.reorgOperations(transpose, new MatrixBlock(), 0, 0, 0);
+				res = LibMatrixMult.matrixMult(left,right, new MatrixBlock(), numThreads);
+			}
+			case aB_aC -> {
+				ReorgOperator transpose = new ReorgOperator(SwapIndex.getSwapIndexFnObject(), numThreads);
+				left = left.reorgOperations(transpose, new MatrixBlock(), 0, 0, 0);
+				res = LibMatrixMult.matrixMult(left, right, new MatrixBlock(), numThreads);
+			}
+			case A_scalar, AB_scalar -> {
+				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left},new ScalarObject[]{new DoubleObject(right.get(0,0))}, new MatrixBlock());
+			}
+			case AB_B -> {
+				ensureMatrixBlockRowVector(right);
+				res = left.binaryOperations(new BinaryOperator(Multiply.getMultiplyFnObject()), right);
+			}
+			case Ab_b -> {
+				res = EOpNodeFuse.compute(EOpNodeFuse.EinsumRewriteType.AB_BA_B_A__A, List.of(left), new ArrayList<>(), List.of(right), new ArrayList<>(), new ArrayList<>(),
+					null, numThreads);
+			}
+			case AB_A -> {
+				ensureMatrixBlockColumnVector(right);
+				res = left.binaryOperations(new BinaryOperator(Multiply.getMultiplyFnObject()), right);
+			}
+			case aB_a -> {
+				res = EOpNodeFuse.compute(EOpNodeFuse.EinsumRewriteType.AB_BA_A__B, List.of(left), new ArrayList<>(), new ArrayList<>(), List.of(right), new ArrayList<>(),
+					null, numThreads);
+			}
+			case A_B -> {
+				ensureMatrixBlockColumnVector(left);
+				ensureMatrixBlockRowVector(right);
+				res = left.binaryOperations(new BinaryOperator(Multiply.getMultiplyFnObject()), right);
+			}
+			case scalar_scalar -> {
+				return new MatrixBlock(left.get(0,0)*right.get(0,0));
+			}
+			default -> {
+				throw new IllegalArgumentException("Unexpected value: " + bin.operand.toString());
+			}
+
+		}
+		if(transposeResult){
+			ReorgOperator transpose = new ReorgOperator(SwapIndex.getSwapIndexFnObject(), numThreads);
+			res = res.reorgOperations(transpose, new MatrixBlock(), 0, 0, 0);
+		}
+		if(c2 == null) ensureMatrixBlockColumnVector(res);
+		return res;
+	}
+
+	@Override
+	public EOpNode reorderChildrenAndOptimize(EOpNode parent, Character outChar1, Character outChar2) {
+		if (this.operand ==EBinaryOperand.aB_aC){
+			if(this.right.c2 == outChar1) { // result is CB so Swap aB and aC
+				var tmpLeft = left;  left = right;  right = tmpLeft;
+				var tmpC1 = c1;	   c1 = c2;		 c2 = tmpC1;
+				var tmpDim1 = dim1;   dim1 = dim2;	 dim2 = tmpDim1;
+			}
+			if(EinsumCPInstruction.FUSE_OUTER_MULTIPLY && left instanceof EOpNodeFuse fuse && fuse.einsumRewriteType == EOpNodeFuse.EinsumRewriteType.AB_BA_B_A__AB
+				&& (!EinsumCPInstruction.FUSE_OUTER_MULTIPLY_EXCEEDS_L2_CACHE_CHECK || ((fuse.dim1 * fuse.dim2 *(fuse.ABs.size()+fuse.BAs.size())) + (right.dim1*right.dim2)) * 8 > 6 * 1024 * 1024)
+				&& LibMatrixMult.isSkinnyRightHandSide(left.dim1, left.dim2,  right.dim1, right.dim2, false)) {
+				fuse.AZs.add(right);
+				fuse.einsumRewriteType = EOpNodeFuse.EinsumRewriteType.AB_BA_A_AZ__BZ;
+				fuse.c1 = fuse.c2;
+				fuse.c2 = right.c2;
+				return fuse;
+			}
+
+			left = left.reorderChildrenAndOptimize(this, left.c2, left.c1); // maybe can be reordered
+			if(left.c2 == right.c1) { // check if change happened:
+				this.operand = EBinaryOperand.Ba_aC;
+			}
+			right =  right.reorderChildrenAndOptimize(this, right.c1, right.c2);
+		}else if (this.operand ==EBinaryOperand.Ba_Ca){
+			if(this.right.c1 == outChar1) { // result is CB so Swap Ba and Ca
+				var tmpLeft = left;  left = right;  right = tmpLeft;
+				var tmpC1 = c1;	   c1 = c2;		 c2 = tmpC1;
+				var tmpDim1 = dim1;   dim1 = dim2;	 dim2 = tmpDim1;
+			}
+
+			right = right.reorderChildrenAndOptimize(this, right.c2, right.c1); // maybe can be reordered
+			if(left.c2 == right.c1) { // check if change happened:
+				this.operand = EBinaryOperand.Ba_aC;
+			}
+			left = left.reorderChildrenAndOptimize(this, left.c1, left.c2);
+		}else {
+			left = left.reorderChildrenAndOptimize(this, left.c1, left.c2); // just recurse
+			right = right.reorderChildrenAndOptimize(this, right.c1, right.c2);
+		}
+		return this;
+	}
+
+	// used in the old approach
+	public static Triple<Integer, EBinaryOperand, Pair<Character, Character>> tryCombineAndCost(EOpNode n1 , EOpNode n2, HashMap<Character, Integer> charToSizeMap, HashMap<Character, Integer> charToOccurences, Character outChar1, Character outChar2){
+		Predicate<Character> cannotBeSummed = (c) ->
+			c == outChar1 || c == outChar2 || charToOccurences.get(c) > 2;
+
+		if(n1.c1 == null) {
+			// n2.c1 also has to be null
+			return Triple.of(1, EBinaryOperand.scalar_scalar, Pair.of(null, null));
+		}
+
+		if(n2.c1 == null) {
+			if(n1.c2 == null)
+				return Triple.of(charToSizeMap.get(n1.c1), EBinaryOperand.A_scalar, Pair.of(n1.c1, null));
+			return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.AB_scalar, Pair.of(n1.c1, n1.c2));
+		}
+
+		if(n1.c1 == n2.c1){
+			if(n1.c2 != null){
+				if ( n1.c2 == n2.c2){
+					if( cannotBeSummed.test(n1.c1)){
+						if(cannotBeSummed.test(n1.c2)){
+							return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.AB_AB, Pair.of(n1.c1, n1.c2));
+						}
+						return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.Ab_Ab, Pair.of(n1.c1, null));
+					}
+
+					if(cannotBeSummed.test(n1.c2)){
+						return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.aB_aB, Pair.of(n1.c2, null));
+					}
+
+					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.ab_ab, Pair.of(null, null));
+
+				}
+
+				else if(n2.c2 == null){
+					if(cannotBeSummed.test(n1.c1)){
+						return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2)*2, EBinaryOperand.AB_A, Pair.of(n1.c1, n1.c2));
+					}
+					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2)*2, EBinaryOperand.aB_a, Pair.of(n1.c2, null)); // in theory (null, n1.c2)
+				}
+				else if(n1.c1 ==outChar1 || n1.c1==outChar2|| charToOccurences.get(n1.c1) > 2){
+					return null;// AB,AC
+				}
+				else {
+					return Triple.of((charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2))+(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2)*charToSizeMap.get(n2.c2)), EBinaryOperand.aB_aC, Pair.of(n1.c2, n2.c2)); // or n2.c2, n1.c2
+				}
+			}else{ // n1.c2 = null -> c2.c2 = null
+				if(n1.c1 ==outChar1 || n1.c1==outChar2 || charToOccurences.get(n1.c1) > 2){
+					return Triple.of(charToSizeMap.get(n1.c1), EBinaryOperand.A_A, Pair.of(n1.c1, null));
+				}
+				return Triple.of(charToSizeMap.get(n1.c1), EBinaryOperand.a_a, Pair.of(null, null));
+			}
+
+
+		}else{ // n1.c1 != n2.c1
+			if(n1.c2 == null) {
+				return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n2.c1), EBinaryOperand.A_B, Pair.of(n1.c1, n2.c1));
+			}
+			else if(n2.c2 == null) { // ab,c
+				if (n1.c2 == n2.c1) {
+					if(cannotBeSummed.test(n1.c2)){
+						return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n2.c1), EBinaryOperand.AB_B, Pair.of(n1.c1, n1.c2));
+					}
+					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n2.c1), EBinaryOperand.Ab_b, Pair.of(n1.c1, null));
+				}
+				return null; // AB,C
+			}
+			else if (n1.c2 == n2.c1) {
+				if(n1.c1 == n2.c2){ // ab,ba
+					if(cannotBeSummed.test(n1.c1)){
+						if(cannotBeSummed.test(n1.c2)){
+							return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.AB_BA, Pair.of(n1.c1, n1.c2));
+						}
+						return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.Ab_bA, Pair.of(n1.c1, null));
+					}
+					if(cannotBeSummed.test(n1.c2)){
+						return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.aB_Ba, Pair.of(n1.c2, null));
+					}
+					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.ab_ba, Pair.of(null, null));
+				}
+				if(cannotBeSummed.test(n1.c2)){
+					return null; // AB_B
+				}else{
+					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2)*charToSizeMap.get(n2.c2), EBinaryOperand.Ba_aC, Pair.of(n1.c1, n2.c2));
+				}
+			}
+			if(n1.c1 == n2.c2) {
+				if(cannotBeSummed.test(n1.c1)){
+					return null; // AB_B
+				}
+				return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2)*charToSizeMap.get(n2.c1), EBinaryOperand.aB_Ca, Pair.of(n2.c1, n1.c2)); // * its just reorder of mmult
+			}
+			else if (n1.c2 == n2.c2) {
+				if(n1.c2 ==outChar1 || n1.c2==outChar2|| charToOccurences.get(n1.c2) > 2){
+					return null; // BA_CA
+				}else{
+					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2) +(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2)*charToSizeMap.get(n2.c1)), EBinaryOperand.Ba_Ca, Pair.of(n1.c1, n2.c1)); // or n2.c1, n1.c1
+				}
+			}
+			else { // something like AB,CD
+				return null;
+			}
+		}
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/einsum/EOpNodeData.java b/src/main/java/org/apache/sysds/runtime/einsum/EOpNodeData.java
new file mode 100644
index 00000000000..4906323f8a4
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/einsum/EOpNodeData.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.einsum;
+
+import org.apache.commons.logging.Log;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
+import java.util.List;
+
+public class EOpNodeData extends EOpNode {
+	public int matrixIdx;
+	public EOpNodeData(Character c1, Character c2, Integer dim1, Integer dim2, int matrixIdx){
+		super(c1,c2,dim1,dim2);
+		this.matrixIdx = matrixIdx;
+	}
+
+	@Override
+	public List<EOpNode> getChildren() {
+		return List.of();
+	}
+	@Override
+	public String toString() {
+		return this.getClass().getSimpleName()+" ("+matrixIdx+") "+getOutputString();
+	}
+	@Override
+	public MatrixBlock computeEOpNode(List<MatrixBlock> inputs, int numOfThreads, Log LOG) {
+		return inputs.get(matrixIdx);
+	}
+
+	@Override
+	public EOpNode reorderChildrenAndOptimize(EOpNode parent, Character outChar1, Character outChar2) {
+		return this;
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/einsum/EOpNodeFuse.java b/src/main/java/org/apache/sysds/runtime/einsum/EOpNodeFuse.java
new file mode 100644
index 00000000000..5accf93bcca
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/einsum/EOpNodeFuse.java
@@ -0,0 +1,465 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.einsum;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.logging.Log;
+import org.apache.sysds.runtime.codegen.SpoofRowwise;
+import org.apache.sysds.runtime.functionobjects.Multiply;
+import org.apache.sysds.runtime.functionobjects.SwapIndex;
+import org.apache.sysds.runtime.instructions.cp.DoubleObject;
+import org.apache.sysds.runtime.instructions.cp.EinsumCPInstruction;
+import org.apache.sysds.runtime.instructions.cp.ScalarObject;
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
+import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
+import org.jetbrains.annotations.NotNull;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+
+import static org.apache.sysds.runtime.instructions.cp.EinsumCPInstruction.ensureMatrixBlockColumnVector;
+import static org.apache.sysds.runtime.instructions.cp.EinsumCPInstruction.ensureMatrixBlockRowVector;
+
+public class EOpNodeFuse extends EOpNode {
+
+	private EOpNode scalar = null;
+
+	public enum EinsumRewriteType{
+		// B -> row*vec, A -> row*scalar
+		AB_BA_B_A__AB,
+		AB_BA_A__B,
+		AB_BA_B_A__A,
+		AB_BA_B_A__,
+
+		// scalar from row(AB).dot(B) multiplied by row(AZ)
+		AB_BA_B_A_AZ__Z,
+
+		// AZ: last step is outer matrix multiplication using vector Z
+		AB_BA_A_AZ__BZ, AB_BA_A_AZ__ZB,
+	}
+
+	public EinsumRewriteType einsumRewriteType;
+	public List<EOpNode> ABs;
+	public List<EOpNode> BAs;
+	public List<EOpNode> Bs;
+	public List<EOpNode> As;
+	public List<EOpNode> AZs;
+	@Override
+	public List<EOpNode> getChildren(){
+		List<EOpNode> all = new ArrayList<>();
+		all.addAll(ABs);
+		all.addAll(BAs);
+		all.addAll(Bs);
+		all.addAll(As);
+		all.addAll(AZs);
+		if (scalar != null) all.add(scalar);
+		return all;
+	};
+	private EOpNodeFuse(Character c1, Character c2, Integer dim1, Integer dim2, EinsumRewriteType einsumRewriteType, List<EOpNode> ABs, List<EOpNode> BAs, List<EOpNode> Bs, List<EOpNode> As, List<EOpNode> AZs) {
+		super(c1,c2, dim1, dim2);
+		this.einsumRewriteType = einsumRewriteType;
+		this.ABs = ABs;
+		this.BAs = BAs;
+		this.Bs = Bs;
+		this.As = As;
+		this.AZs = AZs;
+	}
+	public EOpNodeFuse(EinsumRewriteType einsumRewriteType, List<EOpNode> ABs, List<EOpNode> BAs, List<EOpNode> Bs, List<EOpNode> As, List<EOpNode> AZs, List<Pair<List<EOpNode>, List<EOpNode>>> AXsAndXs) {
+		super(null,null,null, null);
+		switch(einsumRewriteType) {
+			case AB_BA_B_A__A->{
+				c1 = ABs.get(0).c1;
+				dim1 = ABs.get(0).dim1;
+			}case AB_BA_A__B -> {
+				c1 = ABs.get(0).c2;
+				dim1 = ABs.get(0).dim2;
+			}case AB_BA_B_A__ -> {
+			}case AB_BA_B_A__AB -> {
+				c1 = ABs.get(0).c1;
+				dim1 = ABs.get(0).dim1;
+				c2 = ABs.get(0).c2;
+				dim2 = ABs.get(0).dim2;
+			}case AB_BA_B_A_AZ__Z -> {
+				c1 = AZs.get(0).c1;
+				dim1 = AZs.get(0).dim2;
+			}case AB_BA_A_AZ__BZ ->{
+				c1 = ABs.get(0).c2;
+				dim1 = ABs.get(0).dim2;
+				c2 = AZs.get(0).c2;
+				dim2 = AZs.get(0).dim2;
+			}case AB_BA_A_AZ__ZB ->{
+				c2 = ABs.get(0).c2;
+				dim2 = ABs.get(0).dim2;
+				c1 = AZs.get(0).c2;
+				dim1 = AZs.get(0).dim2;
+			}
+		}
+		this.einsumRewriteType = einsumRewriteType;
+		this.ABs = ABs;
+		this.BAs = BAs;
+		this.Bs = Bs;
+		this.As = As;
+		this.AZs = AZs;
+	}
+
+	@Override
+	public String toString() {
+		return this.getClass().getSimpleName()+" ("+einsumRewriteType.toString()+") "+this.getOutputString();
+	}
+
+	public void addScalarAsIntermediate(EOpNode scalar) {
+		if(einsumRewriteType == EinsumRewriteType.AB_BA_B_A__A || einsumRewriteType == EinsumRewriteType.AB_BA_B_A_AZ__Z)
+			this.scalar = scalar;
+		else
+			throw new RuntimeException("EOpNodeFuse.addScalarAsIntermediate: scalar is undefined for type "+einsumRewriteType.toString());
+	}
+
+	public static List<EOpNodeFuse> findFuseOps(List<EOpNode> operands, Character outChar1, Character outChar2,
+		Map<Character, Integer> charToSize, Map<Character, Integer> charToOccurences, List<EOpNode> ret)
+	{
+		List<EOpNodeFuse> result = new ArrayList<>();
+		Set<String> matricesChars = new HashSet<>();
+		Map<Character, HashSet<String>> matricesCharsStartingWithChar = new HashMap<>();
+		Map<String, ArrayList<EOpNode>> charsToMatrices = new HashMap<>();
+
+		for(EOpNode operand1 : operands) {
+			String k;
+
+			if(operand1.c2 != null) {
+				k = operand1.c1.toString() + operand1.c2;
+				matricesChars.add(k);
+				if(matricesCharsStartingWithChar.containsKey(operand1.c1)) {
+					matricesCharsStartingWithChar.get(operand1.c1).add(k);
+				}
+				else {
+					HashSet<String> set = new HashSet<>();
+					set.add(k);
+					matricesCharsStartingWithChar.put(operand1.c1, set);
+				}
+			}
+			else {
+				k = operand1.c1.toString();
+			}
+
+			if(charsToMatrices.containsKey(k)) {
+				charsToMatrices.get(k).add(operand1);
+			}
+			else {
+				ArrayList<EOpNode> matrices = new ArrayList<>();
+				matrices.add(operand1);
+				charsToMatrices.put(k, matrices);
+			}
+		}
+		ArrayList<Pair<Integer, String>> matricesCharsSorted = new ArrayList<>(matricesChars.stream()
+			.map(x -> Pair.of(charsToMatrices.get(x).get(0).dim1 * charsToMatrices.get(x).get(0).dim2, x)).toList());
+		matricesCharsSorted.sort(Comparator.comparing(Pair::getLeft));
+		ArrayList<EOpNode> AZs = new ArrayList<>();
+
+		HashSet<String> usedMatricesChars = new HashSet<>();
+		HashSet<EOpNode> usedOperands = new HashSet<>();
+
+		for(String ABCandidate : matricesCharsSorted.stream().map(Pair::getRight).toList()) {
+			if(usedMatricesChars.contains(ABCandidate)) continue;
+
+			char a = ABCandidate.charAt(0);
+			char b = ABCandidate.charAt(1);
+			String AB = ABCandidate;
+			String BA = "" + b + a;
+
+			int BAsCount = (charsToMatrices.containsKey(BA) ? charsToMatrices.get(BA).size() : 0);
+			int ABsCount = charsToMatrices.get(AB).size();
+
+			if(BAsCount > ABsCount + 1) {
+				BA = "" + a + b;
+				AB = "" + b + a;
+				char tmp = a;
+				a = b;
+				b = tmp;
+				int tmp2 = ABsCount;
+				ABsCount = BAsCount;
+				BAsCount = tmp2;
+			}
+			String A = "" + a;
+			String B = "" + b;
+			int AsCount = (charsToMatrices.containsKey(A) && !usedMatricesChars.contains(A) ? charsToMatrices.get(A).size() : 0);
+			int BsCount = (charsToMatrices.containsKey(B) && !usedMatricesChars.contains(B) ? charsToMatrices.get(B).size() : 0);
+
+			if(AsCount == 0 && BsCount == 0 && (ABsCount + BAsCount) < 2) { // no elementwise multiplication possible
+				continue;
+			}
+
+			int usedBsCount = BsCount + ABsCount + BAsCount;
+
+			boolean doSumA = false;
+			boolean doSumB = charToOccurences.get(b) == usedBsCount && (outChar1 == null || b != outChar1) && (outChar2 == null || b != outChar2);
+			HashSet<String> AZCandidates = matricesCharsStartingWithChar.get(a);
+
+			String AZ = null;
+			Character z = null;
+			boolean includeAZ = AZCandidates.size() == 2;
+
+			if(includeAZ) {
+				for(var AZCandidate : AZCandidates) {
+					if(AB.equals(AZCandidate)) {continue;}
+					AZs = charsToMatrices.get(AZCandidate);
+					z = AZCandidate.charAt(1);
+					String Z = "" + z;
+					AZ = "" + a + z;
+					int AZsCount= AZs.size();
+					int ZsCount= charsToMatrices.containsKey(Z) ? charsToMatrices.get(Z).size() : 0;
+					doSumA = AZsCount + ABsCount + BAsCount + AsCount == charToOccurences.get(a) && (outChar1 == null || a != outChar1) && (outChar2 == null || a != outChar2);
+					boolean doSumZ = AZsCount + ZsCount  == charToOccurences.get(z) && (outChar1 == null || z != outChar1) && (outChar2 == null || z != outChar2);
+					if(!doSumA){
+						includeAZ = false;
+					} else if(!doSumB && doSumZ){ // swap the order, to have only one fusion AB,...,AZ->Z
+						b = z;
+						z = AB.charAt(1);
+						AB = "" + a + b;
+						BA = "" + b + a;
+						A = "" + a;
+						B = "" + b;
+						AZ = "" + a + z;
+						AZs = charsToMatrices.get(AZ);
+						doSumB = true;
+					} else if(!doSumB && !doSumZ){ // outer between B and Z
+						if(!EinsumCPInstruction.FUSE_OUTER_MULTIPLY
+							|| (EinsumCPInstruction.FUSE_OUTER_MULTIPLY_EXCEEDS_L2_CACHE_CHECK && ((charToSize.get(a) * charToSize.get(b) *(ABsCount + BAsCount)) + (charToSize.get(a)*charToSize.get(z)*(AZsCount))) * 8 < 6 * 1024 * 1024)
+							|| !LibMatrixMult.isSkinnyRightHandSide(charToSize.get(AB.charAt(0)), charToSize.get(AB.charAt(1)), charToSize.get(AB.charAt(0)), charToSize.get(AZCandidates.iterator().next().charAt(1)),false)) {
+							includeAZ = false;
+						}
+					} else if(doSumB && doSumZ){
+						// it will be two separate templates and then mutliply a vectors
+					} else if (doSumB && !doSumZ) {
+						// ->Z template OK
+					}
+					break;
+				}
+			}
+
+			if(!includeAZ) {
+				doSumA = charToOccurences.get(a) == AsCount + ABsCount + BAsCount && (outChar1 == null || a != outChar1) && (outChar2 == null || a != outChar2);
+			}
+
+			ArrayList<EOpNode> ABs = charsToMatrices.containsKey(AB) ? charsToMatrices.get(AB) : new ArrayList<>();
+			ArrayList<EOpNode> BAs = charsToMatrices.containsKey(BA) ? charsToMatrices.get(BA) : new ArrayList<>();
+			ArrayList<EOpNode> As = charsToMatrices.containsKey(A) && !usedMatricesChars.contains(A) ? charsToMatrices.get(A) : new ArrayList<>();
+			ArrayList<EOpNode> Bs = charsToMatrices.containsKey(B) && !usedMatricesChars.contains(B) ? charsToMatrices.get(B) : new ArrayList<>();
+			Character c1 = null, c2 = null;
+			Integer dim1 = null, dim2 = null;
+			EinsumRewriteType type = null;
+
+			if(includeAZ) {
+				if(doSumB) {
+					type = EinsumRewriteType.AB_BA_B_A_AZ__Z;
+					c1 = z;
+				}
+				else if((outChar1 != null && outChar2 != null) && outChar1 == z && outChar2 == b) {
+					type = EinsumRewriteType.AB_BA_A_AZ__ZB;
+					c1 = z; c2 = b;
+				}
+				else if((outChar1 != null && outChar2 != null) && outChar1 == b && outChar2 == z) {
+					type = EinsumRewriteType.AB_BA_A_AZ__BZ;
+					c1 = b; c2 = z;
+				}
+				else {
+					type = EinsumRewriteType.AB_BA_A_AZ__ZB;
+					c1 = z; c2 = b;
+				}
+			}
+			else {
+				AZs= new ArrayList<>();
+				if(doSumA) {
+					if(doSumB) {
+						type = EinsumRewriteType.AB_BA_B_A__;
+					}
+					else {
+						type = EinsumRewriteType.AB_BA_A__B;
+						c1 = AB.charAt(1);
+					}
+				}
+				else if(doSumB) {
+					type = EinsumRewriteType.AB_BA_B_A__A;
+					c1 = AB.charAt(0);
+				}
+				else {
+					type = EinsumRewriteType.AB_BA_B_A__AB;
+					c1 = AB.charAt(0); c2 = AB.charAt(1);
+				}
+			}
+
+			if(c1 != null) {
+				charToOccurences.put(c1, charToOccurences.get(c1) + 1);
+				dim1 = charToSize.get(c1);
+			}
+			if(c2 != null) {
+				charToOccurences.put(c2, charToOccurences.get(c2) + 1);
+				dim2 = charToSize.get(c2);
+			}
+			boolean includeB = type != EinsumRewriteType.AB_BA_A__B && type != EinsumRewriteType.AB_BA_A_AZ__BZ && type != EinsumRewriteType.AB_BA_A_AZ__ZB;
+
+			usedOperands.addAll(ABs);
+			usedOperands.addAll(BAs);
+			usedOperands.addAll(As);
+			if (includeB) usedOperands.addAll(Bs);
+			if (includeAZ) usedOperands.addAll(AZs);
+
+			usedMatricesChars.add(AB);
+			usedMatricesChars.add(BA);
+			usedMatricesChars.add(A);
+			if (includeB) usedMatricesChars.add(B);
+			if (includeAZ) usedMatricesChars.add(AZ);
+
+			var e = new EOpNodeFuse(c1, c2, dim1, dim2, type, ABs, BAs, includeB ? Bs : new ArrayList<>(), As, AZs);
+
+			result.add(e);
+		}
+
+		for(EOpNode n : operands) {
+			if(!usedOperands.contains(n)){
+				ret.add(n);
+			} else {
+				charToOccurences.put(n.c1, charToOccurences.get(n.c1) - 1);
+				if(charToOccurences.get(n.c2)!= null)
+					charToOccurences.put(n.c2, charToOccurences.get(n.c2)-1);
+			}
+		}
+
+		return result;
+	}
+	@SuppressWarnings("unused")
+	public static MatrixBlock compute(EinsumRewriteType rewriteType, List<MatrixBlock> ABsInput, List<MatrixBlock> mbBAs, List<MatrixBlock> mbBs, List<MatrixBlock> mbAs, List<MatrixBlock> mbAZs,
+		Double scalar, int numThreads){
+		boolean isResultAB =rewriteType  == EOpNodeFuse.EinsumRewriteType.AB_BA_B_A__AB;
+		boolean isResultA = rewriteType == EOpNodeFuse.EinsumRewriteType.AB_BA_B_A__A;
+		boolean isResultB = rewriteType == EOpNodeFuse.EinsumRewriteType.AB_BA_A__B;
+		boolean isResult_ = rewriteType == EOpNodeFuse.EinsumRewriteType.AB_BA_B_A__;
+		boolean isResultZ = rewriteType == EOpNodeFuse.EinsumRewriteType.AB_BA_B_A_AZ__Z;
+		boolean isResultBZ =rewriteType  == EOpNodeFuse.EinsumRewriteType.AB_BA_A_AZ__BZ;
+		boolean isResultZB =rewriteType  == EOpNodeFuse.EinsumRewriteType.AB_BA_A_AZ__ZB;
+
+		List<MatrixBlock> mbABs = new ArrayList<>(ABsInput);
+		int bSize = mbABs.get(0).getNumColumns();
+		int aSize = mbABs.get(0).getNumRows();
+		if (!mbBAs.isEmpty()) {
+			ReorgOperator transpose = new ReorgOperator(SwapIndex.getSwapIndexFnObject(), numThreads);
+			for(MatrixBlock mb : mbBAs) //BA->AB
+				mbABs.add(mb.reorgOperations(transpose, null, 0, 0, 0));
+		}
+
+		if(mbAs.size() > 1) mbAs = multiplyVectorsIntoOne(mbAs, aSize);
+		if(mbBs.size() > 1) mbBs = multiplyVectorsIntoOne(mbBs, bSize);
+
+		int constDim2 = -1;
+		int zSize = 0;
+		int azCount = 0;
+		switch(rewriteType){
+			case AB_BA_B_A_AZ__Z, AB_BA_A_AZ__BZ, AB_BA_A_AZ__ZB ->  {
+				constDim2 = mbAZs.get(0).getNumColumns();
+				zSize = mbAZs.get(0).getNumColumns();
+				azCount = mbAZs.size();
+			}
+		}
+
+		SpoofRowwise.RowType rowType = switch(rewriteType){
+			case AB_BA_B_A__AB -> SpoofRowwise.RowType.NO_AGG;
+			case AB_BA_A__B -> SpoofRowwise.RowType.COL_AGG_T;
+			case AB_BA_B_A__A -> SpoofRowwise.RowType.ROW_AGG;
+			case AB_BA_B_A__ -> SpoofRowwise.RowType.FULL_AGG;
+			case AB_BA_B_A_AZ__Z -> SpoofRowwise.RowType.COL_AGG_CONST;
+			case AB_BA_A_AZ__BZ -> SpoofRowwise.RowType.COL_AGG_B1_T;
+			case AB_BA_A_AZ__ZB -> SpoofRowwise.RowType.COL_AGG_B1;
+		};
+		EinsumSpoofRowwise r = new EinsumSpoofRowwise(rewriteType, rowType, constDim2,
+			mbABs.size()-1, !mbBs.isEmpty() && (!isResultBZ && !isResultZB && !isResultB), mbAs.size(), azCount, zSize);
+
+		ArrayList<MatrixBlock> fuseInputs = new ArrayList<>();
+		fuseInputs.addAll(mbABs);
+		if(!isResultBZ && !isResultZB && !isResultB)
+			fuseInputs.addAll(mbBs);
+		fuseInputs.addAll(mbAs);
+		if (isResultZ || isResultBZ || isResultZB)
+			fuseInputs.addAll(mbAZs);
+
+		ArrayList<ScalarObject> scalarObjects = new ArrayList<>();
+		if(scalar != null){
+			scalarObjects.add(new DoubleObject(scalar));
+		}
+		MatrixBlock out = r.execute(fuseInputs, scalarObjects, new MatrixBlock(), numThreads);
+
+		if(isResultB && !mbBs.isEmpty()){
+			LibMatrixMult.vectMultiply(mbBs.get(0).getDenseBlockValues(), out.getDenseBlockValues(), 0,0, mbABs.get(0).getNumColumns());
+		}
+		if(isResultBZ && !mbBs.isEmpty()){
+			ensureMatrixBlockColumnVector(mbBs.get(0));
+			out = out.binaryOperations(new BinaryOperator(Multiply.getMultiplyFnObject()), mbBs.get(0));
+		}
+		if(isResultZB && !mbBs.isEmpty()){
+			ensureMatrixBlockRowVector(mbBs.get(0));
+			out = out.binaryOperations(new BinaryOperator(Multiply.getMultiplyFnObject()), mbBs.get(0));
+		}
+
+		if( isResultA ||  isResultB || isResultZ)
+			ensureMatrixBlockColumnVector(out);
+
+		return out;
+	}
+	@Override
+	public MatrixBlock computeEOpNode(List<MatrixBlock> inputs, int numThreads, Log LOG) {
+		final Function<EOpNode, MatrixBlock> eOpNodeToMatrixBlock =  n -> n.computeEOpNode(inputs, numThreads, LOG);
+		List<MatrixBlock> mbABs = new ArrayList<>(ABs.stream().map(eOpNodeToMatrixBlock).toList());
+		List<MatrixBlock> mbBAs = BAs.stream().map(eOpNodeToMatrixBlock).toList();
+		List<MatrixBlock> mbBs =  Bs.stream().map(eOpNodeToMatrixBlock).toList();
+		List<MatrixBlock> mbAs = As.stream().map(eOpNodeToMatrixBlock).toList();
+		List<MatrixBlock> mbAZs = AZs.stream().map(eOpNodeToMatrixBlock).toList();
+		Double scalar = this.scalar == null ? null : this.scalar.computeEOpNode(inputs, numThreads, LOG).get(0,0);
+		return EOpNodeFuse.compute(this.einsumRewriteType, mbABs, mbBAs, mbBs, mbAs, mbAZs , scalar, numThreads);
+	}
+
+	@Override
+	public EOpNode reorderChildrenAndOptimize(EOpNode parent, Character outChar1, Character outChar2) {
+		ABs.replaceAll(n -> n.reorderChildrenAndOptimize(this, n.c1, n.c2));
+		BAs.replaceAll(n -> n.reorderChildrenAndOptimize(this, n.c1, n.c2));
+		As.replaceAll(n -> n.reorderChildrenAndOptimize(this, n.c1, n.c2));
+		Bs.replaceAll(n -> n.reorderChildrenAndOptimize(this, n.c1, n.c2));
+		AZs.replaceAll(n -> n.reorderChildrenAndOptimize(this, n.c1, n.c2));
+		return this;
+	}
+
+	private static @NotNull List<MatrixBlock> multiplyVectorsIntoOne(List<MatrixBlock> mbs, int size) {
+		MatrixBlock mb = new MatrixBlock(mbs.get(0).getNumRows(), mbs.get(0).getNumColumns(), false);
+		mb.allocateDenseBlock();
+		for(int i = 1; i< mbs.size(); i++) { // multiply Bs
+			if(i==1)
+				LibMatrixMult.vectMultiplyWrite(mbs.get(0).getDenseBlock().values(0), mbs.get(1).getDenseBlock().values(0), mb.getDenseBlock().values(0),0,0,0, size);
+			else
+				LibMatrixMult.vectMultiply(mbs.get(i).getDenseBlock().values(0),mb.getDenseBlock().values(0),0,0, size);
+		}
+		return List.of(mb);
+	}
+}
+
diff --git a/src/main/java/org/apache/sysds/runtime/einsum/EOpNodeUnary.java b/src/main/java/org/apache/sysds/runtime/einsum/EOpNodeUnary.java
new file mode 100644
index 00000000000..918f1dd3b24
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/einsum/EOpNodeUnary.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.einsum;
+
+import org.apache.commons.logging.Log;
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.functionobjects.DiagIndex;
+import org.apache.sysds.runtime.functionobjects.KahanPlus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.functionobjects.ReduceAll;
+import org.apache.sysds.runtime.functionobjects.ReduceCol;
+import org.apache.sysds.runtime.functionobjects.ReduceDiag;
+import org.apache.sysds.runtime.functionobjects.ReduceRow;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.AggregateOperator;
+import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
+import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
+
+import java.util.List;
+
+public class EOpNodeUnary extends EOpNode {
+	private final EUnaryOperand eUnaryOperand;
+	public EOpNode child;
+
+	public enum EUnaryOperand {
+		DIAG, TRACE, SUM, SUM_COLS, SUM_ROWS
+	}
+	public EOpNodeUnary(Character c1, Character c2, Integer dim1, Integer dim2, EOpNode child, EUnaryOperand eUnaryOperand) {
+		super(c1, c2, dim1, dim2);
+		this.child = child;
+		this.eUnaryOperand = eUnaryOperand;
+	}
+
+	@Override
+	public List<EOpNode> getChildren() {
+		return List.of(child);
+	}
+	@Override
+	public String toString() {
+		return this.getClass().getSimpleName()+" ("+eUnaryOperand.toString()+") "+this.getOutputString();
+	}
+
+	@Override
+	public MatrixBlock computeEOpNode(List<MatrixBlock> inputs, int numOfThreads, Log LOG) {
+		MatrixBlock mb = child.computeEOpNode(inputs, numOfThreads, LOG);
+		return switch(eUnaryOperand) {
+			case DIAG->{
+				ReorgOperator op = new ReorgOperator(DiagIndex.getDiagIndexFnObject());
+				yield mb.reorgOperations(op, new MatrixBlock(),0,0,0);
+			}
+			case TRACE -> {
+				AggregateOperator agg = new AggregateOperator(0, KahanPlus.getKahanPlusFnObject(), Types.CorrectionLocationType.LASTCOLUMN);
+				AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceDiag.getReduceDiagFnObject(), numOfThreads);
+				MatrixBlock res = new MatrixBlock(10, 10, false);
+				mb.aggregateUnaryOperations(aggun, res,0,null);
+				yield res;
+			}
+			case SUM->{
+				AggregateOperator agg = new AggregateOperator(0, Plus.getPlusFnObject());
+				AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceAll.getReduceAllFnObject(), numOfThreads);
+				MatrixBlock res = new MatrixBlock(1, 1, false);
+				mb.aggregateUnaryOperations(aggun, res, 0, null);
+				yield res;
+			}
+			case SUM_COLS ->{
+				AggregateOperator agg = new AggregateOperator(0, Plus.getPlusFnObject());
+				AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceRow.getReduceRowFnObject(), numOfThreads);
+				MatrixBlock res = new MatrixBlock(mb.getNumColumns(), 1, false);
+				mb.aggregateUnaryOperations(aggun, res, 0, null);
+				yield res;
+			}
+			case SUM_ROWS ->{
+				AggregateOperator agg = new AggregateOperator(0, Plus.getPlusFnObject());
+				AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceCol.getReduceColFnObject(), numOfThreads);
+				MatrixBlock res = new MatrixBlock(mb.getNumRows(), 1, false);
+				mb.aggregateUnaryOperations(aggun, res, 0, null);
+				yield res;
+			}
+		};
+	}
+
+	@Override
+	public EOpNode reorderChildrenAndOptimize(EOpNode parent, Character outChar1, Character outChar2) {
+		return this;
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/einsum/EinsumContext.java b/src/main/java/org/apache/sysds/runtime/einsum/EinsumContext.java
index 6da39e59873..16c67e5399b 100644
--- a/src/main/java/org/apache/sysds/runtime/einsum/EinsumContext.java
+++ b/src/main/java/org/apache/sysds/runtime/einsum/EinsumContext.java
@@ -23,155 +23,100 @@
 
 import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.Iterator;
-
+import java.util.List;
+import java.util.Map;
 
 public class EinsumContext {
-    public enum ContractDimensions {
-        CONTRACT_LEFT,
-        CONTRACT_RIGHT,
-        CONTRACT_BOTH,
-    }
-    public Integer outRows;
-    public Integer outCols;
-    public Character outChar1;
-    public Character outChar2;
-    public HashMap<Character, Integer> charToDimensionSize;
-    public String equationString;
-    public boolean[] diagonalInputs;
-    public HashSet<Character> summingChars;
-    public HashSet<Character> contractDimsSet;
-    public ContractDimensions[] contractDims;
-    public ArrayList<String> newEquationStringInputsSplit;
-    public HashMap<Character, ArrayList<Integer>> characterAppearanceIndexes; // for each character, this tells in which inputs it appears
-
-    private EinsumContext(){};
-    public static EinsumContext getEinsumContext(String eqStr, ArrayList<MatrixBlock> inputs){
-        EinsumContext res = new EinsumContext();
-
-        res.equationString = eqStr;
-        res.charToDimensionSize = new HashMap<Character, Integer>();
-        HashSet<Character> summingChars = new HashSet<>();
-        ContractDimensions[] contractDims = new ContractDimensions[inputs.size()];
-        boolean[] diagonalInputs = new boolean[inputs.size()]; // all false by default
-        HashSet<Character> contractDimsSet = new HashSet<>();
-        HashMap<Character, ArrayList<Integer>> partsCharactersToIndices = new HashMap<>();
-        ArrayList<String> newEquationStringSplit = new ArrayList<>();
-
-        Iterator<MatrixBlock> it = inputs.iterator();
-        MatrixBlock curArr = it.next();
-        int arrSizeIterator = 0;
-        int arrayIterator = 0;
-        int i;
-        // first iteration through string: collect information on character-size and what characters are summing characters
-        for (i = 0; true; i++) {
-            char c = eqStr.charAt(i);
-            if(c == '-'){
-                i+=2;
-                break;
-            }
-            if(c == ','){
-                arrayIterator++;
-                curArr = it.next();
-                arrSizeIterator = 0;
-            }
-            else{
-                if (res.charToDimensionSize.containsKey(c)) { // sanity check if dims match, this is already checked at validation
-                    if(arrSizeIterator == 0 && res.charToDimensionSize.get(c) != curArr.getNumRows())
-                        throw new RuntimeException("Einsum: character "+c+" has multiple conflicting sizes");
-                    else if(arrSizeIterator == 1 && res.charToDimensionSize.get(c) != curArr.getNumColumns())
-                        throw new RuntimeException("Einsum: character "+c+" has multiple conflicting sizes");
-                    summingChars.add(c);
-                } else {
-                    if(arrSizeIterator == 0)
-                        res.charToDimensionSize.put(c, curArr.getNumRows());
-                    else if(arrSizeIterator == 1)
-                        res.charToDimensionSize.put(c, curArr.getNumColumns());
-                }
-
-                arrSizeIterator++;
-            }
-        }
-
-        int numOfRemainingChars = eqStr.length() - i;
-
-        if (numOfRemainingChars > 2)
-            throw new RuntimeException("Einsum: dim > 2 not supported");
-
-        arrSizeIterator = 0;
-
-        Character outChar1 = numOfRemainingChars > 0 ? eqStr.charAt(i) : null;
-        Character outChar2 = numOfRemainingChars > 1 ? eqStr.charAt(i+1) : null;
-        res.outRows=(numOfRemainingChars > 0 ? res.charToDimensionSize.get(outChar1) : 1);
-        res.outCols=(numOfRemainingChars > 1 ? res.charToDimensionSize.get(outChar2) : 1);
-
-        arrayIterator=0;
-        // second iteration through string: collect remaining information
-        for (i = 0; true; i++) {
-            char c = eqStr.charAt(i);
-            if (c == '-') {
-                break;
-            }
-            if (c == ',') {
-                arrayIterator++;
-                arrSizeIterator = 0;
-                continue;
-            }
-            String s = "";
-
-            if(summingChars.contains(c)) {
-                s+=c;
-                if(!partsCharactersToIndices.containsKey(c))
-                    partsCharactersToIndices.put(c, new ArrayList<>());
-                partsCharactersToIndices.get(c).add(arrayIterator);
-            }
-            else if((outChar1 != null && c == outChar1) || (outChar2 != null && c == outChar2)) {
-                s+=c;
-            }
-            else {
-                contractDimsSet.add(c);
-                contractDims[arrayIterator] = ContractDimensions.CONTRACT_LEFT;
-            }
-
-            if(i + 1 < eqStr.length()) { // process next character together
-                char c2 = eqStr.charAt(i + 1);
-                i++;
-                if (c2 == '-') { newEquationStringSplit.add(s); break;}
-                if (c2 == ',') { arrayIterator++; newEquationStringSplit.add(s); continue; }
-
-                if (c2 == c){
-                    diagonalInputs[arrayIterator] = true;
-                    if (contractDims[arrayIterator] == ContractDimensions.CONTRACT_LEFT) contractDims[arrayIterator] = ContractDimensions.CONTRACT_BOTH;
-                }
-                else{
-                    if(summingChars.contains(c2)) {
-                        s+=c2;
-                        if(!partsCharactersToIndices.containsKey(c2))
-                            partsCharactersToIndices.put(c2, new ArrayList<>());
-                        partsCharactersToIndices.get(c2).add(arrayIterator);
-                    }
-                    else if((outChar1 != null && c2 == outChar1) || (outChar2 != null && c2 == outChar2)) {
-                        s+=c2;
-                    }
-                    else {
-                        contractDimsSet.add(c2);
-                        contractDims[arrayIterator] = contractDims[arrayIterator] == ContractDimensions.CONTRACT_LEFT ? ContractDimensions.CONTRACT_BOTH : ContractDimensions.CONTRACT_RIGHT;
-                    }
-                }
-            }
-            newEquationStringSplit.add(s);
-            arrSizeIterator++;
-        }
-
-        res.contractDims = contractDims;
-        res.contractDimsSet = contractDimsSet;
-        res.diagonalInputs = diagonalInputs;
-        res.summingChars = summingChars;
-        res.outChar1 = outChar1;
-        res.outChar2 = outChar2;
-        res.newEquationStringInputsSplit = newEquationStringSplit;
-        res.characterAppearanceIndexes = partsCharactersToIndices;
-        return res;
-    }
+	public Integer outRows;
+	public Integer outCols;
+	public Character outChar1;
+	public Character outChar2;
+	public Map<Character, Integer> charToDimensionSize;
+	public String equationString;
+	public List<String> newEquationStringInputsSplit;
+	public Map<Character, Integer> characterAppearanceCount;
+
+	private EinsumContext(){};
+	public static EinsumContext getEinsumContext(String eqStr, List<MatrixBlock> inputs){
+		EinsumContext res = new EinsumContext();
+
+		res.equationString = eqStr;
+		Map<Character, Integer> charToDimensionSize = new HashMap<>();
+		Map<Character, Integer> characterAppearanceCount = new HashMap<>();
+		List<String> newEquationStringSplit = new ArrayList<>();
+		Character outChar1 = null;
+		Character outChar2 = null;
+
+		Iterator<MatrixBlock> it = inputs.iterator();
+		MatrixBlock curArr = it.next();
+		int i = 0;
+
+		char c = eqStr.charAt(i);
+		for(i = 0; i < eqStr.length(); i++) {
+			StringBuilder sb = new StringBuilder(2);
+			for(;i < eqStr.length(); i++){
+				c = eqStr.charAt(i);
+				if  (c == ' ') continue;
+				if  (c == ',' || c == '-' ) break;
+				if (!Character.isAlphabetic(c)) {
+					throw new RuntimeException("Einsum: only alphabetic characters are supported for dimensions: "+c);
+				}
+				sb.append(c);
+				if (characterAppearanceCount.containsKey(c)) characterAppearanceCount.put(c, characterAppearanceCount.get(c) + 1) ;
+				else characterAppearanceCount.put(c, 1);
+			}
+			String s = sb.toString();
+			newEquationStringSplit.add(s);
+
+			if(s.length() > 0){
+				if (charToDimensionSize.containsKey(s.charAt(0)))
+					if (charToDimensionSize.get(s.charAt(0)) != curArr.getNumRows())
+						throw new RuntimeException("Einsum: character "+c+" has multiple conflicting sizes");
+				charToDimensionSize.put(s.charAt(0), curArr.getNumRows());
+			}
+			if(s.length() > 1){
+				if (charToDimensionSize.containsKey(s.charAt(1)))
+					if (charToDimensionSize.get(s.charAt(1)) != curArr.getNumColumns())
+						throw new RuntimeException("Einsum: character "+c+" has multiple conflicting sizes");
+				charToDimensionSize.put(s.charAt(1), curArr.getNumColumns());
+			}
+			if(s.length() > 2) throw new RuntimeException("Einsum: only up-to 2D inputs strings allowed ");
+
+			if( c==','){
+				curArr = it.next();
+			}
+			else if (c=='-') break;
+
+			if (i == eqStr.length() - 1) {throw new RuntimeException("Einsum: missing '->' substring "+c);}
+		}
+
+		if (i == eqStr.length() - 1 || eqStr.charAt(i+1) != '>') throw new RuntimeException("Einsum: missing '->' substring "+c);
+		i+=2;
+
+		StringBuilder sb = new StringBuilder(2);
+
+		for(;i < eqStr.length(); i++){
+			c = eqStr.charAt(i);
+			if  (c == ' ') continue;
+			if (!Character.isAlphabetic(c)) {
+				throw new RuntimeException("Einsum: only alphabetic characters are supported for dimensions: "+c);
+			}
+			sb.append(c);
+		}
+		String s = sb.toString();
+		if(s.length() > 0) outChar1 = s.charAt(0);
+		if(s.length() > 1) outChar2 = s.charAt(1);
+		if(s.length() > 2) throw new RuntimeException("Einsum: only up-to 2D output allowed ");
+
+		res.outRows=(outChar1 == null ? 1 : charToDimensionSize.get(outChar1));
+		res.outCols=(outChar2 == null ? 1 : charToDimensionSize.get(outChar2));
+
+		res.outChar1 = outChar1;
+		res.outChar2 = outChar2;
+		res.newEquationStringInputsSplit = newEquationStringSplit;
+		res.characterAppearanceCount = characterAppearanceCount;
+		res.charToDimensionSize = charToDimensionSize;
+		return res;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/einsum/EinsumEquationValidator.java b/src/main/java/org/apache/sysds/runtime/einsum/EinsumEquationValidator.java
index 5643159ef9a..417a1b760b2 100644
--- a/src/main/java/org/apache/sysds/runtime/einsum/EinsumEquationValidator.java
+++ b/src/main/java/org/apache/sysds/runtime/einsum/EinsumEquationValidator.java
@@ -32,113 +32,117 @@
 
 public class EinsumEquationValidator {
 
-    public static <HopOrIdentifier extends ParseInfo> Triple<Long, Long, Types.DataType> validateEinsumEquationAndReturnDimensions(String equationString, List<HopOrIdentifier> expressionsOrIdentifiers) throws LanguageException {
-        String[] eqStringParts = equationString.split("->"); // length 2 if "...->..." , length 1 if "...->"
-        boolean isResultScalar = eqStringParts.length == 1;
+	public static <HopOrIdentifier extends ParseInfo> Triple<Long, Long, Types.DataType> validateEinsumEquationAndReturnDimensions(String equationString, List<HopOrIdentifier> expressionsOrIdentifiers) throws LanguageException {
+		String[] eqStringParts = equationString.split("->"); // length 2 if "...->..." , length 1 if "...->"
+		boolean isResultScalar = eqStringParts.length == 1;
 
-        if(expressionsOrIdentifiers == null)
-            throw new RuntimeException("Einsum: called validateEinsumAndReturnDimensions with null list");
+		if(expressionsOrIdentifiers == null)
+			throw new RuntimeException("Einsum: called validateEinsumAndReturnDimensions with null list");
 
-        HashMap<Character, Long> charToDimensionSize = new HashMap<>();
-        Iterator<HopOrIdentifier> it = expressionsOrIdentifiers.iterator();
-        HopOrIdentifier currArr = it.next();
-        int arrSizeIterator = 0;
-        int numberOfMatrices = 1;
-        for (int i = 0; i < eqStringParts[0].length(); i++) {
-            char c = equationString.charAt(i);
-            if(c==' ') continue;
-            if(c==','){
-                if(!it.hasNext())
-                    throw new LanguageException("Einsum: Provided less operands than specified in equation str");
-                currArr = it.next();
-                arrSizeIterator = 0;
-                numberOfMatrices++;
-            } else{
-                long thisCharDimension = getThisCharDimension(currArr, arrSizeIterator);
-                if (charToDimensionSize.containsKey(c)){
-                    if (charToDimensionSize.get(c) != thisCharDimension)
-                        throw new LanguageException("Einsum: Character '" + c + "' expected to be dim " + charToDimensionSize.get(c) + ", but found " + thisCharDimension);
-                }else{
-                    charToDimensionSize.put(c, thisCharDimension);
-                }
-                arrSizeIterator++;
-            }
-        }
-        if (expressionsOrIdentifiers.size() - 1 > numberOfMatrices)
-            throw new LanguageException("Einsum: Provided more operands than specified in equation str");
+		HashMap<Character, Long> charToDimensionSize = new HashMap<>();
+		Iterator<HopOrIdentifier> it = expressionsOrIdentifiers.iterator();
+		HopOrIdentifier currArr = it.next();
+		int arrSizeIterator = 0;
+		int numberOfMatrices = 1;
+		for (int i = 0; i < eqStringParts[0].length(); i++) {
+			char c = equationString.charAt(i);
+			if(c==' ') continue;
+			if(c==','){
+				if(!it.hasNext())
+					throw new LanguageException("Einsum: Provided less operands than specified in equation str");
+				currArr = it.next();
+				arrSizeIterator = 0;
+				numberOfMatrices++;
+			} else{
+				long thisCharDimension = getThisCharDimension(currArr, arrSizeIterator);
+				if (charToDimensionSize.containsKey(c)){
+					if (charToDimensionSize.get(c) != thisCharDimension)
+						throw new LanguageException("Einsum: Character '" + c + "' expected to be dim " + charToDimensionSize.get(c) + ", but found " + thisCharDimension);
+				}else{
+					charToDimensionSize.put(c, thisCharDimension);
+				}
+				arrSizeIterator++;
+			}
+		}
+		if (expressionsOrIdentifiers.size() - 1 > numberOfMatrices)
+			throw new LanguageException("Einsum: Provided more operands than specified in equation str");
 
-        if (isResultScalar)
-            return Triple.of(-1l,-1l, Types.DataType.SCALAR);
+		if (isResultScalar)
+			return Triple.of(-1l,-1l, Types.DataType.SCALAR);
 
-        int numberOfOutDimensions = 0;
-        Character dim1Char = null;
-        long dim1 = 1;
-        long dim2 = 1;
-        for (int i = 0; i < eqStringParts[1].length(); i++) {
-            char c = eqStringParts[1].charAt(i);
-            if (c == ' ') continue;
-            if (numberOfOutDimensions == 0) {
-                dim1Char = c;
-                dim1 = charToDimensionSize.get(c);
-            } else {
-                if(c==dim1Char) throw new LanguageException("Einsum: output character "+c+" provided multiple times");
-                dim2 = charToDimensionSize.get(c);
-            }
-            numberOfOutDimensions++;
-        }
-        if (numberOfOutDimensions > 2) {
-            throw new LanguageException("Einsum: output matrices with with no. dims > 2 not supported");
-        } else {
-            return Triple.of(dim1, dim2, Types.DataType.MATRIX);
-        }
-    }
+		int numberOfOutDimensions = 0;
+		Character dim1Char = null;
+		long dim1 = 1;
+		long dim2 = 1;
+		for (int i = 0; i < eqStringParts[1].length(); i++) {
+			char c = eqStringParts[1].charAt(i);
+			if (c == ' ') continue;
+			if (numberOfOutDimensions == 0) {
+				dim1Char = c;
+				if(!charToDimensionSize.containsKey(c))
+					throw new LanguageException("Einsum: Output dimension '"+c+"' not present in input operands");
+				dim1 = charToDimensionSize.get(c);
+			} else {
+				if(c==dim1Char) throw new LanguageException("Einsum: output character "+c+" provided multiple times");
+				if(!charToDimensionSize.containsKey(c))
+					throw new LanguageException("Einsum: Output dimension '"+c+"' not present in input operands");
+				dim2 = charToDimensionSize.get(c);
+			}
+			numberOfOutDimensions++;
+		}
+		if (numberOfOutDimensions > 2) {
+			throw new LanguageException("Einsum: output matrices with with no. dims > 2 not supported");
+		} else {
+			return Triple.of(dim1, dim2, Types.DataType.MATRIX);
+		}
+	}
 
-    public static Types.DataType validateEinsumEquationNoDimensions(String equationString, int numberOfMatrixInputs) throws LanguageException {
-        String[] eqStringParts = equationString.split("->"); // length 2 if "...->..." , length 1 if "...->"
-        boolean isResultScalar = eqStringParts.length == 1;
+	public static Types.DataType validateEinsumEquationNoDimensions(String equationString, int numberOfMatrixInputs) throws LanguageException {
+		String[] eqStringParts = equationString.split("->"); // length 2 if "...->..." , length 1 if "...->"
+		boolean isResultScalar = eqStringParts.length == 1;
 
-        int numberOfMatrices = 1;
-        for (int i = 0; i < eqStringParts[0].length(); i++) {
-            char c = eqStringParts[0].charAt(i);
-            if(c == ' ') continue;
-            if(c == ',')
-                numberOfMatrices++;
-        }
-        if(numberOfMatrixInputs != numberOfMatrices){
-            throw  new LanguageException("Einsum: Invalid number of parameters, given: " + numberOfMatrixInputs + ", expected: " + numberOfMatrices);
-        }
+		int numberOfMatrices = 1;
+		for (int i = 0; i < eqStringParts[0].length(); i++) {
+			char c = eqStringParts[0].charAt(i);
+			if(c == ' ') continue;
+			if(c == ',')
+				numberOfMatrices++;
+		}
+		if(numberOfMatrixInputs != numberOfMatrices){
+			throw  new LanguageException("Einsum: Invalid number of parameters, given: " + numberOfMatrixInputs + ", expected: " + numberOfMatrices);
+		}
 
-        if(isResultScalar){
-            return Types.DataType.SCALAR;
-        }else {
-            int numberOfDimensions = 0;
-            Character dim1Char = null;
-            for (int i = 0; i < eqStringParts[1].length(); i++) {
-                char c = eqStringParts[i].charAt(i);
-                if(c == ' ') continue;
-                numberOfDimensions++;
-                if (numberOfDimensions == 1 && c == dim1Char)
-                    throw new LanguageException("Einsum: output character "+c+" provided multiple times");
-                dim1Char = c;
-            }
+		if(isResultScalar){
+			return Types.DataType.SCALAR;
+		}else {
+			int numberOfDimensions = 0;
+			Character dim1Char = null;
+			for (int i = 0; i < eqStringParts[1].length(); i++) {
+				char c = eqStringParts[i].charAt(i);
+				if(c == ' ') continue;
+				numberOfDimensions++;
+				if (numberOfDimensions == 1 && c == dim1Char)
+					throw new LanguageException("Einsum: output character "+c+" provided multiple times");
+				dim1Char = c;
+			}
 
-            if (numberOfDimensions > 2) {
-                throw new LanguageException("Einsum: output matrices with with no. dims > 2 not supported");
-            } else {
-                return Types.DataType.MATRIX;
-            }
-        }
-    }
+			if (numberOfDimensions > 2) {
+				throw new LanguageException("Einsum: output matrices with with no. dims > 2 not supported");
+			} else {
+				return Types.DataType.MATRIX;
+			}
+		}
+	}
 
-    private static <HopOrIdentifier extends ParseInfo> long getThisCharDimension(HopOrIdentifier currArr, int arrSizeIterator) {
-        long thisCharDimension;
-        if(currArr instanceof Hop){
-            thisCharDimension = arrSizeIterator == 0 ? ((Hop) currArr).getDim1()  : ((Hop) currArr).getDim2();
-        } else if(currArr instanceof Identifier){
-            thisCharDimension = arrSizeIterator == 0 ? ((Identifier) currArr).getDim1()  : ((Identifier) currArr).getDim2();
-        } else {
-            throw new RuntimeException("validateEinsumAndReturnDimensions called with expressions that are not Hop or Identifier: "+ currArr == null ? "null" : currArr.getClass().toString());
-        }
-        return thisCharDimension;
-    }
+	private static <HopOrIdentifier extends ParseInfo> long getThisCharDimension(HopOrIdentifier currArr, int arrSizeIterator) {
+		long thisCharDimension;
+		if(currArr instanceof Hop){
+			thisCharDimension = arrSizeIterator == 0 ? ((Hop) currArr).getDim1()  : ((Hop) currArr).getDim2();
+		} else if(currArr instanceof Identifier){
+			thisCharDimension = arrSizeIterator == 0 ? ((Identifier) currArr).getDim1()  : ((Identifier) currArr).getDim2();
+		} else {
+			throw new RuntimeException("validateEinsumAndReturnDimensions called with expressions that are not Hop or Identifier: "+ currArr == null ? "null" : currArr.getClass().toString());
+		}
+		return thisCharDimension;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/einsum/EinsumSpoofRowwise.java b/src/main/java/org/apache/sysds/runtime/einsum/EinsumSpoofRowwise.java
new file mode 100644
index 00000000000..40b73ea3994
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/einsum/EinsumSpoofRowwise.java
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.einsum;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorSpecies;
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
+import org.apache.sysds.runtime.codegen.SpoofRowwise;
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+
+public final class EinsumSpoofRowwise extends SpoofRowwise {
+	private static final long serialVersionUID = -5957679254041639561L;
+	
+	private final int _ABCount;
+	private final boolean _Bsupplied;
+	private final int _ACount;
+	private final int _AZCount;
+	private final int _ZSize;
+	private final int _AZStartIndex;
+	private final EOpNodeFuse.EinsumRewriteType _EinsumRewriteType;
+
+	public EinsumSpoofRowwise(EOpNodeFuse.EinsumRewriteType einsumRewriteType, RowType rowType, long constDim2,
+		int abCount, boolean bSupplied, int aCount, int azCount, int zSize) {
+		super(rowType, constDim2, false, 1);
+		_ABCount = abCount;
+		_Bsupplied = bSupplied;
+		_ACount = aCount;
+		_AZStartIndex = abCount + (_Bsupplied ? 1 : 0) + aCount;
+		_AZCount = azCount;
+		_EinsumRewriteType = einsumRewriteType;
+		_ZSize = zSize;
+	}
+
+	protected void genexec(double[] a, int ai, SideInput[] b, double[] scalars, double[] c, int ci, int len, long grix,
+		int rix) {
+		switch(_EinsumRewriteType) {
+			case AB_BA_B_A__AB -> {
+				genexecAB(a, ai, b, null, c, ci, len, grix, rix);
+				if(scalars.length != 0) { LibMatrixMult.vectMultiplyWrite(scalars[0], c, c, ci, ci, len); }
+			}
+			case AB_BA_A__B -> {
+				genexecB(a, ai, b, null, c, ci, len, grix, rix);
+			}
+			case AB_BA_B_A__A -> {
+				genexecAor(a, ai, b, null, c, ci, len, grix, rix);
+				if(scalars.length != 0) { c[rix] *= scalars[0]; }
+			}
+			case AB_BA_B_A__ -> {
+				genexecAor(a, ai, b, null, c, ci, len, grix, rix);
+				if(scalars.length != 0) { c[0] *= scalars[0]; }
+			}
+			case AB_BA_B_A_AZ__Z -> {
+				double[] temp = {0};
+				genexecAor(a, ai, b, null, temp, 0, len, grix, rix);
+				if(scalars.length != 0) { temp[0] *= scalars[0]; }
+				if(_AZCount > 1) {
+					double[] temp2 = new double[_ZSize];
+					int bi = _AZStartIndex;
+					LibMatrixMult.vectMultiplyWrite(b[bi++].values(0), b[bi++].values(0), temp2, _ZSize * rix,
+						_ZSize * rix, 0, _ZSize);
+					while(bi < _AZStartIndex + _AZCount) {
+						LibMatrixMult.vectMultiplyWrite(temp2, b[bi++].values(0), temp2, 0, _ZSize * rix, 0, _ZSize);
+					}
+					LibMatrixMult.vectMultiplyAdd(temp[0], temp2, c, 0, 0, _ZSize);
+				}
+				else
+					LibMatrixMult.vectMultiplyAdd(temp[0], b[_AZStartIndex].values(rix), c, _ZSize * rix, 0, _ZSize);
+			}
+			case AB_BA_A_AZ__BZ -> {
+				double[] temp = new double[len];
+				genexecB(a, ai, b, null, temp, 0, len, grix, rix);
+				if(scalars.length != 0) {
+					LibMatrixMult.vectMultiplyWrite(scalars[0], temp, temp, 0, 0, len);
+				}
+				if(_AZCount > 1) {
+					double[] temp2 = new double[_ZSize];
+					int bi = _AZStartIndex;
+					LibMatrixMult.vectMultiplyWrite(b[bi++].values(0), b[bi++].values(0), temp2, _ZSize * rix,
+						_ZSize * rix, 0, _ZSize);
+					while(bi < _AZStartIndex + _AZCount) {
+						LibMatrixMult.vectMultiplyWrite(temp2, b[bi++].values(0), temp2, 0, _ZSize * rix, 0, _ZSize);
+					}
+					LibSpoofPrimitives.vectOuterMultAdd(temp, temp2, c, 0, 0, 0, len, _ZSize);
+				}
+				else
+					LibSpoofPrimitives.vectOuterMultAdd(temp, b[_AZStartIndex].values(rix), c, 0, _ZSize * rix, 0, len, _ZSize);
+			}
+			case AB_BA_A_AZ__ZB -> {
+				double[] temp = new double[len];
+				genexecB(a, ai, b, null, temp, 0, len, grix, rix);
+				if(scalars.length != 0) {
+					LibMatrixMult.vectMultiplyWrite(scalars[0], temp, temp, 0, 0, len);
+				}
+				if(_AZCount > 1) {
+					double[] temp2 = new double[_ZSize];
+					int bi = _AZStartIndex;
+					LibMatrixMult.vectMultiplyWrite(b[bi++].values(0), b[bi++].values(0), temp2, _ZSize * rix,
+						_ZSize * rix, 0, _ZSize);
+					while(bi < _AZStartIndex + _AZCount) {
+						LibMatrixMult.vectMultiplyWrite(temp2, b[bi++].values(0), temp2, 0, _ZSize * rix, 0, _ZSize);
+					}
+					LibSpoofPrimitives.vectOuterMultAdd(temp2, temp, c, 0, 0, 0, _ZSize, len);
+				}
+				else
+					LibSpoofPrimitives.vectOuterMultAdd(b[_AZStartIndex].values(rix), temp, c, _ZSize * rix, 0, 0, _ZSize, len);
+			}
+			default -> throw new NotImplementedException();
+		}
+	}
+
+	private void genexecAB(double[] a, int ai, SideInput[] b, double[] scalars, double[] c, int ci, int len, long grix,
+		int rix) {
+		int bi = 0;
+		double[] TMP1 = null;
+		if(_ABCount != 0) {
+			if(_ABCount == 1 & _ACount == 0 && !_Bsupplied) {
+				LibMatrixMult.vectMultiplyWrite(a, b[0].values(rix), c, ai, ai, ci, len);
+				return;
+			}
+			TMP1 = LibSpoofPrimitives.vectMultWrite(a, b[bi++].values(rix), ai, ai, len);
+			while(bi < _ABCount) {
+				if(_ACount == 0 && !_Bsupplied && bi == _ABCount - 1) {
+					LibMatrixMult.vectMultiplyWrite(TMP1, b[bi++].values(rix), c, 0, ai, ci, len);
+				}
+				else {
+					LibMatrixMult.vectMultiplyWrite(TMP1, b[bi++].values(rix), TMP1, 0, ai, 0, len);
+				}
+			}
+		}
+
+		if(_Bsupplied) {
+			if(_ACount == 1)
+				if(TMP1 == null)
+					vectMultiplyWrite(b[bi + 1].values(0)[rix], a, b[bi].values(0), c, ai, 0, ci, len);
+				else
+					vectMultiplyWrite(b[bi + 1].values(0)[rix], TMP1, b[bi].values(0), c, 0, 0, ci, len);
+			else if(TMP1 == null)
+				LibMatrixMult.vectMultiplyWrite(a, b[bi].values(0), c, ai, 0, ci, len);
+			else
+				LibMatrixMult.vectMultiplyWrite(TMP1, b[bi].values(0), c, 0, 0, ci, len);
+		}
+		else if(_ACount == 1) {
+			if(TMP1 == null)
+				LibMatrixMult.vectMultiplyWrite(b[bi].values(0)[rix], a, c, ai, ci, len);
+			else
+				LibMatrixMult.vectMultiplyWrite(b[bi].values(0)[rix], TMP1, c, 0, ci, len);
+		}
+	}
+
+	private void genexecB(double[] a, int ai, SideInput[] b, double[] scalars, double[] c, int ci, int len, long grix,
+		int rix) {
+		int bi = 0;
+		double[] TMP1 = null;
+		if(_ABCount == 1 && _ACount == 0)
+			LibMatrixMult.vectMultiplyAdd(a, b[bi++].values(rix), c, ai, ai, 0, len);
+		else if(_ABCount != 0) {
+			TMP1 = LibSpoofPrimitives.vectMultWrite(a, b[bi++].values(rix), ai, ai, len);
+			while(bi < _ABCount) {
+				if(_ACount == 0 && bi == _ABCount - 1) {
+					LibMatrixMult.vectMultiplyAdd(TMP1, b[bi++].values(rix), c, 0, ai, 0, len);
+				}
+				else {
+					LibMatrixMult.vectMultiplyWrite(TMP1, b[bi++].values(rix), TMP1, 0, ai, 0, len);
+				}
+			}
+		}
+
+		if(_ACount == 1) {
+			if(TMP1 == null)
+				LibMatrixMult.vectMultiplyAdd(b[bi].values(0)[rix], a, c, ai, 0, len);
+			else
+				LibMatrixMult.vectMultiplyAdd(b[bi].values(0)[rix], TMP1, c, 0, 0, len);
+		}
+	}
+
+	private void genexecAor(double[] a, int ai, SideInput[] b, double[] scalars, double[] c, int ci, int len, long grix, int rix) {
+		int bi = 0;
+		double[] TMP1 = null;
+		double TMP2 = 0;
+		if(_ABCount == 1 && !_Bsupplied)
+			TMP2 = LibSpoofPrimitives.dotProduct(a, b[bi++].values(rix), ai, ai, len);
+		else if(_ABCount != 0) {
+			TMP1 = LibSpoofPrimitives.vectMultWrite(a, b[bi++].values(rix), ai, ai, len);
+			while(bi < _ABCount) {
+				if(!_Bsupplied && bi == _ABCount - 1)
+					TMP2 = LibSpoofPrimitives.dotProduct(TMP1, b[bi++].values(rix), 0, ai, len);
+				else
+					LibMatrixMult.vectMultiplyWrite(TMP1, b[bi++].values(rix), TMP1, 0, ai, 0, len);
+			}
+		}
+
+		if(_Bsupplied)
+			if(_ABCount != 0) TMP2 = LibSpoofPrimitives.dotProduct(TMP1, b[bi++].values(0), 0, 0, len);
+			else TMP2 = LibSpoofPrimitives.dotProduct(a, b[bi++].values(0), ai, 0, len);
+		else if(_ABCount == 0) TMP2 = LibSpoofPrimitives.vectSum(a, ai, len);
+
+		if(_ACount == 1) TMP2 *= b[bi].values(0)[rix];
+
+		if(_EinsumRewriteType == EOpNodeFuse.EinsumRewriteType.AB_BA_B_A__A) c[ci] = TMP2;
+		else c[0] += TMP2;
+	}
+
+	protected void genexec(double[] avals, int[] aix, int ai, SideInput[] b, double[] scalars, double[] c, int ci,
+		int alen, int len, long grix, int rix) {
+		throw new RuntimeException("Sparse fused einsum not implemented");
+	}
+
+	// I am not sure if it is worth copying to LibMatrixMult so for now added it here
+	private static final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
+	private static final int vLen = SPECIES.length();
+
+	public static void vectMultiplyWrite(final double aval, double[] a, double[] b, double[] c, int ai, int bi, int ci,
+		final int len) {
+		final int bn = len % vLen;
+
+		//rest, not aligned to vLen-blocks
+		for(int j = 0; j < bn; j++, ai++, bi++, ci++)
+			c[ci] = aval * b[bi] * a[ai];
+
+		//unrolled vLen-block (for better instruction-level parallelism)
+		DoubleVector avalVec = DoubleVector.broadcast(SPECIES, aval);
+		for(int j = bn; j < len; j += vLen, ai += vLen, bi += vLen, ci += vLen) {
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai);
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
+			avalVec.mul(bVec).mul(aVec).intoArray(c, ci);
+		}
+	}
+
+	public static void vectMultiplyAdd(final double aval, double[] a, double[] b, double[] c, int ai, int bi, int ci,
+		final int len) {
+		final int bn = len % vLen;
+
+		//rest, not aligned to vLen-blocks
+		for(int j = 0; j < bn; j++, ai++, bi++, ci++)
+			c[ci] += aval * b[bi] * a[ai];
+
+		//unrolled vLen-block (for better instruction-level parallelism)
+		DoubleVector avalVec = DoubleVector.broadcast(SPECIES, aval);
+		for(int j = bn; j < len; j += vLen, ai += vLen, bi += vLen, ci += vLen) {
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai);
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
+			DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci);
+			DoubleVector tmp = aVec.mul(bVec);
+			tmp.fma(avalVec, cVec).intoArray(c, ci);
+		}
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/frame/data/columns/FloatArray.java b/src/main/java/org/apache/sysds/runtime/frame/data/columns/FloatArray.java
index d0ab7a56305..f838eadc1d2 100644
--- a/src/main/java/org/apache/sysds/runtime/frame/data/columns/FloatArray.java
+++ b/src/main/java/org/apache/sysds/runtime/frame/data/columns/FloatArray.java
@@ -174,7 +174,7 @@ public void reset(int size) {
 
 	@Override
 	public byte[] getAsByteArray() {
-		ByteBuffer floatBuffer = ByteBuffer.allocate(8 * _size);
+		ByteBuffer floatBuffer = ByteBuffer.allocate(4 * _size);
 		floatBuffer.order(ByteOrder.nativeOrder());
 		for(int i = 0; i < _size; i++)
 			floatBuffer.putFloat(_data[i]);
diff --git a/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameLibAppend.java b/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameLibAppend.java
index 60c61e8f4af..a0eb9965d43 100644
--- a/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameLibAppend.java
+++ b/src/main/java/org/apache/sysds/runtime/frame/data/lib/FrameLibAppend.java
@@ -65,16 +65,16 @@ else if(a.getNumColumns() == 0)
 		else if(b.getNumColumns() == 0)
 			return a;
 
-		final ValueType[] _schema = addAll(a.getSchema(), b.getSchema());
-		final ColumnMetadata[] _colmeta = addAll(a.getColumnMetadata(), b.getColumnMetadata());
-		final Array<?>[] _coldata = addAll(a.getColumns(), b.getColumns());
-		String[] _colnames = addAll(a.getColumnNames(), b.getColumnNames());
+		final ValueType[] schema = addAll(a.getSchema(), b.getSchema());
+		final ColumnMetadata[] colmeta = addAll(a.getColumnMetadata(), b.getColumnMetadata());
+		final Array<?>[] coldata = addAll(a.getColumns(), b.getColumns());
+		String[] colnames = addAll(a.getColumnNames(), b.getColumnNames());
 
 		// check and enforce unique columns names
-		if(!Arrays.stream(_colnames).allMatch(new HashSet<>()::add))
-			_colnames = null; // set to default of null to allocate on demand
+		if(!Arrays.stream(colnames).allMatch(new HashSet<>()::add))
+			colnames = null; // set to default of null to allocate on demand
 
-		return new FrameBlock(_schema, _colnames, _colmeta, _coldata);
+		return new FrameBlock(schema, colnames, colmeta, coldata);
 	}
 
 	private static FrameBlock appendRbind(FrameBlock a, FrameBlock b) {
diff --git a/src/main/java/org/apache/sysds/runtime/functionobjects/CM.java b/src/main/java/org/apache/sysds/runtime/functionobjects/CM.java
index 54a2d83adf8..f3b137c088d 100644
--- a/src/main/java/org/apache/sysds/runtime/functionobjects/CM.java
+++ b/src/main/java/org/apache/sysds/runtime/functionobjects/CM.java
@@ -20,7 +20,7 @@
 package org.apache.sysds.runtime.functionobjects;
 
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.instructions.cp.Data;
 import org.apache.sysds.runtime.instructions.cp.KahanObject;
 import org.apache.sysds.runtime.matrix.operators.CMOperator.AggregateOperationTypes;
@@ -88,7 +88,7 @@ public AggregateOperationTypes getAggOpType() {
 	 */
 	@Override
 	public Data execute(Data in1, double in2) {
-		CM_COV_Object cm1=(CM_COV_Object) in1;
+		CmCovObject cm1=(CmCovObject) in1;
 		
 		if(cm1.isCMAllZeros()) {
 			cm1.w=1;
@@ -203,7 +203,7 @@ public Data execute(Data in1, double in2) {
 	 */
 	@Override
 	public Data execute(Data in1, double in2, double w2) {
-		CM_COV_Object cm1=(CM_COV_Object) in1;
+		CmCovObject cm1=(CmCovObject) in1;
 		
 		if(cm1.isCMAllZeros())
 		{
@@ -320,8 +320,8 @@ public Data execute(Data in1, double in2, double w2) {
 	@Override
 	public Data execute(Data in1, Data in2)
 	{
-		CM_COV_Object cm1=(CM_COV_Object) in1;
-		CM_COV_Object cm2=(CM_COV_Object) in2;
+		CmCovObject cm1=(CmCovObject) in1;
+		CmCovObject cm2=(CmCovObject) in2;
 		
 		if(cm1.isCMAllZeros())
 		{
diff --git a/src/main/java/org/apache/sysds/runtime/functionobjects/COV.java b/src/main/java/org/apache/sysds/runtime/functionobjects/COV.java
index 89d8b235a05..836bf972ce6 100644
--- a/src/main/java/org/apache/sysds/runtime/functionobjects/COV.java
+++ b/src/main/java/org/apache/sysds/runtime/functionobjects/COV.java
@@ -19,7 +19,7 @@
 
 package org.apache.sysds.runtime.functionobjects;
 
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.instructions.cp.Data;
 import org.apache.sysds.runtime.instructions.cp.KahanObject;
 
@@ -62,7 +62,7 @@ private COV() {
 	@Override
 	public Data execute(Data in1, double u, double v, double w2) 
 	{
-		CM_COV_Object cov1=(CM_COV_Object) in1;
+		CmCovObject cov1=(CmCovObject) in1;
 		if(cov1.isCOVAllZeros())
 		{
 			cov1.w=w2;
@@ -94,7 +94,7 @@ public Data execute(Data in1, double u, double v, double w2)
 	@Override
 	public Data execute(Data in1, double u, double v) 
 	{
-		CM_COV_Object cov1=(CM_COV_Object) in1;
+		CmCovObject cov1=(CmCovObject) in1;
 		if(cov1.isCOVAllZeros())
 		{
 			cov1.w=1L;
@@ -118,8 +118,8 @@ public Data execute(Data in1, double u, double v)
 	@Override
 	public Data execute(Data in1, Data in2)
 	{
-		CM_COV_Object cov1=(CM_COV_Object) in1;
-		CM_COV_Object cov2=(CM_COV_Object) in2;
+		CmCovObject cov1=(CmCovObject) in1;
+		CmCovObject cov2=(CmCovObject) in2;
 		if(cov1.isCOVAllZeros())
 		{
 			cov1.w=cov2.w;
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/OOCInstructionParser.java b/src/main/java/org/apache/sysds/runtime/instructions/OOCInstructionParser.java
index feefe5f63d6..a2e64dd0bac 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/OOCInstructionParser.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/OOCInstructionParser.java
@@ -23,6 +23,7 @@
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.common.InstructionType;
 import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.instructions.ooc.AggregateTernaryOOCInstruction;
 import org.apache.sysds.runtime.instructions.ooc.AggregateUnaryOOCInstruction;
 import org.apache.sysds.runtime.instructions.ooc.BinaryOOCInstruction;
 import org.apache.sysds.runtime.instructions.ooc.CSVReblockOOCInstruction;
@@ -33,6 +34,7 @@
 import org.apache.sysds.runtime.instructions.ooc.OOCInstruction;
 import org.apache.sysds.runtime.instructions.ooc.ParameterizedBuiltinOOCInstruction;
 import org.apache.sysds.runtime.instructions.ooc.ReblockOOCInstruction;
+import org.apache.sysds.runtime.instructions.ooc.TernaryOOCInstruction;
 import org.apache.sysds.runtime.instructions.ooc.TSMMOOCInstruction;
 import org.apache.sysds.runtime.instructions.ooc.UnaryOOCInstruction;
 import org.apache.sysds.runtime.instructions.ooc.MatrixVectorBinaryOOCInstruction;
@@ -64,10 +66,14 @@ public static OOCInstruction parseSingleInstruction(InstructionType ooctype, Str
 				return CSVReblockOOCInstruction.parseInstruction(str);
 			case AggregateUnary:
 				return AggregateUnaryOOCInstruction.parseInstruction(str);
+			case AggregateTernary:
+				return AggregateTernaryOOCInstruction.parseInstruction(str);
 			case Unary:
 				return UnaryOOCInstruction.parseInstruction(str);
 			case Binary:
 				return BinaryOOCInstruction.parseInstruction(str);
+			case Ternary:
+				return TernaryOOCInstruction.parseInstruction(str);
 			case AggregateBinary:
 			case MAPMM:
 				return MatrixVectorBinaryOOCInstruction.parseInstruction(str);
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
index b35ca55dab6..b8d84ca3898 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/CPInstruction.java
@@ -34,6 +34,7 @@
 import org.apache.sysds.runtime.instructions.Instruction;
 import org.apache.sysds.runtime.instructions.fed.FEDInstructionUtils;
 import org.apache.sysds.runtime.matrix.operators.Operator;
+import org.apache.sysds.runtime.ooc.stats.OOCEventLog;
 
 public abstract class CPInstruction extends Instruction {
 	protected static final Log LOG = LogFactory.getLog(CPInstruction.class.getName());
@@ -52,6 +53,7 @@ public enum CPType {
 
 	protected final CPType _cptype;
 	protected final boolean _requiresLabelUpdate;
+	private long nanoTime;
 
 	protected CPInstruction(CPType type, String opcode, String istr) {
 		this(type, null, opcode, istr);
@@ -88,6 +90,8 @@ public String getGraphString() {
 
 	@Override
 	public Instruction preprocessInstruction(ExecutionContext ec) {
+		if (DMLScript.OOC_LOG_EVENTS)
+			nanoTime = System.nanoTime();
 		//default preprocess behavior (e.g., debug state, lineage)
 		Instruction tmp = super.preprocessInstruction(ec);
 
@@ -118,6 +122,10 @@ public Instruction preprocessInstruction(ExecutionContext ec) {
 	public void postprocessInstruction(ExecutionContext ec) {
 		if (DMLScript.LINEAGE_DEBUGGER)
 			ec.maintainLineageDebuggerInfo(this);
+		if (DMLScript.OOC_LOG_EVENTS) {
+			int callerId = OOCEventLog.registerCaller(getExtendedOpcode() + "_" + hashCode());
+			OOCEventLog.onComputeEvent(callerId, nanoTime, System.nanoTime());
+		}
 	}
 	
 	/**
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/CentralMomentCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/CentralMomentCPInstruction.java
index cca466182da..18fcfe6ae95 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/CentralMomentCPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/CentralMomentCPInstruction.java
@@ -90,7 +90,7 @@ public void processInstruction( ExecutionContext ec ) {
 		if ( cm_op.getAggOpType() == AggregateOperationTypes.INVALID )
 			cm_op = cm_op.setCMAggOp((int)order.getLongValue());
 		
-		CM_COV_Object cmobj = null; 
+		CmCovObject cmobj = null; 
 		if (input3 == null ) {
 			cmobj = matBlock.cmOperations(cm_op);
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/CM_COV_Object.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/CmCovObject.java
similarity index 95%
rename from src/main/java/org/apache/sysds/runtime/instructions/cp/CM_COV_Object.java
rename to src/main/java/org/apache/sysds/runtime/instructions/cp/CmCovObject.java
index 8591c3b9ca9..09d08e8e448 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/CM_COV_Object.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/CmCovObject.java
@@ -27,7 +27,7 @@
 import org.apache.sysds.runtime.matrix.operators.CMOperator.AggregateOperationTypes;
 
 
-public class CM_COV_Object extends Data 
+public class CmCovObject extends Data 
 {
 	private static final long serialVersionUID = -5814207545197934085L;
 
@@ -48,7 +48,7 @@ public String toString() {
 		return "weight: "+w+", mean: "+mean+", m2: "+m2+", m3: "+m3+", m4: "+m4+", min: "+min+", max: "+max+", mean2: "+mean_v+", c2: "+c2;
 	}
 	
-	public CM_COV_Object()
+	public CmCovObject()
 	{
 		super(DataType.UNKNOWN, ValueType.UNKNOWN);
 		w=0;
@@ -75,7 +75,7 @@ public void reset()
 		max=0;
 	}
 	
-	public int compareTo(CM_COV_Object that)
+	public int compareTo(CmCovObject that)
 	{
 		if(w!=that.w)
 			return Double.compare(w, that.w);
@@ -100,10 +100,10 @@ else if(max!=that.max)
 	@Override
 	public boolean equals(Object o)
 	{
-		if( o == null || !(o instanceof CM_COV_Object) )
+		if( o == null || !(o instanceof CmCovObject) )
 			return false;
 		
-		CM_COV_Object that = (CM_COV_Object)o;
+		CmCovObject that = (CmCovObject)o;
 		return (w==that.w && mean.equals(that.mean) && m2.equals(that.m2))
 				&& m3.equals(that.m3) && m4.equals(that.m4) 
 				&& mean_v.equals(that.mean_v) && c2.equals(that.c2)
@@ -115,7 +115,7 @@ public int hashCode() {
 		throw new RuntimeException("hashCode() should never be called on instances of this class.");
 	}
 	
-	public void set(CM_COV_Object that)
+	public void set(CmCovObject that)
 	{
 		this.w=that.w;
 		this.mean.set(that.mean);
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/CovarianceCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/CovarianceCPInstruction.java
index f0b8b597039..29b74f95741 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/CovarianceCPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/CovarianceCPInstruction.java
@@ -62,7 +62,7 @@ public void processInstruction(ExecutionContext ec)
 		MatrixBlock matBlock2 = ec.getMatrixInput(input2.getName());
 		String output_name = output.getName(); 
 		COVOperator cov_op = (COVOperator)_optr;
-		CM_COV_Object covobj = null;
+		CmCovObject covobj = null;
 		
 		if ( input3 == null ) {
 			// Unweighted: cov.mvar0.mvar1.out
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/EinsumCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/EinsumCPInstruction.java
index c67dd290799..2b1074c80c9 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/EinsumCPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/EinsumCPInstruction.java
@@ -19,7 +19,6 @@
 
 package org.apache.sysds.runtime.instructions.cp;
 
-import org.apache.commons.lang3.NotImplementedException;
 import org.apache.commons.lang3.tuple.Pair;
 import org.apache.commons.lang3.tuple.Triple;
 import org.apache.commons.logging.Log;
@@ -29,31 +28,49 @@
 import org.apache.sysds.common.Types.DataType;
 import org.apache.sysds.hops.LiteralOp;
 import org.apache.sysds.hops.OptimizerUtils;
-import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.hops.codegen.cplan.CNode;
-import org.apache.sysds.hops.codegen.cplan.CNodeBinary;
 import org.apache.sysds.hops.codegen.cplan.CNodeCell;
 import org.apache.sysds.hops.codegen.cplan.CNodeData;
-import org.apache.sysds.hops.codegen.cplan.CNodeRow;
-import org.apache.sysds.runtime.codegen.*;
+import org.apache.sysds.runtime.codegen.CodegenUtils;
+import org.apache.sysds.runtime.codegen.SpoofOperator;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysds.runtime.einsum.EOpNode;
+import org.apache.sysds.runtime.einsum.EOpNodeBinary;
+import org.apache.sysds.runtime.einsum.EOpNodeBinary.EBinaryOperand;
+import org.apache.sysds.runtime.einsum.EOpNodeData;
+import org.apache.sysds.runtime.einsum.EOpNodeFuse;
+import org.apache.sysds.runtime.einsum.EOpNodeUnary;
 import org.apache.sysds.runtime.einsum.EinsumContext;
-import org.apache.sysds.runtime.functionobjects.*;
-import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+import org.apache.sysds.runtime.functionobjects.SwapIndex;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
-import org.apache.sysds.runtime.matrix.operators.AggregateOperator;
-import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
-import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
-import org.apache.sysds.runtime.matrix.operators.SimpleOperator;
+import org.apache.sysds.utils.Explain;
 
-import java.util.*;
-import java.util.function.Predicate;
+
+import static org.apache.sysds.api.DMLScript.EXPLAIN;
+import static org.apache.sysds.hops.rewrite.RewriteMatrixMultChainOptimization.mmChainDP;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
 
 public class EinsumCPInstruction extends BuiltinNaryCPInstruction {
-	public static boolean FORCE_CELL_TPL = false;
+	public static final boolean FORCE_CELL_TPL = false; // naive looped solution
+
+	public static final boolean FUSE_OUTER_MULTIPLY = true;
+	public static final boolean FUSE_OUTER_MULTIPLY_EXCEEDS_L2_CACHE_CHECK = true;
+
+	public static final boolean PRINT_TRACE = true;
+
 	protected static final Log LOG = LogFactory.getLog(EinsumCPInstruction.class.getName());
 	public String eqStr;
 	private final int _numThreads;
@@ -62,15 +79,13 @@ public class EinsumCPInstruction extends BuiltinNaryCPInstruction {
 	public EinsumCPInstruction(Operator op, String opcode, String istr, CPOperand out, CPOperand... inputs)
 	{
 		super(op, opcode, istr, out, inputs);
-		_numThreads = OptimizerUtils.getConstrainedNumThreads(-1);
+		_numThreads = OptimizerUtils.getConstrainedNumThreads(-1)/2;
 		_in = inputs;
 		this.eqStr = inputs[0].getName();
-		Logger.getLogger(EinsumCPInstruction.class).setLevel(Level.TRACE);
+		if (PRINT_TRACE)
+			Logger.getLogger(EinsumCPInstruction.class).setLevel(Level.TRACE);
 	}
 
-	@SuppressWarnings("unused")
-	private EinsumContext einc = null;
-
 	@Override
 	public void processInstruction(ExecutionContext ec) {
 		//get input matrices and scalars, incl pinning of matrices
@@ -81,119 +96,159 @@ public void processInstruction(ExecutionContext ec) {
 				if(mb instanceof CompressedMatrixBlock){
 					mb = ((CompressedMatrixBlock) mb).getUncompressed("Spoof instruction");
 				}
+				if(mb.getNumRows() == 1){
+					ensureMatrixBlockColumnVector(mb);
+				}
 				inputs.add(mb);
 			}
 		}
 
 		EinsumContext einc = EinsumContext.getEinsumContext(eqStr, inputs);
 
-		this.einc = einc;
 		String resultString = einc.outChar2 != null ? String.valueOf(einc.outChar1) + einc.outChar2 : einc.outChar1 != null ? String.valueOf(einc.outChar1) : "";
 
-		if( LOG.isDebugEnabled() ) LOG.trace("outrows:"+einc.outRows+", outcols:"+einc.outCols);
+		if( LOG.isTraceEnabled() ) LOG.trace("output: "+resultString +" "+einc.outRows+"x"+einc.outCols);
 
-		ArrayList<String> inputsChars = einc.newEquationStringInputsSplit;
+		List<String> inputsChars = einc.newEquationStringInputsSplit;
 
 		if(LOG.isTraceEnabled()) LOG.trace(String.join(",",einc.newEquationStringInputsSplit));
-
-		contractDimensionsAndComputeDiagonals(einc, inputs);
+		List<EOpNode> eOpNodes = new ArrayList<>(inputsChars.size());
+		List<EOpNode> eOpNodesScalars = new ArrayList<>(inputsChars.size()); // computed separately and not included into plan until it is already created
 
 		//make all vetors col vectors
 		for(int i = 0; i < inputs.size(); i++){
-			if(inputs.get(i) != null && inputsChars.get(i).length() == 1) EnsureMatrixBlockColumnVector(inputs.get(i));
+			if(inputsChars.get(i).length() == 1) ensureMatrixBlockColumnVector(inputs.get(i));
 		}
 
-		if(LOG.isTraceEnabled()) for(Character c : einc.characterAppearanceIndexes.keySet()){
-			ArrayList<Integer> a = einc.characterAppearanceIndexes.get(c);
-			LOG.trace(c+" count= "+a.size());
+		addSumDimensionsDiagonalsAndScalars(einc, inputsChars, eOpNodes, eOpNodesScalars, einc.charToDimensionSize);
+
+		Map<Character, Integer> characterToOccurences = einc.characterAppearanceCount;
+
+		for (int i = 0; i < inputsChars.size(); i++) {
+			if (inputsChars.get(i) == null) continue;
+			Character c1 = inputsChars.get(i).isEmpty() ? null : inputsChars.get(i).charAt(0);
+			Character c2 = inputsChars.get(i).length() > 1 ? inputsChars.get(i).charAt(1) : null;
+			Integer dim1 = c1 == null ? null : einc.charToDimensionSize.get(c1);
+			Integer dim2 = c1 == null ? null : einc.charToDimensionSize.get(c2);
+			EOpNodeData n = new EOpNodeData(c1,c2,dim1,dim2, i);
+			eOpNodes.add(n);
 		}
 
-		// compute scalar by suming-all matrices:
-		Double scalar = null;
-		for(int i=0;i< inputs.size(); i++){
-			String s = inputsChars.get(i);
-			if(s.equals("")){
-				MatrixBlock mb = inputs.get(i);
-				if (scalar == null) scalar = mb.get(0,0);
-				else scalar*= mb.get(0,0);
-				inputs.set(i,null);
-				inputsChars.set(i,null);
+		List<EOpNode> ret = new ArrayList<>();
+		addVectorMultiplies(eOpNodes, eOpNodesScalars,characterToOccurences, einc.outChar1, einc.outChar2, ret);
+		eOpNodes = ret;
+
+		List<EOpNode> plan;
+		ArrayList<MatrixBlock> remainingMatrices;
+
+		if(!FORCE_CELL_TPL) {
+			plan = generateGreedyPlan(eOpNodes, eOpNodesScalars, 
+				einc.charToDimensionSize, characterToOccurences, einc.outChar1, einc.outChar2);
+			if(!eOpNodesScalars.isEmpty()){
+				EOpNode l = eOpNodesScalars.get(0);
+				for(int i = 1; i < eOpNodesScalars.size(); i++){
+					l = new EOpNodeBinary(l, eOpNodesScalars.get(i), EBinaryOperand.scalar_scalar);
+				}
+
+				if(plan.isEmpty()) plan.add(l);
+				else {
+					int minCost = Integer.MAX_VALUE;
+					EOpNode addToNode = null;
+					int minIdx = -1;
+					for(int i = 0; i < plan.size(); i++) {
+						EOpNode n = plan.get(i);
+						Pair<Integer, EOpNode> costAndNode = addScalarToPlanFindMinCost(n, einc.charToDimensionSize);
+						if(costAndNode.getLeft() < minCost) {
+							minCost = costAndNode.getLeft();
+							addToNode = costAndNode.getRight();
+							minIdx = i;
+						}
+					}
+					plan.set(minIdx, mergeEOpNodeWithScalar(addToNode, l));
+				}
+
 			}
-		}
 
-		if (scalar != null) {
-			inputsChars.add("");
-			inputs.add(new MatrixBlock(scalar));
-		}
+			if(plan.size() == 2 && plan.get(0).c2 == null && plan.get(1).c2 == null){
+				if (plan.get(0).c1 == einc.outChar1 && plan.get(1).c1 == einc.outChar2)
+					plan.set(0, new EOpNodeBinary(plan.get(0), plan.get(1), EBinaryOperand.A_B));
+				if (plan.get(0).c1 == einc.outChar2 && plan.get(1).c1 == einc.outChar1)
+					plan.set(0, new EOpNodeBinary(plan.get(1), plan.get(0), EBinaryOperand.A_B));
+				plan.remove(1);
+			}
 
-		HashMap<Character, Integer> characterToOccurences = new HashMap<>();
-		for (Character key :einc.characterAppearanceIndexes.keySet()) {
-			characterToOccurences.put(key, einc.characterAppearanceIndexes.get(key).size());
-		}
-		for (Character key :einc.charToDimensionSize.keySet()) {
-			if(!characterToOccurences.containsKey(key))
-				characterToOccurences.put(key, 1);
-		}
+			if(plan.size() == 1)
+				plan.set(0,plan.get(0).reorderChildrenAndOptimize(null, einc.outChar1, einc.outChar2));
 
-		ArrayList<EOpNode> eOpNodes = new ArrayList<>(inputsChars.size());
-		for (int i = 0; i < inputsChars.size(); i++) {
-			if (inputsChars.get(i) == null) continue;
-			EOpNodeData n = new EOpNodeData(inputsChars.get(i).length() > 0 ? inputsChars.get(i).charAt(0) : null, inputsChars.get(i).length() > 1 ? inputsChars.get(i).charAt(1) : null, i);
-			eOpNodes.add(n);
+			if (EXPLAIN != Explain.ExplainType.NONE ) {
+				System.out.println("Einsum plan:");
+				for(int i = 0; i < plan.size(); i++) {
+					System.out.println((i + 1) + ".");
+					System.out.println("- " + String.join("\n- ", plan.get(i).recursivePrintString()));
+				}
+			}
+
+			remainingMatrices = executePlan(plan, inputs);
+		}else{
+			plan = eOpNodes;
+			remainingMatrices = inputs;
 		}
-		Pair<Integer, List<EOpNode> > plan = FORCE_CELL_TPL ? null : generatePlan(0, eOpNodes, einc.charToDimensionSize, characterToOccurences, einc.outChar1, einc.outChar2);
 
 
-		ArrayList<MatrixBlock> resMatrices = FORCE_CELL_TPL ? null : executePlan(plan.getRight(), inputs);
-//		ArrayList<MatrixBlock> resMatrices = executePlan(plan.getRight(), inputs, true);
 
-		if(!FORCE_CELL_TPL && resMatrices.size() == 1){
-			EOpNode resNode = plan.getRight().get(0);
+		if(!FORCE_CELL_TPL && remainingMatrices.size() == 1){
+			EOpNode resNode = plan.get(0);
 			if (einc.outChar1 != null && einc.outChar2 != null){
 				if(resNode.c1 == einc.outChar1 && resNode.c2 == einc.outChar2){
-					ec.setMatrixOutput(output.getName(), resMatrices.get(0));
+					ec.setMatrixOutput(output.getName(), remainingMatrices.get(0));
 				}
 				else if(resNode.c1 == einc.outChar2 && resNode.c2 == einc.outChar1){
+					if( LOG.isTraceEnabled()) LOG.trace("Transposing the final result");
+
 					ReorgOperator transpose = new ReorgOperator(SwapIndex.getSwapIndexFnObject(), _numThreads);
-					MatrixBlock resM = resMatrices.get(0).reorgOperations(transpose, new MatrixBlock(),0,0,0);
+					MatrixBlock resM = remainingMatrices.get(0).reorgOperations(transpose, new MatrixBlock(),0,0,0);
 					ec.setMatrixOutput(output.getName(), resM);
 				}else{
-					if(LOG.isTraceEnabled()) LOG.trace("Einsum expected: "+resultString + ", got: "+resNode.c1+resNode.c2);
-					throw new RuntimeException("Einsum plan produced different result");
+					if(LOG.isTraceEnabled()) LOG.trace("Einsum error, expected: "+resultString + ", got: "+resNode.c1+resNode.c2);
+					throw new RuntimeException("Einsum plan produced different result, expected: "+resultString + ", got: "+resNode.c1+resNode.c2);
 				}
 			}else if (einc.outChar1 != null){
 				if(resNode.c1 == einc.outChar1  && resNode.c2 == null){
-					ec.setMatrixOutput(output.getName(), resMatrices.get(0));
+					ensureMatrixBlockColumnVector(remainingMatrices.get(0));
+					ec.setMatrixOutput(output.getName(), remainingMatrices.get(0));
 				}else{
 					if(LOG.isTraceEnabled()) LOG.trace("Einsum expected: "+resultString + ", got: "+resNode.c1+resNode.c2);
 					throw new RuntimeException("Einsum plan produced different result");
 				}
 			}else{
 				if(resNode.c1 == null && resNode.c2 == null){
-					ec.setScalarOutput(output.getName(), new DoubleObject(resMatrices.get(0).get(0, 0)));;
+					ec.setScalarOutput(output.getName(), new DoubleObject(remainingMatrices.get(0).get(0, 0)));;
 				}
 			}
 		}else{
 			// use cell template with loops for remaining
-			ArrayList<MatrixBlock> mbs = resMatrices;
-			ArrayList<String> chars = new ArrayList<>();
+			ArrayList<MatrixBlock> mbs = remainingMatrices;
+			List<String> chars = new ArrayList<>();
 
-			for (int i = 0; i < plan.getRight().size(); i++) {
+			for (int i = 0; i < plan.size(); i++) {
 				String s;
-				if(plan.getRight().get(i).c1 == null) s = "";
-				else if(plan.getRight().get(i).c2 == null) s = plan.getRight().get(i).c1.toString();
-				else s = plan.getRight().get(i).c1.toString() + plan.getRight().get(i).c2;
+				if(plan.get(i).c1 == null) s = "";
+				else if(plan.get(i).c2 == null) s = plan.get(i).c1.toString();
+				else s = plan.get(i).c1.toString() + plan.get(i).c2;
 				chars.add(s);
 			}
 
-			ArrayList<Character> summingChars = new ArrayList<>();
-			for (Character c : einc.characterAppearanceIndexes.keySet()) {
+			List<Character> summingChars = new ArrayList<>();
+			for (Character c : characterToOccurences.keySet()) {
 				if (c != einc.outChar1 && c != einc.outChar2) summingChars.add(c);
 			}
 			if(LOG.isTraceEnabled()) LOG.trace("finishing with cell tpl: "+String.join(",", chars));
 
 			MatrixBlock res = computeCellSummation(mbs, chars, resultString, einc.charToDimensionSize, summingChars, einc.outRows, einc.outCols);
 
+			if (einc.outChar2 == null)
+				ensureMatrixBlockColumnVector(res);
+
 			if (einc.outRows == 1 && einc.outCols == 1)
 				ec.setScalarOutput(output.getName(), new DoubleObject(res.get(0, 0)));
 			else ec.setMatrixOutput(output.getName(), res);
@@ -204,102 +259,371 @@ else if(resNode.c1 == einc.outChar2 && resNode.c2 == einc.outChar1){
 
 	}
 
-	private void contractDimensionsAndComputeDiagonals(EinsumContext einc, ArrayList<MatrixBlock> inputs) {
-		for(int i = 0; i< einc.contractDims.length; i++){
-			//AggregateOperator agg = new AggregateOperator(0, KahanPlus.getKahanPlusFnObject(),Types.CorrectionLocationType.LASTCOLUMN);
-			AggregateOperator agg = new AggregateOperator(0, Plus.getPlusFnObject());
-
-			if(einc.diagonalInputs[i]){
-				ReorgOperator op = new ReorgOperator(DiagIndex.getDiagIndexFnObject());
-				inputs.set(i, inputs.get(i).reorgOperations(op, new MatrixBlock(),0,0,0));
-			}
-			if (einc.contractDims[i] == null) continue;
-			switch (einc.contractDims[i]){
-				case CONTRACT_BOTH: {
-					AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceAll.getReduceAllFnObject(), _numThreads);
-					MatrixBlock res = new MatrixBlock(1, 1, false);
-					inputs.get(i).aggregateUnaryOperations(aggun, res, 0, null);
-					inputs.set(i, res);
-					break;
+	private EOpNode mergeEOpNodeWithScalar(EOpNode addToNode, EOpNode scalar) {
+		if(addToNode instanceof EOpNodeFuse fuse) {
+			switch (fuse.einsumRewriteType) {
+				case AB_BA_B_A__A, AB_BA_B_A_AZ__Z -> {
+					fuse.addScalarAsIntermediate(scalar);
+					return fuse;
+				}
+			};
+			return new EOpNodeBinary(addToNode,scalar,EBinaryOperand.AB_scalar);
+		}
+		if(addToNode.c1 == null)
+			return new EOpNodeBinary(addToNode,scalar,EBinaryOperand.scalar_scalar);
+		if(addToNode.c2 == null)
+			return new EOpNodeBinary(addToNode,scalar,EBinaryOperand.A_scalar);
+		return new EOpNodeBinary(addToNode,scalar,EBinaryOperand.AB_scalar);
+	}
+
+	private static Pair<Integer, EOpNode> addScalarToPlanFindMinCost(EOpNode plan, Map<Character, Integer> charToSizeMap) {
+		int thisSize = 0;
+		if(plan.c1 != null) thisSize += charToSizeMap.get(plan.c1);
+		if(plan.c2 != null) thisSize += charToSizeMap.get(plan.c2);
+		int cost = thisSize;
+
+		if (plan instanceof EOpNodeData || plan instanceof EOpNodeUnary) return Pair.of(thisSize, plan);
+
+		List<EOpNode> inputs = List.of();
+
+		if (plan instanceof EOpNodeBinary bin) inputs = List.of(bin.left, bin.right);
+		else if(plan instanceof EOpNodeFuse fuse){
+			cost = switch (fuse.einsumRewriteType) {
+				case AB_BA_B_A__ -> 1;
+				case AB_BA_B_A__AB -> thisSize;
+				case AB_BA_A__B -> thisSize;
+				case AB_BA_B_A__A -> 2; // intermediate is scalar, 2 because if there is some real scalar
+				case AB_BA_B_A_AZ__Z -> 2; // intermediate is scalar
+				case AB_BA_A_AZ__BZ -> thisSize;
+				case AB_BA_A_AZ__ZB -> thisSize;
+			};
+			inputs = fuse.getChildren();
+		}
+
+		for(EOpNode inp : inputs){
+			Pair<Integer, EOpNode> min = addScalarToPlanFindMinCost(inp, charToSizeMap);
+			if(min.getLeft() < cost){
+				cost = min.getLeft();
+				plan = min.getRight();
+			}
+		}
+		return Pair.of(cost, plan);
+	}
+
+	private static void addVectorMultiplies(List<EOpNode> eOpNodes, List<EOpNode> eOpNodesScalars, 
+		Map<Character, Integer> charToOccurences, Character outChar1, Character outChar2, List<EOpNode> ret) 
+	{
+		Map<Character, List<EOpNode>> vectorCharacterToIndices = new HashMap<>();
+		for (int i = 0; i < eOpNodes.size(); i++) {
+			if (eOpNodes.get(i).c2 == null) {
+				if (vectorCharacterToIndices.containsKey(eOpNodes.get(i).c1))
+					vectorCharacterToIndices.get(eOpNodes.get(i).c1).add(eOpNodes.get(i));
+				else
+					vectorCharacterToIndices.put(eOpNodes.get(i).c1, new ArrayList<>(Collections.singletonList(eOpNodes.get(i))));
+			}
+		}
+		Set<EOpNode> usedNodes = new HashSet<>();
+		for(Character c : vectorCharacterToIndices.keySet()){
+			List<EOpNode> nodes = vectorCharacterToIndices.get(c);
+
+			if(nodes.size()==1) continue;
+			EOpNode left = nodes.get(0);
+			usedNodes.add(left);
+			boolean canBeSummed = c != outChar1 && c != outChar2 && charToOccurences.get(c) == nodes.size();
+
+			for(int i = 1; i < nodes.size(); i++){
+				EOpNode right = nodes.get(i);
+
+				if(canBeSummed && i == nodes.size()-1){
+					left = new EOpNodeBinary(left,right, EBinaryOperand.a_a);
+				}else {
+					left = new EOpNodeBinary(left,right, EBinaryOperand.A_A);
+				}
+				usedNodes.add(right);
+			}
+			if(canBeSummed) {
+				eOpNodesScalars.add(left);
+				charToOccurences.put(c, 0);
+			}
+			else {
+				ret.add(left);
+				charToOccurences.put(c, charToOccurences.get(c) - nodes.size() + 1);
+			}
+		}
+		for(EOpNode inp : eOpNodes){
+			if(!usedNodes.contains(inp)) ret.add(inp);
+		}
+	}
+
+	private void addSumDimensionsDiagonalsAndScalars(EinsumContext einc, List<String> inputStrings,
+		List<EOpNode> eOpNodes, List<EOpNode> eOpNodesScalars, Map<Character, Integer> charToDimensionSize) 
+	{
+		for(int i = 0; i< inputStrings.size(); i++){
+			String s = inputStrings.get(i);
+			if (s.isEmpty()){
+				eOpNodesScalars.add(new EOpNodeData(null, null, null, null,i));
+				inputStrings.set(i, null);
+				continue;
+			}else if (s.length() == 1){
+				char c1 = s.charAt(0);
+				if((einc.outChar1 == null || c1 != einc.outChar1) && (einc.outChar2 == null || c1 != einc.outChar2) && einc.characterAppearanceCount.get(c1) == 1){
+					EOpNode e0 = new EOpNodeData(c1, null, charToDimensionSize.get(c1), null, i);
+					eOpNodesScalars.add(new EOpNodeUnary(null, null, null, null, e0, EOpNodeUnary.EUnaryOperand.SUM));
+					inputStrings.set(i, null);
 				}
-				case CONTRACT_RIGHT: {
-					AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceCol.getReduceColFnObject(), _numThreads);
-					MatrixBlock res = new MatrixBlock(inputs.get(i).getNumRows(), 1, false);
-					inputs.get(i).aggregateUnaryOperations(aggun, res, 0, null);
-					inputs.set(i, res);
-					break;
+				continue;
+			}
+
+			char c1 = s.charAt(0);
+			char c2 = s.charAt(1);
+			Character newC1 = null;
+			EOpNodeUnary.EUnaryOperand op = null;
+
+			if(c1 == c2){
+				if((einc.outChar1 == null || c1 != einc.outChar1) && (einc.outChar2 == null || c1 != einc.outChar2) && einc.characterAppearanceCount.get(c1) == 2){
+					op = EOpNodeUnary.EUnaryOperand.TRACE;
+				}else {
+					einc.characterAppearanceCount.put(c1, einc.characterAppearanceCount.get(c1) - 1);
+					op = EOpNodeUnary.EUnaryOperand.DIAG;
+					newC1 = c1;
 				}
-				case CONTRACT_LEFT: {
-					AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceRow.getReduceRowFnObject(), _numThreads);
-					MatrixBlock res = new MatrixBlock(inputs.get(i).getNumColumns(), 1, false);
-					inputs.get(i).aggregateUnaryOperations(aggun, res, 0, null);
-					inputs.set(i, res);
-					break;
+			}else if((einc.outChar1 == null || c1 != einc.outChar1) && (einc.outChar2 == null || c1 != einc.outChar2) && einc.characterAppearanceCount.get(c1) == 1){
+				if ((einc.outChar1 == null || c2 != einc.outChar1) && (einc.outChar2 == null || c2 != einc.outChar2) && einc.characterAppearanceCount.get(c2) == 1){
+					op = EOpNodeUnary.EUnaryOperand.SUM;
+				}else{
+					newC1 = c2;
+					op = EOpNodeUnary.EUnaryOperand.SUM_COLS;
 				}
-				default:
-					break;
+			}else if((einc.outChar1 == null || c2 != einc.outChar1) && (einc.outChar2 == null || c2 != einc.outChar2) && einc.characterAppearanceCount.get(c2) == 1){
+				newC1 =  c1;
+				op = EOpNodeUnary.EUnaryOperand.SUM_ROWS;
 			}
+
+			if(op == null) continue;
+			EOpNodeData e0 = new EOpNodeData(c1, c2, charToDimensionSize.get(c1), charToDimensionSize.get(c2),  i);
+			Integer dim1 = newC1 == null ? null : charToDimensionSize.get(newC1);
+			EOpNodeUnary res = new EOpNodeUnary(newC1, null, dim1, null, e0, op);
+
+			if(op == EOpNodeUnary.EUnaryOperand.SUM) eOpNodesScalars.add(res);
+			else eOpNodes.add(res);
+
+			inputStrings.set(i, null);
 		}
 	}
 
-	private enum EBinaryOperand { // upper case: char has to remain, lower case: to be summed
-		////// summations:   //////
-		aB_a,// -> B
-		Ba_a, // -> B
-		Ba_aC, // mmult -> BC
-		aB_Ca,
-		Ba_Ca, // -> BC
-		aB_aC, // outer mult, possibly with transposing first -> BC
-		a_a,// dot ->
-
-		////// elementwisemult and sums, something like ij,ij->i   //////
-		aB_aB,// elemwise and colsum -> B
-		Ba_Ba, // elemwise and rowsum ->B
-		Ba_aB, // elemwise, either colsum or rowsum -> B
-//		aB_Ba,
-
-		////// elementwise, no summations:   //////
-		A_A,// v-elemwise -> A
-		AB_AB,// M-M elemwise -> AB
-		AB_BA, // M-M.T elemwise -> AB
-		AB_A, // M-v colwise -> BA!?
-		BA_A, // M-v rowwise -> BA
-		ab_ab,//M-M sum all
-		ab_ba, //M-M.T sum all
-		////// other   //////
-		A_B, // outer mult -> AB
-		A_scalar, // v-scalar
-		AB_scalar, // m-scalar
-		scalar_scalar
+	private static List<EOpNode> generateGreedyPlan(List<EOpNode> eOpNodes, List<EOpNode> eOpNodesScalars,
+		Map<Character, Integer> charToSizeMap, Map<Character, Integer> charToOccurences, Character outChar1, Character outChar2) 
+	{
+		List<EOpNode> ret;
+		int lastNumOfOperands = -1;
+		while(lastNumOfOperands != eOpNodes.size() && eOpNodes.size() > 1){
+			lastNumOfOperands = eOpNodes.size();
+
+			List<EOpNodeFuse> fuseOps;
+			do {
+				ret = new ArrayList<>();
+				fuseOps = EOpNodeFuse.findFuseOps(eOpNodes, outChar1, outChar2, charToSizeMap, charToOccurences, ret);
+
+				if(!fuseOps.isEmpty()) {
+					for (EOpNodeFuse fuseOp : fuseOps) {
+						if (fuseOp.c1 == null) {
+							eOpNodesScalars.add(fuseOp);
+							continue;
+						}
+						ret.add(fuseOp);
+					}
+					eOpNodes = ret;
+				}
+			} while(eOpNodes.size() > 1 && !fuseOps.isEmpty());
+
+			ret = new ArrayList<>();
+			addVectorMultiplies(eOpNodes, eOpNodesScalars,charToOccurences, outChar1, outChar2, ret);
+			eOpNodes = ret;
+
+			ret = new ArrayList<>();
+			List<List<EOpNode>> matrixMultiplies = findMatrixMultiplicationChains(eOpNodes, outChar1, outChar2, charToOccurences,
+				ret);
+
+			for(List<EOpNode> list : matrixMultiplies) {
+				EOpNodeBinary bin = optimizeMMChain(list, charToSizeMap);
+				ret.add(bin);
+			}
+			eOpNodes = ret;
+		}
+
+		return eOpNodes;
 	}
-	private abstract class EOpNode {
-		public Character c1;
-		public Character c2; // nullable
-		public EOpNode(Character c1, Character c2){
-			this.c1 = c1;
-			this.c2 = c2;
+
+	private static void reverseMMChainIfBeneficial(List<EOpNode> mmChain){ // possibly check the cost instead of number of transposes
+		char c1 = mmChain.get(0).c1;
+		char c2 = mmChain.get(0).c2;
+		int noTransposes = 0;
+		for (int i=1; i<mmChain.size(); i++){
+			if(c2 == mmChain.get(i).c1){
+				continue;
+			}
+			noTransposes++;
+			if(c2 == mmChain.get(i).c2){
+				c2 = mmChain.get(i).c1;
+			}
+			else if(c1 == mmChain.get(i).c1) {
+				c1 = c2;
+				c2 = mmChain.get(i).c2;
+			}else if(c1 == mmChain.get(i).c2) {
+				c1 = c2;
+				c2 = mmChain.get(i).c1;
+			}
+		}
+		if (noTransposes > (mmChain.size() / 2 )+1) {
+			Collections.reverse(mmChain);
 		}
 	}
-	private class EOpNodeBinary extends EOpNode {
-		public EOpNode left;
-		public EOpNode right;
-		public EBinaryOperand operand;
-		public EOpNodeBinary(Character c1, Character c2, EOpNode left, EOpNode right, EBinaryOperand operand){
-			super(c1,c2);
-			this.left = left;
-			this.right = right;
-			this.operand = operand;
+	private static EOpNodeBinary optimizeMMChain(List<EOpNode> mmChainL, Map<Character, Integer> charToSizeMap) {
+		List<EOpNode> mmChain = new ArrayList<>(mmChainL);
+		reverseMMChainIfBeneficial(mmChain);
+		List<Pair<Integer, Integer>> dimensions = new ArrayList<>();
+
+		for(int i = 0; i < mmChain.size()-1; i++){
+			EOpNode n1 = mmChain.get(i);
+			EOpNode n2 = mmChain.get(i+1);
+			if(n1.c2 == n2.c1 || n1.c2 == n2.c2) dimensions.add(Pair.of(charToSizeMap.get(n1.c1), charToSizeMap.get(n1.c2)));
+			else dimensions.add(Pair.of(charToSizeMap.get(n1.c2), charToSizeMap.get(n1.c1))); // transpose this one
 		}
+		EOpNode prelast = mmChain.get(mmChain.size()-2);
+		EOpNode last = mmChain.get(mmChain.size()-1);
+		if (last.c1 == prelast.c2 || last.c1 == prelast.c1) dimensions.add(Pair.of(charToSizeMap.get(last.c1), charToSizeMap.get(last.c2)));
+		else dimensions.add(Pair.of(charToSizeMap.get(last.c2), charToSizeMap.get(last.c1)));
+
+
+		double[] dimsArray = new double[mmChain.size() + 1];
+		getDimsArray( dimensions, dimsArray );
+
+		int size = mmChain.size();
+		int[][] splitMatrix = mmChainDP(dimsArray, mmChain.size());
+
+		return (EOpNodeBinary) getBinaryFromSplit(splitMatrix,0,size-1, mmChain);
 	}
-	private class EOpNodeData extends EOpNode {
-		public int matrixIdx;
-		public EOpNodeData(Character c1, Character c2, int matrixIdx){
-			super(c1,c2);
-			this.matrixIdx = matrixIdx;
+
+	private static EOpNode getBinaryFromSplit(int[][] splitMatrix, int i, int j, List<EOpNode> mmChain) {
+		if (i==j) return mmChain.get(i);
+		int split =  splitMatrix[i][j];
+
+		EOpNode left = getBinaryFromSplit(splitMatrix,i,split,mmChain);
+		EOpNode right = getBinaryFromSplit(splitMatrix,split+1,j,mmChain);
+		return EOpNodeBinary.combineMatrixMultiply(left, right);
+	}
+
+	private static void getDimsArray( List<Pair<Integer, Integer>> chain, double[] dimsArray )
+	{
+		for( int i = 0; i < chain.size(); i++ ) {
+			if (i == 0) {
+				dimsArray[i] = chain.get(i).getLeft();
+				if (dimsArray[i] <= 0) {
+					throw new RuntimeException(
+						"EinsumCPInstruction::optimizeMMChain() : Invalid Matrix Dimension: "+ dimsArray[i]);
+				}
+			}
+			else if (chain.get(i - 1).getRight() != chain.get(i).getLeft()) {
+				throw new RuntimeException(
+					"EinsumCPInstruction::optimizeMMChain() : Matrix Dimension Mismatch: " +
+					chain.get(i - 1).getRight()+" != "+chain.get(i).getLeft());
+			}
+
+			dimsArray[i + 1] = chain.get(i).getRight();
+			if( dimsArray[i + 1] <= 0 ) {
+				throw new RuntimeException(
+					"EinsumCPInstruction::optimizeMMChain() : Invalid Matrix Dimension: " + dimsArray[i + 1]);
+			}
+		}
+	}
+	private static List<List<EOpNode>> findMatrixMultiplicationChains(List<EOpNode> inpOperands, 
+		Character outChar1, Character outChar2, Map<Character, Integer> charToOccurences, List<EOpNode> ret) 
+	{
+		Set<Character> charactersThatCanBeContracted = new HashSet<>();
+		Map<Character, ArrayList<EOpNode>> characterToNodes = new HashMap<>();
+		List<EOpNode> operandsTodo =  new ArrayList<>();
+		for(EOpNode op : inpOperands) {
+			if(op.c2 == null || op.c1 == null) continue;
+
+			if (characterToNodes.containsKey(op.c1))  characterToNodes.get(op.c1).add(op);
+			else characterToNodes.put(op.c1, new ArrayList<>(Collections.singletonList(op)));
+			if (characterToNodes.containsKey(op.c2)) characterToNodes.get(op.c2).add(op);
+			else characterToNodes.put(op.c2, new ArrayList<>(Collections.singletonList(op)));
+
+			boolean todo = false;
+			if (charToOccurences.get(op.c1) == 2 && op.c1 != outChar1 && op.c1 != outChar2) {
+				charactersThatCanBeContracted.add(op.c1);
+				todo = true;
+			}
+			if (charToOccurences.get(op.c2) == 2 && op.c2 != outChar1 && op.c2 != outChar2) {
+				charactersThatCanBeContracted.add(op.c2);
+				todo = true;
+			}
+			if (todo)  operandsTodo.add(op);
+		}
+		List<List<EOpNode>> res = new ArrayList<>();
+
+		Set<EOpNode> doneNodes = new HashSet<>();
+
+		for(int i = 0; i < operandsTodo.size(); i++){
+			EOpNode iterateNode = operandsTodo.get(i);
+
+			if (doneNodes.contains(iterateNode)) continue; // was added previously
+			doneNodes.add(iterateNode);
+
+			LinkedList<EOpNode> multiplies = new LinkedList<>();
+			multiplies.add(iterateNode);
+
+			EOpNode nextNode = iterateNode;
+			Character nextC = iterateNode.c2;
+			// add to right using c2
+			while(charactersThatCanBeContracted.contains(nextC)) {
+				EOpNode one = characterToNodes.get(nextC).get(0);
+				EOpNode two = characterToNodes.get(nextC).get(1);
+				if (nextNode == one){
+					multiplies.addLast(two);
+					nextNode = two;
+				}else{
+					multiplies.addLast(one);
+					nextNode = one;
+				}
+				if(nextNode.c1 == nextC) nextC = nextNode.c2;
+				else nextC = nextNode.c1;
+				doneNodes.add(nextNode);
+			}
+
+			// add to left using c1
+			nextNode = iterateNode;
+			nextC = iterateNode.c1;
+			while(charactersThatCanBeContracted.contains(nextC)) {
+				EOpNode one = characterToNodes.get(nextC).get(0);
+				EOpNode two = characterToNodes.get(nextC).get(1);
+				if (nextNode == one){
+					multiplies.addFirst(two);
+					nextNode = two;
+				}else{
+					multiplies.addFirst(one);
+					nextNode = one;
+				}
+				if(nextNode.c1 == nextC) nextC = nextNode.c2;
+				else nextC = nextNode.c1;
+				doneNodes.add(nextNode);
+			}
+
+			res.add(multiplies);
+		}
+
+		for(EOpNode op : inpOperands) {
+			if (doneNodes.contains(op)) continue;
+			ret.add(op);
 		}
+
+		return res;
 	}
 
-	private Pair<Integer, List<EOpNode> /* ideally with one element */> generatePlan(int cost, ArrayList<EOpNode> operands, HashMap<Character, Integer> charToSizeMap, HashMap<Character, Integer> charToOccurences, Character outChar1, Character outChar2) {
+	// old way, DFS finds all paths
+	@SuppressWarnings("unused")
+	private Pair<Integer, List<EOpNode>> generateBinaryPlanCostBased(int cost, ArrayList<EOpNode> operands, HashMap<Character, Integer> charToSizeMap, HashMap<Character, Integer> charToOccurences, Character outChar1, Character outChar2) {
 		Integer minCost = cost;
 		List<EOpNode> minNodes = operands;
 
@@ -307,16 +631,15 @@ public EOpNodeData(Character c1, Character c2, int matrixIdx){
 			boolean swap = (operands.get(0).c2 == null && operands.get(1).c2 != null) || operands.get(0).c1 == null;
 			EOpNode n1 = operands.get(!swap ? 0 : 1);
 			EOpNode n2 = operands.get(!swap ? 1 : 0);
-			Triple<Integer, EBinaryOperand, Pair<Character, Character>> t = TryCombineAndCost(n1, n2, charToSizeMap, charToOccurences, outChar1, outChar2);
+			Triple<Integer, EBinaryOperand, Pair<Character, Character>> t = EOpNodeBinary.tryCombineAndCost(n1, n2, charToSizeMap, charToOccurences, outChar1, outChar2);
 			if (t != null) {
-				EOpNodeBinary newNode = new EOpNodeBinary(t.getRight().getLeft(), t.getRight().getRight(), n1, n2, t.getMiddle());
+				EOpNodeBinary newNode = new EOpNodeBinary(n1, n2, t.getMiddle());
 				int thisCost = cost + t.getLeft();
 				return Pair.of(thisCost, Arrays.asList(newNode));
 			}
 			return Pair.of(cost, operands);
 		}
 		else if (operands.size() == 1){
-			// check for transpose
 			return Pair.of(cost, operands);
 		}
 
@@ -326,17 +649,15 @@ else if (operands.size() == 1){
 				EOpNode n1 = operands.get(!swap ? i : j);
 				EOpNode n2 = operands.get(!swap ? j : i);
 
-
-				Triple<Integer, EBinaryOperand, Pair<Character, Character>> t = TryCombineAndCost(n1, n2, charToSizeMap, charToOccurences, outChar1, outChar2);
+				Triple<Integer, EBinaryOperand, Pair<Character, Character>> t = EOpNodeBinary.tryCombineAndCost(n1, n2, charToSizeMap, charToOccurences, outChar1, outChar2);
 				if (t != null){
-					EOpNodeBinary newNode = new EOpNodeBinary(t.getRight().getLeft(), t.getRight().getRight(), n1, n2, t.getMiddle());
+					EOpNodeBinary newNode = new EOpNodeBinary(n1, n2, t.getMiddle());
 					int thisCost = cost + t.getLeft();
 
 					if(n1.c1 != null) charToOccurences.put(n1.c1, charToOccurences.get(n1.c1)-1);
 					if(n1.c2 != null) charToOccurences.put(n1.c2, charToOccurences.get(n1.c2)-1);
 					if(n2.c1 != null) charToOccurences.put(n2.c1, charToOccurences.get(n2.c1)-1);
 					if(n2.c2 != null) charToOccurences.put(n2.c2, charToOccurences.get(n2.c2)-1);
-
 					if(newNode.c1 != null) charToOccurences.put(newNode.c1, charToOccurences.get(newNode.c1)+1);
 					if(newNode.c2 != null) charToOccurences.put(newNode.c2, charToOccurences.get(newNode.c2)+1);
 
@@ -346,8 +667,8 @@ else if (operands.size() == 1){
 					}
 					newOperands.add(newNode);
 
-					Pair<Integer, List<EOpNode>> furtherPlan = generatePlan(thisCost, newOperands,charToSizeMap, charToOccurences, outChar1, outChar2);
-					if(furtherPlan.getRight().size() < (minNodes.size()) || furtherPlan.getLeft() < minCost){
+					Pair<Integer, List<EOpNode>> furtherPlan = generateBinaryPlanCostBased(thisCost, newOperands, charToSizeMap, charToOccurences, outChar1, outChar2);
+					if(furtherPlan.getRight().size() < minNodes.size() || furtherPlan.getLeft() < minCost){
 						minCost = furtherPlan.getLeft();
 						minNodes = furtherPlan.getRight();
 					}
@@ -365,325 +686,28 @@ else if (operands.size() == 1){
 		return Pair.of(minCost, minNodes);
 	}
 
-	private static Triple<Integer, EBinaryOperand, Pair<Character, Character>> TryCombineAndCost(EOpNode n1 , EOpNode n2, HashMap<Character, Integer> charToSizeMap, HashMap<Character, Integer> charToOccurences, Character outChar1, Character outChar2){
-		Predicate<Character> cannotBeSummed = (c) ->
-				c == outChar1 || c == outChar2 || charToOccurences.get(c) > 2;
-
-		if(n1.c1 == null) {
-			// n2.c1 also has to be null
-			return Triple.of(1, EBinaryOperand.scalar_scalar, Pair.of(null, null));
-		}
-
-		if(n2.c1 == null) {
-			if(n1.c2 == null)
-				return Triple.of(charToSizeMap.get(n1.c1), EBinaryOperand.A_scalar, Pair.of(n1.c1, null));
-			return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.AB_scalar, Pair.of(n1.c1, n1.c2));
-		}
-
-		if(n1.c1 == n2.c1){
-			if(n1.c2 != null){
-				if ( n1.c2 == n2.c2){
-					if( cannotBeSummed.test(n1.c1)){
-						if(cannotBeSummed.test(n1.c2)){
-							return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.AB_AB, Pair.of(n1.c1, n1.c2));
-						}
-						return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.Ba_Ba, Pair.of(n1.c1, null));
-					}
-
-					if(cannotBeSummed.test(n1.c2)){
-						return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.aB_aB, Pair.of(n1.c2, null));
-					}
-
-					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.ab_ab, Pair.of(null, null));
-
-				}
-
-				else if(n2.c2 == null){
-					if(cannotBeSummed.test(n1.c1)){
-						return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2)*2, EBinaryOperand.AB_A, Pair.of(n1.c1, n1.c2));
-					}
-					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2)*2, EBinaryOperand.aB_a, Pair.of(n1.c2, null)); // in theory (null, n1.c2)
-				}
-				else if(n1.c1 ==outChar1 || n1.c1==outChar2|| charToOccurences.get(n1.c1) > 2){
-					return null;// AB,AC
-				}
-				else {
-					return Triple.of((charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2))+(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2)*charToSizeMap.get(n2.c2)), EBinaryOperand.aB_aC, Pair.of(n1.c2, n2.c2)); // or n2.c2, n1.c2
-				}
-			}else{ // n1.c2 = null -> c2.c2 = null
-				if(n1.c1 ==outChar1 || n1.c1==outChar2 || charToOccurences.get(n1.c1) > 2){
-					return Triple.of(charToSizeMap.get(n1.c1), EBinaryOperand.A_A, Pair.of(n1.c1, null));
-				}
-				return Triple.of(charToSizeMap.get(n1.c1), EBinaryOperand.a_a, Pair.of(null, null));
-			}
-
-
-		}else{ // n1.c1 != n2.c1
-			if(n1.c2 == null) {
-				return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n2.c1), EBinaryOperand.A_B, Pair.of(n1.c1, n2.c1));
-			}
-			else if(n2.c2 == null) { // ab,c
-				if (n1.c2 == n2.c1) {
-					if(cannotBeSummed.test(n1.c2)){
-						return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n2.c1), EBinaryOperand.BA_A, Pair.of(n1.c1, n1.c2));
-					}
-					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n2.c1), EBinaryOperand.Ba_a, Pair.of(n1.c1, null));
-				}
-				return null; // AB,C
-			}
-			else if (n1.c2 == n2.c1) {
-				if(n1.c1 == n2.c2){ // ab,ba
-					if(cannotBeSummed.test(n1.c1)){
-						if(cannotBeSummed.test(n1.c2)){
-							return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.AB_BA, Pair.of(n1.c1, n1.c2));
-						}
-						return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.Ba_aB, Pair.of(n1.c1, null));
-					}
-					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.ab_ba, Pair.of(null, null));
-				}
-				if(cannotBeSummed.test(n1.c2)){
-					return null; // AB_B
-				}else{
-					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2)*charToSizeMap.get(n2.c2), EBinaryOperand.Ba_aC, Pair.of(n1.c1, n2.c2));
-//					if(n1.c1 ==outChar1 || n1.c1==outChar2|| charToOccurences.get(n1.c1) > 2){
-//						return null; // AB_B
-//					}
-//					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2), EBinaryOperand.Ba_a, Pair.of(n1.c1, null));
-				}
-			}
-			if(n1.c1 == n2.c2) {
-				if(cannotBeSummed.test(n1.c1)){
-					return null; // AB_B
-				}
-				return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2)*charToSizeMap.get(n2.c1), EBinaryOperand.aB_Ca, Pair.of(n2.c1, n1.c2)); // * its just reorder of mmult
-			}
-			else if (n1.c2 == n2.c2) {
-				if(n1.c2 ==outChar1 || n1.c2==outChar2|| charToOccurences.get(n1.c2) > 2){
-					return null; // BA_CA
-				}else{
-					return Triple.of(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2) +(charToSizeMap.get(n1.c1)*charToSizeMap.get(n1.c2)*charToSizeMap.get(n2.c1)), EBinaryOperand.Ba_Ca, Pair.of(n1.c1, n2.c1)); // or n2.c1, n1.c1
-				}
-			}
-			else { // we have something like ab,cd
-				return null;
-			}
-		}
-	}
-
-	private ArrayList<MatrixBlock /* #els = #els of plan */> executePlan(List<EOpNode> plan, ArrayList<MatrixBlock> inputs){
-		return executePlan(plan, inputs, false);
-	}
-	private ArrayList<MatrixBlock /* #els = #els of plan */> executePlan(List<EOpNode> plan, ArrayList<MatrixBlock> inputs, boolean codegen) {
+	private ArrayList<MatrixBlock> executePlan(List<EOpNode> plan, List<MatrixBlock> inputs) {
 		ArrayList<MatrixBlock> res = new ArrayList<>(plan.size());
 		for(EOpNode p : plan){
-			if(codegen) res.add(ComputeEOpNodeCodegen(p, inputs));
-			else res.add(ComputeEOpNode(p, inputs));
+			res.add(p.computeEOpNode(inputs, _numThreads, LOG));
 		}
 		return res;
 	}
 
-	private MatrixBlock ComputeEOpNode(EOpNode eOpNode, ArrayList<MatrixBlock> inputs){
-		if(eOpNode instanceof EOpNodeData eOpNodeData){
-			return inputs.get(eOpNodeData.matrixIdx);
-		}
-		EOpNodeBinary bin = (EOpNodeBinary) eOpNode;
-		MatrixBlock left = ComputeEOpNode(bin.left, inputs);
-		MatrixBlock right = ComputeEOpNode(bin.right, inputs);
-
-		AggregateOperator agg = new AggregateOperator(0, Plus.getPlusFnObject());
-
-		MatrixBlock res;
-		switch (bin.operand){
-			case AB_AB -> {
-				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left, right},new ScalarObject[]{}, new MatrixBlock());
-			}
-			case A_A -> {
-				EnsureMatrixBlockColumnVector(left);
-				EnsureMatrixBlockColumnVector(right);
-				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left, right},new ScalarObject[]{}, new MatrixBlock());
-			}
-			case a_a -> {
-				EnsureMatrixBlockColumnVector(left);
-				EnsureMatrixBlockColumnVector(right);
-				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left, right},new ScalarObject[]{}, new MatrixBlock());
-				AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceAll.getReduceAllFnObject(), _numThreads);
-				res = (MatrixBlock) res.aggregateUnaryOperations(aggun, new MatrixBlock(), 0, null);
-			}
-			////////////
-			case Ba_Ba -> {
-				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left, right},new ScalarObject[]{}, new MatrixBlock());
-				AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceCol.getReduceColFnObject(), _numThreads);
-				res = (MatrixBlock) res.aggregateUnaryOperations(aggun, new MatrixBlock(), 0, null);
-			}
-			case aB_aB -> {
-				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left, right},new ScalarObject[]{}, new MatrixBlock());
-				AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceRow.getReduceRowFnObject(), _numThreads);
-				res = (MatrixBlock) res.aggregateUnaryOperations(aggun, new MatrixBlock(), 0, null);
-				EnsureMatrixBlockColumnVector(res);
-			}
-			case ab_ab -> {
-				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left, right},new ScalarObject[]{}, new MatrixBlock());
-				AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceAll.getReduceAllFnObject(), _numThreads);
-				res = (MatrixBlock) res.aggregateUnaryOperations(aggun, new MatrixBlock(), 0, null);
-			}
-			case ab_ba -> {
-				ReorgOperator transpose = new ReorgOperator(SwapIndex.getSwapIndexFnObject(), _numThreads);
-				right = right.reorgOperations(transpose, new MatrixBlock(), 0, 0, 0);
-				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left, right},new ScalarObject[]{}, new MatrixBlock());
-				AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceAll.getReduceAllFnObject(), _numThreads);
-				res = (MatrixBlock) res.aggregateUnaryOperations(aggun, new MatrixBlock(), 0, null);
-			}
-			case Ba_aB -> {
-				ReorgOperator transpose = new ReorgOperator(SwapIndex.getSwapIndexFnObject(), _numThreads);
-				right = right.reorgOperations(transpose, new MatrixBlock(), 0, 0, 0);
-				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left, right},new ScalarObject[]{}, new MatrixBlock());
-				AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceCol.getReduceColFnObject(), _numThreads);
-				res = (MatrixBlock) res.aggregateUnaryOperations(aggun, new MatrixBlock(), 0, null);
-			}
-
-			/////////
-			case AB_BA -> {
-				ReorgOperator transpose = new ReorgOperator(SwapIndex.getSwapIndexFnObject(), _numThreads);
-				right = right.reorgOperations(transpose, new MatrixBlock(), 0, 0, 0);
-				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left, right},new ScalarObject[]{}, new MatrixBlock());
-			}
-			case Ba_aC -> {
-				res = LibMatrixMult.matrixMult(left,right, new MatrixBlock(), _numThreads);
-			}
-			case aB_Ca -> {
-				res = LibMatrixMult.matrixMult(right,left, new MatrixBlock(), _numThreads);
-			}
-			case Ba_Ca -> {
-				ReorgOperator transpose = new ReorgOperator(SwapIndex.getSwapIndexFnObject(), _numThreads);
-				right = right.reorgOperations(transpose, new MatrixBlock(), 0, 0, 0);
-				res = LibMatrixMult.matrixMult(left,right, new MatrixBlock(), _numThreads);
-			}
-			case aB_aC -> {
-				ReorgOperator transpose = new ReorgOperator(SwapIndex.getSwapIndexFnObject(), _numThreads);
-				left = left.reorgOperations(transpose, new MatrixBlock(), 0, 0, 0);
-				res = LibMatrixMult.matrixMult(left,right, new MatrixBlock(), _numThreads);
-			}
-			case A_scalar, AB_scalar -> {
-				res = MatrixBlock.naryOperations(new SimpleOperator(Multiply.getMultiplyFnObject()), new MatrixBlock[]{left},new ScalarObject[]{new DoubleObject(right.get(0,0))}, new MatrixBlock());
-			}
-			case BA_A -> {
-				EnsureMatrixBlockRowVector(right);
-				res = left.binaryOperations(new BinaryOperator(Multiply.getMultiplyFnObject()), right);
-			}
-			case Ba_a -> {
-				EnsureMatrixBlockRowVector(right);
-				res = left.binaryOperations(new BinaryOperator(Multiply.getMultiplyFnObject()), right);
-				AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceCol.getReduceColFnObject(), _numThreads);
-				res = (MatrixBlock) res.aggregateUnaryOperations(aggun, new MatrixBlock(), 0, null);
-			}
-
-			case AB_A -> {
-				EnsureMatrixBlockColumnVector(right);
-				res = left.binaryOperations(new BinaryOperator(Multiply.getMultiplyFnObject()), right);
-			}
-			case aB_a -> {
-				EnsureMatrixBlockColumnVector(right);
-				res = left.binaryOperations(new BinaryOperator(Multiply.getMultiplyFnObject()), right);
-				AggregateUnaryOperator aggun = new AggregateUnaryOperator(agg, ReduceRow.getReduceRowFnObject(), _numThreads);
-				res = (MatrixBlock) res.aggregateUnaryOperations(aggun, new MatrixBlock(), 0, null);
-				EnsureMatrixBlockColumnVector(res);
-			}
-
-			case A_B -> {
-				EnsureMatrixBlockColumnVector(left);
-				EnsureMatrixBlockRowVector(right);
-				res = left.binaryOperations(new BinaryOperator(Multiply.getMultiplyFnObject()), right);
-			}
-			case scalar_scalar -> {
-				return new MatrixBlock(left.get(0,0)*right.get(0,0));
-			}
-			default -> {
-				throw new IllegalArgumentException("Unexpected value: " + bin.operand.toString());
-			}
-
-		}
-		return res;
-	}
-
-	private static MatrixBlock ComputeEOpNodeCodegen(EOpNode eOpNode, ArrayList<MatrixBlock> inputs){
-		return rComputeEOpNodeCodegen(eOpNode, inputs);
-//		throw new NotImplementedException();
-	}
-	private static CNodeData MatrixBlockToCNodeData(MatrixBlock mb, int id){
-		return new CNodeData("ce"+id, id, mb.getNumRows(), mb.getNumColumns(), DataType.MATRIX);
-	}
-	private static MatrixBlock rComputeEOpNodeCodegen(EOpNode eOpNode, ArrayList<MatrixBlock> inputs) {
-		if (eOpNode instanceof EOpNodeData eOpNodeData){
-			return inputs.get(eOpNodeData.matrixIdx);
-//			return new CNodeData("ce"+eOpNodeData.matrixIdx, eOpNodeData.matrixIdx, inputs.get(eOpNodeData.matrixIdx).getNumRows(), inputs.get(eOpNodeData.matrixIdx).getNumColumns(), DataType.MATRIX);
-		}
-
-		EOpNodeBinary bin = (EOpNodeBinary) eOpNode;
-//		CNodeData dataLeft = null;
-//		if (bin.left instanceof EOpNodeData eOpNodeData) dataLeft = new CNodeData("ce"+eOpNodeData.matrixIdx, eOpNodeData.matrixIdx, inputs.get(eOpNodeData.matrixIdx).getNumRows(), inputs.get(eOpNodeData.matrixIdx).getNumColumns(), DataType.MATRIX);
-//		CNodeData dataRight = null;
-//		if (bin.right instanceof EOpNodeData eOpNodeData) dataRight = new CNodeData("ce"+eOpNodeData.matrixIdx, eOpNodeData.matrixIdx, inputs.get(eOpNodeData.matrixIdx).getNumRows(), inputs.get(eOpNodeData.matrixIdx).getNumColumns(), DataType.MATRIX);
-
-		if(bin.operand == EBinaryOperand.AB_AB){
-			if (bin.right instanceof EOpNodeBinary rBinary && rBinary.operand  == EBinaryOperand.AB_AB){
-				MatrixBlock left = rComputeEOpNodeCodegen(bin.left, inputs);
-
-				MatrixBlock right1 = rComputeEOpNodeCodegen(((EOpNodeBinary) bin.right).left, inputs);
-				MatrixBlock right2 = rComputeEOpNodeCodegen(((EOpNodeBinary) bin.right).right, inputs);
-
-				CNodeData d0 = MatrixBlockToCNodeData(left, 0);
-				CNodeData d1 = MatrixBlockToCNodeData(right1, 1);
-				CNodeData d2 = MatrixBlockToCNodeData(right2, 2);
-//				CNodeNary nary = new CNodeNary(cnodeIn, CNodeNary.NaryType.)
-				CNodeBinary rightBinary = new CNodeBinary(d1, d2, CNodeBinary.BinType.VECT_MULT);
-				CNodeBinary cNodeBinary = new CNodeBinary(d0, rightBinary, CNodeBinary.BinType.VECT_MULT);
-				ArrayList<CNode> cnodeIn = new ArrayList<>();
-				cnodeIn.add(d0);
-				cnodeIn.add(d1);
-				cnodeIn.add(d2);
-
-				CNodeRow cnode = new CNodeRow(cnodeIn, cNodeBinary);
-
-				cnode.setRowType(SpoofRowwise.RowType.NO_AGG);
-				cnode.renameInputs();
-
-
-				String src = cnode.codegen(false, SpoofCompiler.GeneratorAPI.JAVA);
-				if( LOG.isTraceEnabled()) LOG.trace(CodegenUtils.printWithLineNumber(src));
-				Class<?> cla = CodegenUtils.compileClass("codegen." + cnode.getClassname(), src);
-
-				SpoofOperator op = CodegenUtils.createInstance(cla);
-				MatrixBlock mb = new MatrixBlock();
-
-				ArrayList<ScalarObject> scalars = new ArrayList<>();
-				ArrayList<MatrixBlock> mbs = new ArrayList<>(3);
-				mbs.add(left);
-				mbs.add(right1);
-				mbs.add(right2);
-				MatrixBlock out = op.execute(mbs, scalars, mb, 6);
-
-				return out;
-			}
-		}
-
-		throw new NotImplementedException();
-	}
-
-
 	private void releaseMatrixInputs(ExecutionContext ec){
 		for (CPOperand input : _in)
 			if(input.getDataType()==DataType.MATRIX)
 				ec.releaseMatrixInput(input.getName()); //todo release other
 	}
 
-	private static void EnsureMatrixBlockColumnVector(MatrixBlock mb){
+	public static void ensureMatrixBlockColumnVector(MatrixBlock mb){
 		if(mb.getNumColumns() > 1){
 			mb.setNumRows(mb.getNumColumns());
 			mb.setNumColumns(1);
 			mb.getDenseBlock().resetNoFill(mb.getNumRows(),1);
 		}
 	}
-	private static void EnsureMatrixBlockRowVector(MatrixBlock mb){
+	public static void ensureMatrixBlockRowVector(MatrixBlock mb){
 		if(mb.getNumRows() > 1){
 			mb.setNumColumns(mb.getNumRows());
 			mb.setNumRows(1);
@@ -697,11 +721,13 @@ private static void indent(StringBuilder sb, int level) {
 		}
 	}
 
-	private MatrixBlock computeCellSummation(ArrayList<MatrixBlock> inputs, List<String> inputsChars, String resultString,
-														   HashMap<Character, Integer> charToDimensionSizeInt, List<Character> summingChars, int outRows, int outCols){
-		ArrayList<CNode> cnodeIn = new ArrayList<>();
-		cnodeIn.add(new CNodeData(new LiteralOp(3), 0, 0, DataType.SCALAR));
-		CNodeCell cnode = new CNodeCell(cnodeIn, null);
+	private MatrixBlock computeCellSummation(ArrayList<MatrixBlock> inputs, List<String> inputsChars, 
+		String resultString, Map<Character, Integer> charToDimensionSizeInt, 
+		List<Character> summingChars, int outRows, int outCols)
+	{
+		ArrayList<CNode> dummyIn = new ArrayList<>();
+		dummyIn.add(new CNodeData(new LiteralOp(0), 0, 0, DataType.SCALAR));
+		CNodeCell cnode = new CNodeCell(dummyIn, null);
 		StringBuilder sb = new StringBuilder();
 
 		int indent = 2;
@@ -784,7 +810,7 @@ else if (summingChars.contains(inputsChars.get(i).charAt(1))) {
 				sb.append(itVar0);
 				sb.append(inputsChars.get(i).charAt(1));
 				sb.append(")");
-			} else if (resultString.length() >= 1 &&inputsChars.get(i).charAt(1) == resultString.charAt(0)) {
+			} else if (resultString.length() >= 1 && inputsChars.get(i).charAt(1) == resultString.charAt(0)) {
 				sb.append("rix)");
 			} else if (resultString.length() == 2  && inputsChars.get(i).charAt(1) == resultString.charAt(1)) {
 				sb.append("cix)");
@@ -806,7 +832,7 @@ else if (summingChars.contains(inputsChars.get(i).charAt(1))) {
 			indent--;
 			sb.append("}\n");
 		}
-		String src = CNodeCell.JAVA_TEMPLATE;//
+		String src = CNodeCell.JAVA_TEMPLATE;
 		src = src.replace("%TMP%", cnode.createVarname());
 		src = src.replace("%TYPE%", "NO_AGG");
 		src = src.replace("%SPARSE_SAFE%", "false");
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/MultiReturnParameterizedBuiltinCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/MultiReturnParameterizedBuiltinCPInstruction.java
index 28bd01f08d2..98101348da0 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/MultiReturnParameterizedBuiltinCPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/MultiReturnParameterizedBuiltinCPInstruction.java
@@ -8,7 +8,7 @@
  * with the License.  You may obtain a copy of the License at
  * 
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -91,7 +91,7 @@ public void processInstruction(ExecutionContext ec) {
 		FrameBlock fin = ec.getFrameInput(input1.getName());
 		String spec = ec.getScalarInput(input2).getStringValue();
 		String[] colnames = fin.getColumnNames();
-
+		
 		// execute block transform encode
 		MultiColumnEncoder encoder = EncoderFactory.createEncoder(spec, colnames, fin.getNumColumns(), null);
 		// TODO: Assign #threads in compiler and pass via the instruction string
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/ReorgCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/ReorgCPInstruction.java
index a1788c0e251..be77f4eb4eb 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/ReorgCPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/ReorgCPInstruction.java
@@ -118,11 +118,13 @@ else if ( opcode.equalsIgnoreCase(Opcodes.REV.toString()) ) {
 			return new ReorgCPInstruction(new ReorgOperator(RevIndex.getRevIndexFnObject(), k), in, out, opcode, str);
 		}
 		else if (opcode.equalsIgnoreCase(Opcodes.ROLL.toString())) {
-			InstructionUtils.checkNumFields(str, 3);
+			InstructionUtils.checkNumFields(str, 3, 4);
 			in.split(parts[1]);
 			out.split(parts[3]);
 			CPOperand shift = new CPOperand(parts[2]);
-			return new ReorgCPInstruction(new ReorgOperator(new RollIndex(0)), in, out, shift, opcode, str);
+			int k = (parts.length > 4) ? Integer.parseInt(parts[4]) : 1;
+
+			return new ReorgCPInstruction(new ReorgOperator(new RollIndex(0), k), in, out, shift, opcode, str);
 		}
 		else if ( opcode.equalsIgnoreCase(Opcodes.DIAG.toString()) ) {
 			parseUnaryInstruction(str, in, out); //max 2 operands
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
index 5dd8e55e821..afc446f7479 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
@@ -46,6 +46,7 @@
 import org.apache.sysds.runtime.frame.data.FrameBlock;
 import org.apache.sysds.runtime.instructions.Instruction;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
+import org.apache.sysds.runtime.instructions.ooc.TeeOOCInstruction;
 import org.apache.sysds.runtime.io.FileFormatProperties;
 import org.apache.sysds.runtime.io.FileFormatPropertiesCSV;
 import org.apache.sysds.runtime.io.FileFormatPropertiesHDF5;
@@ -1026,6 +1027,9 @@ private void processCopyInstruction(ExecutionContext ec) {
 		if ( dd == null )
 			throw new DMLRuntimeException("Unexpected error: could not find a data object for variable name:" + getInput1().getName() + ", while processing instruction " +this.toString());
 
+		if (DMLScript.USE_OOC && dd instanceof MatrixObject)
+			TeeOOCInstruction.incrRef(((MatrixObject)dd).getStreamable(), 1);
+
 		// remove existing variable bound to target name
 		Data input2_data = ec.removeVariable(getInput2().getName());
 
@@ -1117,6 +1121,8 @@ private void processSetFileNameInstruction(ExecutionContext ec){
 	public static void processRmvarInstruction( ExecutionContext ec, String varname ) {
 		// remove variable from symbol table
 		Data dat = ec.removeVariable(varname);
+		if (DMLScript.USE_OOC && dat instanceof MatrixObject)
+			TeeOOCInstruction.incrRef(((MatrixObject) dat).getStreamable(), -1);
 		//cleanup matrix data on fs/hdfs (if necessary)
 		if( dat != null )
 			ec.cleanupDataObject(dat);
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/fed/CentralMomentFEDInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/fed/CentralMomentFEDInstruction.java
index 0d6ee0ded8f..d8606699525 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/fed/CentralMomentFEDInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/fed/CentralMomentFEDInstruction.java
@@ -33,7 +33,7 @@
 import org.apache.sysds.runtime.controlprogram.federated.FederationMap;
 import org.apache.sysds.runtime.controlprogram.federated.FederationUtils;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
 import org.apache.sysds.runtime.instructions.cp.CentralMomentCPInstruction;
 import org.apache.sysds.runtime.instructions.cp.Data;
@@ -83,7 +83,7 @@ public void processInstruction(ExecutionContext ec) {
 			cm_op = cm_op.setCMAggOp((int) order.getLongValue());
 
 		FederationMap fedMapping = mo.getFedMapping();
-		List<CM_COV_Object> globalCmobj = new ArrayList<>();
+		List<CmCovObject> globalCmobj = new ArrayList<>();
 
 		long varID = FederationUtils.getNextFedDataID();
 		CMOperator finalCm_op = cm_op;
@@ -109,7 +109,7 @@ public void processInstruction(ExecutionContext ec) {
 				if (!response.isSuccessful())
 					response.throwExceptionFromResponse();
 				synchronized (globalCmobj) {
-					globalCmobj.add((CM_COV_Object) response.getData()[0]);
+					globalCmobj.add((CmCovObject) response.getData()[0]);
 				}
 			}
 			catch (Exception e) {
@@ -118,8 +118,8 @@ public void processInstruction(ExecutionContext ec) {
 			return null;
 		});
 
-		Optional<CM_COV_Object> res = globalCmobj.stream()
-				.reduce((arg0, arg1) -> (CM_COV_Object) finalCm_op.fn.execute(arg0, arg1));
+		Optional<CmCovObject> res = globalCmobj.stream()
+				.reduce((arg0, arg1) -> (CmCovObject) finalCm_op.fn.execute(arg0, arg1));
 		try {
 			ec.setScalarOutput(output.getName(), new DoubleObject(res.get().getRequiredResult(finalCm_op)));
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/fed/CovarianceFEDInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/fed/CovarianceFEDInstruction.java
index 4d22fd753e7..fd15432dbdf 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/fed/CovarianceFEDInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/fed/CovarianceFEDInstruction.java
@@ -40,7 +40,7 @@
 import org.apache.sysds.runtime.controlprogram.federated.FederationUtils;
 import org.apache.sysds.runtime.controlprogram.federated.MatrixLineagePair;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
 import org.apache.sysds.runtime.instructions.cp.CovarianceCPInstruction;
 import org.apache.sysds.runtime.instructions.cp.Data;
@@ -173,7 +173,7 @@ private void processCov(ExecutionContext ec, MatrixObject mo1, MatrixObject mo2)
 		}
 
 		FederationMap fedMapping = mo.getFedMapping();
-		List<CM_COV_Object> globalCmobj = new ArrayList<>();
+		List<CmCovObject> globalCmobj = new ArrayList<>();
 		long varID = FederationUtils.getNextFedDataID();
 		fedMapping.mapParallel(varID, (range, data) -> {
 
@@ -203,7 +203,7 @@ private void processCov(ExecutionContext ec, MatrixObject mo1, MatrixObject mo2)
 				if(!response.isSuccessful())
 					response.throwExceptionFromResponse();
 				synchronized(globalCmobj) {
-					globalCmobj.add((CM_COV_Object) response.getData()[0]);
+					globalCmobj.add((CmCovObject) response.getData()[0]);
 				}
 			}
 			catch(Exception e) {
@@ -212,7 +212,7 @@ private void processCov(ExecutionContext ec, MatrixObject mo1, MatrixObject mo2)
 			return null;
 		});
 
-		Optional<CM_COV_Object> res = globalCmobj.stream().reduce((arg0, arg1) -> (CM_COV_Object) cop.fn.execute(arg0, arg1));
+		Optional<CmCovObject> res = globalCmobj.stream().reduce((arg0, arg1) -> (CmCovObject) cop.fn.execute(arg0, arg1));
 		try {
 			ec.setScalarOutput(output.getName(), new DoubleObject(res.get().getRequiredResult(cop)));
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateTernaryOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateTernaryOOCInstruction.java
new file mode 100644
index 00000000000..c85e17e4c50
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateTernaryOOCInstruction.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.instructions.ooc;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.common.Opcodes;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
+import org.apache.sysds.runtime.functionobjects.KahanPlus;
+import org.apache.sysds.runtime.functionobjects.Multiply;
+import org.apache.sysds.runtime.functionobjects.ReduceAll;
+import org.apache.sysds.runtime.instructions.InstructionUtils;
+import org.apache.sysds.runtime.instructions.cp.CPOperand;
+import org.apache.sysds.runtime.instructions.cp.DoubleObject;
+import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
+import org.apache.sysds.runtime.matrix.data.OperationsOnMatrixValues;
+import org.apache.sysds.runtime.matrix.operators.AggregateTernaryOperator;
+import org.apache.sysds.runtime.matrix.operators.Operator;
+import org.apache.sysds.runtime.meta.DataCharacteristics;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.CompletableFuture;
+import java.util.function.Function;
+
+public class AggregateTernaryOOCInstruction extends ComputationOOCInstruction {
+
+	private static final Log LOG = LogFactory.getLog(AggregateTernaryOOCInstruction.class.getName());
+
+	private AggregateTernaryOOCInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out,
+		String opcode, String istr) {
+		super(OOCInstruction.OOCType.AggregateTernary, op, in1, in2, in3, out, opcode, istr);
+	}
+
+	public static AggregateTernaryOOCInstruction parseInstruction(String str) {
+		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+		String opcode = parts[0];
+
+		if(opcode.equalsIgnoreCase(Opcodes.TAKPM.toString()) || opcode.equalsIgnoreCase(Opcodes.TACKPM.toString())) {
+			InstructionUtils.checkNumFields(parts , 4, 5);
+
+			CPOperand in1 = new CPOperand(parts[1]);
+			CPOperand in2 = new CPOperand(parts[2]);
+			CPOperand in3 = new CPOperand(parts[3]);
+			CPOperand out = new CPOperand(parts[4]);
+			//int numThreads = parts.length == 6 ? Integer.parseInt(parts[5]) : 1;
+
+			AggregateTernaryOperator op = InstructionUtils.parseAggregateTernaryOperator(opcode, 1);
+			return new AggregateTernaryOOCInstruction(op, in1, in2, in3, out, opcode, str);
+		}
+		throw new DMLRuntimeException("AggregateTernaryOOCInstruction.parseInstruction():: Unknown opcode " + opcode);
+	}
+
+	@Override
+	public void processInstruction(ExecutionContext ec) {
+		MatrixObject m1 = ec.getMatrixObject(input1);
+		MatrixObject m2 = ec.getMatrixObject(input2);
+		MatrixObject m3 = input3.isLiteral() ? null : ec.getMatrixObject(input3);
+
+		AggregateTernaryOperator abOp = (AggregateTernaryOperator) _optr;
+		validateInput(m1, m2, m3, abOp, input1.getName(), input2.getName(), input3.getName());
+
+		boolean isReduceAll = abOp.indexFn instanceof ReduceAll;
+
+		OOCStream<IndexedMatrixValue> qIn1 = m1.getStreamHandle();
+		OOCStream<IndexedMatrixValue> qIn2 = m2.getStreamHandle();
+		OOCStream<IndexedMatrixValue> qIn3 = m3 == null ? null : m3.getStreamHandle();
+
+		if(isReduceAll)
+			processReduceAll(ec, abOp, qIn1, qIn2, qIn3);
+		else
+			processReduceRow(ec, abOp, qIn1, qIn2, qIn3, m1.getDataCharacteristics());
+	}
+
+	private void processReduceAll(ExecutionContext ec, AggregateTernaryOperator abOp,
+		OOCStream<IndexedMatrixValue> qIn1, OOCStream<IndexedMatrixValue> qIn2, OOCStream<IndexedMatrixValue> qIn3) {
+
+		final int extra = abOp.aggOp.correction.getNumRemovedRowsColumns();
+		final MatrixBlock agg = new MatrixBlock(1, 1 + extra, false);
+		final MatrixBlock corr = new MatrixBlock(1, 1 + extra, false);
+
+		OOCStream<IndexedMatrixValue> qMid = createWritableStream();
+
+		List<OOCStream<IndexedMatrixValue>> streams = new ArrayList<>();
+		streams.add(qIn1);
+		streams.add(qIn2);
+		if(qIn3 != null)
+			streams.add(qIn3);
+
+		List<Function<IndexedMatrixValue, MatrixIndexes>> keyFns = new ArrayList<>();
+		for(int i = 0; i < streams.size(); i++)
+			keyFns.add(IndexedMatrixValue::getIndexes);
+
+		CompletableFuture<Void> fut = joinOOC(streams, qMid, blocks -> {
+			MatrixBlock b1 = (MatrixBlock) blocks.get(0).getValue();
+			MatrixBlock b2 = (MatrixBlock) blocks.get(1).getValue();
+			MatrixBlock b3 = blocks.size() == 3 ? (MatrixBlock) blocks.get(2).getValue() : null;
+			MatrixBlock partial = MatrixBlock.aggregateTernaryOperations(b1, b2, b3, new MatrixBlock(), abOp, false);
+			return new IndexedMatrixValue(blocks.get(0).getIndexes(), partial);
+		}, keyFns);
+
+		try {
+			IndexedMatrixValue imv;
+			while((imv = qMid.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) {
+				MatrixBlock partial = (MatrixBlock) imv.getValue();
+				OperationsOnMatrixValues.incrementalAggregation(agg,
+					abOp.aggOp.existsCorrection() ? corr : null, partial, abOp.aggOp, true);
+			}
+			fut.join();
+		}
+		catch(Exception ex) {
+			throw new DMLRuntimeException(ex);
+		}
+
+		agg.dropLastRowsOrColumns(abOp.aggOp.correction);
+		ec.setScalarOutput(output.getName(), new DoubleObject(agg.get(0, 0)));
+	}
+
+	private void processReduceRow(ExecutionContext ec, AggregateTernaryOperator abOp,
+		OOCStream<IndexedMatrixValue> qIn1, OOCStream<IndexedMatrixValue> qIn2, OOCStream<IndexedMatrixValue> qIn3,
+		DataCharacteristics dc) {
+
+		long emitThreshold = dc.getNumRowBlocks();
+		if(emitThreshold <= 0)
+			throw new DMLRuntimeException("Unknown number of row blocks for out-of-core aggregate ternary.");
+
+		OOCStream<IndexedMatrixValue> qOut = createWritableStream();
+		ec.getMatrixObject(output).setStreamHandle(qOut);
+
+		OOCStream<IndexedMatrixValue> qMid = createWritableStream();
+
+		List<OOCStream<IndexedMatrixValue>> streams = new ArrayList<>();
+		streams.add(qIn1);
+		streams.add(qIn2);
+		if(qIn3 != null)
+			streams.add(qIn3);
+
+		List<Function<IndexedMatrixValue, MatrixIndexes>> keyFns = new ArrayList<>();
+		for(int i = 0; i < streams.size(); i++)
+			keyFns.add(IndexedMatrixValue::getIndexes);
+
+		CompletableFuture<Void> fut = joinOOC(streams, qMid, blocks -> {
+			MatrixBlock b1 = (MatrixBlock) blocks.get(0).getValue();
+			MatrixBlock b2 = (MatrixBlock) blocks.get(1).getValue();
+			MatrixBlock b3 = blocks.size() == 3 ? (MatrixBlock) blocks.get(2).getValue() : null;
+			MatrixBlock partial = MatrixBlock.aggregateTernaryOperations(b1, b2, b3, new MatrixBlock(), abOp, false);
+			return new IndexedMatrixValue(blocks.get(0).getIndexes(), partial);
+		}, keyFns);
+
+		final Map<Long, MatrixBlock> aggMap = new HashMap<>();
+		final Map<Long, MatrixBlock> corrMap = new HashMap<>();
+		final Map<Long, Integer> cntMap = new HashMap<>();
+
+		try {
+			IndexedMatrixValue imv;
+			while((imv = qMid.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) {
+				MatrixIndexes idx = imv.getIndexes();
+				long colIx = idx.getColumnIndex();
+				MatrixBlock partial = (MatrixBlock) imv.getValue();
+
+				MatrixBlock curAgg = aggMap.get(colIx);
+				MatrixBlock curCorr = corrMap.get(colIx);
+				if(curAgg == null) {
+					aggMap.put(colIx, partial);
+					curCorr = new MatrixBlock(partial.getNumRows(), partial.getNumColumns(), false);
+					corrMap.put(colIx, curCorr);
+					cntMap.put(colIx, 1);
+				}
+				else {
+					OperationsOnMatrixValues.incrementalAggregation(curAgg, abOp.aggOp.existsCorrection() ? curCorr : null,
+						partial, abOp.aggOp, true);
+					cntMap.put(colIx, cntMap.get(colIx) + 1);
+				}
+
+				if(cntMap.get(colIx) >= emitThreshold) {
+					MatrixBlock finalAgg = aggMap.remove(colIx);
+					corrMap.remove(colIx);
+					cntMap.remove(colIx);
+
+					finalAgg.dropLastRowsOrColumns(abOp.aggOp.correction);
+					MatrixIndexes outIdx = new MatrixIndexes(1, colIx);
+					qOut.enqueue(new IndexedMatrixValue(outIdx, finalAgg));
+				}
+			}
+			fut.join();
+		}
+		catch(Exception ex) {
+			throw new DMLRuntimeException(ex);
+		}
+		finally {
+			qOut.closeInput();
+		}
+	}
+
+	private static void validateInput(MatrixObject m1, MatrixObject m2, MatrixObject m3, AggregateTernaryOperator op,
+		String name1, String name2, String name3) {
+
+		DataCharacteristics c1 = m1.getDataCharacteristics();
+		DataCharacteristics c2 = m2.getDataCharacteristics();
+		DataCharacteristics c3 = m3 == null ? c2 : m3.getDataCharacteristics();
+
+		long m1r = c1.getRows();
+		long m2r = c2.getRows();
+		long m3r = c3.getRows();
+		long m1c = c1.getCols();
+		long m2c = c2.getCols();
+		long m3c = c3.getCols();
+
+		if(m1r <= 0 || m2r <= 0 || m3r <= 0 || m1c <= 0 || m2c <= 0 || m3c <= 0)
+			throw new DMLRuntimeException("Unknown dimensions for aggregate ternary inputs.");
+
+		if(m1r != m2r || m1c != m2c || m2r != m3r || m2c != m3c){
+			if(LOG.isTraceEnabled()){
+				LOG.trace("matBlock1:" + name1 + " (" + m1r + "x" + m1c + ")");
+				LOG.trace("matBlock2:" + name2 + " (" + m2r + "x" + m2c + ")");
+				LOG.trace("matBlock3:" + name3 + " (" + m3r + "x" + m3c + ")");
+			}
+			throw new DMLRuntimeException("Invalid dimensions for aggregate ternary (" + m1r + "x" + m1c + ", "
+				+ m2r + "x" + m2c + ", " + m3r + "x" + m3c + ").");
+		}
+
+		if(!(op.aggOp.increOp.fn instanceof KahanPlus && op.binaryFn instanceof Multiply))
+			throw new DMLRuntimeException("Unsupported operator for aggregate ternary operations.");
+
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateUnaryOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateUnaryOOCInstruction.java
index 2a53c5400ae..54d87dd3f2d 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateUnaryOOCInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/AggregateUnaryOOCInstruction.java
@@ -21,7 +21,6 @@
 
 import org.apache.sysds.common.Types.CorrectionLocationType;
 import org.apache.sysds.conf.ConfigurationManager;
-import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
@@ -76,7 +75,7 @@ public void processInstruction( ExecutionContext ec ) {
 		//setup operators and input queue
 		AggregateUnaryOperator aggun = (AggregateUnaryOperator) getOperator(); 
 		MatrixObject min = ec.getMatrixObject(input1);
-		OOCStream<IndexedMatrixValue> q = min.getStreamHandle();
+		OOCStream<IndexedMatrixValue> qIn = min.getStreamHandle();
 		int blen = ConfigurationManager.getBlocksize();
 
 		if (aggun.isRowAggregate() || aggun.isColAggregate()) {
@@ -87,89 +86,70 @@ public void processInstruction( ExecutionContext ec ) {
 			HashMap<Long, MatrixBlock> corrs = new HashMap<>(); // correction blocks
 
 			OOCStream<IndexedMatrixValue> qOut = createWritableStream();
+			OOCStream<IndexedMatrixValue> qLocal = createWritableStream();
+
 			ec.getMatrixObject(output).setStreamHandle(qOut);
 
+			// per-block aggregation (parallel map)
+			mapOOC(qIn, qLocal, tmp -> {
+				MatrixIndexes midx = aggun.isRowAggregate() ?
+					new MatrixIndexes(tmp.getIndexes().getRowIndex(), 1) :
+					new MatrixIndexes(1, tmp.getIndexes().getColumnIndex());
+
+				MatrixBlock ltmp = (MatrixBlock) ((MatrixBlock) tmp.getValue())
+					.aggregateUnaryOperations(aggun, new MatrixBlock(), blen, tmp.getIndexes());
+				return new IndexedMatrixValue(midx, ltmp);
+			});
+
+			// global reduce
 			submitOOCTask(() -> {
-					IndexedMatrixValue tmp = null;
-					try {
-						while((tmp = q.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) {
-							long idx  = aggun.isRowAggregate() ?
-								tmp.getIndexes().getRowIndex() : tmp.getIndexes().getColumnIndex();
-							MatrixBlock ret = aggTracker.get(idx);
-							if(ret != null) {
-								MatrixBlock corr = corrs.get(idx);
-
-								// aggregation
-								MatrixBlock ltmp = (MatrixBlock) ((MatrixBlock) tmp.getValue())
-									.aggregateUnaryOperations(aggun, new MatrixBlock(), blen, tmp.getIndexes());
-								OperationsOnMatrixValues.incrementalAggregation(ret,
-									_aop.existsCorrection() ? corr : null, ltmp, _aop, true);
-
-								if (!aggTracker.putAndIncrementCount(idx, ret)){
-									corrs.replace(idx, corr);
-									continue;
-								}
-							}
-							else {
-								// first block for this idx - init aggregate and correction
-								// TODO avoid corr block for inplace incremental aggregation
-								int rows = tmp.getValue().getNumRows();
-								int cols = tmp.getValue().getNumColumns();
-								int extra = _aop.correction.getNumRemovedRowsColumns();
-								ret = aggun.isRowAggregate()? new MatrixBlock(rows, 1 + extra, false) : new MatrixBlock(1 + extra, cols, false);
-								MatrixBlock corr = aggun.isRowAggregate()? new MatrixBlock(rows, 1 + extra, false) : new MatrixBlock(1 + extra, cols, false);
-
-								// aggregation
-								MatrixBlock ltmp = (MatrixBlock) ((MatrixBlock) tmp.getValue()).aggregateUnaryOperations(
-									aggun, new MatrixBlock(), blen, tmp.getIndexes());
-								OperationsOnMatrixValues.incrementalAggregation(ret,
-									_aop.existsCorrection() ? corr : null, ltmp, _aop, true);
-
-								if(emitThreshold > 1){
-									aggTracker.putAndIncrementCount(idx, ret);
-									corrs.put(idx, corr);
-									continue;
-								}
-							}
-
-							// all input blocks for this idx processed - emit aggregated block
-							ret.dropLastRowsOrColumns(_aop.correction);
-							MatrixIndexes midx = aggun.isRowAggregate() ?
-								new MatrixIndexes(tmp.getIndexes().getRowIndex(), 1) :
-								new MatrixIndexes(1, tmp.getIndexes().getColumnIndex());
-							IndexedMatrixValue tmpOut = new IndexedMatrixValue(midx, ret);
-
-							qOut.enqueue(tmpOut);
-							// drop intermediate states
-							aggTracker.remove(idx);
-							corrs.remove(idx);
-						}
-						qOut.closeInput();
+				IndexedMatrixValue partial;
+				while ((partial = qLocal.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) {
+					long idx = aggun.isRowAggregate() ? partial.getIndexes().getRowIndex() : partial.getIndexes()
+						.getColumnIndex();
+
+					MatrixBlock ret = aggTracker.get(idx);
+					boolean ready;
+					if(ret != null) {
+						MatrixBlock corr = corrs.get(idx);
+						OperationsOnMatrixValues.incrementalAggregation(ret,
+							_aop.existsCorrection() ? corr : null, (MatrixBlock) partial.getValue(), _aop,
+							true);
+						ready = aggTracker.incrementCount(idx);
 					}
-					catch(Exception ex) {
-						throw new DMLRuntimeException(ex);
+					else {
+						ret = (MatrixBlock) partial.getValue();
+						MatrixBlock corr = _aop.existsCorrection() ? new MatrixBlock(ret.getNumRows(),
+							ret.getNumColumns(), false) : null;
+						ready = aggTracker.putAndIncrementCount(idx, ret);
+						if(!ready && _aop.existsCorrection())
+							corrs.put(idx, corr);
+					}
+
+					if(ready) {
+						ret.dropLastRowsOrColumns(_aop.correction);
+						qOut.enqueue(new IndexedMatrixValue(partial.getIndexes(), ret));
+						aggTracker.remove(idx);
+						corrs.remove(idx);
 					}
-			}, q, qOut);
+				}
+				qOut.closeInput();
+			});
 		}
 		// full aggregation
 		else {
-			IndexedMatrixValue tmp = null;
-			//read blocks and aggregate immediately into result
+			OOCStream<MatrixBlock> qLocal = createWritableStream();
+
+			mapOOC(qIn, qLocal, tmp -> (MatrixBlock) ((MatrixBlock) tmp.getValue())
+				.aggregateUnaryOperations(aggun, new MatrixBlock(), blen, tmp.getIndexes()));
+
+			MatrixBlock ltmp;
 			int extra = _aop.correction.getNumRemovedRowsColumns();
 			MatrixBlock ret = new MatrixBlock(1,1+extra,false);
 			MatrixBlock corr = new MatrixBlock(1,1+extra,false);
-			try {
-				while((tmp = q.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) {
-					//block aggregation
-					MatrixBlock ltmp = (MatrixBlock) ((MatrixBlock) tmp.getValue())
-						.aggregateUnaryOperations(aggun, new MatrixBlock(), blen, tmp.getIndexes());
-					//accumulation into final result
-					OperationsOnMatrixValues.incrementalAggregation(
-						ret, _aop.existsCorrection() ? corr : null, ltmp, _aop, true);
-				}
-			}
-			catch(Exception ex) {
-				throw new DMLRuntimeException(ex);
+			while((ltmp = qLocal.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) {
+				OperationsOnMatrixValues.incrementalAggregation(
+					ret, _aop.existsCorrection() ? corr : null, ltmp, _aop, true);
 			}
 
 			//create scalar output
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/CachingStream.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/CachingStream.java
index d7c80e4de3c..f9869b20f9a 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/CachingStream.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/CachingStream.java
@@ -24,9 +24,16 @@
 import org.apache.sysds.runtime.controlprogram.parfor.util.IDSequence;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
+import org.apache.sysds.runtime.ooc.cache.BlockKey;
+import org.apache.sysds.runtime.ooc.cache.OOCIOHandler;
+import org.apache.sysds.runtime.ooc.cache.OOCCacheManager;
+import org.apache.sysds.runtime.ooc.stream.OOCSourceStream;
+import shaded.parquet.it.unimi.dsi.fastutil.ints.IntArrayList;
 
 import java.util.HashMap;
 import java.util.Map;
+import java.util.concurrent.ExecutionException;
+import java.util.function.Consumer;
 
 /**
  * A wrapper around LocalTaskQueue to consume the source stream and reset to
@@ -39,6 +46,8 @@ public class CachingStream implements OOCStreamable<IndexedMatrixValue> {
 
 	// original live stream
 	private final OOCStream<IndexedMatrixValue> _source;
+	private final IntArrayList _consumptionCounts = new IntArrayList();
+	private final IntArrayList _consumerConsumptionCounts = new IntArrayList();
 
 	// stream identifier
 	private final long _streamId;
@@ -46,7 +55,7 @@ public class CachingStream implements OOCStreamable<IndexedMatrixValue> {
 	// block counter
 	private int _numBlocks = 0;
 
-	private Runnable[] _subscribers;
+	private Consumer<OOCStream.QueueCallback<IndexedMatrixValue>>[] _subscribers;
 
 	// state flags
 	private boolean _cacheInProgress = true; // caching in progress, in the first pass.
@@ -54,6 +63,10 @@ public class CachingStream implements OOCStreamable<IndexedMatrixValue> {
 
 	private DMLRuntimeException _failure;
 
+	private boolean deletable = false;
+	private int maxConsumptionCount = 0;
+	private String _watchdogId = null;
+
 	public CachingStream(OOCStream<IndexedMatrixValue> source) {
 		this(source, _streamSeq.getNextID());
 	}
@@ -61,23 +74,77 @@ public CachingStream(OOCStream<IndexedMatrixValue> source) {
 	public CachingStream(OOCStream<IndexedMatrixValue> source, long streamId) {
 		_source = source;
 		_streamId = streamId;
-		source.setSubscriber(() -> {
-			try {
-				boolean closed = fetchFromStream();
-				Runnable[] mSubscribers = _subscribers;
+		if (OOCWatchdog.WATCH) {
+			_watchdogId = "CS-" + hashCode();
+			// Capture a short context to help identify origin
+			OOCWatchdog.registerOpen(_watchdogId, "CachingStream@" + hashCode(), getCtxMsg(), this);
+		}
+		source.setSubscriber(tmp -> {
+			try (tmp) {
+				final IndexedMatrixValue task = tmp.get();
+				int blk;
+				Consumer<OOCStream.QueueCallback<IndexedMatrixValue>>[] mSubscribers;
+				OOCStream.QueueCallback<IndexedMatrixValue> mCallback = null;
 
-				if(mSubscribers != null) {
-					for(Runnable mSubscriber : mSubscribers)
-						mSubscriber.run();
+				synchronized (this) {
+					mSubscribers = _subscribers;
+					if(task != LocalTaskQueue.NO_MORE_TASKS) {
+						if (!_cacheInProgress)
+							throw new DMLRuntimeException("Stream is closed");
+						OOCIOHandler.SourceBlockDescriptor descriptor = null;
+						if (_source instanceof OOCSourceStream src) {
+							descriptor = src.getDescriptor(task.getIndexes());
+						}
+						if (descriptor == null) {
+							if (mSubscribers == null || mSubscribers.length == 0)
+								OOCCacheManager.put(_streamId, _numBlocks, task);
+							else
+								mCallback = OOCCacheManager.putAndPin(_streamId, _numBlocks, task);
+						}
+						else {
+							if (mSubscribers == null || mSubscribers.length == 0)
+								OOCCacheManager.putSourceBacked(_streamId, _numBlocks, task, descriptor);
+							else
+								mCallback = OOCCacheManager.putAndPinSourceBacked(_streamId, _numBlocks, task, descriptor);
+						}
+						if (_index != null)
+							_index.put(task.getIndexes(), _numBlocks);
+						blk = _numBlocks;
+						_numBlocks++;
+						_consumptionCounts.add(0);
+						notifyAll();
+					}
+					else {
+						_cacheInProgress = false; // caching is complete
+						if (OOCWatchdog.WATCH)
+							OOCWatchdog.registerClose(_watchdogId);
+						notifyAll();
+						blk = -1;
+					}
+				}
 
-					if (closed) {
-						synchronized (this) {
-							_subscribers = null;
+				if(mSubscribers != null && mSubscribers.length > 0) {
+					final OOCStream.QueueCallback<IndexedMatrixValue> finalCallback = mCallback;
+					try(finalCallback) {
+						if(blk != -1) {
+							for(int i = 0; i < mSubscribers.length; i++) {
+								OOCStream.QueueCallback<IndexedMatrixValue> localCallback = finalCallback.keepOpen();
+								try(localCallback) {
+									mSubscribers[i].accept(localCallback);
+								}
+								if(onConsumed(blk, i))
+									mSubscribers[i].accept(OOCStream.eos(_failure));
+							}
+						}
+						else {
+							OOCStream.QueueCallback<IndexedMatrixValue> cb = OOCStream.eos(_failure);
+							for(int i = 0; i < mSubscribers.length; i++) {
+								if(onNoMoreTasks(i))
+									mSubscribers[i].accept(cb);
+							}
 						}
 					}
 				}
-			} catch (InterruptedException e) {
-				throw new DMLRuntimeException(e);
 			} catch (DMLRuntimeException e) {
 				// Propagate failure to subscribers
 				_failure = e;
@@ -85,11 +152,12 @@ public CachingStream(OOCStream<IndexedMatrixValue> source, long streamId) {
 					notifyAll();
 				}
 
-				Runnable[] mSubscribers = _subscribers;
+				Consumer<OOCStream.QueueCallback<IndexedMatrixValue>>[] mSubscribers = _subscribers;
+				OOCStream.QueueCallback<IndexedMatrixValue> err = OOCStream.eos( _failure);
 				if(mSubscribers != null) {
-					for(Runnable mSubscriber : mSubscribers) {
+					for(Consumer<OOCStream.QueueCallback<IndexedMatrixValue>> mSubscriber : mSubscribers) {
 						try {
-							mSubscriber.run();
+							mSubscriber.accept(err);
 						} catch (Exception ignored) {
 						}
 					}
@@ -98,47 +166,168 @@ public CachingStream(OOCStream<IndexedMatrixValue> source, long streamId) {
 		});
 	}
 
-	private synchronized boolean fetchFromStream() throws InterruptedException {
-		if(!_cacheInProgress)
-			throw new DMLRuntimeException("Stream is closed");
+	private String getCtxMsg() {
+		StackTraceElement[] st = new Exception().getStackTrace();
+		// Skip the first few frames (constructor, createWritableStream, etc.)
+		StringBuilder sb = new StringBuilder();
+		int limit = Math.min(st.length, 7);
+		for(int i = 2; i < limit; i++) {
+			sb.append(st[i].getClassName()).append(".").append(st[i].getMethodName()).append(":")
+				.append(st[i].getLineNumber());
+			if(i < limit - 1)
+				sb.append(" <- ");
+		}
+		return sb.toString();
+	}
 
-		IndexedMatrixValue task = _source.dequeue();
+	public synchronized void scheduleDeletion() {
+		if (deletable)
+			return; // Deletion already scheduled
 
-		if(task != LocalTaskQueue.NO_MORE_TASKS) {
-			OOCEvictionManager.put(_streamId, _numBlocks, task);
-			if (_index != null)
-				_index.put(task.getIndexes(), _numBlocks);
-			_numBlocks++;
-			notifyAll();
-			return false;
-		}
-		else {
-			_cacheInProgress = false; // caching is complete
-			notifyAll();
-			return true;
+		if (_cacheInProgress && maxConsumptionCount == 0)
+			throw new DMLRuntimeException("Cannot have a caching stream with no listeners");
+
+		deletable = true;
+		for (int i = 0; i < _consumptionCounts.size(); i++) {
+			tryDeleteBlock(i);
 		}
 	}
 
-	public synchronized IndexedMatrixValue get(int idx) throws InterruptedException {
+	public String toString() {
+		return "CachingStream@" + _streamId;
+	}
+
+	private synchronized void tryDeleteBlock(int i) {
+		int cnt = _consumptionCounts.getInt(i);
+		if (cnt > maxConsumptionCount)
+			throw new DMLRuntimeException("Cannot have more than " + maxConsumptionCount + " consumptions.");
+		if (cnt == maxConsumptionCount)
+			OOCCacheManager.forget(_streamId, i);
+	}
+
+	private synchronized boolean onConsumed(int blockIdx, int consumerIdx) {
+		int newCount = _consumptionCounts.getInt(blockIdx)+1;
+		if (newCount > maxConsumptionCount)
+			throw new DMLRuntimeException("Cannot have more than " + maxConsumptionCount + " consumptions.");
+		_consumptionCounts.set(blockIdx, newCount);
+		int newConsumerCount = _consumerConsumptionCounts.getInt(consumerIdx)+1;
+		_consumerConsumptionCounts.set(consumerIdx, newConsumerCount);
+
+		if (deletable)
+			tryDeleteBlock(blockIdx);
+
+		return !_cacheInProgress && newConsumerCount == _numBlocks + 1;
+	}
+
+	private synchronized boolean onNoMoreTasks(int consumerIdx) {
+		int newConsumerCount = _consumerConsumptionCounts.getInt(consumerIdx)+1;
+		_consumerConsumptionCounts.set(consumerIdx, newConsumerCount);
+		return !_cacheInProgress && newConsumerCount == _numBlocks + 1;
+	}
+
+	public synchronized OOCStream.QueueCallback<IndexedMatrixValue> get(int idx) throws InterruptedException,
+		ExecutionException {
 		while (true) {
 			if (_failure != null)
 				throw _failure;
 			else if (idx < _numBlocks) {
-				IndexedMatrixValue out = OOCEvictionManager.get(_streamId, idx);
+				OOCStream.QueueCallback<IndexedMatrixValue> out = OOCCacheManager.requestBlock(_streamId, idx).get();
 
 				if (_index != null) // Ensure index is up to date
-					_index.putIfAbsent(out.getIndexes(), idx);
+					_index.putIfAbsent(out.get().getIndexes(), idx);
+
+				int newCount = _consumptionCounts.getInt(idx)+1;
+				if (newCount > maxConsumptionCount)
+					throw new DMLRuntimeException("Consumer overflow! Expected: " + maxConsumptionCount);
+				_consumptionCounts.set(idx, newCount);
+
+				if (deletable)
+					tryDeleteBlock(idx);
 
 				return out;
 			} else if (!_cacheInProgress)
-				return (IndexedMatrixValue)LocalTaskQueue.NO_MORE_TASKS;
+				return new OOCStream.SimpleQueueCallback<>(null, null);
 
 			wait();
 		}
 	}
 
-	public synchronized IndexedMatrixValue findCached(MatrixIndexes idx) {
-		return OOCEvictionManager.get(_streamId, _index.get(idx));
+	public synchronized int findCachedIndex(MatrixIndexes idx) {
+		return _index.get(idx);
+	}
+
+	public synchronized BlockKey peekCachedBlockKey(MatrixIndexes idx) {
+		return new BlockKey(_streamId, _index.get(idx));
+	}
+
+	public synchronized OOCStream.QueueCallback<IndexedMatrixValue> findCached(MatrixIndexes idx) {
+		int mIdx = _index.get(idx);
+		int newCount = _consumptionCounts.getInt(mIdx)+1;
+		if (newCount > maxConsumptionCount)
+			throw new DMLRuntimeException("Consumer overflow in " + _streamId + "_" + mIdx + ". Expected: " + maxConsumptionCount);
+
+		_consumptionCounts.set(mIdx, newCount);
+
+		try {
+			return OOCCacheManager.requestBlock(_streamId, mIdx).get();
+		} catch (InterruptedException | ExecutionException e) {
+			return new OOCStream.SimpleQueueCallback<>(null, new DMLRuntimeException(e));
+		} finally {
+			if (deletable)
+				tryDeleteBlock(mIdx);
+		}
+	}
+
+	public void findCachedAsync(MatrixIndexes idx, Consumer<OOCStream.QueueCallback<IndexedMatrixValue>> callback) {
+		int mIdx;
+		synchronized(this) {
+			mIdx = _index.get(idx);
+			int newCount = _consumptionCounts.getInt(mIdx)+1;
+			if (newCount > maxConsumptionCount)
+				throw new DMLRuntimeException("Consumer overflow in " + _streamId + "_" + mIdx + ". Expected: " + maxConsumptionCount);
+		}
+		OOCCacheManager.requestBlock(_streamId, mIdx).whenComplete((cb, r) -> {
+			try (cb) {
+				synchronized(CachingStream.this) {
+					int newCount = _consumptionCounts.getInt(mIdx) + 1;
+					if(newCount > maxConsumptionCount) {
+						_failure = new DMLRuntimeException(
+							"Consumer overflow in " + _streamId + "_" + mIdx + ". Expected: " + maxConsumptionCount);
+						cb.fail(_failure);
+					}
+					else
+						_consumptionCounts.set(mIdx, newCount);
+				}
+
+				callback.accept(cb);
+			}
+		});
+	}
+
+	/**
+	 * Finds a cached item asynchronously without counting it as a consumption.
+	 */
+	public void peekCachedAsync(MatrixIndexes idx, Consumer<OOCStream.QueueCallback<IndexedMatrixValue>> callback) {
+		int mIdx;
+		synchronized(this) {
+			mIdx = _index.get(idx);
+		}
+		OOCCacheManager.requestBlock(_streamId, mIdx).whenComplete((cb, r) -> callback.accept(cb));
+	}
+
+	/**
+	 * Finds a cached item without counting it as a consumption.
+	 */
+	public OOCStream.QueueCallback<IndexedMatrixValue> peekCached(MatrixIndexes idx) {
+		int mIdx;
+		synchronized(this) {
+			mIdx = _index.get(idx);
+		}
+		try {
+			return OOCCacheManager.requestBlock(_streamId, mIdx).get();
+		} catch (InterruptedException | ExecutionException e) {
+			return new OOCStream.SimpleQueueCallback<>(null, new DMLRuntimeException(e));
+		}
 	}
 
 	public synchronized void activateIndexing() {
@@ -161,14 +350,25 @@ public boolean isProcessed() {
 		return false;
 	}
 
-	@Override
-	public void setSubscriber(Runnable subscriber) {
+	public void setSubscriber(Consumer<OOCStream.QueueCallback<IndexedMatrixValue>> subscriber, boolean incrConsumers) {
+		if (deletable)
+			throw new DMLRuntimeException("Cannot register a new subscriber on " + this + " because has been flagged for deletion");
+		if (_failure != null)
+			throw _failure;
+
 		int mNumBlocks;
+		boolean cacheInProgress;
+		int consumerIdx;
 		synchronized (this) {
 			mNumBlocks = _numBlocks;
-			if (_cacheInProgress) {
+			cacheInProgress = _cacheInProgress;
+			consumerIdx = _consumerConsumptionCounts.size();
+			_consumerConsumptionCounts.add(0);
+			if (incrConsumers)
+				maxConsumptionCount++;
+			if (cacheInProgress) {
 				int newLen = _subscribers == null ? 1 : _subscribers.length + 1;
-				Runnable[] newSubscribers = new Runnable[newLen];
+				Consumer<OOCStream.QueueCallback<IndexedMatrixValue>>[] newSubscribers = new Consumer[newLen];
 
 				if(newLen > 1)
 					System.arraycopy(_subscribers, 0, newSubscribers, 0, newLen - 1);
@@ -178,10 +378,45 @@ public void setSubscriber(Runnable subscriber) {
 			}
 		}
 
-		for (int i = 0; i < mNumBlocks; i++)
-			subscriber.run();
+		for (int i = 0; i < mNumBlocks; i++) {
+			final int idx = i;
+			OOCCacheManager.requestBlock(_streamId, i).whenComplete((cb, r) -> {
+				try (cb) {
+					synchronized(CachingStream.this) {
+						if(_index != null)
+							_index.put(cb.get().getIndexes(), idx);
+					}
+					subscriber.accept(cb);
+
+					if (onConsumed(idx, consumerIdx))
+						subscriber.accept(OOCStream.eos(_failure)); // NO_MORE_TASKS
+				}
+			});
+		}
+
+		if (!cacheInProgress && onNoMoreTasks(consumerIdx))
+			subscriber.accept(OOCStream.eos(_failure)); // NO_MORE_TASKS
+	}
+
+	/**
+	 * Artificially increase subscriber count.
+	 * Only use if certain blocks are accessed more than once.
+	 */
+	public synchronized void incrSubscriberCount(int count) {
+		if (deletable)
+			throw new IllegalStateException("Cannot increment the subscriber count if flagged for deletion");
+
+		maxConsumptionCount += count;
+	}
+
+	/**
+	 * Artificially increase the processing count of a block.
+	 */
+	public synchronized void incrProcessingCount(int i, int count) {
+		int cnt = _consumptionCounts.getInt(i)+count;
+		_consumptionCounts.set(i, cnt);
 
-		if (!_cacheInProgress)
-			subscriber.run(); // To fetch the NO_MORE_TASK element
+		if (deletable)
+			tryDeleteBlock(i);
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/CentralMomentOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/CentralMomentOOCInstruction.java
index 7b3346ab6dd..1c73b636341 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/CentralMomentOOCInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/CentralMomentOOCInstruction.java
@@ -23,7 +23,7 @@
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
 import org.apache.sysds.runtime.instructions.cp.CentralMomentCPInstruction;
 import org.apache.sysds.runtime.instructions.cp.DoubleObject;
@@ -73,7 +73,7 @@ public void processInstruction(ExecutionContext ec) {
 
 		CMOperator finalCm_op = cm_op;
 
-		OOCStream<CM_COV_Object> cmObjs = createWritableStream();
+		OOCStream<CmCovObject> cmObjs = createWritableStream();
 
 		if(input3 == null) {
 			mapOOC(qIn, cmObjs, tmp -> ((MatrixBlock) tmp.getValue()).cmOperations(new CMOperator(finalCm_op))); // Need to copy CMOperator as its ValueFunction is stateful
@@ -98,11 +98,11 @@ public void processInstruction(ExecutionContext ec) {
 		}
 
 		try {
-			CM_COV_Object agg = cmObjs.dequeue();
-			CM_COV_Object next;
+			CmCovObject agg = cmObjs.dequeue();
+			CmCovObject next;
 
 			while ((next = cmObjs.dequeue()) != LocalTaskQueue.NO_MORE_TASKS)
-				agg = (CM_COV_Object) finalCm_op.fn.execute(agg, next);
+				agg = (CmCovObject) finalCm_op.fn.execute(agg, next);
 
 			ec.setScalarOutput(output_name, new DoubleObject(agg.getRequiredResult(finalCm_op)));
 		} catch (Exception ex) {
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/IndexingOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/IndexingOOCInstruction.java
index 1d555da8d6c..175d81d6e06 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/IndexingOOCInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/IndexingOOCInstruction.java
@@ -115,6 +115,34 @@ public boolean isAligned() {
 			return (_indexRange.rowStart % _blocksize) == 0 && (_indexRange.colStart % _blocksize) == 0;
 		}
 
+		public int getNumConsumptions(MatrixIndexes index) {
+			long blockRow = index.getRowIndex() - 1;
+			long blockCol = index.getColumnIndex() - 1;
+
+			if(!_blockRange.isWithin(blockRow, blockCol))
+				return 0;
+
+			long blockRowStart = blockRow * _blocksize;
+			long blockRowEnd = blockRowStart + _blocksize - 1;
+			long blockColStart = blockCol * _blocksize;
+			long blockColEnd = blockColStart + _blocksize - 1;
+
+			long overlapRowStart = Math.max(_indexRange.rowStart, blockRowStart);
+			long overlapRowEnd = Math.min(_indexRange.rowEnd, blockRowEnd);
+			long overlapColStart = Math.max(_indexRange.colStart, blockColStart);
+			long overlapColEnd = Math.min(_indexRange.colEnd, blockColEnd);
+
+			if(overlapRowStart > overlapRowEnd || overlapColStart > overlapColEnd)
+				return 0;
+
+			int outRowStart = (int) ((overlapRowStart - _indexRange.rowStart) / _blocksize);
+			int outRowEnd = (int) ((overlapRowEnd - _indexRange.rowStart) / _blocksize);
+			int outColStart = (int) ((overlapColStart - _indexRange.colStart) / _blocksize);
+			int outColEnd = (int) ((overlapColEnd - _indexRange.colStart) / _blocksize);
+
+			return (outRowEnd - outRowStart + 1) * (outColEnd - outColStart + 1);
+		}
+
 		public boolean putNext(MatrixIndexes index, T data, BiConsumer<MatrixIndexes, Sector<T>> emitter) {
 			long blockRow = index.getRowIndex() - 1;
 			long blockCol = index.getColumnIndex() - 1;
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/MatrixIndexingOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/MatrixIndexingOOCInstruction.java
index a04a77677cd..fa0d0df55d3 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/MatrixIndexingOOCInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/MatrixIndexingOOCInstruction.java
@@ -33,8 +33,8 @@
 import org.apache.sysds.runtime.util.IndexRange;
 
 import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicReference;
 
 public class MatrixIndexingOOCInstruction extends IndexingOOCInstruction {
 
@@ -43,11 +43,6 @@ public MatrixIndexingOOCInstruction(CPOperand in, CPOperand rl, CPOperand ru, CP
 		super(in, rl, ru, cl, cu, out, opcode, istr);
 	}
 
-	protected MatrixIndexingOOCInstruction(CPOperand lhsInput, CPOperand rhsInput, CPOperand rl, CPOperand ru,
-		CPOperand cl, CPOperand cu, CPOperand out, String opcode, String istr) {
-		super(lhsInput, rhsInput, rl, ru, cl, cu, out, opcode, istr);
-	}
-
 	@Override
 	public void processInstruction(ExecutionContext ec) {
 		String opcode = getOpcode();
@@ -88,16 +83,15 @@ public void processInstruction(ExecutionContext ec) {
 				throw new DMLRuntimeException("Desired block not found");
 			}
 
-			final AtomicReference<CompletableFuture<Void>> futureRef = new AtomicReference<>();
-
 			if(ix.rowStart % blocksize == 0 && ix.colStart % blocksize == 0) {
 				// Aligned case: interior blocks can be forwarded directly, borders may require slicing
 				final int outBlockRows = (int) Math.ceil((double) (ix.rowSpan() + 1) / blocksize);
 				final int outBlockCols = (int) Math.ceil((double) (ix.colSpan() + 1) / blocksize);
 				final int totalBlocks = outBlockRows * outBlockCols;
 				final AtomicInteger producedBlocks = new AtomicInteger(0);
+				CompletableFuture<Void> future = new  CompletableFuture<>();
 
-				CompletableFuture<Void> future = filterOOC(qIn, tmp -> {
+				filterOOC(qIn, tmp -> {
 					MatrixIndexes inIdx = tmp.getIndexes();
 					long blockRow = inIdx.getRowIndex() - 1;
 					long blockCol = inIdx.getColumnIndex() - 1;
@@ -124,35 +118,37 @@ public void processInstruction(ExecutionContext ec) {
 					long outBlockCol = blockCol - firstBlockCol + 1;
 					qOut.enqueue(new IndexedMatrixValue(new MatrixIndexes(outBlockRow, outBlockCol), outBlock));
 
-					if(producedBlocks.incrementAndGet() >= totalBlocks) {
-						CompletableFuture<Void> f = futureRef.get();
-						if(f != null)
-							f.cancel(true);
-					}
+					if(producedBlocks.incrementAndGet() >= totalBlocks)
+						future.complete(null);
 				}, tmp -> {
+					if (future.isDone()) // Then we may skip blocks and avoid submitting tasks
+						return false;
+
 					long blockRow = tmp.getIndexes().getRowIndex() - 1;
 					long blockCol = tmp.getIndexes().getColumnIndex() - 1;
 					return blockRow >= firstBlockRow && blockRow <= lastBlockRow && blockCol >= firstBlockCol &&
 						blockCol <= lastBlockCol;
 				}, qOut::closeInput);
-				futureRef.set(future);
 				return;
 			}
 
-			final BlockAligner<IndexedBlockMeta> aligner = new BlockAligner<>(ix, blocksize);
+			final BlockAligner<MatrixIndexes> aligner = new BlockAligner<>(ix, blocksize);
+			final ConcurrentHashMap<MatrixIndexes, Integer> consumptionCounts = new ConcurrentHashMap<>();
 
 			// We may need to construct our own intermediate stream to properly manage the cached items
 			boolean hasIntermediateStream = !qIn.hasStreamCache();
 			final CachingStream cachedStream = hasIntermediateStream ? new CachingStream(new SubscribableTaskQueue<>()) : qOut.getStreamCache();
 			cachedStream.activateIndexing();
+			cachedStream.incrSubscriberCount(1); // We may require re-consumption of blocks (up to 4 times)
+			final CompletableFuture<Void> future = new CompletableFuture<>();
 
-			CompletableFuture<Void> future = filterOOC(qIn.getReadStream(), tmp -> {
+			filterOOC(qIn.getReadStream(), tmp -> {
 				if (hasIntermediateStream) {
 					// We write to an intermediate stream to ensure that these matrix blocks are properly cached
 					cachedStream.getWriteStream().enqueue(tmp);
 				}
 
-				boolean completed = aligner.putNext(tmp.getIndexes(), new IndexedBlockMeta(tmp), (idx, sector) -> {
+				boolean completed = aligner.putNext(tmp.getIndexes(), tmp.getIndexes(), (idx, sector) -> {
 					int targetBlockRow = (int) (idx.getRowIndex() - 1);
 					int targetBlockCol = (int) (idx.getColumnIndex() - 1);
 
@@ -176,50 +172,64 @@ public void processInstruction(ExecutionContext ec) {
 
 					for(int r = 0; r < rowSegments; r++) {
 						for(int c = 0; c < colSegments; c++) {
-							IndexedBlockMeta ibm = sector.get(r, c);
-							if(ibm == null)
+							MatrixIndexes mIdx = sector.get(r, c);
+							if(mIdx == null)
 								continue;
 
-							IndexedMatrixValue mv = cachedStream.findCached(ibm.idx);
-							MatrixBlock srcBlock = (MatrixBlock) mv.getValue();
-
-							if(target == null)
-								target = new MatrixBlock(nRows, nCols, srcBlock.isInSparseFormat());
-
-							long srcBlockRowStart = (ibm.idx.getRowIndex() - 1) * blocksize;
-							long srcBlockColStart = (ibm.idx.getColumnIndex() - 1) * blocksize;
-							long sliceRowStartGlobal = Math.max(targetRowStartGlobal, srcBlockRowStart);
-							long sliceRowEndGlobal = Math.min(targetRowEndGlobal,
-								srcBlockRowStart + srcBlock.getNumRows() - 1);
-							long sliceColStartGlobal = Math.max(targetColStartGlobal, srcBlockColStart);
-							long sliceColEndGlobal = Math.min(targetColEndGlobal,
-								srcBlockColStart + srcBlock.getNumColumns() - 1);
-
-							int sliceRowStart = (int) (sliceRowStartGlobal - srcBlockRowStart);
-							int sliceRowEnd = (int) (sliceRowEndGlobal - srcBlockRowStart);
-							int sliceColStart = (int) (sliceColStartGlobal - srcBlockColStart);
-							int sliceColEnd = (int) (sliceColEndGlobal - srcBlockColStart);
-
-							int targetRowOffset = (int) (sliceRowStartGlobal - targetRowStartGlobal);
-							int targetColOffset = (int) (sliceColStartGlobal - targetColStartGlobal);
-
-							MatrixBlock sliced = srcBlock.slice(sliceRowStart, sliceRowEnd, sliceColStart, sliceColEnd);
-							sliced.putInto(target, targetRowOffset, targetColOffset, true);
+							try (OOCStream.QueueCallback<IndexedMatrixValue> cb = cachedStream.peekCached(mIdx)) {
+								IndexedMatrixValue mv = cb.get();
+								MatrixBlock srcBlock = (MatrixBlock) mv.getValue();
+
+								if(target == null)
+									target = new MatrixBlock(nRows, nCols, srcBlock.isInSparseFormat());
+
+								long srcBlockRowStart = (mIdx.getRowIndex() - 1) * blocksize;
+								long srcBlockColStart = (mIdx.getColumnIndex() - 1) * blocksize;
+								long sliceRowStartGlobal = Math.max(targetRowStartGlobal, srcBlockRowStart);
+								long sliceRowEndGlobal = Math.min(targetRowEndGlobal,
+									srcBlockRowStart + srcBlock.getNumRows() - 1);
+								long sliceColStartGlobal = Math.max(targetColStartGlobal, srcBlockColStart);
+								long sliceColEndGlobal = Math.min(targetColEndGlobal,
+									srcBlockColStart + srcBlock.getNumColumns() - 1);
+
+								int sliceRowStart = (int) (sliceRowStartGlobal - srcBlockRowStart);
+								int sliceRowEnd = (int) (sliceRowEndGlobal - srcBlockRowStart);
+								int sliceColStart = (int) (sliceColStartGlobal - srcBlockColStart);
+								int sliceColEnd = (int) (sliceColEndGlobal - srcBlockColStart);
+
+								int targetRowOffset = (int) (sliceRowStartGlobal - targetRowStartGlobal);
+								int targetColOffset = (int) (sliceColStartGlobal - targetColStartGlobal);
+
+								MatrixBlock sliced = srcBlock.slice(sliceRowStart, sliceRowEnd, sliceColStart,
+									sliceColEnd);
+								sliced.putInto(target, targetRowOffset, targetColOffset, true);
+							}
+
+							final int maxConsumptions = aligner.getNumConsumptions(mIdx);
+
+							Integer con = consumptionCounts.compute(mIdx, (k, v) -> {
+								if (v == null)
+									v = 0;
+								v = v+1;
+								if (v == maxConsumptions)
+									return null;
+								return v;
+							});
+
+							if (con == null)
+								cachedStream.incrProcessingCount(cachedStream.findCachedIndex(mIdx), 1);
 						}
 					}
 
 					qOut.enqueue(new IndexedMatrixValue(idx, target));
 				});
 
-				if(completed) {
-					// All blocks have been processed; we can cancel the future
-					// Currently, this does not affect processing (predicates prevent task submission anyway).
-					// However, a cancelled future may allow early file read aborts once implemented.
-					CompletableFuture<Void> f = futureRef.get();
-					if(f != null)
-						f.cancel(true);
-				}
+				if(completed)
+					future.complete(null);
 			}, tmp -> {
+				if (future.isDone()) // Then we may skip blocks and avoid submitting tasks
+					return false;
+
 				// Pre-filter incoming blocks to avoid unnecessary task submission
 				long blockRow = tmp.getIndexes().getRowIndex() - 1;
 				long blockCol = tmp.getIndexes().getColumnIndex() - 1;
@@ -228,8 +238,14 @@ public void processInstruction(ExecutionContext ec) {
 			}, () -> {
 				aligner.close();
 				qOut.closeInput();
+			}, tmp -> {
+				// If elements are not processed in an existing caching stream, we increment the process counter to allow block deletion
+				if (!hasIntermediateStream)
+					cachedStream.incrProcessingCount(cachedStream.findCachedIndex(tmp.getIndexes()), 1);
 			});
-			futureRef.set(future);
+
+			if (hasIntermediateStream)
+				cachedStream.scheduleDeletion(); // We can immediately delete blocks after consumption
 		}
 		//left indexing
 		else if(opcode.equalsIgnoreCase(Opcodes.LEFT_INDEX.toString())) {
@@ -239,16 +255,4 @@ else if(opcode.equalsIgnoreCase(Opcodes.LEFT_INDEX.toString())) {
 			throw new DMLRuntimeException(
 				"Invalid opcode (" + opcode + ") encountered in MatrixIndexingOOCInstruction.");
 	}
-
-	private static class IndexedBlockMeta {
-		public final MatrixIndexes idx;
-		////public final long nrows;
-		//public final long ncols;
-
-		public IndexedBlockMeta(IndexedMatrixValue mv) {
-			this.idx = mv.getIndexes();
-			//this.nrows = mv.getValue().getNumRows();
-			//this.ncols = mv.getValue().getNumColumns();
-		}
-	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/MatrixVectorBinaryOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/MatrixVectorBinaryOOCInstruction.java
index 38586428e1e..b0c08db2dca 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/MatrixVectorBinaryOOCInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/MatrixVectorBinaryOOCInstruction.java
@@ -23,10 +23,8 @@
 
 import org.apache.sysds.common.Opcodes;
 import org.apache.sysds.conf.ConfigurationManager;
-import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
 import org.apache.sysds.runtime.functionobjects.Multiply;
 import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
@@ -86,9 +84,48 @@ public void processInstruction( ExecutionContext ec ) {
 		OOCStream<IndexedMatrixValue> qIn = min.getStreamHandle();
 		OOCStream<IndexedMatrixValue> qOut = createWritableStream();
 		BinaryOperator plus = InstructionUtils.parseBinaryOperator(Opcodes.PLUS.toString());
+		addOutStream(qOut);
 		ec.getMatrixObject(output).setStreamHandle(qOut);
+		final Object lock = new Object();
+
+		submitOOCTasks(qIn, cb -> {
+			try(cb) {
+				IndexedMatrixValue tmp = cb.get();
+				MatrixBlock matrixBlock = (MatrixBlock) tmp.getValue();
+				long rowIndex = tmp.getIndexes().getRowIndex();
+				long colIndex = tmp.getIndexes().getColumnIndex();
+				MatrixBlock vectorSlice = partitionedVector.get(colIndex);
+
+				// Now, call the operation with the correct, specific operator.
+				MatrixBlock partialResult = matrixBlock.aggregateBinaryOperations(matrixBlock, vectorSlice,
+					new MatrixBlock(), (AggregateBinaryOperator) _optr);
+
+				// for single column block, no aggregation neeeded
+				if(emitThreshold == 1) {
+					qOut.enqueue(new IndexedMatrixValue(tmp.getIndexes(), partialResult));
+				}
+				else {
+					// aggregation
+					synchronized(lock) {
+						MatrixBlock currAgg = aggTracker.get(rowIndex);
+						if(currAgg == null) {
+							aggTracker.putAndIncrementCount(rowIndex, partialResult);
+						}
+						else {
+							currAgg = currAgg.binaryOperations(plus, partialResult);
+							if(aggTracker.putAndIncrementCount(rowIndex, currAgg)) {
+								// early block output: emit aggregated block
+								MatrixIndexes idx = new MatrixIndexes(rowIndex, 1L);
+								qOut.enqueue(new IndexedMatrixValue(idx, currAgg));
+								aggTracker.remove(rowIndex);
+							}
+						}
+					}
+				}
+			}
+		}, qOut::closeInput);
 
-		submitOOCTask(() -> {
+		/*submitOOCTask(() -> {
 				IndexedMatrixValue tmp = null;
 				try {
 					while((tmp = qIn.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) {
@@ -129,6 +166,6 @@ public void processInstruction( ExecutionContext ec ) {
 				finally {
 					qOut.closeInput();
 				}
-		}, qIn, qOut);
+		}, qIn, qOut);*/
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCEvictionManager.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCEvictionManager.java
index f5ae7573b0a..bd1da85fec3 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCEvictionManager.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCEvictionManager.java
@@ -46,45 +46,67 @@
 import java.util.concurrent.locks.ReentrantLock;
 
 /**
- * Eviction Manager for the Out-Of-Core stream cache
- * This is the base implementation for LRU, FIFO
- *
- * Design choice 1: Pure JVM-memory cache
- * What: Store MatrixBlock objects in a synchronized in-memory cache
- *   (Map + Deque for LRU/FIFO). Spill to disk by serializing MatrixBlock
- *   only when evicting.
- * Pros: Simple to implement; no off-heap management; easy to debug;
- *   no serialization race since you serialize only when evicting;
- *   fast cache hits (direct object access).
- * Cons: Heap usage counted roughly via serialized-size estimate — actual
- *   JVM object overhead not accounted; risk of GC pressure and OOM if
- *   estimates are off or if many small objects cause fragmentation;
- *   eviction may be more expensive (serialize on eviction).
- * <p>
- * Design choice 2:
+ * Eviction Manager for the Out-Of-Core (OOC) stream cache.
  * <p>
- * This manager runtime memory management by caching serialized
- * ByteBuffers and spilling them to disk when needed.
- * <p>
- * * core function: Caches ByteBuffers (off-heap/direct) and
- * spills them to disk
- * * Eviction: Evicts a ByteBuffer by writing its contents to a file
- * * Granularity: Evicts one IndexedMatrixValue block at a time
- * * Data replay: get() will always return the data either from memory or
- *   by falling back to the disk
- * * Memory: Since the datablocks are off-heap (in ByteBuffer) or disk,
- *   there won't be OOM.
+ * This manager implements a high-performance, thread-safe buffer pool designed
+ * to handle intermediate results that exceed available heap memory. It employs
+ * a <b>partitioned eviction</b> strategy to maximize disk throughput and a
+ * <b>lock-striped</b> concurrency model to minimize thread contention.
+ *
+ * <h2>1. Purpose</h2>
+ * Provides a bounded cache for {@code MatrixBlock}s produced and consumed by OOC
+ * streaming operators (e.g., {@code tsmm}, {@code ba+*}). When memory pressure
+ * exceeds a configured limit, blocks are transparently evicted to disk and restored
+ * on demand, allowing execution of operations larger than RAM.
+ *
+ * <h2>2. Lifecycle Management</h2>
+ * Blocks transition atomically through three states to ensure data consistency:
+ * <ul>
+ * <li><b>HOT:</b> The block is pinned in the JVM heap ({@code value != null}).</li>
+ * <li><b>EVICTING:</b> A transition state. The block is currently being written to disk.
+ * Concurrent readers must wait on the entry's condition variable.</li>
+ * <li><b>COLD:</b> The block is persisted on disk. The heap reference is nulled out
+ * to free memory, but the container (metadata) remains in the cache map.</li>
+ * </ul>
+ *
+ * <h2>3. Eviction Strategy (Partitioned I/O)</h2>
+ * To mitigate I/O thrashing caused by writing thousands of small blocks:
+ * <ul>
+ * <li>Eviction is <b>partition-based</b>: Groups of "HOT" blocks are gathered into
+ * batches (e.g., 64MB) and written sequentially to a single partition file.</li>
+ * <li>This converts random I/O into high-throughput sequential I/O.</li>
+ * <li>A separate metadata map tracks the {@code (partitionId, offset)} for every
+ * evicted block, allowing random-access reloading.</li>
+ * </ul>
+ *
+ * <h2>4. Data Integrity (Re-hydration)</h2>
+ * To prevent index corruption during serialization/deserialization cycles, this manager
+ * uses a "re-hydration" model. The {@code IndexedMatrixValue} container is <b>never</b>
+ * removed from the cache structure. Eviction only nulls the data payload. Loading
+ * restores the data into the existing container, preserving the original {@code MatrixIndexes}.
  *
- * Pros: Avoids heap OOM by keeping large data off-heap; predictable
- *   memory usage; good for very large blocks.
- * Cons: More complex synchronization; need robust off-heap allocator/free;
- *   must ensure serialization finishes before adding to queue or make evict
- *   wait on serialization; careful with native memory leaks.
+ * <h2>5. Concurrency Model (Fine-Grained Locking)</h2>
+ * <ul>
+ * <li><b>Global Structure Lock:</b> A coarse-grained lock ({@code _cacheLock}) guards
+ * the {@code LinkedHashMap} structure against concurrent insertions, deletions,
+ * and iteration during eviction selection.</li>
+ *
+ * <li><b>Per-Block Locks:</b> Each {@code BlockEntry} owns an independent
+ * {@code ReentrantLock}. This decouples I/O operations, allowing a reader to load
+ * "Block A" from disk while the evictor writes "Block B" to disk simultaneously,
+ * maximizing throughput.</li>
+ *
+ * <li><b>Condition Queues:</b> To handle read-write races, the system uses atomic
+ * state transitions. If a reader attempts to access a block in the {@code EVICTING}
+ * state, it waits on the entry's {@code Condition} variable until the writer
+ * signals that the block is safely {@code COLD} (persisted).</li>
+ * </ul>
  */
+
 public class OOCEvictionManager {
 
 	// Configuration: OOC buffer limit as percentage of heap
-	private static final double OOC_BUFFER_PERCENTAGE = 0.15 * 0.01 * 2; // 15% of heap
+	private static final double OOC_BUFFER_PERCENTAGE = 0.15; // 15% of heap
 
 	private static final double PARTITION_EVICTION_SIZE = 64 * 1024 * 1024; // 64 MB
 
@@ -96,8 +118,8 @@ public class OOCEvictionManager {
 	private static LinkedHashMap<String, BlockEntry> _cache = new LinkedHashMap<>();
 
 	// Spill related structures
-	private static ConcurrentHashMap<String, spillLocation> _spillLocations =  new ConcurrentHashMap<>();
-	private static ConcurrentHashMap<Integer, partitionFile> _partitions = new ConcurrentHashMap<>();
+	private static ConcurrentHashMap<String, SpillLocation> _spillLocations =  new ConcurrentHashMap<>();
+	private static ConcurrentHashMap<Integer, PartitionFile> _partitions = new ConcurrentHashMap<>();
 	private static final AtomicInteger _partitionCounter = new AtomicInteger(0);
 
 	// Track which partitions belong to which stream (for cleanup)
@@ -121,24 +143,24 @@ private enum BlockState {
 		COLD // On disk
 	}
 
-	private static class spillLocation {
+	private static class SpillLocation {
 		// structure of spillLocation: file, offset
 		final int partitionId;
 		final long offset;
 
-		spillLocation(int partitionId, long offset) {
+		SpillLocation(int partitionId, long offset) {
 
 			this.partitionId = partitionId;
 			this.offset = offset;
 		}
 	}
 
-	private static class partitionFile {
+	private static class PartitionFile {
 		final String filePath;
 		//final long streamId;
 
 
-		private partitionFile(String filePath, long streamId) {
+		private PartitionFile(String filePath, long streamId) {
 			this.filePath = filePath;
 			//this.streamId = streamId;
 		}
@@ -170,6 +192,40 @@ private static class BlockEntry {
 		LocalFileUtils.createLocalFileIfNotExist(_spillDir);
 	}
 
+	public static void reset() {
+		TeeOOCInstruction.reset();
+		if (!_cache.isEmpty()) {
+			System.err.println("There are dangling elements in the OOC Eviction cache: " + _cache.size());
+		}
+		_size.set(0);
+		_cache.clear();
+		_spillLocations.clear();
+		_partitions.clear();
+		_partitionCounter.set(0);
+		_streamPartitions.clear();
+	}
+
+	/**
+	 * Removes a block from the cache without setting its data to null.
+	 */
+	public static void forget(long streamId, int blockId) {
+		BlockEntry e;
+		synchronized (_cacheLock) {
+			e = _cache.remove(streamId + "_" + blockId);
+		}
+
+		if (e != null) {
+			e.lock.lock();
+			try {
+				if (e.state == BlockState.HOT)
+					_size.addAndGet(-e.size);
+			} finally {
+				e.lock.unlock();
+			}
+			System.out.println("Removed block " + streamId + "_" + blockId + " from cache (idx: " + (e.value != null ? e.value.getIndexes() : "?") + ")");
+		}
+	}
+
 	/**
 	 * Store a block in the OOC cache (serialize once)
 	 */
@@ -304,7 +360,7 @@ private static void evict() {
 		}
 
 		// 2. create the partition file metadata
-		partitionFile partFile = new partitionFile(filename, 0);
+		PartitionFile partFile = new PartitionFile(filename, 0);
 		_partitions.put(partitionId, partFile);
 
 		FileOutputStream fos = null;
@@ -329,7 +385,7 @@ private static void evict() {
 				System.out.println("written, partition id: " + _partitions.get(partitionId) + ", offset: " + offset);
 
 				// 3. create the spillLocation
-				spillLocation sloc = new spillLocation(partitionId, offset);
+				SpillLocation sloc = new SpillLocation(partitionId, offset);
 				_spillLocations.put(tmp.getKey(), sloc);
 
 				// 4. track file for cleanup
@@ -372,12 +428,12 @@ private static IndexedMatrixValue loadFromDisk(long streamId, int blockId) {
 		String key = streamId + "_" + blockId;
 
 		// 1. find the blocks address (spill location)
-		spillLocation sloc = _spillLocations.get(key);
+		SpillLocation sloc = _spillLocations.get(key);
 		if (sloc == null) {
 			throw new DMLRuntimeException("Failed to load spill location for: " + key);
 		}
 
-		partitionFile partFile = _partitions.get(sloc.partitionId);
+		PartitionFile partFile = _partitions.get(sloc.partitionId);
 		if (partFile == null) {
 			throw new DMLRuntimeException("Failed to load partition for: " + sloc.partitionId);
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCInstruction.java
index ca13cfdb2c3..5a4ae19b613 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCInstruction.java
@@ -30,10 +30,15 @@
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
 import org.apache.sysds.runtime.matrix.operators.Operator;
+import org.apache.sysds.runtime.ooc.cache.OOCCacheManager;
+import org.apache.sysds.runtime.ooc.stats.OOCEventLog;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 import org.apache.sysds.runtime.util.OOCJoin;
+import org.apache.sysds.utils.Statistics;
+import scala.Tuple4;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -45,6 +50,7 @@
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.LongAdder;
 import java.util.function.BiConsumer;
 import java.util.function.BiFunction;
 import java.util.function.Consumer;
@@ -54,9 +60,10 @@
 public abstract class OOCInstruction extends Instruction {
 	protected static final Log LOG = LogFactory.getLog(OOCInstruction.class.getName());
 	private static final AtomicInteger nextStreamId = new AtomicInteger(0);
+	private long nanoTime;
 
 	public enum OOCType {
-		Reblock, Tee, Binary, Unary, AggregateUnary, AggregateBinary, MAPMM, MMTSJ,
+		Reblock, Tee, Binary, Ternary, Unary, AggregateUnary, AggregateBinary, AggregateTernary, MAPMM, MMTSJ,
 		Reorg, CM, Ctable, MatrixIndexing, ParameterizedBuiltin, Rand
 	}
 
@@ -65,6 +72,8 @@ public enum OOCType {
 	protected Set<OOCStream<?>> _inQueues;
 	protected Set<OOCStream<?>> _outQueues;
 	private boolean _failed;
+	private LongAdder _localStatisticsAdder;
+	public final int _callerId;
 
 	protected OOCInstruction(OOCInstruction.OOCType type, String opcode, String istr) {
 		this(type, null, opcode, istr);
@@ -78,6 +87,10 @@ protected OOCInstruction(OOCInstruction.OOCType type, Operator op, String opcode
 
 		_requiresLabelUpdate = super.requiresLabelUpdate();
 		_failed = false;
+
+		if (DMLScript.STATISTICS)
+			_localStatisticsAdder = new LongAdder();
+		_callerId = DMLScript.OOC_LOG_EVENTS ? OOCEventLog.registerCaller(getExtendedOpcode() + "_" + hashCode()) : 0;
 	}
 
 	@Override
@@ -101,6 +114,8 @@ public String getGraphString() {
 
 	@Override
 	public Instruction preprocessInstruction(ExecutionContext ec) {
+		if (DMLScript.OOC_LOG_EVENTS)
+			nanoTime = System.nanoTime();
 		// TODO
 		return super.preprocessInstruction(ec);
 	}
@@ -112,6 +127,8 @@ public Instruction preprocessInstruction(ExecutionContext ec) {
 	public void postprocessInstruction(ExecutionContext ec) {
 		if(DMLScript.LINEAGE_DEBUGGER)
 			ec.maintainLineageDebuggerInfo(this);
+		if (DMLScript.OOC_LOG_EVENTS)
+			OOCEventLog.onComputeEvent(_callerId, nanoTime, System.nanoTime());
 	}
 
 	protected void addInStream(OOCStream<?>... queue) {
@@ -121,8 +138,12 @@ protected void addInStream(OOCStream<?>... queue) {
 	}
 
 	protected void addOutStream(OOCStream<?>... queue) {
-		// Currently same behavior as addInQueue
-		if (_outQueues == null)
+		if (queue.length == 0 && _outQueues == null) {
+			_outQueues = Collections.emptySet();
+			return;
+		}
+
+		if (_outQueues == null || _outQueues.isEmpty())
 			_outQueues = new HashSet<>();
 		_outQueues.addAll(List.of(queue));
 	}
@@ -131,11 +152,15 @@ protected <T> OOCStream<T> createWritableStream() {
 		return new SubscribableTaskQueue<>();
 	}
 
-	protected <T, R> CompletableFuture<Void> filterOOC(OOCStream<T> qIn, Consumer<T> processor, Function<T, Boolean> predicate, Runnable finalizer) {
+	protected <T> CompletableFuture<Void> filterOOC(OOCStream<T> qIn, Consumer<T> processor, Function<T, Boolean> predicate, Runnable finalizer) {
+		return filterOOC(qIn, processor, predicate, finalizer, null);
+	}
+
+	protected <T> CompletableFuture<Void> filterOOC(OOCStream<T> qIn, Consumer<T> processor, Function<T, Boolean> predicate, Runnable finalizer, Consumer<T> onNotProcessed) {
 		if (_inQueues == null || _outQueues == null)
 			throw new NotImplementedException("filterOOC requires manual specification of all input and output streams for error propagation");
 
-		return submitOOCTasks(qIn, processor, finalizer, predicate);
+		return submitOOCTasks(qIn, c -> processor.accept(c.get()), finalizer, p -> predicate.apply(p.get()), onNotProcessed != null ? (i, tmp) -> onNotProcessed.accept(tmp.get()) : null);
 	}
 
 	protected <T, R> CompletableFuture<Void> mapOOC(OOCStream<T> qIn, OOCStream<R> qOut, Function<T, R> mapper) {
@@ -143,8 +168,8 @@ protected <T, R> CompletableFuture<Void> mapOOC(OOCStream<T> qIn, OOCStream<R> q
 		addOutStream(qOut);
 
 		return submitOOCTasks(qIn, tmp -> {
-			try {
-				R r = mapper.apply(tmp);
+			try (tmp) {
+				R r = mapper.apply(tmp.get());
 				qOut.enqueue(r);
 			} catch (Exception e) {
 				throw e instanceof DMLRuntimeException ? (DMLRuntimeException) e : new DMLRuntimeException(e);
@@ -163,54 +188,125 @@ protected <R, P> CompletableFuture<Void> broadcastJoinOOC(OOCStream<IndexedMatri
 		leftCache.activateIndexing();
 		rightCache.activateIndexing();
 
+		if (!explicitLeftCaching)
+			leftCache.incrSubscriberCount(1); // Prevent early block deletion as we may read elements twice
+
+		if (!explicitRightCaching)
+			rightCache.incrSubscriberCount(1);
+
 		Map<P, List<MatrixIndexes>> availableLeftInput = new ConcurrentHashMap<>();
 		Map<P, BroadcastedElement> availableBroadcastInput = new ConcurrentHashMap<>();
 
-		return submitOOCTasks(List.of(qIn, broadcast), (i, tmp) -> {
-			P key = on.apply(tmp);
-
-			if (i == 0) { // qIn stream
-				BroadcastedElement b = availableBroadcastInput.get(key);
-
-				if (b == null) {
-					// Matching broadcast element is not available -> cache element
-					if (explicitLeftCaching)
-						leftCache.getWriteStream().enqueue(tmp);
-
-					availableLeftInput.compute(key, (k, v) -> {
-						if (v == null)
-							v = new ArrayList<>();
-						v.add(tmp.getIndexes());
-						return v;
-					});
-				} else {
-					// Directly emit
-					qOut.enqueue(mapper.apply(tmp, b));
-
-					if (b.canRelease())
-						availableBroadcastInput.remove(key);
+		OOCStream<Tuple4<P, OOCStream.QueueCallback<IndexedMatrixValue>, OOCStream.QueueCallback<IndexedMatrixValue>, BroadcastedElement>> broadcastingQueue = createWritableStream();
+		AtomicInteger waitCtr = new AtomicInteger(1);
+		CompletableFuture<Void> fut1 = new CompletableFuture<>();
+
+		submitOOCTasks(List.of(qIn, broadcast), (i, tmp) -> {
+			try (tmp) {
+				P key = on.apply(tmp.get());
+
+				if(i == 0) { // qIn stream
+					BroadcastedElement b = availableBroadcastInput.get(key);
+
+					if(b == null) {
+						// Matching broadcast element is not available -> cache element
+						availableLeftInput.compute(key, (k, v) -> {
+							if(v == null)
+								v = new ArrayList<>();
+							v.add(tmp.get().getIndexes());
+							return v;
+						});
+
+						if(explicitLeftCaching)
+							leftCache.getWriteStream().enqueue(tmp.get());
+					}
+					else {
+						waitCtr.incrementAndGet();
+
+						OOCCacheManager.requestManyBlocks(
+							List.of(leftCache.peekCachedBlockKey(tmp.get().getIndexes()), rightCache.peekCachedBlockKey(b.idx)))
+							.whenComplete((items, err) -> {
+								try {
+									broadcastingQueue.enqueue(new Tuple4<>(key, items.get(0).keepOpen(), items.get(1).keepOpen(), b));
+								} finally {
+									items.forEach(OOCStream.QueueCallback::close);
+								}
+							});
+					}
 				}
-			} else { // broadcast stream
-				if (explicitRightCaching)
-					rightCache.getWriteStream().enqueue(tmp);
+				else { // broadcast stream
+					if(explicitRightCaching)
+						rightCache.getWriteStream().enqueue(tmp.get());
+
+					BroadcastedElement b = new BroadcastedElement(tmp.get().getIndexes());
+					availableBroadcastInput.put(key, b);
+
+					List<MatrixIndexes> queued = availableLeftInput.remove(key);
+
+					if(queued != null) {
+						for(MatrixIndexes idx : queued) {
+							waitCtr.incrementAndGet();
+
+							OOCCacheManager.requestManyBlocks(
+								List.of(leftCache.peekCachedBlockKey(idx), rightCache.peekCachedBlockKey(tmp.get().getIndexes())))
+								.whenComplete((items, err) -> {
+									try{
+										broadcastingQueue.enqueue(new Tuple4<>(key, items.get(0).keepOpen(), items.get(1).keepOpen(), b));
+									} finally {
+										items.forEach(OOCStream.QueueCallback::close);
+									}
+								});
+						}
+					}
+				}
+			}
+		}, () -> {
+			fut1.complete(null);
+			if (waitCtr.decrementAndGet() == 0)
+				broadcastingQueue.closeInput();
+		});
 
-				BroadcastedElement b = new BroadcastedElement(tmp.getIndexes());
-				availableBroadcastInput.put(key, b);
+		CompletableFuture<Void> fut2 = new CompletableFuture<>();
+		submitOOCTasks(List.of(broadcastingQueue), (i, tpl) -> {
+			try (tpl) {
+				final BroadcastedElement b = tpl.get()._4();
+				final OOCStream.QueueCallback<IndexedMatrixValue> lValue = tpl.get()._2();
+				final OOCStream.QueueCallback<IndexedMatrixValue> bValue = tpl.get()._3();
 
-				List<MatrixIndexes> queued = availableLeftInput.remove(key);
+				try (lValue; bValue) {
+					b.value = bValue.get();
+					qOut.enqueue(mapper.apply(lValue.get(), b));
+					leftCache.incrProcessingCount(leftCache.findCachedIndex(lValue.get().getIndexes()), 1);
 
-				if (queued != null) {
-					for(MatrixIndexes idx : queued) {
-						b.value = rightCache.findCached(b.idx);
-						qOut.enqueue(mapper.apply(leftCache.findCached(idx), b));
-						b.value = null;
+					if(b.canRelease()) {
+						availableBroadcastInput.remove(tpl.get()._1());
+
+						if(!explicitRightCaching)
+							rightCache.incrProcessingCount(rightCache.findCachedIndex(b.idx),
+								1); // Correct for incremented subscriber count to allow block deletion
 					}
 				}
 
-				if (b.canRelease())
-					availableBroadcastInput.remove(key);
+				if(waitCtr.decrementAndGet() == 0)
+					broadcastingQueue.closeInput();
 			}
-		}, qOut::closeInput);
+		}, () -> fut2.complete(null));
+
+		if (explicitLeftCaching)
+			leftCache.scheduleDeletion();
+		if (explicitRightCaching)
+			rightCache.scheduleDeletion();
+
+		CompletableFuture<Void> fut = CompletableFuture.allOf(fut1, fut2);
+		fut.whenComplete((res, t) -> {
+			availableBroadcastInput.forEach((k, v) -> {
+				rightCache.incrProcessingCount(rightCache.findCachedIndex(v.idx), 1);
+			});
+			availableBroadcastInput.clear();
+			qOut.closeInput();
+		});
+
+		return fut;
 	}
 
 	protected static class BroadcastedElement {
@@ -244,12 +340,78 @@ public MatrixIndexes getIndex() {
 		public IndexedMatrixValue getValue() {
 			return value;
 		}
-	};
+	}
 
 	protected <T, R, P> CompletableFuture<Void> joinOOC(OOCStream<T> qIn1, OOCStream<T> qIn2, OOCStream<R> qOut, BiFunction<T, T, R> mapper, Function<T, P> on) {
 		return joinOOC(qIn1, qIn2, qOut, mapper, on, on);
 	}
 
+	@SuppressWarnings("unchecked")
+	protected <T, R, P> CompletableFuture<Void> joinOOC(List<OOCStream<T>> qIn, OOCStream<R> qOut, Function<List<T>, R> mapper, List<Function<T, P>> on) {
+		if (qIn == null || on == null || qIn.size() != on.size())
+			throw new DMLRuntimeException("joinOOC(list) requires the same number of streams and key functions.");
+
+		addInStream(qIn.toArray(OOCStream[]::new));
+		addOutStream(qOut);
+
+		final int n = qIn.size();
+
+		CachingStream[] caches = new CachingStream[n];
+		boolean[] explicitCaching = new boolean[n];
+
+		for (int i = 0; i < n; i++) {
+			OOCStream<T> s = qIn.get(i);
+			explicitCaching[i] = !s.hasStreamCache();
+			caches[i] = explicitCaching[i] ? new CachingStream((OOCStream<IndexedMatrixValue>) s) : s.getStreamCache();
+			caches[i].activateIndexing();
+			// One additional consumption for the materialization when emitting
+			caches[i].incrSubscriberCount(1);
+		}
+
+		Map<P, MatrixIndexes[]> seen = new ConcurrentHashMap<>();
+
+		CompletableFuture<Void> future = submitOOCTasks(
+			Arrays.stream(caches).map(CachingStream::getReadStream).collect(java.util.stream.Collectors.toList()),
+			(i, tmp) -> {
+				Function<T, P> keyFn = on.get(i);
+				P key = keyFn.apply((T)tmp.get());
+				MatrixIndexes idx = tmp.get().getIndexes();
+
+				MatrixIndexes[] arr = seen.computeIfAbsent(key, k -> new MatrixIndexes[n]);
+				boolean ready;
+				synchronized (arr) {
+					arr[i] = idx;
+					ready = true;
+					for (MatrixIndexes ix : arr) {
+						if (ix == null) {
+							ready = false;
+							break;
+						}
+					}
+				}
+
+				if (!ready || !seen.remove(key, arr))
+					return;
+
+				List<OOCStream.QueueCallback<T>> values = new java.util.ArrayList<>(n);
+				try {
+					for(int j = 0; j < n; j++)
+						values.add((OOCStream.QueueCallback<T>) caches[j].findCached(arr[j]));
+
+					qOut.enqueue(mapper.apply(values.stream().map(OOCStream.QueueCallback::get).toList()));
+				} finally {
+					values.forEach(OOCStream.QueueCallback::close);
+				}
+			}, qOut::closeInput);
+
+		for (int i = 0; i < n; i++) {
+			if (explicitCaching[i])
+				caches[i].scheduleDeletion();
+		}
+
+		return future;
+	}
+
 	@SuppressWarnings("unchecked")
 	protected <T, R, P> CompletableFuture<Void> joinOOC(OOCStream<T> qIn1, OOCStream<T> qIn2, OOCStream<R> qOut, BiFunction<T, T, R> mapper, Function<T, P> onLeft, Function<T, P> onRight) {
 		addInStream(qIn1, qIn2);
@@ -257,59 +419,75 @@ protected <T, R, P> CompletableFuture<Void> joinOOC(OOCStream<T> qIn1, OOCStream
 
 		final CompletableFuture<Void> future = new CompletableFuture<>();
 
+		boolean explicitLeftCaching = !qIn1.hasStreamCache();
+		boolean explicitRightCaching = !qIn2.hasStreamCache();
+
 		// We need to construct our own stream to properly manage the cached items in the hash join
-		CachingStream leftCache = qIn1.hasStreamCache() ? qIn1.getStreamCache() : new CachingStream((SubscribableTaskQueue<IndexedMatrixValue>)qIn1); // We have to assume this generic type for now
-		CachingStream rightCache = qIn2.hasStreamCache() ? qIn2.getStreamCache() : new CachingStream((SubscribableTaskQueue<IndexedMatrixValue>)qIn2); // We have to assume this generic type for now
+		CachingStream leftCache = explicitLeftCaching ? new CachingStream((OOCStream<IndexedMatrixValue>) qIn1) : qIn1.getStreamCache();
+		CachingStream rightCache = explicitRightCaching ? new CachingStream((OOCStream<IndexedMatrixValue>) qIn2) : qIn2.getStreamCache();
 		leftCache.activateIndexing();
 		rightCache.activateIndexing();
 
+		leftCache.incrSubscriberCount(1);
+		rightCache.incrSubscriberCount(1);
+
 		final OOCJoin<P, MatrixIndexes> join = new OOCJoin<>((idx, left, right) -> {
-			T leftObj = (T) leftCache.findCached(left);
-			T rightObj = (T) rightCache.findCached(right);
-			qOut.enqueue(mapper.apply(leftObj, rightObj));
+			OOCStream.QueueCallback<T> leftObj = (OOCStream.QueueCallback<T>) leftCache.findCached(left);
+			OOCStream.QueueCallback<T> rightObj = (OOCStream.QueueCallback<T>) rightCache.findCached(right);
+			try (leftObj; rightObj) {
+				qOut.enqueue(mapper.apply(leftObj.get(), rightObj.get()));
+			}
 		});
 
 		submitOOCTasks(List.of(leftCache.getReadStream(), rightCache.getReadStream()), (i, tmp) -> {
-			if (i == 0)
-				join.addLeft(onLeft.apply((T)tmp), ((IndexedMatrixValue) tmp).getIndexes());
-			else
-				join.addRight(onRight.apply((T)tmp), ((IndexedMatrixValue) tmp).getIndexes());
+			try (tmp) {
+				if(i == 0)
+					join.addLeft(onLeft.apply((T) tmp.get()), tmp.get().getIndexes());
+				else
+					join.addRight(onRight.apply((T) tmp.get()), tmp.get().getIndexes());
+			}
 		}, () -> {
 			join.close();
 			qOut.closeInput();
 			future.complete(null);
 		});
 
+		if (explicitLeftCaching)
+			leftCache.scheduleDeletion();
+		if (explicitRightCaching)
+			rightCache.scheduleDeletion();
+
 		return future;
 	}
 
-	protected <T> CompletableFuture<Void> submitOOCTasks(final List<OOCStream<T>> queues, BiConsumer<Integer, T> consumer, Runnable finalizer) {
+	protected <T> CompletableFuture<Void> submitOOCTasks(final List<OOCStream<T>> queues, BiConsumer<Integer, OOCStream.QueueCallback<T>> consumer, Runnable finalizer) {
+		return submitOOCTasks(queues, consumer, finalizer, null);
+	}
+
+	protected <T> CompletableFuture<Void> submitOOCTasks(final List<OOCStream<T>> queues, BiConsumer<Integer, OOCStream.QueueCallback<T>> consumer, Runnable finalizer, BiConsumer<Integer, OOCStream.QueueCallback<T>> onNotProcessed) {
 		List<CompletableFuture<Void>> futures = new ArrayList<>(queues.size());
 
 		for (int i = 0; i < queues.size(); i++)
 			futures.add(new CompletableFuture<>());
 
-		return submitOOCTasks(queues, consumer, finalizer, futures, null);
+		return submitOOCTasks(queues, consumer, finalizer, futures, null, onNotProcessed);
 	}
 
-	protected <T> CompletableFuture<Void> submitOOCTasks(final List<OOCStream<T>> queues, BiConsumer<Integer, T> consumer, Runnable finalizer, List<CompletableFuture<Void>> futures, BiFunction<Integer, T, Boolean> predicate) {
+	protected <T> CompletableFuture<Void> submitOOCTasks(final List<OOCStream<T>> queues, BiConsumer<Integer, OOCStream.QueueCallback<T>> consumer, Runnable finalizer, List<CompletableFuture<Void>> futures, BiFunction<Integer, OOCStream.QueueCallback<T>, Boolean> predicate, BiConsumer<Integer, OOCStream.QueueCallback<T>> onNotProcessed) {
 		addInStream(queues.toArray(OOCStream[]::new));
+		if (_outQueues == null)
+			throw new IllegalArgumentException("Explicit specification of all output streams is required before submitting tasks. If no output streams are present use addOutStream().");
 		ExecutorService pool = CommonThreadPool.get();
 
 		final List<AtomicInteger> activeTaskCtrs = new ArrayList<>(queues.size());
-		final List<AtomicBoolean> streamsClosed = new ArrayList<>(queues.size());
 
-		for (int i = 0; i < queues.size(); i++) {
-			activeTaskCtrs.add(new AtomicInteger(0));
-			streamsClosed.add(new AtomicBoolean(false));
-		}
+		for (int i = 0; i < queues.size(); i++)
+			activeTaskCtrs.add(new AtomicInteger(1));
 
-		final AtomicInteger globalTaskCtr = new AtomicInteger(0);
 		final CompletableFuture<Void> globalFuture = CompletableFuture.allOf(futures.toArray(CompletableFuture[]::new));
 		if (_outQueues == null)
 			_outQueues = Collections.emptySet();
 		final Runnable oocFinalizer = oocTask(finalizer, null, Stream.concat(_outQueues.stream(), _inQueues.stream()).toArray(OOCStream[]::new));
-		final Object globalLock = new Object();
 
 		int i = 0;
 		@SuppressWarnings("unused")
@@ -319,101 +497,117 @@ protected <T> CompletableFuture<Void> submitOOCTasks(final List<OOCStream<T>> qu
 		for (OOCStream<T> queue : queues) {
 			final int k = i;
 			final AtomicInteger localTaskCtr =  activeTaskCtrs.get(k);
-			final AtomicBoolean localStreamClosed = streamsClosed.get(k);
 			final CompletableFuture<Void> localFuture = futures.get(k);
+			final AtomicBoolean closeRaceWatchdog = new AtomicBoolean(false);
 
 			//System.out.println("Substream (k " + k + ", id " + streamId + ", type '" + queue.getClass().getSimpleName() + "', stream_id " + queue.hashCode() + ")");
-			queue.setSubscriber(oocTask(() -> {
-				final T item = queue.dequeue();
-
-				if (predicate != null && item != null && !predicate.apply(k, item)) // Can get closed due to cancellation
-					return;
-
-				synchronized (globalLock) {
-					if (localFuture.isDone())
+			queue.setSubscriber(oocTask(callback -> {
+				long startTime = DMLScript.STATISTICS ? System.nanoTime() : 0;
+				try (callback) {
+					if(callback.isEos()) {
+						if(!closeRaceWatchdog.compareAndSet(false, true))
+							throw new DMLRuntimeException(
+								"Race condition observed: NO_MORE_TASKS callback has been triggered more than once");
+
+						if(localTaskCtr.decrementAndGet() == 0) {
+							// Then we can run the finalization procedure already
+							localFuture.complete(null);
+						}
 						return;
+					}
 
-					globalTaskCtr.incrementAndGet();
-				}
-
-				localTaskCtr.incrementAndGet();
+					if(predicate != null && !predicate.apply(k, callback)) { // Can get closed due to cancellation
+						if(onNotProcessed != null)
+							onNotProcessed.accept(k, callback);
+						return;
+					}
 
-				pool.submit(oocTask(() -> {
-					if(item != null) {
-						//System.out.println("Accept" + ((IndexedMatrixValue)item).getIndexes() + " (k " + k + ", id " + streamId + ")");
-						consumer.accept(k, item);
+					if(localFuture.isDone()) {
+						if(onNotProcessed != null)
+							onNotProcessed.accept(k, callback);
+						return;
 					}
 					else {
-						//System.out.println("Close substream (k " + k + ", id " + streamId + ")");
-						localStreamClosed.set(true);
+						localTaskCtr.incrementAndGet();
 					}
 
-					boolean runFinalizer = false;
-
-					synchronized (globalLock) {
-						int localTasks = localTaskCtr.decrementAndGet();
-						boolean finalizeStream = localTasks == 0 && localStreamClosed.get();
-
-						int globalTasks = globalTaskCtr.get() - 1;
-
-						if (finalizeStream || (globalFuture.isDone() && localTasks == 0)) {
-							localFuture.complete(null);
-
-							if (globalFuture.isDone() && globalTasks == 0)
-								runFinalizer = true;
+					// The item needs to be pinned in memory to be accessible in the executor thread
+					final OOCStream.QueueCallback<T> pinned = callback.keepOpen();
+
+					pool.submit(oocTask(() -> {
+						long taskStartTime = DMLScript.STATISTICS ? System.nanoTime() : 0;
+						try (pinned) {
+							consumer.accept(k, pinned);
+
+							if(localTaskCtr.decrementAndGet() == 0)
+								localFuture.complete(null);
+						} finally {
+							if (DMLScript.STATISTICS) {
+								_localStatisticsAdder.add(System.nanoTime() - taskStartTime);
+								if (globalFuture.isDone()) {
+									Statistics.maintainOOCHeavyHitter(getExtendedOpcode(), _localStatisticsAdder.sum());
+									_localStatisticsAdder.reset();
+								}
+								if (DMLScript.OOC_LOG_EVENTS)
+									OOCEventLog.onComputeEvent(_callerId, taskStartTime, System.nanoTime());
+							}
+						}
+					}, localFuture, Stream.concat(_outQueues.stream(), _inQueues.stream()).toArray(OOCStream[]::new)));
+
+					if(closeRaceWatchdog.get()) // Sanity check
+						throw new DMLRuntimeException("Race condition observed");
+				} finally {
+					if (DMLScript.STATISTICS) {
+						_localStatisticsAdder.add(System.nanoTime() - startTime);
+						if (globalFuture.isDone()) {
+							Statistics.maintainOOCHeavyHitter(getExtendedOpcode(), _localStatisticsAdder.sum());
+							_localStatisticsAdder.reset();
 						}
-
-						globalTaskCtr.decrementAndGet();
 					}
-
-					if (runFinalizer)
-						oocFinalizer.run();
-				}, localFuture, Stream.concat(_outQueues.stream(), _inQueues.stream()).toArray(OOCStream[]::new)));
+				}
 			}, null,  Stream.concat(_outQueues.stream(), _inQueues.stream()).toArray(OOCStream[]::new)));
 
 			i++;
 		}
 
-		pool.shutdown();
-
 		globalFuture.whenComplete((res, e) -> {
-			if (globalFuture.isCancelled() || globalFuture.isCompletedExceptionally())
+			if (globalFuture.isCancelled() || globalFuture.isCompletedExceptionally()) {
 				futures.forEach(f -> {
-					if (!f.isDone()) {
-						if (globalFuture.isCancelled() || globalFuture.isCompletedExceptionally())
+					if(!f.isDone()) {
+						if(globalFuture.isCancelled() || globalFuture.isCompletedExceptionally())
 							f.cancel(true);
 						else
 							f.complete(null);
 					}
 				});
-
-			boolean runFinalizer;
-
-			synchronized (globalLock) {
-				runFinalizer = globalTaskCtr.get() == 0;
 			}
 
-			if (runFinalizer)
-				oocFinalizer.run();
-
-			//System.out.println("Shutdown (id " + streamId + ")");
+			oocFinalizer.run();
 		});
 		return globalFuture;
 	}
 
-	protected <T> CompletableFuture<Void> submitOOCTasks(OOCStream<T> queue, Consumer<T> consumer, Runnable finalizer) {
+	protected <T> CompletableFuture<Void> submitOOCTasks(OOCStream<T> queue, Consumer<OOCStream.QueueCallback<T>> consumer, Runnable finalizer) {
 		return submitOOCTasks(List.of(queue), (i, tmp) -> consumer.accept(tmp), finalizer);
 	}
 
-	protected <T> CompletableFuture<Void> submitOOCTasks(OOCStream<T> queue, Consumer<T> consumer, Runnable finalizer, Function<T, Boolean> predicate) {
-		return submitOOCTasks(List.of(queue), (i, tmp) -> consumer.accept(tmp), finalizer, List.of(new CompletableFuture<Void>()), (i, tmp) -> predicate.apply(tmp));
+	protected <T> CompletableFuture<Void> submitOOCTasks(OOCStream<T> queue, Consumer<OOCStream.QueueCallback<T>> consumer, Runnable finalizer, Function<OOCStream.QueueCallback<T>, Boolean> predicate, BiConsumer<Integer, OOCStream.QueueCallback<T>> onNotProcessed) {
+		return submitOOCTasks(List.of(queue), (i, tmp) -> consumer.accept(tmp), finalizer, List.of(new CompletableFuture<Void>()), (i, tmp) -> predicate.apply(tmp), onNotProcessed);
 	}
 
 	protected CompletableFuture<Void> submitOOCTask(Runnable r, OOCStream<?>... queues) {
 		ExecutorService pool = CommonThreadPool.get();
 		final CompletableFuture<Void> future = new CompletableFuture<>();
 		try {
-			pool.submit(oocTask(() -> {r.run();future.complete(null);}, future, queues));
+			pool.submit(oocTask(() -> {
+				long startTime = DMLScript.STATISTICS || DMLScript.OOC_LOG_EVENTS ? System.nanoTime() : 0;
+				r.run();
+				future.complete(null);
+				if (DMLScript.STATISTICS)
+					Statistics.maintainOOCHeavyHitter(getExtendedOpcode(), System.nanoTime() - startTime);
+				if (DMLScript.OOC_LOG_EVENTS)
+					OOCEventLog.onComputeEvent(_callerId, startTime,  System.nanoTime());
+				}, future, queues));
 		}
 		catch (Exception ex) {
 			throw new DMLRuntimeException(ex);
@@ -427,19 +621,62 @@ protected CompletableFuture<Void> submitOOCTask(Runnable r, OOCStream<?>... queu
 
 	private Runnable oocTask(Runnable r, CompletableFuture<Void> future,  OOCStream<?>... queues) {
 		return () -> {
+			long startTime = DMLScript.STATISTICS ? System.nanoTime() : 0;
 			try {
 				r.run();
 			}
 			catch (Exception ex) {
 				DMLRuntimeException re = ex instanceof DMLRuntimeException ? (DMLRuntimeException) ex : new DMLRuntimeException(ex);
 
-				if (_failed) // Do avoid infinite cycles
-					throw re;
+				synchronized(this) {
+					if(_failed) // Do avoid infinite cycles
+						throw re;
+
+					_failed = true;
+				}
+
+				for(OOCStream<?> q : queues) {
+					try {
+						q.propagateFailure(re);
+					} catch(Throwable ignore) {
+						// Should not happen, but catch just in case
+					}
+				}
 
-				_failed = true;
+				if (future != null)
+					future.completeExceptionally(re);
 
-				for (OOCStream<?> q : queues)
-					q.propagateFailure(re);
+				// Rethrow to ensure proper future handling
+				throw re;
+			} finally {
+				if (DMLScript.STATISTICS)
+					_localStatisticsAdder.add(System.nanoTime() - startTime);
+			}
+		};
+	}
+
+	private <T> Consumer<OOCStream.QueueCallback<T>> oocTask(Consumer<OOCStream.QueueCallback<T>> c, CompletableFuture<Void> future,  OOCStream<?>... queues) {
+		return callback -> {
+			try {
+				c.accept(callback);
+			}
+			catch (Exception ex) {
+				DMLRuntimeException re = ex instanceof DMLRuntimeException ? (DMLRuntimeException) ex : new DMLRuntimeException(ex);
+
+				synchronized(this) {
+					if (_failed) // Do avoid infinite cycles
+						throw re;
+
+					_failed = true;
+				}
+
+				for(OOCStream<?> q : queues) {
+					try {
+						q.propagateFailure(re);
+					} catch(Throwable ignored) {
+						// Should not happen, but catch just in case
+					}
+				}
 
 				if (future != null)
 					future.completeExceptionally(re);
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCStream.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCStream.java
index 1a12cb138b7..27dd9515acf 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCStream.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCStream.java
@@ -20,20 +20,81 @@
 package org.apache.sysds.runtime.instructions.ooc;
 
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
+
+import java.util.function.Consumer;
 
 public interface OOCStream<T> extends OOCStreamable<T> {
+	static <T> QueueCallback<T> eos(DMLRuntimeException e) {
+		return new SimpleQueueCallback<>(null, e);
+	}
+
 	void enqueue(T t);
 
 	T dequeue();
 
 	void closeInput();
 
-	LocalTaskQueue<T> toLocalTaskQueue();
-
 	void propagateFailure(DMLRuntimeException re);
 
 	boolean hasStreamCache();
 
 	CachingStream getStreamCache();
+
+	/**
+	 * Registers a new subscriber that consumes the stream.
+	 * While there is no guarantee for any specific order, the closing item LocalTaskQueue.NO_MORE_TASKS
+	 * is guaranteed to be invoked after every other item has finished processing. Thus, the NO_MORE_TASKS
+	 * callback can be used to free dependent resources and close output streams.
+	 */
+	void setSubscriber(Consumer<QueueCallback<T>> subscriber);
+
+	interface QueueCallback<T> extends AutoCloseable {
+		T get();
+
+		/**
+		 * Keeps the callback item pinned in memory until the returned callback is also closed.
+		 */
+		QueueCallback<T> keepOpen();
+
+		void close();
+
+		void fail(DMLRuntimeException failure);
+
+		boolean isEos();
+	}
+
+	class SimpleQueueCallback<T> implements QueueCallback<T> {
+		private final T _result;
+		private DMLRuntimeException _failure;
+
+		public SimpleQueueCallback(T result, DMLRuntimeException failure) {
+			this._result = result;
+			this._failure = failure;
+		}
+
+		@Override
+		public T get() {
+			if (_failure != null)
+				throw _failure;
+			return _result;
+		}
+
+		@Override
+		public QueueCallback<T> keepOpen() {
+			return this;
+		}
+
+		@Override
+		public void fail(DMLRuntimeException failure) {
+			this._failure = failure;
+		}
+
+		@Override
+		public void close() {}
+
+		@Override
+		public boolean isEos() {
+			return get() == null;
+		}
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCStreamable.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCStreamable.java
index bdc4086bdcd..af2c0afa660 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCStreamable.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCStreamable.java
@@ -25,6 +25,4 @@ public interface OOCStreamable<T> {
 	OOCStream<T> getWriteStream();
 
 	boolean isProcessed();
-
-	void setSubscriber(Runnable subscriber);
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCWatchdog.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCWatchdog.java
new file mode 100644
index 00000000000..69e669a40b7
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/OOCWatchdog.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.instructions.ooc;
+
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Watchdog to help debug OOC streams/tasks that never close.
+ */
+public final class OOCWatchdog {
+	public static final boolean WATCH = false;
+	private static final ConcurrentHashMap<String, Entry> OPEN = new ConcurrentHashMap<>();
+	private static final ScheduledExecutorService EXEC =
+		Executors.newSingleThreadScheduledExecutor(r -> {
+			Thread t = new Thread(r, "TemporaryWatchdog");
+			t.setDaemon(true);
+			return t;
+		});
+
+	private static final long STALE_MS = TimeUnit.SECONDS.toMillis(10);
+	private static final long SCAN_INTERVAL_MS = TimeUnit.SECONDS.toMillis(10);
+
+	static {
+		if (WATCH)
+			EXEC.scheduleAtFixedRate(OOCWatchdog::scan, SCAN_INTERVAL_MS, SCAN_INTERVAL_MS, TimeUnit.MILLISECONDS);
+	}
+
+	private OOCWatchdog() {
+		// no-op
+	}
+
+	public static void registerOpen(String id, String desc, String context, OOCStreamable<?> stream) {
+		OPEN.put(id, new Entry(desc, context, System.currentTimeMillis(), stream));
+	}
+
+	public static void addEvent(String id, String eventMsg) {
+		Entry e = OPEN.get(id);
+		if (e != null)
+			e.events.add(eventMsg);
+	}
+
+	public static void registerClose(String id) {
+		OPEN.remove(id);
+	}
+
+	private static void scan() {
+		long now = System.currentTimeMillis();
+		for (Map.Entry<String, Entry> e : OPEN.entrySet()) {
+			if (now - e.getValue().openedAt >= STALE_MS) {
+				if (e.getValue().events.isEmpty() && !(e.getValue().stream instanceof CachingStream))
+					continue; // Probably just a stream that has no consumer (remains to be checked why this can happen)
+				System.err.println("[TemporaryWatchdog] Still open after " + (now - e.getValue().openedAt) + "ms: "
+					+ e.getKey() + " (" + e.getValue().desc + ")"
+					+ (e.getValue().context != null ? " ctx=" + e.getValue().context : ""));
+			}
+		}
+	}
+
+	private static class Entry {
+		final String desc;
+		final String context;
+		final long openedAt;
+		final OOCStreamable<?> stream;
+		ConcurrentLinkedQueue<String> events;
+
+		Entry(String desc, String context, long openedAt, OOCStreamable<?> stream) {
+			this.desc = desc;
+			this.context = context;
+			this.openedAt = openedAt;
+			this.stream = stream;
+			this.events = new ConcurrentLinkedQueue<>();
+		}
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/ParameterizedBuiltinOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/ParameterizedBuiltinOOCInstruction.java
index e56d32e4401..d70fc3ccb94 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/ParameterizedBuiltinOOCInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/ParameterizedBuiltinOOCInstruction.java
@@ -20,7 +20,6 @@
 package org.apache.sysds.runtime.instructions.ooc;
 
 import org.apache.commons.lang3.NotImplementedException;
-import org.apache.commons.lang3.mutable.MutableObject;
 import org.apache.sysds.common.Opcodes;
 import org.apache.sysds.common.Types;
 import org.apache.sysds.runtime.DMLRuntimeException;
@@ -43,7 +42,6 @@
 import java.util.LinkedHashMap;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ExecutionException;
-import java.util.concurrent.atomic.AtomicBoolean;
 
 public class ParameterizedBuiltinOOCInstruction extends ComputationOOCInstruction {
 
@@ -110,29 +108,26 @@ else if(instOpcode.equalsIgnoreCase(Opcodes.CONTAINS.toString())) {
 
 			Data finalPattern = pattern;
 
-			AtomicBoolean found = new AtomicBoolean(false);
+			addInStream(qIn);
+			addOutStream(); // This instruction has no output stream
 
-			MutableObject<CompletableFuture<Void>> futureRef = new MutableObject<>();
-			CompletableFuture<Void> future = submitOOCTasks(qIn, tmp -> {
-				boolean contains = ((MatrixBlock)tmp.getValue()).containsValue(((ScalarObject)finalPattern).getDoubleValue());
+			CompletableFuture<Boolean> future = new CompletableFuture<>();
 
-				if (contains) {
-					found.set(true);
+			filterOOC(qIn, tmp -> {
+				boolean contains = ((MatrixBlock)tmp.getValue()).containsValue(((ScalarObject)finalPattern).getDoubleValue());
 
-					// Now we may complete the future
-					if (futureRef.getValue() != null)
-						futureRef.getValue().complete(null);
-				}
-			}, () -> {});
-			futureRef.setValue(future);
+				if (contains)
+					future.complete(true);
+			}, tmp -> !future.isDone(), // Don't start a separate worker if result already known
+				() -> future.complete(false));     // Then the pattern was not found
 
+			boolean ret;
 			try {
-				futureRef.getValue().get();
+				ret = future.get();
 			} catch (InterruptedException | ExecutionException e) {
 				throw new DMLRuntimeException(e);
 			}
 
-			boolean ret = found.get();
 			ec.setScalarOutput(output.getName(), new BooleanObject(ret));
 		}
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/PlaybackStream.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/PlaybackStream.java
index 6edc4ecf270..bd725e5dd44 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/PlaybackStream.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/PlaybackStream.java
@@ -20,16 +20,24 @@
 package org.apache.sysds.runtime.instructions.ooc;
 
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Consumer;
+
 public class PlaybackStream implements OOCStream<IndexedMatrixValue>, OOCStreamable<IndexedMatrixValue> {
 	private final CachingStream _streamCache;
-	private int _streamIdx;
+	private final AtomicInteger _streamIdx;
+	private final AtomicBoolean _subscriberSet;
+	private QueueCallback<IndexedMatrixValue> _lastDequeue;
 
 	public PlaybackStream(CachingStream streamCache) {
 		this._streamCache = streamCache;
-		this._streamIdx = 0;
+		this._streamIdx = new AtomicInteger(0);
+		this._subscriberSet = new AtomicBoolean(false);
+		streamCache.incrSubscriberCount(1);
 	}
 
 	@Override
@@ -42,18 +50,17 @@ public void closeInput() {
 		throw new DMLRuntimeException("Cannot close a playback stream");
 	}
 
-	@Override
-	public LocalTaskQueue<IndexedMatrixValue> toLocalTaskQueue() {
-		final SubscribableTaskQueue<IndexedMatrixValue> q = new SubscribableTaskQueue<>();
-		setSubscriber(() -> q.enqueue(dequeue()));
-		return q;
-	}
-
 	@Override
 	public synchronized IndexedMatrixValue dequeue() {
+		if (_subscriberSet.get())
+			throw new IllegalStateException("Cannot dequeue from a playback stream if a subscriber has been set");
+
 		try {
-			return _streamCache.get(_streamIdx++);
-		} catch (InterruptedException e) {
+			if (_lastDequeue != null)
+				_lastDequeue.close();
+			_lastDequeue = _streamCache.get(_streamIdx.getAndIncrement());
+			return _lastDequeue.get();
+		} catch (InterruptedException | ExecutionException e) {
 			throw new DMLRuntimeException(e);
 		}
 	}
@@ -74,8 +81,11 @@ public boolean isProcessed() {
 	}
 
 	@Override
-	public void setSubscriber(Runnable subscriber) {
-		_streamCache.setSubscriber(subscriber);
+	public void setSubscriber(Consumer<QueueCallback<IndexedMatrixValue>> subscriber) {
+		if (!_subscriberSet.compareAndSet(false, true))
+			throw new IllegalArgumentException("Subscriber cannot be set multiple times");
+
+		_streamCache.setSubscriber(subscriber, false);
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/ReblockOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/ReblockOOCInstruction.java
index 74b15c9fb0e..f744b97506b 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/ReblockOOCInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/ReblockOOCInstruction.java
@@ -19,24 +19,19 @@
 
 package org.apache.sysds.runtime.instructions.ooc;
 
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.mapred.JobConf;
 import org.apache.sysds.common.Opcodes;
-import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.common.Types;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
-import org.apache.sysds.runtime.io.IOUtilFunctions;
-import org.apache.sysds.runtime.io.MatrixReader;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
-import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
 import org.apache.sysds.runtime.matrix.operators.Operator;
 import org.apache.sysds.runtime.meta.DataCharacteristics;
+import org.apache.sysds.runtime.ooc.cache.OOCCacheManager;
+import org.apache.sysds.runtime.ooc.cache.OOCIOHandler;
+import org.apache.sysds.runtime.ooc.stream.OOCSourceStream;
 
 public class ReblockOOCInstruction extends ComputationOOCInstruction {
 	private int blen;
@@ -74,40 +69,19 @@ public void processInstruction(ExecutionContext ec) {
 		//TODO support other formats than binary 
 		
 		//create queue, spawn thread for asynchronous reading, and return
-		OOCStream<IndexedMatrixValue> q = createWritableStream();
-		submitOOCTask(() -> readBinaryBlock(q, min.getFileName()), q);
+		OOCStream<IndexedMatrixValue> q = new OOCSourceStream();
+		OOCIOHandler io = OOCCacheManager.getIOHandler();
+		OOCIOHandler.SourceReadRequest req = new OOCIOHandler.SourceReadRequest(
+			min.getFileName(), Types.FileFormat.BINARY, mc.getRows(), mc.getCols(), blen, mc.getNonZeros(),
+			Long.MAX_VALUE, true, q);
+		io.scheduleSourceRead(req).whenComplete((res, err) -> {
+			if (err != null) {
+				Exception ex = err instanceof Exception ? (Exception) err : new Exception(err);
+				q.propagateFailure(new DMLRuntimeException(ex));
+			}
+		});
 		
 		MatrixObject mout = ec.getMatrixObject(output);
 		mout.setStreamHandle(q);
 	}
-	
-	@SuppressWarnings("resource")
-	private void readBinaryBlock(OOCStream<IndexedMatrixValue> q, String fname) {
-		try {
-			//prepare file access
-			JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());	
-			Path path = new Path( fname ); 
-			FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
-			
-			//check existence and non-empty file
-			MatrixReader.checkValidInputFile(fs, path); 
-			
-			//core reading
-			for( Path lpath : IOUtilFunctions.getSequenceFilePaths(fs, path) ) { //1..N files 
-				//directly read from sequence files (individual partfiles)
-				try( SequenceFile.Reader reader = new SequenceFile
-					.Reader(job, SequenceFile.Reader.file(lpath)) )
-				{
-					MatrixIndexes key = new MatrixIndexes();
-					MatrixBlock value = new MatrixBlock();
-					while( reader.next(key, value) )
-						q.enqueue(new IndexedMatrixValue(key, new MatrixBlock(value)));
-				}
-			}
-			q.closeInput();
-		}
-		catch(Exception ex) {
-			throw new DMLRuntimeException(ex);
-		}
-	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/SubscribableTaskQueue.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/SubscribableTaskQueue.java
index f136ffc2bb6..605a78178fa 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/SubscribableTaskQueue.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/SubscribableTaskQueue.java
@@ -22,80 +22,168 @@
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
 
+import java.util.LinkedList;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Consumer;
+
 public class SubscribableTaskQueue<T> extends LocalTaskQueue<T> implements OOCStream<T> {
-	private Runnable _subscriber;
 
-	@Override
-	public synchronized void enqueue(T t) {
-		try {
-			super.enqueueTask(t);
-		}
-		catch (InterruptedException e) {
-			throw new DMLRuntimeException(e);
+	private final AtomicInteger _availableCtr = new AtomicInteger(1);
+	private final AtomicBoolean _closed = new AtomicBoolean(false);
+	private volatile Consumer<QueueCallback<T>> _subscriber = null;
+	private String _watchdogId;
+
+	public SubscribableTaskQueue() {
+		if (OOCWatchdog.WATCH) {
+			_watchdogId = "STQ-" + hashCode();
+			// Capture a short context to help identify origin
+			OOCWatchdog.registerOpen(_watchdogId, "SubscribableTaskQueue@" + hashCode(), getCtxMsg(), this);
 		}
+	}
 
-		if(_subscriber != null)
-			_subscriber.run();
+	private String getCtxMsg() {
+		StackTraceElement[] st = new Exception().getStackTrace();
+		// Skip the first few frames (constructor, createWritableStream, etc.)
+		StringBuilder sb = new StringBuilder();
+		int limit = Math.min(st.length, 7);
+		for(int i = 2; i < limit; i++) {
+			sb.append(st[i].getClassName()).append(".").append(st[i].getMethodName()).append(":")
+				.append(st[i].getLineNumber());
+			if(i < limit - 1)
+				sb.append(" <- ");
+		}
+		return sb.toString();
 	}
 
 	@Override
-	public T dequeue() {
-		try {
-			return super.dequeueTask();
+	public void enqueue(T t) {
+		if (t == NO_MORE_TASKS)
+			throw new DMLRuntimeException("Cannot enqueue NO_MORE_TASKS item");
+
+		int cnt = _availableCtr.incrementAndGet();
+
+		if (cnt <= 1) { // Then the queue was already closed and we disallow further enqueues
+			_availableCtr.decrementAndGet(); // Undo increment
+			throw new DMLRuntimeException("Cannot enqueue into closed SubscribableTaskQueue");
 		}
-		catch (InterruptedException e) {
-			throw new DMLRuntimeException(e);
+
+		Consumer<QueueCallback<T>> s = _subscriber;
+
+		if (s != null) {
+			s.accept(new SimpleQueueCallback<>(t, _failure));
+			onDeliveryFinished();
+			return;
 		}
+
+		synchronized (this) {
+			// Re-check that subscriber is really null to avoid race conditions
+			if (_subscriber == null) {
+				try {
+					super.enqueueTask(t);
+				}
+				catch(InterruptedException e) {
+					throw new DMLRuntimeException(e);
+				}
+				return;
+			}
+			// Otherwise do not insert and re-schedule subscriber invocation
+			s = _subscriber;
+		}
+
+		// Last case if due to race a subscriber has been set
+		s.accept(new SimpleQueueCallback<>(t, _failure));
+		onDeliveryFinished();
 	}
 
 	@Override
-	public synchronized void closeInput() {
-		super.closeInput();
-
-		if(_subscriber != null) {
-			_subscriber.run();
-			_subscriber = null;
-		}
+	public synchronized void enqueueTask(T t) {
+		enqueue(t);
 	}
 
 	@Override
-	public LocalTaskQueue<T> toLocalTaskQueue() {
-		return this;
+	public T dequeue() {
+		try {
+			if (OOCWatchdog.WATCH)
+				OOCWatchdog.addEvent(_watchdogId, "dequeue -- " + getCtxMsg());
+			T deq = super.dequeueTask();
+			if (deq != NO_MORE_TASKS)
+				onDeliveryFinished();
+			return deq;
+		}
+		catch(InterruptedException e) {
+			throw new DMLRuntimeException(e);
+		}
 	}
 
 	@Override
-	public OOCStream<T> getReadStream() {
-		return this;
+	public synchronized T dequeueTask() {
+		return dequeue();
 	}
 
 	@Override
-	public OOCStream<T> getWriteStream() {
-		return this;
+	public synchronized void closeInput() {
+		if (_closed.compareAndSet(false, true)) {
+			super.closeInput();
+			onDeliveryFinished();
+		} else {
+			throw new IllegalStateException("Multiple close input calls");
+		}
 	}
 
 	@Override
-	public void setSubscriber(Runnable subscriber) {
-		int queueSize;
+	public void setSubscriber(Consumer<QueueCallback<T>> subscriber) {
+		if(subscriber == null)
+			throw new IllegalArgumentException("Cannot set subscriber to null");
 
-		synchronized (this) {
+		LinkedList<T> data;
+
+		synchronized(this) {
 			if(_subscriber != null)
 				throw new DMLRuntimeException("Cannot set multiple subscribers");
-
 			_subscriber = subscriber;
-			queueSize = _data.size();
-			queueSize += _closedInput ? 1 : 0; // To trigger the NO_MORE_TASK element
+			if(_failure != null)
+				throw _failure;
+			data = _data;
+			_data = new LinkedList<>();
+		}
+
+		for (T t : data) {
+			subscriber.accept(new SimpleQueueCallback<>(t, _failure));
+			onDeliveryFinished();
 		}
+	}
+
+	@SuppressWarnings("unchecked")
+	private void onDeliveryFinished() {
+		int ctr = _availableCtr.decrementAndGet();
 
-		for (int i = 0; i < queueSize; i++)
-			subscriber.run();
+		if (ctr == 0) {
+			Consumer<QueueCallback<T>> s = _subscriber;
+			if (s != null)
+				s.accept(new SimpleQueueCallback<>((T) LocalTaskQueue.NO_MORE_TASKS, _failure));
+
+			if (OOCWatchdog.WATCH)
+				OOCWatchdog.registerClose(_watchdogId);
+		}
 	}
 
 	@Override
 	public synchronized void propagateFailure(DMLRuntimeException re) {
 		super.propagateFailure(re);
+		Consumer<QueueCallback<T>> s = _subscriber;
+		if(s != null)
+			s.accept(new SimpleQueueCallback<>(null, re));
+	}
 
-		if(_subscriber != null)
-			_subscriber.run();
+	@Override
+	public OOCStream<T> getReadStream() {
+		return this;
+	}
+
+	@Override
+	public OOCStream<T> getWriteStream() {
+		return this;
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/TeeOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/TeeOOCInstruction.java
index fd80b4e6e90..aba36297e7f 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/ooc/TeeOOCInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/TeeOOCInstruction.java
@@ -25,8 +25,37 @@
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 
+import java.util.concurrent.ConcurrentHashMap;
+
 public class TeeOOCInstruction extends ComputationOOCInstruction {
 
+	private static final ConcurrentHashMap<CachingStream, Integer> refCtr = new ConcurrentHashMap<>();
+
+	public static void reset() {
+		if (!refCtr.isEmpty()) {
+			System.err.println("There are some dangling streams still in the cache: " + refCtr);
+			refCtr.clear();
+		}
+	}
+
+	/**
+	 * Increments the reference counter of a stream by the set amount.
+	 */
+	public static void incrRef(OOCStreamable<IndexedMatrixValue> stream, int incr) {
+		if (!(stream instanceof CachingStream))
+			return;
+
+		Integer ref = refCtr.compute((CachingStream)stream, (k, v) -> {
+			if (v == null)
+				v = 0;
+			v += incr;
+			return v <= 0 ? null : v;
+		});
+
+		if (ref == null)
+			((CachingStream)stream).scheduleDeletion();
+	}
+
 	protected TeeOOCInstruction(OOCType type, CPOperand in1, CPOperand out, String opcode, String istr) {
 		super(type, null, in1, out, opcode, istr);
 	}
@@ -45,9 +74,20 @@ public void processInstruction( ExecutionContext ec ) {
 		MatrixObject min = ec.getMatrixObject(input1);
 		OOCStream<IndexedMatrixValue> qIn = min.getStreamHandle();
 
+		CachingStream handle = qIn.hasStreamCache() ? qIn.getStreamCache() : new CachingStream(qIn);
+
+		if (!qIn.hasStreamCache()) {
+			// We also set the input stream handle
+			min.setStreamHandle(handle);
+			incrRef(handle, 2);
+		}
+		else {
+			incrRef(handle, 1);
+		}
+
 		//get output and create new resettable stream
 		MatrixObject mo = ec.getMatrixObject(output);
-		mo.setStreamHandle(new CachingStream(qIn));
+		mo.setStreamHandle(handle);
 		mo.setMetaData(min.getMetaData());
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/ooc/TernaryOOCInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/ooc/TernaryOOCInstruction.java
new file mode 100644
index 00000000000..da5c37c50ef
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/instructions/ooc/TernaryOOCInstruction.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.instructions.ooc;
+
+import java.util.List;
+
+import org.apache.sysds.common.Types.ValueType;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysds.runtime.functionobjects.IfElse;
+import org.apache.sysds.runtime.instructions.InstructionUtils;
+import org.apache.sysds.runtime.instructions.cp.CPOperand;
+import org.apache.sysds.runtime.instructions.cp.ScalarObject;
+import org.apache.sysds.runtime.instructions.cp.ScalarObjectFactory;
+import org.apache.sysds.runtime.instructions.cp.StringObject;
+import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
+import org.apache.sysds.runtime.matrix.operators.Operator;
+import org.apache.sysds.runtime.matrix.operators.TernaryOperator;
+
+public class TernaryOOCInstruction extends ComputationOOCInstruction {
+
+	protected TernaryOOCInstruction(Operator op, CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out,
+		String opcode, String istr) {
+		super(OOCType.Ternary, op, in1, in2, in3, out, opcode, istr);
+	}
+
+	public static TernaryOOCInstruction parseInstruction(String str) {
+		String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
+		InstructionUtils.checkNumFields(parts, 4, 5);
+		String opcode = parts[0];
+		CPOperand in1 = new CPOperand(parts[1]);
+		CPOperand in2 = new CPOperand(parts[2]);
+		CPOperand in3 = new CPOperand(parts[3]);
+		CPOperand out = new CPOperand(parts[4]);
+		int numThreads = parts.length > 5 ? Integer.parseInt(parts[5]) : 1;
+		TernaryOperator op = InstructionUtils.parseTernaryOperator(opcode, numThreads);
+		return new TernaryOOCInstruction(op, in1, in2, in3, out, opcode, str);
+	}
+
+	@Override
+	public void processInstruction(ExecutionContext ec) {
+		boolean m1 = input1.isMatrix();
+		boolean m2 = input2.isMatrix();
+		boolean m3 = input3.isMatrix();
+
+		if(!m1 && !m2 && !m3) {
+			processScalarInstruction(ec);
+			return;
+		}
+
+		if(m1 && m2 && m3)
+			processThreeMatrixInstruction(ec);
+		else if(m1 && m2)
+			processTwoMatrixInstruction(ec, 1, 2);
+		else if(m1 && m3)
+			processTwoMatrixInstruction(ec, 1, 3);
+		else if(m2 && m3)
+			processTwoMatrixInstruction(ec, 2, 3);
+		else if(m1)
+			processSingleMatrixInstruction(ec, 1);
+		else if(m2)
+			processSingleMatrixInstruction(ec, 2);
+		else
+			processSingleMatrixInstruction(ec, 3);
+	}
+
+	private void processScalarInstruction(ExecutionContext ec) {
+		TernaryOperator op = (TernaryOperator) _optr;
+		if(op.fn instanceof IfElse && output.getValueType() == ValueType.STRING) {
+			String value = (ec.getScalarInput(input1).getDoubleValue() != 0 ?
+				ec.getScalarInput(input2) : ec.getScalarInput(input3)).getStringValue();
+			ec.setScalarOutput(output.getName(), new StringObject(value));
+		}
+		else {
+			double value = op.fn.execute(
+				ec.getScalarInput(input1).getDoubleValue(),
+				ec.getScalarInput(input2).getDoubleValue(),
+				ec.getScalarInput(input3).getDoubleValue());
+			ec.setScalarOutput(output.getName(), ScalarObjectFactory
+				.createScalarObject(output.getValueType(), value));
+		}
+	}
+
+	private void processSingleMatrixInstruction(ExecutionContext ec, int matrixPos) {
+		MatrixObject mo = getMatrixObject(ec, matrixPos);
+		MatrixBlock s1 = input1.isMatrix() ? null : getScalarInputBlock(ec, input1);
+		MatrixBlock s2 = input2.isMatrix() ? null : getScalarInputBlock(ec, input2);
+		MatrixBlock s3 = input3.isMatrix() ? null : getScalarInputBlock(ec, input3);
+
+		OOCStream<IndexedMatrixValue> qIn = mo.getStreamHandle();
+		OOCStream<IndexedMatrixValue> qOut = createWritableStream();
+		ec.getMatrixObject(output).setStreamHandle(qOut);
+
+		mapOOC(qIn, qOut, tmp -> {
+			IndexedMatrixValue outVal = new IndexedMatrixValue();
+			MatrixBlock op1 = resolveOperandBlock(1, tmp, null, matrixPos, -1, s1, s2, s3);
+			MatrixBlock op2 = resolveOperandBlock(2, tmp, null, matrixPos, -1, s1, s2, s3);
+			MatrixBlock op3 = resolveOperandBlock(3, tmp, null, matrixPos, -1, s1, s2, s3);
+			outVal.set(tmp.getIndexes(),
+				op1.ternaryOperations((TernaryOperator)_optr, op2, op3, new MatrixBlock()));
+			return outVal;
+		});
+	}
+
+	private void processTwoMatrixInstruction(ExecutionContext ec, int leftPos, int rightPos) {
+		MatrixObject left = getMatrixObject(ec, leftPos);
+		MatrixObject right = getMatrixObject(ec, rightPos);
+
+		MatrixBlock s1 = input1.isMatrix() ? null : getScalarInputBlock(ec, input1);
+		MatrixBlock s2 = input2.isMatrix() ? null : getScalarInputBlock(ec, input2);
+		MatrixBlock s3 = input3.isMatrix() ? null : getScalarInputBlock(ec, input3);
+
+		OOCStream<IndexedMatrixValue> qOut = createWritableStream();
+		ec.getMatrixObject(output).setStreamHandle(qOut);
+
+		joinOOC(left.getStreamHandle(), right.getStreamHandle(), qOut, (l, r) -> {
+			IndexedMatrixValue outVal = new IndexedMatrixValue();
+			MatrixBlock op1 = resolveOperandBlock(1, l, r, leftPos, rightPos, s1, s2, s3);
+			MatrixBlock op2 = resolveOperandBlock(2, l, r, leftPos, rightPos, s1, s2, s3);
+			MatrixBlock op3 = resolveOperandBlock(3, l, r, leftPos, rightPos, s1, s2, s3);
+			outVal.set(l.getIndexes(),
+				op1.ternaryOperations((TernaryOperator)_optr, op2, op3, new MatrixBlock()));
+			return outVal;
+		}, IndexedMatrixValue::getIndexes);
+	}
+
+	private void processThreeMatrixInstruction(ExecutionContext ec) {
+		MatrixObject m1 = ec.getMatrixObject(input1);
+		MatrixObject m2 = ec.getMatrixObject(input2);
+		MatrixObject m3 = ec.getMatrixObject(input3);
+
+		OOCStream<IndexedMatrixValue> qOut = createWritableStream();
+		ec.getMatrixObject(output).setStreamHandle(qOut);
+
+		List<OOCStream<IndexedMatrixValue>> streams = List.of(
+			m1.getStreamHandle(), m2.getStreamHandle(), m3.getStreamHandle());
+
+		List<java.util.function.Function<IndexedMatrixValue, MatrixIndexes>> keyFns =
+			List.of(IndexedMatrixValue::getIndexes, IndexedMatrixValue::getIndexes, IndexedMatrixValue::getIndexes);
+
+		joinOOC(streams, qOut, blocks -> {
+			IndexedMatrixValue b1 = blocks.get(0);
+			IndexedMatrixValue b2 = blocks.get(1);
+			IndexedMatrixValue b3 = blocks.get(2);
+			IndexedMatrixValue outVal = new IndexedMatrixValue();
+			outVal.set(b1.getIndexes(),
+				((MatrixBlock)b1.getValue()).ternaryOperations((TernaryOperator)_optr, (MatrixBlock)b2.getValue(), (MatrixBlock)b3.getValue(), new MatrixBlock()));
+			return outVal;
+		}, keyFns);
+	}
+
+	private MatrixObject getMatrixObject(ExecutionContext ec, int pos) {
+		if(pos == 1)
+			return ec.getMatrixObject(input1);
+		else if(pos == 2)
+			return ec.getMatrixObject(input2);
+		else if(pos == 3)
+			return ec.getMatrixObject(input3);
+		else
+			throw new DMLRuntimeException("Invalid matrix position: " + pos);
+	}
+
+	private MatrixBlock getScalarInputBlock(ExecutionContext ec, CPOperand operand) {
+		ScalarObject scalar = ec.getScalarInput(operand);
+		return new MatrixBlock(scalar.getDoubleValue());
+	}
+
+	private MatrixBlock resolveOperandBlock(int operandPos, IndexedMatrixValue left, IndexedMatrixValue right,
+		int leftPos, int rightPos, MatrixBlock s1, MatrixBlock s2, MatrixBlock s3) {
+		if(operandPos == leftPos && left != null)
+			return (MatrixBlock) left.getValue();
+		if(operandPos == rightPos && right != null)
+			return (MatrixBlock) right.getValue();
+
+		if(operandPos == 1)
+			return s1;
+		else if(operandPos == 2)
+			return s2;
+		else if(operandPos == 3)
+			return s3;
+		else
+			throw new DMLRuntimeException("Invalid operand position: " + operandPos);
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/spark/CentralMomentSPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/spark/CentralMomentSPInstruction.java
index 29bb4144662..864254e4da6 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/spark/CentralMomentSPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/spark/CentralMomentSPInstruction.java
@@ -30,7 +30,7 @@
 import org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysds.runtime.functionobjects.CM;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
 import org.apache.sysds.runtime.instructions.cp.DoubleObject;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
@@ -110,18 +110,18 @@ public void processInstruction( ExecutionContext ec ) {
 		JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
 				
 		//process central moment instruction
-		CM_COV_Object cmobj = null; 
+		CmCovObject cmobj = null; 
 		if( input3 == null ) //w/o weights
 		{
 			cmobj = in1.values().map(new RDDCMFunction(cop))
-			           .fold(new CM_COV_Object(), new RDDCMReduceFunction(cop));
+			           .fold(new CmCovObject(), new RDDCMReduceFunction(cop));
 		}
 		else //with weights
 		{
 			JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
 			cmobj = in1.join( in2 )
 					   .values().map(new RDDCMWeightsFunction(cop))
-			           .fold(new CM_COV_Object(), new RDDCMReduceFunction(cop));
+			           .fold(new CmCovObject(), new RDDCMReduceFunction(cop));
 		}
 
 		//create scalar output (no lineage information required)
@@ -129,7 +129,7 @@ public void processInstruction( ExecutionContext ec ) {
 		ec.setScalarOutput(output.getName(), new DoubleObject(val));
 	}
 
-	private static class RDDCMFunction implements Function<MatrixBlock, CM_COV_Object> 
+	private static class RDDCMFunction implements Function<MatrixBlock, CmCovObject> 
 	{
 		private static final long serialVersionUID = 2293839116041610644L;
 		
@@ -140,7 +140,7 @@ public RDDCMFunction( CMOperator op ) {
 		}
 
 		@Override
-		public CM_COV_Object call(MatrixBlock arg0) 
+		public CmCovObject call(MatrixBlock arg0) 
 			throws Exception 
 		{
 			//execute cm operations
@@ -148,7 +148,7 @@ public CM_COV_Object call(MatrixBlock arg0)
 		}
 	}
 
-	private static class RDDCMWeightsFunction implements Function<Tuple2<MatrixBlock,MatrixBlock>, CM_COV_Object> 
+	private static class RDDCMWeightsFunction implements Function<Tuple2<MatrixBlock,MatrixBlock>, CmCovObject> 
 	{
 		private static final long serialVersionUID = -8949715516574052497L;
 		
@@ -159,7 +159,7 @@ public RDDCMWeightsFunction( CMOperator op ) {
 		}
 
 		@Override
-		public CM_COV_Object call(Tuple2<MatrixBlock,MatrixBlock> arg0) 
+		public CmCovObject call(Tuple2<MatrixBlock,MatrixBlock> arg0) 
 			throws Exception 
 		{
 			MatrixBlock input = arg0._1();
@@ -170,7 +170,7 @@ public CM_COV_Object call(Tuple2<MatrixBlock,MatrixBlock> arg0)
 		}
 	}
 	
-	private static class RDDCMReduceFunction implements Function2<CM_COV_Object, CM_COV_Object, CM_COV_Object>
+	private static class RDDCMReduceFunction implements Function2<CmCovObject, CmCovObject, CmCovObject>
 	{
 
 		private static final long serialVersionUID = 3272260751983866544L;
@@ -182,7 +182,7 @@ public RDDCMReduceFunction( CMOperator op ) {
 		}
 
 		@Override
-		public CM_COV_Object call(CM_COV_Object arg0, CM_COV_Object arg1) 
+		public CmCovObject call(CmCovObject arg0, CmCovObject arg1) 
 			throws Exception 
 		{
 			//execute cm combine operations
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/spark/CovarianceSPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/spark/CovarianceSPInstruction.java
index 9373e83b974..d2816163fe9 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/spark/CovarianceSPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/spark/CovarianceSPInstruction.java
@@ -30,7 +30,7 @@
 import org.apache.sysds.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysds.runtime.functionobjects.COV;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
 import org.apache.sysds.runtime.instructions.cp.DoubleObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -88,17 +88,17 @@ public void processInstruction( ExecutionContext ec ) {
 		JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
 		
 		//process central moment instruction
-		CM_COV_Object cmobj = null; 
+		CmCovObject cmobj = null; 
 		if( input3 == null ) { //w/o weights
 			cmobj = in1.join( in2 )
 				.values().map(new RDDCOVFunction(cop))
-				.fold(new CM_COV_Object(), new RDDCOVReduceFunction(cop));
+				.fold(new CmCovObject(), new RDDCOVReduceFunction(cop));
 		}
 		else { //with weights
 			JavaPairRDD<MatrixIndexes,MatrixBlock> in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() );
 			cmobj = in1.join( in2 ).join( in3 )
 				.values().map(new RDDCOVWeightsFunction(cop))
-				.fold(new CM_COV_Object(), new RDDCOVReduceFunction(cop));
+				.fold(new CmCovObject(), new RDDCOVReduceFunction(cop));
 		}
 
 		//create scalar output (no lineage information required)
@@ -106,7 +106,7 @@ public void processInstruction( ExecutionContext ec ) {
 		ec.setScalarOutput(output.getName(), new DoubleObject(val));
 	}
 
-	private static class RDDCOVFunction implements Function<Tuple2<MatrixBlock,MatrixBlock>, CM_COV_Object>
+	private static class RDDCOVFunction implements Function<Tuple2<MatrixBlock,MatrixBlock>, CmCovObject>
 	{
 		private static final long serialVersionUID = -9088449969750217519L;
 		
@@ -117,7 +117,7 @@ public RDDCOVFunction( COVOperator op ) {
 		}
 
 		@Override
-		public CM_COV_Object call(Tuple2<MatrixBlock,MatrixBlock> arg0) 
+		public CmCovObject call(Tuple2<MatrixBlock,MatrixBlock> arg0) 
 			throws Exception 
 		{
 			MatrixBlock input1 = arg0._1();
@@ -128,7 +128,7 @@ public CM_COV_Object call(Tuple2<MatrixBlock,MatrixBlock> arg0)
 		}
 	}
 
-	private static class RDDCOVWeightsFunction implements Function<Tuple2<Tuple2<MatrixBlock,MatrixBlock>,MatrixBlock>, CM_COV_Object> 
+	private static class RDDCOVWeightsFunction implements Function<Tuple2<Tuple2<MatrixBlock,MatrixBlock>,MatrixBlock>, CmCovObject> 
 	{
 		private static final long serialVersionUID = 1945166819152577077L;
 		
@@ -139,7 +139,7 @@ public RDDCOVWeightsFunction( COVOperator op ) {
 		}
 
 		@Override
-		public CM_COV_Object call(Tuple2<Tuple2<MatrixBlock,MatrixBlock>,MatrixBlock> arg0) 
+		public CmCovObject call(Tuple2<Tuple2<MatrixBlock,MatrixBlock>,MatrixBlock> arg0) 
 			throws Exception 
 		{
 			MatrixBlock input1 = arg0._1()._1();
@@ -151,7 +151,7 @@ public CM_COV_Object call(Tuple2<Tuple2<MatrixBlock,MatrixBlock>,MatrixBlock> ar
 		}
 	}
 
-	private static class RDDCOVReduceFunction implements Function2<CM_COV_Object, CM_COV_Object, CM_COV_Object>
+	private static class RDDCOVReduceFunction implements Function2<CmCovObject, CmCovObject, CmCovObject>
 	{
 		private static final long serialVersionUID = 1118102911706607118L;
 		
@@ -162,7 +162,7 @@ public RDDCOVReduceFunction( COVOperator op ) {
 		}
 
 		@Override
-		public CM_COV_Object call(CM_COV_Object arg0, CM_COV_Object arg1) 
+		public CmCovObject call(CmCovObject arg0, CmCovObject arg1) 
 			throws Exception 
 		{
 			//execute cov combine operations
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/spark/functions/PerformGroupByAggInCombiner.java b/src/main/java/org/apache/sysds/runtime/instructions/spark/functions/PerformGroupByAggInCombiner.java
index 54c620df94d..da2dc152f59 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/spark/functions/PerformGroupByAggInCombiner.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/spark/functions/PerformGroupByAggInCombiner.java
@@ -23,7 +23,7 @@
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.functionobjects.CM;
 import org.apache.sysds.runtime.functionobjects.KahanPlus;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.instructions.cp.KahanObject;
 import org.apache.sysds.runtime.matrix.data.WeightedCell;
 import org.apache.sysds.runtime.matrix.operators.AggregateOperator;
@@ -45,7 +45,7 @@ public WeightedCell call(WeightedCell value1, WeightedCell value2)
 		throws Exception 
 	{
 		WeightedCell outCell = new WeightedCell();
-		CM_COV_Object cmObj = new CM_COV_Object(); 
+		CmCovObject cmObj = new CmCovObject(); 
 		if(_op instanceof CMOperator) //everything except sum
 		{
 			if( ((CMOperator) _op).isPartialAggregateOperator() ) {
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/spark/functions/PerformGroupByAggInReducer.java b/src/main/java/org/apache/sysds/runtime/instructions/spark/functions/PerformGroupByAggInReducer.java
index 36587ca9004..55cf03a0c67 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/spark/functions/PerformGroupByAggInReducer.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/spark/functions/PerformGroupByAggInReducer.java
@@ -23,7 +23,7 @@
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.functionobjects.CM;
 import org.apache.sysds.runtime.functionobjects.KahanPlus;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.instructions.cp.KahanObject;
 import org.apache.sysds.runtime.matrix.data.WeightedCell;
 import org.apache.sysds.runtime.matrix.operators.AggregateOperator;
@@ -44,7 +44,7 @@ public WeightedCell call(Iterable<WeightedCell> kv)
 		throws Exception 
 	{
 		WeightedCell outCell = new WeightedCell();
-		CM_COV_Object cmObj = new CM_COV_Object(); 
+		CmCovObject cmObj = new CmCovObject(); 
 		if(op instanceof CMOperator) //everything except sum
 		{
 			cmObj.reset();
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/spark/utils/RDDSortUtils.java b/src/main/java/org/apache/sysds/runtime/instructions/spark/utils/RDDSortUtils.java
index 7fcdffeefd2..bb5e28cfb27 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/spark/utils/RDDSortUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/spark/utils/RDDSortUtils.java
@@ -260,11 +260,11 @@ public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByValMemSort( Java
 
 		//broadcast index vector
 		PartitionedBlock<MatrixBlock> pmb = new PartitionedBlock<>(sortedIxSrc, blen);
-		Broadcast<PartitionedBlock<MatrixBlock>> _pmb = sec.getSparkContext().broadcast(pmb);
+		Broadcast<PartitionedBlock<MatrixBlock>> pmb2 = sec.getSparkContext().broadcast(pmb);
 
 		//sort data with broadcast index vector
 		JavaPairRDD<MatrixIndexes, RowMatrixBlock> ret = data
-				.mapPartitionsToPair(new ShuffleMatrixBlockRowsInMemFunction(rlen, blen, _pmb));
+				.mapPartitionsToPair(new ShuffleMatrixBlockRowsInMemFunction(rlen, blen, pmb2));
 		return RDDAggregateUtils.mergeRowsByKey(ret);
 	}
 
diff --git a/src/main/java/org/apache/sysds/runtime/io/MatrixWriter.java b/src/main/java/org/apache/sysds/runtime/io/MatrixWriter.java
index 1844cc1af79..8681a91c7e0 100644
--- a/src/main/java/org/apache/sysds/runtime/io/MatrixWriter.java
+++ b/src/main/java/org/apache/sysds/runtime/io/MatrixWriter.java
@@ -23,7 +23,7 @@
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
+import org.apache.sysds.runtime.instructions.ooc.OOCStream;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
@@ -56,7 +56,7 @@ public abstract void writeMatrixToHDFS( MatrixBlock src, String fname, long rlen
 	 * @param blen The block size
 	 * @throws IOException if an I/O error occurs
 	 */
-	public abstract long writeMatrixFromStream(String fname, LocalTaskQueue<IndexedMatrixValue> stream,
+	public abstract long writeMatrixFromStream(String fname, OOCStream<IndexedMatrixValue> stream,
 											   long rlen, long clen, int blen) throws IOException;
 	
 	public void setForcedParallel(boolean par) {
diff --git a/src/main/java/org/apache/sysds/runtime/io/ReaderHDF5.java b/src/main/java/org/apache/sysds/runtime/io/ReaderHDF5.java
index f65887a2cb1..71d710d3f15 100644
--- a/src/main/java/org/apache/sysds/runtime/io/ReaderHDF5.java
+++ b/src/main/java/org/apache/sysds/runtime/io/ReaderHDF5.java
@@ -19,29 +19,67 @@
 
 package org.apache.sysds.runtime.io;
 
+import java.io.ByteArrayOutputStream;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.DoubleBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
 import org.apache.hadoop.fs.Path;
-import java.io.BufferedInputStream;
+import org.apache.hadoop.fs.RawLocalFileSystem;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.DenseBlockFP64DEDUP;
+import org.apache.sysds.runtime.data.DenseBlockLFP64DEDUP;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.io.hdf5.H5;
-import org.apache.sysds.runtime.io.hdf5.H5Constants;
 import org.apache.sysds.runtime.io.hdf5.H5ContiguousDataset;
+import org.apache.sysds.runtime.io.hdf5.H5ByteReader;
 import org.apache.sysds.runtime.io.hdf5.H5RootObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.UtilFunctions;
+import org.apache.sysds.hops.OptimizerUtils;
 
 public class ReaderHDF5 extends MatrixReader {
 	protected final FileFormatPropertiesHDF5 _props;
+	private static final int DEFAULT_HDF5_READ_BLOCK_BYTES = 8 * 1024 * 1024; // Default contiguous read block size (8 MiB).
+	private static final int DEFAULT_HDF5_READ_BUFFER_BYTES = 16 * 1024 * 1024; // Default readahead window (16 MiB).
+	private static final int DEFAULT_HDF5_READ_MAP_BYTES = 256 * 1024 * 1024; // Default mmap window (256 MiB).
+	private static final int DEFAULT_HDF5_READ_PARALLEL_MIN_BYTES = 64 * 1024 * 1024; // Minimum bytes before parallel read.
+	private static final int HDF5_READ_BLOCK_BYTES =
+		getHdf5ReadInt("sysds.hdf5.read.block.bytes", DEFAULT_HDF5_READ_BLOCK_BYTES);
+	private static final int HDF5_READ_BUFFER_BYTES = Math.max(
+		getHdf5ReadInt("sysds.hdf5.read.buffer.bytes", DEFAULT_HDF5_READ_BUFFER_BYTES),
+		HDF5_READ_BLOCK_BYTES);
+	protected static final int HDF5_READ_MAP_BYTES = Math.max(
+		getHdf5ReadInt("sysds.hdf5.read.map.bytes", DEFAULT_HDF5_READ_MAP_BYTES),
+		HDF5_READ_BUFFER_BYTES);
+	private static final boolean HDF5_READ_USE_MMAP =
+		getHdf5ReadBoolean("sysds.hdf5.read.mmap", true);
+	protected static final boolean HDF5_SKIP_NNZ =
+		getHdf5ReadBoolean("sysds.hdf5.read.skip.nnz", false);
+	protected static final boolean HDF5_FORCE_DENSE =
+		getHdf5ReadBoolean("sysds.hdf5.read.force.dense", false);
+	protected static final boolean HDF5_READ_TRACE =
+		getHdf5ReadBoolean("sysds.hdf5.read.trace", false);
+	protected static final int HDF5_READ_PARALLEL_THREADS = Math.max(1,
+		getHdf5ReadInt("sysds.hdf5.read.parallel.threads",
+			OptimizerUtils.getParallelBinaryReadParallelism()));
+	protected static final int HDF5_READ_PARALLEL_MIN_BYTES =
+		getHdf5ReadInt("sysds.hdf5.read.parallel.min.bytes", DEFAULT_HDF5_READ_PARALLEL_MIN_BYTES);
 
 	public ReaderHDF5(FileFormatPropertiesHDF5 props) {
 		_props = props;
@@ -63,7 +101,7 @@ public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int bl
 		//check existence and non-empty file
 		checkValidInputFile(fs, path);
 
-		//core read 
+		//core read
 		ret = readHDF5MatrixFromHDFS(path, job, fs, ret, rlen, clen, blen, _props.getDatasetName());
 
 		//finally check if change of sparse/dense block representation required
@@ -82,8 +120,8 @@ public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long cle
 		//core read
 		String datasetName = _props.getDatasetName();
 
-		BufferedInputStream bis = new BufferedInputStream(is, (int) (H5Constants.STATIC_HEADER_SIZE + (clen * rlen * 8)));
-		long lnnz = readMatrixFromHDF5(bis, datasetName, ret, 0, rlen, clen, blen);
+		H5ByteReader byteReader = createByteReader(is, "input-stream(" + datasetName + ")", -1);
+		long lnnz = readMatrixFromHDF5(byteReader, datasetName, ret, 0, rlen, clen, blen);
 
 		//finally check if change of sparse/dense block representation required
 		ret.setNonZeros(lnnz);
@@ -92,6 +130,50 @@ public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long cle
 		return ret;
 	}
 
+	static H5ByteReader createByteReader(InputStream is, String sourceId) throws IOException {
+		return createByteReader(is, sourceId, -1);
+	}
+
+	static H5ByteReader createByteReader(InputStream is, String sourceId, long lengthHint) throws IOException {
+		long length = lengthHint;
+		if(is instanceof FSDataInputStream) {
+			LOG.trace("[HDF5] Using FSDataInputStream-backed reader for " + sourceId);
+			H5ByteReader base = new FsDataInputStreamByteReader((FSDataInputStream) is);
+			if(length > 0 && length <= Integer.MAX_VALUE) {
+				return new BufferedH5ByteReader(base, length, HDF5_READ_BUFFER_BYTES);
+			}
+			return base;
+		}
+		else if(is instanceof FileInputStream) {
+			FileChannel channel = ((FileInputStream) is).getChannel();
+			length = channel.size();
+			LOG.trace("[HDF5] Using FileChannel-backed reader for " + sourceId + " (size=" + length + ")");
+			if(HDF5_READ_USE_MMAP && length > 0) {
+				return new MappedH5ByteReader(channel, length, HDF5_READ_MAP_BYTES);
+			}
+			H5ByteReader base = new FileChannelByteReader(channel);
+			if(length > 0 && length <= Integer.MAX_VALUE) {
+				return new BufferedH5ByteReader(base, length, HDF5_READ_BUFFER_BYTES);
+			}
+			return base;
+		}
+		else {
+			byte[] cached = drainToByteArray(is);
+			LOG.trace("[HDF5] Cached " + cached.length + " bytes into memory for " + sourceId);
+			return new BufferedH5ByteReader(new ByteArrayH5ByteReader(cached), cached.length, HDF5_READ_BUFFER_BYTES);
+		}
+	}
+
+	private static byte[] drainToByteArray(InputStream is) throws IOException {
+		try(InputStream input = is; ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
+			byte[] buff = new byte[8192];
+			int len;
+			while((len = input.read(buff)) != -1)
+				bos.write(buff, 0, len);
+			return bos.toByteArray();
+		}
+	}
+
 	private static MatrixBlock readHDF5MatrixFromHDFS(Path path, JobConf job,
 		FileSystem fs, MatrixBlock dest, long rlen, long clen, int blen, String datasetName)
 		throws IOException, DMLRuntimeException
@@ -116,9 +198,8 @@ private static MatrixBlock readHDF5MatrixFromHDFS(Path path, JobConf job,
 		//actual read of individual files
 		long lnnz = 0;
 		for(int fileNo = 0; fileNo < files.size(); fileNo++) {
-			BufferedInputStream bis = new BufferedInputStream(fs.open(files.get(fileNo)),
-				(int) (H5Constants.STATIC_HEADER_SIZE + (clen * rlen * 8)));
-			lnnz += readMatrixFromHDF5(bis, datasetName, dest, 0, rlen, clen, blen);
+			H5ByteReader byteReader = createByteReader(files.get(fileNo), fs);
+			lnnz += readMatrixFromHDF5(byteReader, datasetName, dest, 0, rlen, clen, blen);
 		}
 		//post processing
 		dest.setNonZeros(lnnz);
@@ -126,45 +207,155 @@ private static MatrixBlock readHDF5MatrixFromHDFS(Path path, JobConf job,
 		return dest;
 	}
 
-	public static long readMatrixFromHDF5(BufferedInputStream bis, String datasetName, MatrixBlock dest,
+	public static long readMatrixFromHDF5(H5ByteReader byteReader, String datasetName, MatrixBlock dest,
 		int rl, long ru, long clen, int blen)
 	{
-		bis.mark(0);
 		long lnnz = 0;
-		H5RootObject rootObject = H5.H5Fopen(bis);
+		boolean skipNnz = HDF5_SKIP_NNZ && !dest.isInSparseFormat();
+		if(HDF5_FORCE_DENSE && dest.isInSparseFormat()) {
+			dest.allocateDenseBlock(true);
+			skipNnz = HDF5_SKIP_NNZ;
+			if(HDF5_READ_TRACE)
+				LOG.trace("[HDF5] Forcing dense output for dataset=" + datasetName);
+		}
+		H5RootObject rootObject = H5.H5Fopen(byteReader);
 		H5ContiguousDataset contiguousDataset = H5.H5Dopen(rootObject, datasetName);
 
-		int[] dims = rootObject.getDimensions();
-		int ncol = dims[1];
+		int ncol = (int) rootObject.getCol();
+		LOG.trace("[HDF5] readMatrix dataset=" + datasetName + " dims=" + rootObject.getRow() + "x"
+			+ rootObject.getCol() + " loop=[" + rl + "," + ru + ") dest=" + dest.getNumRows() + "x"
+			+ dest.getNumColumns());
 
 		try {
-			double[] row = new double[ncol];
+			double[] row = null;
+			double[] blockBuffer = null;
+			int[] ixBuffer = null;
+			double[] valBuffer = null;
+			long elemSize = contiguousDataset.getDataType().getDoubleDataType().getSize();
+			long rowBytes = (long) ncol * elemSize;
+			if(rowBytes > Integer.MAX_VALUE) {
+				throw new DMLRuntimeException("HDF5 row size exceeds buffer capacity: " + rowBytes);
+			}
+			int blockRows = 1;
+			if(!contiguousDataset.isRankGt2() && rowBytes > 0) {
+				blockRows = (int) Math.max(1, HDF5_READ_BLOCK_BYTES / rowBytes);
+			}
 			if( dest.isInSparseFormat() ) {
 				SparseBlock sb = dest.getSparseBlock();
-				for(int i = rl; i < ru; i++) {
-					H5.H5Dread(contiguousDataset, i, row);
-					int lnnzi = UtilFunctions.computeNnz(row, 0, (int)clen);
-					sb.allocate(i, lnnzi); //avoid row reallocations
-					for(int j = 0; j < ncol; j++) 
-						sb.append(i, j, row[j]); //prunes zeros
-					lnnz += lnnzi;
+				if(contiguousDataset.isRankGt2()) {
+					row = new double[ncol];
+					for(int i = rl; i < ru; i++) {
+						contiguousDataset.readRowDoubles(i, row, 0);
+						int lnnzi = UtilFunctions.computeNnz(row, 0, ncol);
+						sb.allocate(i, lnnzi); //avoid row reallocations
+						for(int j = 0; j < ncol; j++)
+							sb.append(i, j, row[j]); //prunes zeros
+						lnnz += lnnzi;
+					}
+				}
+				else {
+					ixBuffer = new int[ncol];
+					valBuffer = new double[ncol];
+					for(int i = rl; i < ru; ) {
+						int rowsToRead = (int) Math.min(blockRows, ru - i);
+						ByteBuffer buffer = contiguousDataset.getDataBuffer(i, rowsToRead);
+						DoubleBuffer db = buffer.order(ByteOrder.LITTLE_ENDIAN).asDoubleBuffer();
+						int blockSize = rowsToRead * ncol;
+						if(blockBuffer == null || blockBuffer.length < blockSize) {
+							blockBuffer = new double[blockSize];
+						}
+						db.get(blockBuffer, 0, blockSize);
+						for(int r = 0; r < rowsToRead; r++, i++) {
+							int base = r * ncol;
+							int lnnzi = 0;
+							for(int j = 0; j < ncol; j++) {
+								double v = blockBuffer[base + j];
+								if(v != 0) {
+									ixBuffer[lnnzi] = j;
+									valBuffer[lnnzi] = v;
+									lnnzi++;
+								}
+							}
+							sb.allocate(i, lnnzi); //avoid row reallocations
+							for(int k = 0; k < lnnzi; k++) {
+								sb.append(i, ixBuffer[k], valBuffer[k]);
+							}
+							lnnz += lnnzi;
+						}
+					}
 				}
 			}
 			else {
 				DenseBlock denseBlock = dest.getDenseBlock();
-				for(int i = rl; i < ru; i++) {
-					H5.H5Dread(contiguousDataset, i, row);
-					for(int j = 0; j < ncol; j++) {
-						if(row[j] != 0) {
-							denseBlock.set(i, j, row[j]);
-							lnnz++;
+				boolean fastDense = denseBlock.isNumeric(ValueType.FP64)
+					&& !(denseBlock instanceof DenseBlockFP64DEDUP)
+					&& !(denseBlock instanceof DenseBlockLFP64DEDUP);
+				if(contiguousDataset.isRankGt2()) {
+					row = new double[ncol];
+					for(int i = rl; i < ru; i++) {
+						if(fastDense) {
+							double[] destRow = denseBlock.values(i);
+							int destPos = denseBlock.pos(i);
+							contiguousDataset.readRowDoubles(i, destRow, destPos);
+							if(!skipNnz)
+								lnnz += UtilFunctions.computeNnz(destRow, destPos, ncol);
+						}
+						else {
+							contiguousDataset.readRowDoubles(i, row, 0);
+							denseBlock.set(i, row);
+							if(!skipNnz)
+								lnnz += UtilFunctions.computeNnz(row, 0, ncol);
+						}
+					}
+				}
+				else {
+					boolean contiguousDense = fastDense && denseBlock.isContiguous();
+					double[] destAll = contiguousDense ? denseBlock.values(0) : null;
+					for(int i = rl; i < ru; ) {
+						int rowsToRead = (int) Math.min(blockRows, ru - i);
+						ByteBuffer buffer = contiguousDataset.getDataBuffer(i, rowsToRead);
+						DoubleBuffer db = buffer.order(ByteOrder.LITTLE_ENDIAN).asDoubleBuffer();
+						int blockSize = rowsToRead * ncol;
+						if(contiguousDense) {
+							int destPos = denseBlock.pos(i);
+							db.get(destAll, destPos, blockSize);
+							if(!skipNnz)
+								lnnz += UtilFunctions.computeNnz(destAll, destPos, blockSize);
+							i += rowsToRead;
+							continue;
+						}
+						if(fastDense) {
+							if(blockBuffer == null || blockBuffer.length < blockSize) {
+								blockBuffer = new double[blockSize];
+							}
+							db.get(blockBuffer, 0, blockSize);
+							for(int r = 0; r < rowsToRead; r++, i++) {
+								double[] destRow = denseBlock.values(i);
+								int destPos = denseBlock.pos(i);
+								System.arraycopy(blockBuffer, r * ncol, destRow, destPos, ncol);
+							}
+							if(!skipNnz)
+								lnnz += UtilFunctions.computeNnz(blockBuffer, 0, blockSize);
+							continue;
+						}
+						for(int r = 0; r < rowsToRead; r++, i++) {
+							if(row == null) {
+								row = new double[ncol];
+							}
+							db.get(row, 0, ncol);
+							denseBlock.set(i, row);
+							if(!skipNnz)
+								lnnz += UtilFunctions.computeNnz(row, 0, ncol);
 						}
 					}
 				}
 			}
 		}
 		finally {
-			IOUtilFunctions.closeSilently(bis);
+			rootObject.close();
+		}
+		if(skipNnz) {
+			lnnz = Math.multiplyExact(ru - rl, clen);
 		}
 		return lnnz;
 	}
@@ -175,17 +366,287 @@ public static MatrixBlock computeHDF5Size(List<Path> files, FileSystem fs, Strin
 		int nrow = 0;
 		int ncol = 0;
 		for(int fileNo = 0; fileNo < files.size(); fileNo++) {
-			BufferedInputStream bis = new BufferedInputStream(fs.open(files.get(fileNo)));
-			H5RootObject rootObject = H5.H5Fopen(bis);
+			H5ByteReader byteReader = createByteReader(files.get(fileNo), fs);
+			H5RootObject rootObject = H5.H5Fopen(byteReader);
 			H5.H5Dopen(rootObject, datasetName);
 
-			int[] dims = rootObject.getDimensions();
-			nrow += dims[0];
-			ncol += dims[1];
+			nrow += (int) rootObject.getRow();
+			ncol += (int) rootObject.getCol();
 
-			IOUtilFunctions.closeSilently(bis);
+			rootObject.close();
 		}
 		// allocate target matrix block based on given size;
 		return createOutputMatrixBlock(nrow, ncol, nrow, estnnz, true, true);
 	}
+
+	private static int getHdf5ReadInt(String key, int defaultValue) {
+		String value = System.getProperty(key);
+		if(value == null)
+			return defaultValue;
+		try {
+			long parsed = Long.parseLong(value.trim());
+			if(parsed <= 0 || parsed > Integer.MAX_VALUE)
+				return defaultValue;
+			return (int) parsed;
+		}
+		catch(NumberFormatException ex) {
+			return defaultValue;
+		}
+	}
+
+	private static boolean getHdf5ReadBoolean(String key, boolean defaultValue) {
+		String value = System.getProperty(key);
+		if(value == null)
+			return defaultValue;
+		return Boolean.parseBoolean(value.trim());
+	}
+
+	static java.io.File getLocalFile(Path path) {
+		try {
+			return new java.io.File(path.toUri());
+		}
+		catch(IllegalArgumentException ex) {
+			return new java.io.File(path.toString());
+		}
+	}
+
+	private static ByteBuffer sliceBuffer(ByteBuffer source, int offset, int length) {
+		ByteBuffer dup = source.duplicate();
+		dup.position(offset);
+		dup.limit(offset + length);
+		return dup.slice();
+	}
+
+	static boolean isLocalFileSystem(FileSystem fs) {
+		if(fs instanceof LocalFileSystem || fs instanceof RawLocalFileSystem)
+			return true;
+		String scheme = fs.getScheme();
+		return scheme != null && scheme.equalsIgnoreCase("file");
+	}
+
+	static H5ByteReader createByteReader(Path path, FileSystem fs) throws IOException {
+		long fileLength = fs.getFileStatus(path).getLen();
+		String sourceId = path.toString();
+		if(isLocalFileSystem(fs)) {
+			FileInputStream fis = new FileInputStream(getLocalFile(path));
+			FileChannel channel = fis.getChannel();
+			long length = channel.size();
+			LOG.trace("[HDF5] Using FileChannel-backed reader for " + sourceId + " (size=" + length + ")");
+			if(HDF5_READ_USE_MMAP && length > 0) {
+				return new MappedH5ByteReader(channel, length, HDF5_READ_MAP_BYTES);
+			}
+			H5ByteReader base = new FileChannelByteReader(channel);
+			if(length > 0 && length <= Integer.MAX_VALUE) {
+				return new BufferedH5ByteReader(base, length, HDF5_READ_BUFFER_BYTES);
+			}
+			return base;
+		}
+		FSDataInputStream fsin = fs.open(path);
+		return createByteReader(fsin, sourceId, fileLength);
+	}
+
+	private static final class FsDataInputStreamByteReader implements H5ByteReader {
+		private final FSDataInputStream input;
+
+		FsDataInputStreamByteReader(FSDataInputStream input) {
+			this.input = input;
+		}
+
+		@Override
+		public ByteBuffer read(long offset, int length) throws IOException {
+			byte[] buffer = new byte[length];
+			input.readFully(offset, buffer, 0, length);
+			return ByteBuffer.wrap(buffer);
+		}
+
+		@Override
+		public ByteBuffer read(long offset, int length, ByteBuffer reuse) throws IOException {
+			if(reuse == null || reuse.capacity() < length || !reuse.hasArray()) {
+				return read(offset, length);
+			}
+			byte[] buffer = reuse.array();
+			int baseOffset = reuse.arrayOffset();
+			input.readFully(offset, buffer, baseOffset, length);
+			reuse.position(baseOffset);
+			reuse.limit(baseOffset + length);
+			if(baseOffset == 0) {
+				return reuse;
+			}
+			return reuse.slice();
+		}
+
+		@Override
+		public void close() throws IOException {
+			input.close();
+		}
+	}
+
+	private static final class BufferedH5ByteReader implements H5ByteReader {
+		private final H5ByteReader base;
+		private final long length;
+		private final int windowSize;
+		private long windowStart = -1;
+		private int windowLength;
+		private ByteBuffer window;
+		private ByteBuffer windowStorage;
+
+		BufferedH5ByteReader(H5ByteReader base, long length, int windowSize) {
+			this.base = base;
+			this.length = length;
+			this.windowSize = windowSize;
+		}
+
+		@Override
+		public ByteBuffer read(long offset, int length) throws IOException {
+			if(length <= 0 || length > windowSize) {
+				return base.read(offset, length);
+			}
+			if(this.length > 0 && offset + length > this.length) {
+				return base.read(offset, length);
+			}
+			if(window != null && offset >= windowStart && offset + length <= windowStart + windowLength) {
+				return sliceBuffer(window, (int) (offset - windowStart), length);
+			}
+			int readSize = windowSize;
+			if(this.length > 0) {
+				long remaining = this.length - offset;
+				if(remaining > 0)
+					readSize = (int) Math.min(readSize, remaining);
+			}
+			if(readSize < length) {
+				readSize = length;
+			}
+			if(windowStorage == null || windowStorage.capacity() < readSize) {
+				windowStorage = ByteBuffer.allocate(windowSize);
+			}
+			window = base.read(offset, readSize, windowStorage);
+			windowStart = offset;
+			windowLength = window.remaining();
+			return sliceBuffer(window, 0, length);
+		}
+
+		@Override
+		public void close() throws IOException {
+			base.close();
+		}
+	}
+
+	private static final class FileChannelByteReader implements H5ByteReader {
+		private final FileChannel channel;
+
+		FileChannelByteReader(FileChannel channel) {
+			this.channel = channel;
+		}
+
+		@Override
+		public ByteBuffer read(long offset, int length) throws IOException {
+			ByteBuffer buffer = ByteBuffer.allocate(length);
+			long pos = offset;
+			while(buffer.hasRemaining()) {
+				int read = channel.read(buffer, pos);
+				if(read < 0)
+					throw new IOException("Unexpected EOF while reading HDF5 data at offset " + offset);
+				pos += read;
+			}
+			buffer.flip();
+			return buffer;
+		}
+
+		@Override
+		public ByteBuffer read(long offset, int length, ByteBuffer reuse) throws IOException {
+			if(reuse == null || reuse.capacity() < length) {
+				return read(offset, length);
+			}
+			reuse.clear();
+			reuse.limit(length);
+			long pos = offset;
+			while(reuse.hasRemaining()) {
+				int read = channel.read(reuse, pos);
+				if(read < 0)
+					throw new IOException("Unexpected EOF while reading HDF5 data at offset " + offset);
+				pos += read;
+			}
+			reuse.flip();
+			return reuse;
+		}
+
+		@Override
+		public void close() throws IOException {
+			channel.close();
+		}
+	}
+
+	private static final class MappedH5ByteReader implements H5ByteReader {
+		private final FileChannel channel;
+		private final long length;
+		private final int windowSize;
+		private long windowStart = -1;
+		private int windowLength;
+		private MappedByteBuffer window;
+
+		MappedH5ByteReader(FileChannel channel, long length, int windowSize) {
+			this.channel = channel;
+			this.length = length;
+			this.windowSize = windowSize;
+		}
+
+		@Override
+		public ByteBuffer read(long offset, int length) throws IOException {
+			if(length <= 0)
+				return ByteBuffer.allocate(0);
+			if(this.length > 0 && offset + length > this.length) {
+				throw new IOException("Attempted to read past EOF at offset " + offset + " length " + length);
+			}
+			if(length > windowSize) {
+				MappedByteBuffer mapped = channel.map(FileChannel.MapMode.READ_ONLY, offset, length);
+				return mapped;
+			}
+			if(window != null && offset >= windowStart && offset + length <= windowStart + windowLength) {
+				return sliceBuffer(window, (int) (offset - windowStart), length);
+			}
+			int readSize = windowSize;
+			if(this.length > 0) {
+				long remaining = this.length - offset;
+				if(remaining > 0)
+					readSize = (int) Math.min(readSize, remaining);
+			}
+			if(readSize < length) {
+				readSize = length;
+			}
+			window = channel.map(FileChannel.MapMode.READ_ONLY, offset, readSize);
+			windowStart = offset;
+			windowLength = readSize;
+			return sliceBuffer(window, 0, length);
+		}
+
+		@Override
+		public void close() throws IOException {
+			channel.close();
+		}
+	}
+
+	private static final class ByteArrayH5ByteReader implements H5ByteReader {
+		private final byte[] data;
+
+		ByteArrayH5ByteReader(byte[] data) {
+			this.data = data;
+		}
+
+		@Override
+		public ByteBuffer read(long offset, int length) throws IOException {
+			if(offset < 0 || offset + length > data.length) {
+				throw new IOException("Attempted to read outside cached buffer (offset=" + offset + ", len=" + length
+					+ ", size=" + data.length + ")");
+			}
+			if(offset > Integer.MAX_VALUE) {
+				throw new IOException("Offset exceeds byte array capacity: " + offset);
+			}
+			return ByteBuffer.wrap(data, (int) offset, length).slice();
+		}
+
+		@Override
+		public void close() {
+			// nothing to close
+		}
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/ReaderHDF5Parallel.java b/src/main/java/org/apache/sysds/runtime/io/ReaderHDF5Parallel.java
index 658eb538265..d1651f94206 100644
--- a/src/main/java/org/apache/sysds/runtime/io/ReaderHDF5Parallel.java
+++ b/src/main/java/org/apache/sysds/runtime/io/ReaderHDF5Parallel.java
@@ -19,9 +19,13 @@
 
 package org.apache.sysds.runtime.io;
 
-import java.io.BufferedInputStream;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.ByteOrder;
+import java.nio.DoubleBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
 import java.util.ArrayList;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
@@ -32,13 +36,21 @@
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.hops.OptimizerUtils;
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.io.hdf5.H5Constants;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.DenseBlockFP64DEDUP;
+import org.apache.sysds.runtime.data.DenseBlockLFP64DEDUP;
+import org.apache.sysds.runtime.io.hdf5.H5ByteReader;
+import org.apache.sysds.runtime.io.hdf5.H5ContiguousDataset;
+import org.apache.sysds.runtime.io.hdf5.H5RootObject;
+import org.apache.sysds.runtime.io.hdf5.H5;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 import org.apache.sysds.runtime.util.HDFSTool;
+import org.apache.sysds.runtime.util.UtilFunctions;
 
 public class ReaderHDF5Parallel extends ReaderHDF5 {
 
@@ -71,12 +83,19 @@ public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int bl
 		ArrayList<Path> files = new ArrayList<>();
 		files.add(path);
 		MatrixBlock src = computeHDF5Size(files, fs, _props.getDatasetName(), estnnz);
+		if(ReaderHDF5.isLocalFileSystem(fs) && !fs.getFileStatus(path).isDirectory()) {
+			Long nnz = readMatrixFromHDF5ParallelLocal(path, fs, src, 0, src.getNumRows(),
+				src.getNumColumns(), blen, _props.getDatasetName());
+			if(nnz != null) {
+				src.setNonZeros(nnz);
+				return src;
+			}
+		}
 		int numParts = Math.min(files.size(), _numThreads);
 		
 		//create and execute tasks
 		ExecutorService pool = CommonThreadPool.get(_numThreads);
 		try {
-			int bufferSize = (src.getNumColumns() * src.getNumRows()) * 8 + H5Constants.STATIC_HEADER_SIZE;
 			ArrayList<ReadHDF5Task> tasks = new ArrayList<>();
 			rlen = src.getNumRows();
 			int blklen = (int) Math.ceil((double) rlen / numParts);
@@ -85,10 +104,7 @@ public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int bl
 				int ru = (int) Math.min((i + 1) * blklen, rlen);
 				Path newPath = HDFSTool.isDirectory(fs, path) ? 
 					new Path(path, IOUtilFunctions.getPartFileName(i)) : path;
-				BufferedInputStream bis = new BufferedInputStream(fs.open(newPath), bufferSize);
-
-				//BufferedInputStream bis, String datasetName, MatrixBlock src, MutableInt rl, int ru
-				tasks.add(new ReadHDF5Task(bis, _props.getDatasetName(), src, rl, ru, clen, blklen));
+				tasks.add(new ReadHDF5Task(fs, newPath, _props.getDatasetName(), src, rl, ru, clen, blklen));
 			}
 
 			long nnz = 0;
@@ -113,9 +129,208 @@ public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long cle
 		return new ReaderHDF5(_props).readMatrixFromInputStream(is, rlen, clen, blen, estnnz);
 	}
 
+	private static Long readMatrixFromHDF5ParallelLocal(Path path, FileSystem fs, MatrixBlock dest,
+		int rl, long ru, long clen, int blen, String datasetName) throws IOException
+	{
+		H5RootObject rootObject = null;
+		long dataAddress;
+		long elemSize;
+		long rows;
+		long cols;
+		try {
+			H5ByteReader metaReader = createByteReader(path, fs);
+			rootObject = H5.H5Fopen(metaReader);
+			H5ContiguousDataset dataset = H5.H5Dopen(rootObject, datasetName);
+			if(dataset.isRankGt2() && !dataset.isRowContiguous()) {
+				rootObject.close();
+				return null;
+			}
+			elemSize = dataset.getElementSize();
+			if(elemSize != 8) {
+				rootObject.close();
+				return null;
+			}
+			dataAddress = dataset.getDataAddress();
+			rows = rootObject.getRow();
+			cols = rootObject.getCol();
+			long rowByteSize = dataset.getRowByteSize();
+			if(rowByteSize <= 0) {
+				rootObject.close();
+				return null;
+			}
+			rootObject.close();
+			rootObject = null;
+		}
+		finally {
+			if(rootObject != null)
+				rootObject.close();
+		}
+
+		if(dest.isInSparseFormat()) {
+			if(HDF5_FORCE_DENSE) {
+				dest.allocateDenseBlock(true);
+				if(HDF5_READ_TRACE)
+					LOG.trace("[HDF5] Forcing dense output for parallel mmap dataset=" + datasetName);
+			}
+			else {
+				return null;
+			}
+		}
+		DenseBlock denseBlock = dest.getDenseBlock();
+		boolean fastDense = denseBlock.isNumeric(ValueType.FP64)
+			&& !(denseBlock instanceof DenseBlockFP64DEDUP)
+			&& !(denseBlock instanceof DenseBlockLFP64DEDUP);
+		boolean contiguousDense = fastDense && denseBlock.isContiguous();
+		if(!fastDense) {
+			return null;
+		}
+
+		if(cols > Integer.MAX_VALUE || rows > Integer.MAX_VALUE) {
+			return null;
+		}
+		int ncol = (int) cols;
+		long rowBytesLong = elemSize * ncol;
+		if(rowBytesLong <= 0 || rowBytesLong > Integer.MAX_VALUE) {
+			return null;
+		}
+		long totalRowsLong = ru - rl;
+		if(totalRowsLong <= 0 || totalRowsLong > Integer.MAX_VALUE) {
+			return null;
+		}
+		long totalBytes = totalRowsLong * rowBytesLong;
+		if(totalBytes < HDF5_READ_PARALLEL_MIN_BYTES || HDF5_READ_PARALLEL_THREADS <= 1) {
+			return null;
+		}
+
+		int numThreads = Math.min(HDF5_READ_PARALLEL_THREADS, (int) totalRowsLong);
+		int rowsPerTask = (int) Math.ceil((double) totalRowsLong / numThreads);
+		double[] destAll = contiguousDense ? denseBlock.values(0) : null;
+		int destBase = contiguousDense ? denseBlock.pos(rl) : 0;
+		int rowBytes = (int) rowBytesLong;
+		int windowBytes = HDF5_READ_MAP_BYTES;
+		boolean skipNnz = HDF5_SKIP_NNZ;
+		if(HDF5_READ_TRACE) {
+			LOG.trace("[HDF5] Parallel mmap read enabled dataset=" + datasetName + " rows=" + totalRowsLong
+				+ " cols=" + cols + " threads=" + numThreads + " windowBytes=" + windowBytes + " skipNnz=" + skipNnz);
+		}
+
+		java.io.File localFile = getLocalFile(path);
+		ExecutorService pool = CommonThreadPool.get(numThreads);
+		ArrayList<Callable<Long>> tasks = new ArrayList<>();
+		for(int rowOffset = 0; rowOffset < totalRowsLong; rowOffset += rowsPerTask) {
+			int rowsToRead = (int) Math.min(rowsPerTask, totalRowsLong - rowOffset);
+			int destOffset = contiguousDense ? destBase + rowOffset * ncol : 0;
+			int startRow = rl + rowOffset;
+			long fileOffset = dataAddress + ((long) (rl + rowOffset) * rowBytes);
+			tasks.add(new H5ParallelReadTask(localFile, fileOffset, rowBytes, rowsToRead, ncol, destAll,
+				destOffset, denseBlock, startRow, windowBytes, skipNnz));
+		}
+
+		long lnnz = 0;
+		try {
+			for(Future<Long> task : pool.invokeAll(tasks))
+				lnnz += task.get();
+		}
+		catch(Exception e) {
+			throw new IOException("Failed parallel read of HDF5 input.", e);
+		}
+		finally {
+			pool.shutdown();
+		}
+
+		if(skipNnz) {
+			lnnz = Math.multiplyExact(totalRowsLong, clen);
+		}
+		return lnnz;
+	}
+
+	private static final class H5ParallelReadTask implements Callable<Long> {
+		private static final int ELEM_BYTES = 8;
+		private final java.io.File file;
+		private final long fileOffset;
+		private final int rowBytes;
+		private final int rows;
+		private final int ncol;
+		private final double[] dest;
+		private final int destOffset;
+		private final DenseBlock denseBlock;
+		private final int startRow;
+		private final int windowBytes;
+		private final boolean skipNnz;
+
+		H5ParallelReadTask(java.io.File file, long fileOffset, int rowBytes, int rows, int ncol, double[] dest,
+			int destOffset, DenseBlock denseBlock, int startRow, int windowBytes, boolean skipNnz)
+		{
+			this.file = file;
+			this.fileOffset = fileOffset;
+			this.rowBytes = rowBytes;
+			this.rows = rows;
+			this.ncol = ncol;
+			this.dest = dest;
+			this.destOffset = destOffset;
+			this.denseBlock = denseBlock;
+			this.startRow = startRow;
+			this.windowBytes = windowBytes;
+			this.skipNnz = skipNnz;
+		}
+
+		@Override
+		public Long call() throws IOException {
+			long nnz = 0;
+			long remaining = (long) rows * rowBytes;
+			long offset = fileOffset;
+			int destIndex = destOffset;
+			int rowCursor = startRow;
+			int window = Math.max(windowBytes, ELEM_BYTES);
+			try(FileInputStream fis = new FileInputStream(file);
+				FileChannel channel = fis.getChannel()) {
+				while(remaining > 0) {
+					int mapBytes;
+					if(dest != null) {
+						mapBytes = (int) Math.min(window, remaining);
+						mapBytes -= mapBytes % ELEM_BYTES;
+						if(mapBytes == 0)
+							mapBytes = (int) Math.min(remaining, ELEM_BYTES);
+					}
+					else {
+						int rowsInMap = (int) Math.min(remaining / rowBytes, window / rowBytes);
+						if(rowsInMap <= 0)
+							rowsInMap = 1;
+						mapBytes = rowsInMap * rowBytes;
+					}
+					MappedByteBuffer map = channel.map(FileChannel.MapMode.READ_ONLY, offset, mapBytes);
+					map.order(ByteOrder.LITTLE_ENDIAN);
+					DoubleBuffer db = map.asDoubleBuffer();
+					int doubles = mapBytes / ELEM_BYTES;
+					if(dest != null) {
+						db.get(dest, destIndex, doubles);
+						if(!skipNnz)
+							nnz += UtilFunctions.computeNnz(dest, destIndex, doubles);
+						destIndex += doubles;
+					}
+					else {
+						int rowsRead = mapBytes / rowBytes;
+						for(int r = 0; r < rowsRead; r++) {
+							double[] rowVals = denseBlock.values(rowCursor + r);
+							int rowPos = denseBlock.pos(rowCursor + r);
+							db.get(rowVals, rowPos, ncol);
+							if(!skipNnz)
+								nnz += UtilFunctions.computeNnz(rowVals, rowPos, ncol);
+						}
+						rowCursor += rowsRead;
+					}
+					offset += mapBytes;
+					remaining -= mapBytes;
+				}
+			}
+			return nnz;
+		}
+	}
+
 	private static class ReadHDF5Task implements Callable<Long> {
 
-		private final BufferedInputStream _bis;
+		private final FileSystem _fs;
+		private final Path _path;
 		private final String _datasetName;
 		private final MatrixBlock _src;
 		private final int _rl;
@@ -123,10 +338,11 @@ private static class ReadHDF5Task implements Callable<Long> {
 		private final long _clen;
 		private final int _blen;
 
-		public ReadHDF5Task(BufferedInputStream bis, String datasetName, MatrixBlock src, 
+		public ReadHDF5Task(FileSystem fs, Path path, String datasetName, MatrixBlock src, 
 			int rl, int ru, long clen, int blen)
 		{
-			_bis = bis;
+			_fs = fs;
+			_path = path;
 			_datasetName = datasetName;
 			_src = src;
 			_rl = rl;
@@ -137,7 +353,9 @@ public ReadHDF5Task(BufferedInputStream bis, String datasetName, MatrixBlock src
 
 		@Override
 		public Long call() throws IOException {
-			return readMatrixFromHDF5(_bis, _datasetName, _src, _rl, _ru, _clen, _blen);
+			try(H5ByteReader byteReader = ReaderHDF5.createByteReader(_path, _fs)) {
+				return readMatrixFromHDF5(byteReader, _datasetName, _src, _rl, _ru, _clen, _blen);
+			}
 		}
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/WriterBinaryBlock.java b/src/main/java/org/apache/sysds/runtime/io/WriterBinaryBlock.java
index 82c994eb7a8..69fd386c5ef 100644
--- a/src/main/java/org/apache/sysds/runtime/io/WriterBinaryBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/io/WriterBinaryBlock.java
@@ -32,6 +32,7 @@
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
+import org.apache.sysds.runtime.instructions.ooc.OOCStream;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
@@ -234,7 +235,7 @@ protected final void writeDiagBinaryBlockMatrixToHDFS(Path path, JobConf job,  M
 	}
 
 	@Override
-	public long writeMatrixFromStream(String fname, LocalTaskQueue<IndexedMatrixValue> stream, long rlen, long clen, int blen) throws IOException {
+	public long writeMatrixFromStream(String fname, OOCStream<IndexedMatrixValue> stream, long rlen, long clen, int blen) throws IOException {
 		Path path = new Path(fname);
 		SequenceFile.Writer writer = null;
 
@@ -245,7 +246,7 @@ public long writeMatrixFromStream(String fname, LocalTaskQueue<IndexedMatrixValu
 
 			// 2. Loop through OOC stream
 			IndexedMatrixValue i_val =  null;
-			while((i_val = stream.dequeueTask()) != LocalTaskQueue.NO_MORE_TASKS) {
+			while((i_val = stream.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) {
 				MatrixBlock mb = (MatrixBlock) i_val.getValue();
 				MatrixIndexes ix = i_val.getIndexes();
 
diff --git a/src/main/java/org/apache/sysds/runtime/io/WriterHDF5.java b/src/main/java/org/apache/sysds/runtime/io/WriterHDF5.java
index ba3bbcda544..e8581c9c842 100644
--- a/src/main/java/org/apache/sysds/runtime/io/WriterHDF5.java
+++ b/src/main/java/org/apache/sysds/runtime/io/WriterHDF5.java
@@ -24,9 +24,9 @@
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.instructions.ooc.OOCStream;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.io.hdf5.H5;
 import org.apache.sysds.runtime.io.hdf5.H5RootObject;
@@ -133,7 +133,7 @@ protected static void writeHDF5MatrixToFile(Path path, JobConf job, FileSystem f
 	}
 
 	@Override
-	public long writeMatrixFromStream(String fname, LocalTaskQueue<IndexedMatrixValue> stream, long rlen, long clen, int blen) {
+	public long writeMatrixFromStream(String fname, OOCStream<IndexedMatrixValue> stream, long rlen, long clen, int blen) {
 		throw new UnsupportedOperationException("Writing from an OOC stream is not supported for the HDF5 format.");
 	};
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/WriterMatrixMarket.java b/src/main/java/org/apache/sysds/runtime/io/WriterMatrixMarket.java
index 39855968202..5483dc28ab9 100644
--- a/src/main/java/org/apache/sysds/runtime/io/WriterMatrixMarket.java
+++ b/src/main/java/org/apache/sysds/runtime/io/WriterMatrixMarket.java
@@ -35,8 +35,8 @@
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
 import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.instructions.ooc.OOCStream;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.matrix.data.IJV;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -224,7 +224,7 @@ public static void mergeTextcellToMatrixMarket( String srcFileName, String fileN
 	}
 
 	@Override
-	public long writeMatrixFromStream(String fname, LocalTaskQueue<IndexedMatrixValue> stream, long rlen, long clen, int blen) {
+	public long writeMatrixFromStream(String fname, OOCStream<IndexedMatrixValue> stream, long rlen, long clen, int blen) {
 		throw new UnsupportedOperationException("Writing from an OOC stream is not supported for the MatrixMarket format.");
 	};
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/WriterTextCSV.java b/src/main/java/org/apache/sysds/runtime/io/WriterTextCSV.java
index 9bc1edace9d..e96278b7801 100644
--- a/src/main/java/org/apache/sysds/runtime/io/WriterTextCSV.java
+++ b/src/main/java/org/apache/sysds/runtime/io/WriterTextCSV.java
@@ -35,9 +35,9 @@
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.instructions.ooc.OOCStream;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.HDFSTool;
@@ -345,7 +345,7 @@ public final void addHeaderToCSV(String srcFileName, String destFileName, long r
 	}
 
 	@Override
-	public long writeMatrixFromStream(String fname, LocalTaskQueue<IndexedMatrixValue> stream, long rlen, long clen, int blen) {
+	public long writeMatrixFromStream(String fname, OOCStream<IndexedMatrixValue> stream, long rlen, long clen, int blen) {
 		throw new UnsupportedOperationException("Writing from an OOC stream is not supported for the TextCSV format.");
 	};
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/WriterTextCell.java b/src/main/java/org/apache/sysds/runtime/io/WriterTextCell.java
index b876f21752b..ad216bf9406 100644
--- a/src/main/java/org/apache/sysds/runtime/io/WriterTextCell.java
+++ b/src/main/java/org/apache/sysds/runtime/io/WriterTextCell.java
@@ -30,8 +30,8 @@
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
 import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.instructions.ooc.OOCStream;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.matrix.data.IJV;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -141,7 +141,7 @@ protected static void writeTextCellMatrixToFile( Path path, JobConf job, FileSys
 	}
 
 	@Override
-	public long writeMatrixFromStream(String fname, LocalTaskQueue<IndexedMatrixValue> stream, long rlen, long clen, int blen) {
+	public long writeMatrixFromStream(String fname, OOCStream<IndexedMatrixValue> stream, long rlen, long clen, int blen) {
 		throw new UnsupportedOperationException("Writing from an OOC stream is not supported for the TextCell format.");
 	};
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/WriterTextLIBSVM.java b/src/main/java/org/apache/sysds/runtime/io/WriterTextLIBSVM.java
index 4a97abefc55..450a20979c4 100644
--- a/src/main/java/org/apache/sysds/runtime/io/WriterTextLIBSVM.java
+++ b/src/main/java/org/apache/sysds/runtime/io/WriterTextLIBSVM.java
@@ -28,9 +28,9 @@
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.runtime.DMLRuntimeException;
-import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.instructions.ooc.OOCStream;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.HDFSTool;
@@ -160,7 +160,7 @@ protected static void appendIndexValLibsvm(StringBuilder sb, int index, double v
 	}
 
 	@Override
-	public long writeMatrixFromStream(String fname, LocalTaskQueue<IndexedMatrixValue> stream, long rlen, long clen, int blen) {
+	public long writeMatrixFromStream(String fname, OOCStream<IndexedMatrixValue> stream, long rlen, long clen, int blen) {
 		throw new UnsupportedOperationException("Writing from an OOC stream is not supported for the LIBSVM format.");
 	};
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/hdf5/H5.java b/src/main/java/org/apache/sysds/runtime/io/hdf5/H5.java
index 0ab909f0a3b..0f640490ed6 100644
--- a/src/main/java/org/apache/sysds/runtime/io/hdf5/H5.java
+++ b/src/main/java/org/apache/sysds/runtime/io/hdf5/H5.java
@@ -19,10 +19,12 @@
 
 package org.apache.sysds.runtime.io.hdf5;
 
-import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
+import java.util.stream.Collectors;
 
 import org.apache.sysds.runtime.io.hdf5.message.H5SymbolTableMessage;
 
@@ -35,16 +37,15 @@ public class H5 {
 	// 4. Write/Read
 	// 5. Close File
 
-	public static H5RootObject H5Fopen(BufferedInputStream bis) {
+	public static H5RootObject H5Fopen(H5ByteReader reader) {
 		H5RootObject rootObject = new H5RootObject();
-		bis.mark(0);
 		try {
 			// Find out if the file is a HDF5 file
 			int maxSignatureLength = 2048;
 			boolean validSignature = false;
 			long offset;
 			for(offset = 0; offset < maxSignatureLength; offset = nextOffset(offset)) {
-				validSignature = H5Superblock.verifySignature(bis, offset);
+				validSignature = H5Superblock.verifySignature(reader, offset);
 				if(validSignature) {
 					break;
 				}
@@ -52,9 +53,9 @@ public static H5RootObject H5Fopen(BufferedInputStream bis) {
 			if(!validSignature) {
 				throw new H5RuntimeException("No valid HDF5 signature found");
 			}
-			rootObject.setBufferedInputStream(bis);
+			rootObject.setByteReader(reader);
 
-			final H5Superblock superblock = new H5Superblock(bis, offset);
+			final H5Superblock superblock = new H5Superblock(reader, offset);
 			rootObject.setSuperblock(superblock);
 		}
 		catch(Exception exception) {
@@ -113,38 +114,79 @@ public static H5RootObject H5Screate(BufferedOutputStream bos, long row, long co
 	// Open a Data Space
 	public static H5ContiguousDataset H5Dopen(H5RootObject rootObject, String datasetName) {
 		try {
-			H5SymbolTableEntry symbolTableEntry = new H5SymbolTableEntry(rootObject,
+			List<String> datasetPath = normalizeDatasetPath(datasetName);
+			H5SymbolTableEntry currentEntry = new H5SymbolTableEntry(rootObject,
 				rootObject.getSuperblock().rootGroupSymbolTableAddress - rootObject.getSuperblock().baseAddressByte);
+			rootObject.setDatasetName(datasetName);
 
-			H5ObjectHeader objectHeader = new H5ObjectHeader(rootObject, symbolTableEntry.getObjectHeaderAddress());
-
-			final H5SymbolTableMessage stm = (H5SymbolTableMessage) objectHeader.getMessages().get(0);
-			final H5BTree rootBTreeNode = new H5BTree(rootObject, stm.getbTreeAddress());
-			final H5LocalHeap rootNameHeap = new H5LocalHeap(rootObject, stm.getLocalHeapAddress());
-			final ByteBuffer nameBuffer = rootNameHeap.getDataBuffer();
-			final List<Long> childAddresses = rootBTreeNode.getChildAddresses();
-
-			long child = childAddresses.get(0);
+			StringBuilder traversedPath = new StringBuilder("/");
+			for(String segment : datasetPath) {
+				currentEntry = descendIntoChild(rootObject, currentEntry, segment, traversedPath.toString());
+				if(traversedPath.length() > 1)
+					traversedPath.append('/');
+				traversedPath.append(segment);
+			}
 
-			H5GroupSymbolTableNode groupSTE = new H5GroupSymbolTableNode(rootObject, child);
+			if(H5RootObject.HDF5_DEBUG) {
+				System.out.println("[HDF5] Opening dataset '" + datasetName + "' resolved to object header @ "
+					+ currentEntry.getObjectHeaderAddress());
+			}
 
-			symbolTableEntry = groupSTE.getSymbolTableEntries()[0];
+			final H5ObjectHeader header = new H5ObjectHeader(rootObject, currentEntry.getObjectHeaderAddress());
+			return new H5ContiguousDataset(rootObject, header);
 
-			nameBuffer.position(symbolTableEntry.getLinkNameOffset());
-			String childName = Utils.readUntilNull(nameBuffer);
+		}
+		catch(Exception exception) {
+			throw new H5RuntimeException(exception);
+		}
+	}
 
-			if(!childName.equals(datasetName)) {
-				throw new H5RuntimeException("The requested dataset '" + datasetName + "' differs from available '"+childName+"'.");
+	private static H5SymbolTableEntry descendIntoChild(H5RootObject rootObject, H5SymbolTableEntry parentEntry,
+		String childSegment, String currentPath) {
+		H5ObjectHeader objectHeader = new H5ObjectHeader(rootObject, parentEntry.getObjectHeaderAddress());
+		H5SymbolTableMessage symbolTableMessage = objectHeader.getMessageOfType(H5SymbolTableMessage.class);
+		List<H5SymbolTableEntry> children = readSymbolTableEntries(rootObject, symbolTableMessage);
+		H5LocalHeap heap = new H5LocalHeap(rootObject, symbolTableMessage.getLocalHeapAddress());
+		ByteBuffer nameBuffer = heap.getDataBuffer();
+		List<String> availableNames = new ArrayList<>();
+		for(H5SymbolTableEntry child : children) {
+			nameBuffer.position(child.getLinkNameOffset());
+			String candidateName = Utils.readUntilNull(nameBuffer);
+			if(H5RootObject.HDF5_DEBUG) {
+				System.out.println("[HDF5] Visit '" + currentPath + "' child -> '" + candidateName + "'");
+			}
+			availableNames.add(candidateName);
+			if(candidateName.equals(childSegment)) {
+				return child;
 			}
+		}
+		throw new H5RuntimeException("Dataset path segment '" + childSegment + "' not found under '" + currentPath
+			+ "'. Available entries: " + availableNames);
+	}
 
-			final H5ObjectHeader header = new H5ObjectHeader(rootObject, symbolTableEntry.getObjectHeaderAddress());
-			final H5ContiguousDataset contiguousDataset = new H5ContiguousDataset(rootObject, header);
-			return contiguousDataset;
+	private static List<H5SymbolTableEntry> readSymbolTableEntries(H5RootObject rootObject,
+		H5SymbolTableMessage symbolTableMessage) {
+		H5BTree btree = new H5BTree(rootObject, symbolTableMessage.getbTreeAddress());
+		List<H5SymbolTableEntry> entries = new ArrayList<>();
+		for(Long childAddress : btree.getChildAddresses()) {
+			H5GroupSymbolTableNode groupNode = new H5GroupSymbolTableNode(rootObject, childAddress);
+			entries.addAll(Arrays.asList(groupNode.getSymbolTableEntries()));
+		}
+		return entries;
+	}
 
+	private static List<String> normalizeDatasetPath(String datasetName) {
+		if(datasetName == null) {
+			throw new H5RuntimeException("Dataset name cannot be null");
 		}
-		catch(Exception exception) {
-			throw new H5RuntimeException(exception);
+		List<String> tokens = Arrays.stream(datasetName.split("/"))
+			.map(String::trim)
+			.filter(token -> !token.isEmpty())
+			.collect(Collectors.toList());
+		if(tokens.isEmpty()) {
+			throw new H5RuntimeException("Dataset name '" + datasetName + "' is invalid.");
 		}
+		return tokens;
 	}
 
 	// Create Dataset
@@ -196,14 +238,12 @@ public static void H5Dwrite(H5RootObject rootObject, double[][] data) {
 
 	public static void H5Dread(H5RootObject rootObject, H5ContiguousDataset dataset, double[][] data) {
 		for(int i = 0; i < rootObject.getRow(); i++) {
-			ByteBuffer buffer = dataset.getDataBuffer(i);
-			dataset.getDataType().getDoubleDataType().fillData(buffer, data[i]);
+			dataset.readRowDoubles(i, data[i], 0);
 		}
 	}
 
 	public static void H5Dread(H5ContiguousDataset dataset, int row, double[] data) {
-		ByteBuffer buffer = dataset.getDataBuffer(row);
-		dataset.getDataType().getDoubleDataType().fillData(buffer, data);
+		dataset.readRowDoubles(row, data, 0);
 	}
 
 }
diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test1.java b/src/main/java/org/apache/sysds/runtime/io/hdf5/H5ByteReader.java
similarity index 66%
rename from src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test1.java
rename to src/main/java/org/apache/sysds/runtime/io/hdf5/H5ByteReader.java
index b0fff7a6391..5421e5f3b0f 100644
--- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test1.java
+++ b/src/main/java/org/apache/sysds/runtime/io/hdf5/H5ByteReader.java
@@ -17,22 +17,20 @@
  * under the License.
  */
 
-package org.apache.sysds.test.functions.io.hdf5;
+package org.apache.sysds.runtime.io.hdf5;
 
-public class ReadHDF5Test1 extends ReadHDF5Test {
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
 
-	private final static String TEST_NAME = "ReadHDF5Test";
-	public final static String TEST_CLASS_DIR = TEST_DIR + ReadHDF5Test1.class.getSimpleName() + "/";
+public interface H5ByteReader extends Closeable {
 
-	protected String getTestName() {
-		return TEST_NAME;
-	}
+	ByteBuffer read(long offset, int length) throws IOException;
 
-	protected String getTestClassDir() {
-		return TEST_CLASS_DIR;
+	default ByteBuffer read(long offset, int length, ByteBuffer reuse) throws IOException {
+		return read(offset, length);
 	}
 
-	protected int getId() {
-		return 1;
-	}
+	@Override
+	void close() throws IOException;
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/hdf5/H5Constants.java b/src/main/java/org/apache/sysds/runtime/io/hdf5/H5Constants.java
index 9d2414bec84..f80690454d8 100644
--- a/src/main/java/org/apache/sysds/runtime/io/hdf5/H5Constants.java
+++ b/src/main/java/org/apache/sysds/runtime/io/hdf5/H5Constants.java
@@ -30,4 +30,6 @@ public final class H5Constants {
 	public static final int DATA_LAYOUT_MESSAGE = 8;
 	public static final int SYMBOL_TABLE_MESSAGE = 17;
 	public static final int OBJECT_MODIFICATION_TIME_MESSAGE = 18;
+	public static final int FILTER_PIPELINE_MESSAGE = 11;
+	public static final int ATTRIBUTE_MESSAGE = 12;
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/hdf5/H5ContiguousDataset.java b/src/main/java/org/apache/sysds/runtime/io/hdf5/H5ContiguousDataset.java
index 3ae6761e864..b132ea6a5aa 100644
--- a/src/main/java/org/apache/sysds/runtime/io/hdf5/H5ContiguousDataset.java
+++ b/src/main/java/org/apache/sysds/runtime/io/hdf5/H5ContiguousDataset.java
@@ -25,6 +25,7 @@
 import org.apache.sysds.runtime.io.hdf5.message.H5DataTypeMessage;
 
 import java.nio.ByteBuffer;
+import java.util.Arrays;
 
 import static java.nio.ByteOrder.LITTLE_ENDIAN;
 
@@ -35,29 +36,235 @@ public class H5ContiguousDataset {
 	private final H5DataTypeMessage dataTypeMessage;
 	@SuppressWarnings("unused")
 	private final H5DataSpaceMessage dataSpaceMessage;
+	private final boolean rankGt2;
+	private final long elemSize;
+	private final long dataSize;
+	private ByteBuffer fullData;
+	private boolean fullDataLoaded = false;
+	private final int[] dims;
+	private final int[] fileDims;
+	private final long[] fileStrides;
+	private final int[] axisPermutation;
+	private final long rowByteStride;
+	private final long rowByteSize;
+	private long[] colOffsets;
 
 	public H5ContiguousDataset(H5RootObject rootObject, H5ObjectHeader objectHeader) {
 
 		this.rootObject = rootObject;
 		this.dataLayoutMessage = objectHeader.getMessageOfType(H5DataLayoutMessage.class);
+		if(this.dataLayoutMessage.getLayoutClass() != H5DataLayoutMessage.LAYOUT_CLASS_CONTIGUOUS) {
+			throw new H5RuntimeException("Unsupported data layout class: "
+				+ this.dataLayoutMessage.getLayoutClass() + " (only contiguous datasets are supported).");
+		}
 		this.dataTypeMessage = objectHeader.getMessageOfType(H5DataTypeMessage.class);
 		this.dataSpaceMessage = objectHeader.getMessageOfType(H5DataSpaceMessage.class);
+
+		this.dims = rootObject.getLogicalDimensions();
+		this.fileDims = rootObject.getRawDimensions() != null ? rootObject.getRawDimensions() : this.dims;
+		this.axisPermutation = normalizePermutation(rootObject.getAxisPermutation(), this.dims);
+		this.rankGt2 = this.dims != null && this.dims.length > 2;
+		this.elemSize = this.dataTypeMessage.getDoubleDataType().getSize();
+		this.dataSize = this.dataLayoutMessage.getSize();
+		this.fileStrides = computeStridesRowMajor(this.fileDims);
+		this.rowByteStride = (fileStrides.length == 0) ? 0 : fileStrides[axisPermutation[0]] * elemSize;
+		if(H5RootObject.HDF5_DEBUG && rankGt2) {
+			System.out.println("[HDF5] dataset=" + rootObject.getDatasetName() + " logicalDims="
+				+ Arrays.toString(dims) + " fileDims=" + Arrays.toString(fileDims) + " axisPerm="
+				+ Arrays.toString(axisPermutation) + " fileStrides=" + Arrays.toString(fileStrides));
+		}
+
+		this.rowByteSize = rootObject.getCol() * elemSize;
 	}
 
 	public ByteBuffer getDataBuffer(int row) {
+		return getDataBuffer(row, 1);
+	}
+
+	public ByteBuffer getDataBuffer(int row, int rowCount) {
 		try {
-			long rowPos = row * rootObject.getCol()*this.dataTypeMessage.getDoubleDataType().getSize();
-			ByteBuffer data = rootObject.readBufferFromAddressNoOrder(dataLayoutMessage.getAddress() + rowPos,
-				(int) (rootObject.getCol() * this.dataTypeMessage.getDoubleDataType().getSize()));
-			data.order(LITTLE_ENDIAN);
+			long cols = rootObject.getCol();
+			long rowBytes = cols * elemSize;
+			if(rowBytes > Integer.MAX_VALUE) {
+				throw new H5RuntimeException("Row byte size exceeds buffer capacity: " + rowBytes);
+			}
+			if(rowCount <= 0) {
+				throw new H5RuntimeException("Row count must be positive, got " + rowCount);
+			}
+			long readLengthLong = rowBytes * rowCount;
+			if(readLengthLong > Integer.MAX_VALUE) {
+				throw new H5RuntimeException("Requested read exceeds buffer capacity: " + readLengthLong);
+			}
+			int readLength = (int) readLengthLong;
 
-			return data;
+			if(rankGt2) {
+				if(isRowContiguous()) {
+					long rowPos = row * rowByteSize;
+					long layoutAddress = dataLayoutMessage.getAddress();
+					long dataAddress = layoutAddress + rowPos;
+					ByteBuffer data = rootObject.readBufferFromAddressNoOrder(dataAddress, readLength);
+					data.order(LITTLE_ENDIAN);
+					if(H5RootObject.HDF5_DEBUG) {
+						System.out.println("[HDF5] getDataBuffer (rank>2 contiguous) dataset=" + rootObject.getDatasetName()
+							+ " row=" + row + " rowCount=" + rowCount + " readLength=" + readLength);
+					}
+					return data;
+				}
+				if(rowCount != 1) {
+					throw new H5RuntimeException("Row block reads are not supported for non-contiguous rank>2 datasets.");
+				}
+				if(!fullDataLoaded) {
+					fullData = rootObject.readBufferFromAddressNoOrder(dataLayoutMessage.getAddress(),
+						(int) dataSize);
+					fullData.order(LITTLE_ENDIAN);
+					fullDataLoaded = true;
+				}
+				if(colOffsets == null) {
+					colOffsets = new long[(int) cols];
+					for(int c = 0; c < cols; c++) {
+						colOffsets[c] = computeByteOffset(0, c);
+					}
+				}
+				ByteBuffer rowBuf = ByteBuffer.allocate(readLength).order(LITTLE_ENDIAN);
+				if(H5RootObject.HDF5_DEBUG && row == 0) {
+					long debugCols = Math.min(cols, 5);
+					for(long c = 0; c < debugCols; c++) {
+						long byteOff = rowByteStride * row + colOffsets[(int) c];
+						double v = fullData.getDouble((int) byteOff);
+						System.out.println("[HDF5] map(row=" + row + ", col=" + c + ") -> byteOff=" + byteOff
+							+ " val=" + v);
+					}
+				}
+				for(int c = 0; c < cols; c++) {
+					long byteOff = rowByteStride * row + colOffsets[c];
+					double v = fullData.getDouble((int) byteOff);
+					if(H5RootObject.HDF5_DEBUG && row == 3 && c == 3) {
+						System.out.println("[HDF5] sample(row=" + row + ", col=" + c + ") byteOff=" + byteOff
+							+ " val=" + v);
+					}
+					rowBuf.putDouble(v);
+				}
+				rowBuf.rewind();
+				if(H5RootObject.HDF5_DEBUG) {
+					System.out.println("[HDF5] getDataBuffer (rank>2) dataset=" + rootObject.getDatasetName() + " row=" + row
+						+ " cols=" + cols + " elemSize=" + elemSize + " dataSize=" + dataSize);
+				}
+				return rowBuf;
+			}
+			else {
+				long rowPos = row * rowBytes;
+				long layoutAddress = dataLayoutMessage.getAddress();
+				// layoutAddress is already an absolute file offset for the contiguous data block.
+				long dataAddress = layoutAddress + rowPos;
+				ByteBuffer data = rootObject.readBufferFromAddressNoOrder(dataAddress, readLength);
+				data.order(LITTLE_ENDIAN);
+				if(H5RootObject.HDF5_DEBUG) {
+					System.out.println("[HDF5] getDataBuffer dataset=" + rootObject.getDatasetName() + " row=" + row
+						+ " layoutAddr=" + layoutAddress + " rowPos=" + rowPos + " readLength=" + readLength
+						+ " col=" + cols + " rowCount=" + rowCount);
+				}
+				return data;
+			}
 		}
 		catch(Exception e) {
 			throw new H5RuntimeException("Failed to map data buffer for dataset", e);
 		}
 	}
+
+	public void readRowDoubles(int row, double[] dest, int destPos) {
+		long cols = rootObject.getCol();
+		if(cols > Integer.MAX_VALUE) {
+			throw new H5RuntimeException("Column count exceeds buffer capacity: " + cols);
+		}
+		int ncol = (int) cols;
+		if(rankGt2) {
+			if(isRowContiguous()) {
+				ByteBuffer data = getDataBuffer(row, 1);
+				data.order(LITTLE_ENDIAN);
+				data.asDoubleBuffer().get(dest, destPos, ncol);
+				return;
+			}
+			if(!fullDataLoaded) {
+				fullData = rootObject.readBufferFromAddressNoOrder(dataLayoutMessage.getAddress(), (int) dataSize);
+				fullData.order(LITTLE_ENDIAN);
+				fullDataLoaded = true;
+			}
+			if(colOffsets == null) {
+				colOffsets = new long[ncol];
+				for(int c = 0; c < ncol; c++) {
+					colOffsets[c] = computeByteOffset(0, c);
+				}
+			}
+			long rowBase = rowByteStride * row;
+			for(int c = 0; c < ncol; c++) {
+				dest[destPos + c] = fullData.getDouble((int) (rowBase + colOffsets[c]));
+			}
+			return;
+		}
+		ByteBuffer data = getDataBuffer(row);
+		data.order(LITTLE_ENDIAN);
+		data.asDoubleBuffer().get(dest, destPos, ncol);
+	}
+
+	private static long[] computeStridesRowMajor(int[] dims) {
+		if(dims == null || dims.length == 0)
+			return new long[0];
+		long[] strides = new long[dims.length];
+		strides[dims.length - 1] = 1;
+		for(int i = dims.length - 2; i >= 0; i--) {
+			strides[i] = strides[i + 1] * dims[i + 1];
+		}
+		return strides;
+	}
+
+	private long computeByteOffset(long row, long col) {
+		long linear = row * fileStrides[axisPermutation[0]];
+		long rem = col;
+		for(int axis = dims.length - 1; axis >= 1; axis--) {
+			int dim = dims[axis];
+			long idx = (dim == 0) ? 0 : rem % dim;
+			rem = (dim == 0) ? 0 : rem / dim;
+			linear += idx * fileStrides[axisPermutation[axis]];
+		}
+		return linear * elemSize;
+	}
+
+	private static int[] normalizePermutation(int[] permutation, int[] dims) {
+		int rank = (dims == null) ? 0 : dims.length;
+		if(permutation == null || permutation.length != rank) {
+			int[] identity = new int[rank];
+			for(int i = 0; i < rank; i++)
+				identity[i] = i;
+			return identity;
+		}
+		return permutation;
+	}
+
 	public H5DataTypeMessage getDataType() {
 		return dataTypeMessage;
 	}
+
+	public long getDataAddress() {
+		return dataLayoutMessage.getAddress();
+	}
+
+	public long getDataSize() {
+		return dataSize;
+	}
+
+	public long getElementSize() {
+		return elemSize;
+	}
+
+	public boolean isRankGt2() {
+		return rankGt2;
+	}
+
+	public long getRowByteSize() {
+		return rowByteSize;
+	}
+
+	public boolean isRowContiguous() {
+		return rowByteStride == rowByteSize;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/hdf5/H5RootObject.java b/src/main/java/org/apache/sysds/runtime/io/hdf5/H5RootObject.java
index ebfb719e0be..823359660fb 100644
--- a/src/main/java/org/apache/sysds/runtime/io/hdf5/H5RootObject.java
+++ b/src/main/java/org/apache/sysds/runtime/io/hdf5/H5RootObject.java
@@ -19,22 +19,24 @@
 
 package org.apache.sysds.runtime.io.hdf5;
 
-import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.Arrays;
 
 import static java.nio.ByteOrder.LITTLE_ENDIAN;
 
 public class H5RootObject {
 
-	protected BufferedInputStream bufferedInputStream;
+	protected H5ByteReader byteReader;
 	protected BufferedOutputStream bufferedOutputStream;
 	protected H5Superblock superblock;
 	protected int rank;
 	protected long row;
 	protected long col;
-	protected int[] dimensions;
+	protected int[] logicalDimensions;
+	protected int[] rawDimensions;
+	protected int[] axisPermutation;
 	protected long maxRow;
 	protected long maxCol;
 	protected int[] maxSizes;
@@ -50,46 +52,47 @@ public class H5RootObject {
 	protected byte groupSymbolTableNodeVersion = 1;
 
 	protected byte dataLayoutClass = 1;
+	public static final boolean HDF5_DEBUG = Boolean.getBoolean("sysds.hdf5.debug");
 
 	public ByteBuffer readBufferFromAddress(long address, int length) {
-		ByteBuffer bb = ByteBuffer.allocate(length);
 		try {
-			byte[] b = new byte[length];
-			bufferedInputStream.reset();
-			bufferedInputStream.skip(address);
-			bufferedInputStream.read(b);
-			bb.put(b);
+			ByteBuffer bb = byteReader.read(address, length);
+			bb.order(LITTLE_ENDIAN);
+			bb.rewind();
+			return bb;
 		}
 		catch(IOException e) {
 			throw new H5RuntimeException(e);
 		}
-		bb.order(LITTLE_ENDIAN);
-		bb.rewind();
-		return bb;
 	}
 
 	public ByteBuffer readBufferFromAddressNoOrder(long address, int length) {
-		ByteBuffer bb = ByteBuffer.allocate(length);
 		try {
-			byte[] b = new byte[length];
-			bufferedInputStream.reset();
-			bufferedInputStream.skip(address);
-			bufferedInputStream.read(b);
-			bb.put(b);
+			ByteBuffer bb = byteReader.read(address, length);
+			bb.rewind();
+			return bb;
 		}
 		catch(IOException e) {
 			throw new H5RuntimeException(e);
 		}
-		bb.rewind();
-		return bb;
 	}
 
-	public BufferedInputStream getBufferedInputStream() {
-		return bufferedInputStream;
+	public void setByteReader(H5ByteReader byteReader) {
+		this.byteReader = byteReader;
 	}
 
-	public void setBufferedInputStream(BufferedInputStream bufferedInputStream) {
-		this.bufferedInputStream = bufferedInputStream;
+	public H5ByteReader getByteReader() {
+		return byteReader;
+	}
+
+	public void close() {
+		try {
+			if(byteReader != null)
+				byteReader.close();
+		}
+		catch(IOException e) {
+			throw new H5RuntimeException(e);
+		}
 	}
 
 	public BufferedOutputStream getBufferedOutputStream() {
@@ -114,7 +117,8 @@ public long getRow() {
 
 	public void setRow(long row) {
 		this.row = row;
-		this.dimensions[0] = (int) row;
+		if(this.logicalDimensions != null && this.logicalDimensions.length > 0)
+			this.logicalDimensions[0] = (int) row;
 	}
 
 	public long getCol() {
@@ -123,7 +127,8 @@ public long getCol() {
 
 	public void setCol(long col) {
 		this.col = col;
-		this.dimensions[1] = (int) col;
+		if(this.logicalDimensions != null && this.logicalDimensions.length > 1)
+			this.logicalDimensions[1] = (int) col;
 	}
 
 	public int getRank() {
@@ -132,7 +137,7 @@ public int getRank() {
 
 	public void setRank(int rank) {
 		this.rank = rank;
-		this.dimensions = new int[rank];
+		this.logicalDimensions = new int[rank];
 		this.maxSizes = new int[rank];
 	}
 
@@ -142,7 +147,8 @@ public long getMaxRow() {
 
 	public void setMaxRow(long maxRow) {
 		this.maxRow = maxRow;
-		this.maxSizes[0] = (int) maxRow;
+		if(this.maxSizes != null && this.maxSizes.length > 0)
+			this.maxSizes[0] = (int) maxRow;
 	}
 
 	public long getMaxCol() {
@@ -151,7 +157,8 @@ public long getMaxCol() {
 
 	public void setMaxCol(long maxCol) {
 		this.maxCol = maxCol;
-		this.maxSizes[1] = (int) maxCol;
+		if(this.maxSizes != null && this.maxSizes.length > 1)
+			this.maxSizes[1] = (int) maxCol;
 	}
 
 	public String getDatasetName() {
@@ -163,13 +170,25 @@ public void setDatasetName(String datasetName) {
 	}
 
 	public int[] getDimensions() {
-		return dimensions;
+		return logicalDimensions;
+	}
+
+	public int[] getLogicalDimensions() {
+		return logicalDimensions;
 	}
 
 	public int[] getMaxSizes() {
 		return maxSizes;
 	}
 
+	public int[] getRawDimensions() {
+		return rawDimensions;
+	}
+
+	public int[] getAxisPermutation() {
+		return axisPermutation;
+	}
+
 	public byte getDataSpaceVersion() {
 		return dataSpaceVersion;
 	}
@@ -179,15 +198,45 @@ public void setDataSpaceVersion(byte dataSpaceVersion) {
 	}
 
 	public void setDimensions(int[] dimensions) {
-		this.dimensions = dimensions;
-		this.row = dimensions[0];
-		this.col = dimensions[1];
+		this.rawDimensions = dimensions;
+		if(dimensions == null || dimensions.length == 0) {
+			this.logicalDimensions = dimensions;
+			this.axisPermutation = new int[0];
+			this.row = 0;
+			this.col = 0;
+			return;
+		}
+		int[] logical = Arrays.copyOf(dimensions, dimensions.length);
+		int[] permutation = identityPermutation(dimensions.length);
+		this.logicalDimensions = logical;
+		this.axisPermutation = permutation;
+		this.row = logicalDimensions[0];
+		this.col = flattenColumns(logicalDimensions);
+		if(HDF5_DEBUG) {
+			System.out.println("[HDF5] setDimensions rank=" + dimensions.length + " rawDims="
+				+ java.util.Arrays.toString(dimensions) + " logicalDims=" + java.util.Arrays.toString(logicalDimensions)
+				+ " axisPerm=" + java.util.Arrays.toString(axisPermutation) + " => rows=" + row + " cols(flat)=" + col);
+		}
+		if(HDF5_DEBUG) {
+			System.out.println("[HDF5] setDimensions debug raw=" + java.util.Arrays.toString(dimensions)
+				+ " logical=" + java.util.Arrays.toString(logicalDimensions) + " perm="
+				+ java.util.Arrays.toString(axisPermutation));
+		}
 	}
 
 	public void setMaxSizes(int[] maxSizes) {
 		this.maxSizes = maxSizes;
+		if(maxSizes == null || maxSizes.length == 0) {
+			this.maxRow = 0;
+			this.maxCol = 0;
+			return;
+		}
 		this.maxRow = maxSizes[0];
-		this.maxCol = maxSizes[1];
+		this.maxCol = flattenColumns(maxSizes);
+		if(HDF5_DEBUG) {
+			System.out.println("[HDF5] setMaxSizes rank=" + maxSizes.length + " max=" + java.util.Arrays.toString(maxSizes)
+				+ " => maxRows=" + maxRow + " maxCols(flat)=" + maxCol);
+		}
 	}
 
 	public byte getObjectHeaderVersion() {
@@ -245,4 +294,23 @@ public byte getGroupSymbolTableNodeVersion() {
 	public void setGroupSymbolTableNodeVersion(byte groupSymbolTableNodeVersion) {
 		this.groupSymbolTableNodeVersion = groupSymbolTableNodeVersion;
 	}
+
+	private long flattenColumns(int[] dims) {
+		if(dims.length == 1) {
+			return 1;
+		}
+		long product = 1;
+		for(int i = 1; i < dims.length; i++) {
+			product = Math.multiplyExact(product, dims[i]);
+		}
+		return product;
+	}
+
+	private static int[] identityPermutation(int rank) {
+		int[] perm = new int[rank];
+		for(int i = 0; i < rank; i++)
+			perm[i] = i;
+		return perm;
+	}
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/hdf5/H5Superblock.java b/src/main/java/org/apache/sysds/runtime/io/hdf5/H5Superblock.java
index 78fa90edd63..e0c921703c4 100644
--- a/src/main/java/org/apache/sysds/runtime/io/hdf5/H5Superblock.java
+++ b/src/main/java/org/apache/sysds/runtime/io/hdf5/H5Superblock.java
@@ -20,8 +20,6 @@
 package org.apache.sysds.runtime.io.hdf5;
 
 
-import java.io.BufferedInputStream;
-import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 
@@ -48,47 +46,30 @@ public class H5Superblock {
 	public H5Superblock() {
 	}
 
-	static boolean verifySignature(BufferedInputStream bis, long offset) {
-		// Format Signature
-		byte[] signature = new byte[HDF5_FILE_SIGNATURE_LENGTH];
-
+	static boolean verifySignature(H5ByteReader reader, long offset) {
 		try {
-			bis.reset();
-			bis.skip(offset);
-			bis.read(signature);
+			ByteBuffer signature = reader.read(offset, HDF5_FILE_SIGNATURE_LENGTH);
+			byte[] sigBytes = new byte[HDF5_FILE_SIGNATURE_LENGTH];
+			signature.get(sigBytes);
+			return Arrays.equals(HDF5_FILE_SIGNATURE, sigBytes);
 		}
-		catch(IOException e) {
+		catch(Exception e) {
 			throw new H5RuntimeException("Failed to read from address: " + offset, e);
 		}
-		// Verify signature
-		return Arrays.equals(HDF5_FILE_SIGNATURE, signature);
 	}
 
-	public H5Superblock(BufferedInputStream bis, long address) {
+	public H5Superblock(H5ByteReader reader, long address) {
 
 		// Calculated bytes for the super block header is = 56
 		int superBlockHeaderSize = 12;
 
-		long fileLocation = address + HDF5_FILE_SIGNATURE_LENGTH;
-		address += 12 + HDF5_FILE_SIGNATURE_LENGTH;
-
-		ByteBuffer header = ByteBuffer.allocate(superBlockHeaderSize);
-
-		try {
-			byte[] b = new byte[superBlockHeaderSize];
-			bis.reset();
-			bis.skip((int) fileLocation);
-			bis.read(b);
-			header.put(b);
-		}
-		catch(IOException e) {
-			throw new H5RuntimeException(e);
-		}
-
-		header.order(LITTLE_ENDIAN);
-		header.rewind();
+		long cursor = address + HDF5_FILE_SIGNATURE_LENGTH;
 
 		try {
+			ByteBuffer header = reader.read(cursor, superBlockHeaderSize);
+			header.order(LITTLE_ENDIAN);
+			header.rewind();
+			cursor += superBlockHeaderSize;
 
 			// Version # of Superblock
 			versionOfSuperblock = header.get();
@@ -125,19 +106,13 @@ public H5Superblock(BufferedInputStream bis, long address) {
 			groupInternalNodeK = Short.toUnsignedInt(header.getShort());
 
 			// File Consistency Flags (skip)
-			address += 4;
+			cursor += 4;
 
 			int nextSectionSize = 4 * sizeOfOffsets;
-			header = ByteBuffer.allocate(nextSectionSize);
-
-			byte[] hb = new byte[nextSectionSize];
-			bis.reset();
-			bis.skip(address);
-			bis.read(hb);
-			header.put(hb);
-			address += nextSectionSize;
+			header = reader.read(cursor, nextSectionSize);
 			header.order(LITTLE_ENDIAN);
 			header.rewind();
+			cursor += nextSectionSize;
 
 			// Base Address
 			baseAddressByte = Utils.readBytesAsUnsignedLong(header, sizeOfOffsets);
@@ -152,7 +127,7 @@ public H5Superblock(BufferedInputStream bis, long address) {
 			driverInformationBlockAddress = Utils.readBytesAsUnsignedLong(header, sizeOfOffsets);
 
 			// Root Group Symbol Table Entry Address
-			rootGroupSymbolTableAddress = address;
+			rootGroupSymbolTableAddress = cursor;
 		}
 		catch(Exception e) {
 			throw new H5RuntimeException("Failed to read superblock from address " + address, e);
diff --git a/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5AttributeMessage.java b/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5AttributeMessage.java
new file mode 100644
index 00000000000..9e778e8fded
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5AttributeMessage.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io.hdf5.message;
+
+import java.nio.ByteBuffer;
+import java.util.BitSet;
+
+import org.apache.sysds.runtime.io.hdf5.H5RootObject;
+
+/**
+ * Lightweight placeholder for attribute messages. We currently ignore attribute content but keep track of the
+ * bytes to ensure the buffer position stays consistent, logging that the attribute was skipped to aid debugging.
+ */
+public class H5AttributeMessage extends H5Message {
+
+	public H5AttributeMessage(H5RootObject rootObject, BitSet flags, ByteBuffer bb) {
+		super(rootObject, flags);
+		if(bb.remaining() == 0)
+			return;
+		byte version = bb.get();
+		if(H5RootObject.HDF5_DEBUG) {
+			System.out.println("[HDF5] Skipping attribute message v" + version + " (" + bb.remaining() + " bytes payload)");
+		}
+		// consume the rest of the payload
+		bb.position(bb.limit());
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5DataLayoutMessage.java b/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5DataLayoutMessage.java
index 46c49c926c6..de364cb0b09 100644
--- a/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5DataLayoutMessage.java
+++ b/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5DataLayoutMessage.java
@@ -30,21 +30,36 @@
 
 public class H5DataLayoutMessage extends H5Message {
 
+	public static final byte LAYOUT_CLASS_COMPACT = 0;
+	public static final byte LAYOUT_CLASS_CONTIGUOUS = 1;
+	public static final byte LAYOUT_CLASS_CHUNKED = 2;
+	public static final byte LAYOUT_CLASS_VIRTUAL = 3;
+
 	private final long address;
 	private final long size;
+	private final byte layoutClass;
+	private final byte layoutVersion;
 
 	public H5DataLayoutMessage(H5RootObject rootObject, BitSet flags, ByteBuffer bb) {
 		super(rootObject, flags);
 		rootObject.setDataLayoutVersion(bb.get());
+		layoutVersion = rootObject.getDataLayoutVersion();
 		rootObject.setDataLayoutClass(bb.get());
+		layoutClass = rootObject.getDataLayoutClass();
 		this.address = Utils.readBytesAsUnsignedLong(bb, rootObject.getSuperblock().sizeOfOffsets);
 		this.size = Utils.readBytesAsUnsignedLong(bb, rootObject.getSuperblock().sizeOfLengths);
+		if(H5RootObject.HDF5_DEBUG) {
+			System.out.println("[HDF5] Data layout (version=" + layoutVersion + ", class=" + layoutClass + ") address="
+				+ address + " size=" + size);
+		}
 	}
 
 	public H5DataLayoutMessage(H5RootObject rootObject, BitSet flags, long address, long size) {
 		super(rootObject, flags);
 		this.address = address;
 		this.size = size;
+		this.layoutVersion = rootObject.getDataLayoutVersion();
+		this.layoutClass = rootObject.getDataLayoutClass();
 	}
 
 	@Override
@@ -74,5 +89,12 @@ public long getAddress() {
 	public long getSize() {
 		return size;
 	}
+	
+	public byte getLayoutClass() {
+		return layoutClass;
+	}
 
+	public byte getLayoutVersion() {
+		return layoutVersion;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5DataSpaceMessage.java b/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5DataSpaceMessage.java
index 68fa15f8e74..db6aae8444e 100644
--- a/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5DataSpaceMessage.java
+++ b/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5DataSpaceMessage.java
@@ -25,6 +25,7 @@
 import org.apache.sysds.runtime.io.hdf5.Utils;
 
 import java.nio.ByteBuffer;
+import java.util.Arrays;
 import java.util.BitSet;
 import java.util.stream.IntStream;
 
@@ -74,7 +75,14 @@ public H5DataSpaceMessage(H5RootObject rootObject, BitSet flags, ByteBuffer bb)
 		}
 
 		// Calculate the total length by multiplying all dimensions
-		totalLength = IntStream.of(rootObject.getDimensions()).mapToLong(Long::valueOf).reduce(1, Math::multiplyExact);
+		totalLength = IntStream.of(rootObject.getLogicalDimensions()).mapToLong(Long::valueOf)
+			.reduce(1, Math::multiplyExact);
+		if(H5RootObject.HDF5_DEBUG) {
+			System.out.println("[HDF5] Dataspace rank=" + rootObject.getRank() + " dims="
+				+ Arrays.toString(rootObject.getLogicalDimensions()) + " => rows=" + rootObject.getRow()
+				+ ", cols(flat)="
+				+ rootObject.getCol());
+		}
 
 	}
 
@@ -97,7 +105,7 @@ public void toBuffer(H5BufferBuilder bb) {
 		// Dimensions sizes
 		if(rootObject.getRank() != 0) {
 			for(int i = 0; i < rootObject.getRank(); i++) {
-				bb.write(rootObject.getDimensions()[i], rootObject.getSuperblock().sizeOfLengths);
+				bb.write(rootObject.getLogicalDimensions()[i], rootObject.getSuperblock().sizeOfLengths);
 			}
 		}
 		// Max dimension sizes
diff --git a/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5DataTypeMessage.java b/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5DataTypeMessage.java
index cd004a11edc..ca08254175f 100644
--- a/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5DataTypeMessage.java
+++ b/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5DataTypeMessage.java
@@ -35,6 +35,10 @@ public class H5DataTypeMessage extends H5Message {
 	public H5DataTypeMessage(H5RootObject rootObject, BitSet flags, ByteBuffer bb) {
 		super(rootObject, flags);
 		doubleDataType = new H5DoubleDataType(bb);
+		if(H5RootObject.HDF5_DEBUG) {
+			System.out.println("[HDF5] Datatype parsed (class=" + doubleDataType.getDataClass() + ", size="
+				+ doubleDataType.getSize() + ")");
+		}
 	}
 
 	public H5DataTypeMessage(H5RootObject rootObject, BitSet flags, H5DoubleDataType doubleDataType) {
diff --git a/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5FilterPipelineMessage.java b/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5FilterPipelineMessage.java
new file mode 100644
index 00000000000..f812005a7f8
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5FilterPipelineMessage.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.io.hdf5.message;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.sysds.runtime.io.hdf5.H5RootObject;
+import org.apache.sysds.runtime.io.hdf5.H5RuntimeException;
+import org.apache.sysds.runtime.io.hdf5.Utils;
+
+/**
+ * Minimal parser for filter pipeline messages. We currently do not support any filters, and therefore
+ * fail fast if we encounter one so the user understands why the dataset cannot be read.
+ */
+public class H5FilterPipelineMessage extends H5Message {
+
+	private final List<Integer> filterIds = new ArrayList<>();
+
+	public H5FilterPipelineMessage(H5RootObject rootObject, BitSet flags, ByteBuffer bb) {
+		super(rootObject, flags);
+		byte version = bb.get();
+		byte numberOfFilters = bb.get();
+		// Skip 6 reserved bytes
+		bb.position(bb.position() + 6);
+
+		for(int i = 0; i < Byte.toUnsignedInt(numberOfFilters); i++) {
+			int filterId = Utils.readBytesAsUnsignedInt(bb, 2);
+			int nameLength = Utils.readBytesAsUnsignedInt(bb, 2);
+			Utils.readBytesAsUnsignedInt(bb, 2); // flags
+			int clientDataLength = Utils.readBytesAsUnsignedInt(bb, 2);
+
+			if(nameLength > 0) {
+				byte[] nameBytes = new byte[nameLength];
+				bb.get(nameBytes);
+			}
+			for(int j = 0; j < clientDataLength; j++) {
+				Utils.readBytesAsUnsignedInt(bb, 4);
+			}
+			Utils.seekBufferToNextMultipleOfEight(bb);
+			filterIds.add(filterId);
+		}
+
+		if(!filterIds.isEmpty()) {
+			if(H5RootObject.HDF5_DEBUG) {
+				System.out.println("[HDF5] Detected unsupported filter pipeline v" + version + " -> " + filterIds);
+			}
+			throw new H5RuntimeException("Encountered unsupported filtered dataset (filters=" + filterIds + "). "
+				+ "Compressed HDF5 inputs are currently unsupported.");
+		}
+	}
+
+	public List<Integer> getFilterIds() {
+		return Collections.unmodifiableList(filterIds);
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5Message.java b/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5Message.java
index 70bb0ebeb31..cb084b85af7 100644
--- a/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5Message.java
+++ b/src/main/java/org/apache/sysds/runtime/io/hdf5/message/H5Message.java
@@ -142,6 +142,12 @@ private static H5Message readMessage(H5RootObject rootObject, ByteBuffer bb, int
 			case H5Constants.OBJECT_MODIFICATION_TIME_MESSAGE:
 				return new H5ObjectModificationTimeMessage(rootObject, flags, bb);
 
+			case H5Constants.FILTER_PIPELINE_MESSAGE:
+				return new H5FilterPipelineMessage(rootObject, flags, bb);
+
+			case H5Constants.ATTRIBUTE_MESSAGE:
+				return new H5AttributeMessage(rootObject, flags, bb);
+
 			default:
 				throw new H5RuntimeException("Unrecognized message type = " + messageType);
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixAgg.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixAgg.java
index 59301db7ece..ea2e13da320 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixAgg.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixAgg.java
@@ -58,7 +58,7 @@
 import org.apache.sysds.runtime.functionobjects.ReduceRow;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.instructions.cp.KahanObject;
 import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
 import org.apache.sysds.runtime.matrix.operators.AggregateOperator;
@@ -475,8 +475,8 @@ public static MatrixBlock cumaggregateUnaryMatrix(MatrixBlock in, MatrixBlock ou
 	 * @param fn Value function to apply
 	 * @return Central Moment or Covariance object
 	 */
-	public static CM_COV_Object aggregateCmCov(MatrixBlock in1, MatrixBlock in2, MatrixBlock in3, ValueFunction fn) {
-		CM_COV_Object cmobj = new CM_COV_Object();
+	public static CmCovObject aggregateCmCov(MatrixBlock in1, MatrixBlock in2, MatrixBlock in3, ValueFunction fn) {
+		CmCovObject cmobj = new CmCovObject();
 		
 		// empty block handling (important for result correctness, otherwise
 		// we get a NaN due to 0/0 on reading out the required result)
@@ -502,11 +502,11 @@ public static CM_COV_Object aggregateCmCov(MatrixBlock in1, MatrixBlock in2, Mat
 	 * @param k Parallelization degree
 	 * @return Central Moment or Covariance object
 	 */
-	public static CM_COV_Object aggregateCmCov(MatrixBlock in1, MatrixBlock in2, MatrixBlock in3, ValueFunction fn, int k) {
+	public static CmCovObject aggregateCmCov(MatrixBlock in1, MatrixBlock in2, MatrixBlock in3, ValueFunction fn, int k) {
 		if( in1.isEmptyBlock(false) || !satisfiesMultiThreadingConstraints(in1, k) )
 			return aggregateCmCov(in1, in2, in3, fn);
 		
-		CM_COV_Object ret = null;
+		CmCovObject ret = null;
 		
 		ExecutorService pool = CommonThreadPool.get(k);
 		try {
@@ -514,7 +514,7 @@ public static CM_COV_Object aggregateCmCov(MatrixBlock in1, MatrixBlock in2, Mat
 			ArrayList<Integer> blklens = UtilFunctions.getBalancedBlockSizesDefault(in1.rlen, k, false);
 			for( int i=0, lb=0; i<blklens.size(); lb+=blklens.get(i), i++ )
 				tasks.add(new AggCmCovTask(in1, in2, in3, fn, lb, lb+blklens.get(i)));
-			List<Future<CM_COV_Object>> rtasks = pool.invokeAll(tasks);
+			List<Future<CmCovObject>> rtasks = pool.invokeAll(tasks);
 			
 			//aggregate partial results and error handling
 			ret = rtasks.get(0).get();
@@ -811,8 +811,8 @@ private static void aggregateFinalResult( AggregateOperator aop, MatrixBlock out
 			out.binaryOperationsInPlace(laop.increOp, partout);
 	}
 	
-	private static CM_COV_Object aggregateCmCov(MatrixBlock in1, MatrixBlock in2, MatrixBlock in3, ValueFunction fn, int rl, int ru) {
-		CM_COV_Object ret = new CM_COV_Object();
+	private static CmCovObject aggregateCmCov(MatrixBlock in1, MatrixBlock in2, MatrixBlock in3, ValueFunction fn, int rl, int ru) {
+		CmCovObject ret = new CmCovObject();
 		
 		if( in2 == null && in3 == null ) { //CM
 			int nzcount = 0;
@@ -1142,10 +1142,10 @@ private static void groupedAggregateCM( MatrixBlock groups, MatrixBlock target,
 		
 		//init group buffers
 		int numCols2 = cu-cl;
-		CM_COV_Object[][] cmValues = new CM_COV_Object[numGroups][numCols2];
+		CmCovObject[][] cmValues = new CmCovObject[numGroups][numCols2];
 		for ( int i=0; i < numGroups; i++ )
 			for( int j=0; j < numCols2; j++  )
-				cmValues[i][j] = new CM_COV_Object();
+				cmValues[i][j] = new CmCovObject();
 		
 		//column vector or matrix
 		if( target.sparse ) { //SPARSE target
@@ -1600,7 +1600,7 @@ else if( ixFn instanceof ReduceRow ) //COLMEAN
 				break;
 			}
 			case VAR: { //VAR
-				CM_COV_Object cbuff = new CM_COV_Object();
+				CmCovObject cbuff = new CmCovObject();
 				if( ixFn instanceof ReduceAll ) //VAR
 					d_uavar(a, c, n, cbuff, (CM)vFn, rl, ru);
 				else if( ixFn instanceof ReduceCol ) //ROWVAR
@@ -1724,7 +1724,7 @@ else if( ixFn instanceof ReduceRow ) //COLMEAN
 				break;
 			}
 			case VAR: { //VAR
-				CM_COV_Object cbuff = new CM_COV_Object();
+				CmCovObject cbuff = new CmCovObject();
 				if( ixFn instanceof ReduceAll ) //VAR
 					s_uavar(a, c, n, cbuff, (CM)vFn, rl, ru);
 				else if( ixFn instanceof ReduceCol ) //ROWVAR
@@ -2429,7 +2429,7 @@ private static void d_uacmean( DenseBlock a, DenseBlock c, int n, KahanObject kb
 	 * @param rl Lower row limit.
 	 * @param ru Upper row limit.
 	 */
-	private static void d_uavar(DenseBlock a, DenseBlock c, int n, CM_COV_Object cbuff, CM cm, int rl, int ru) {
+	private static void d_uavar(DenseBlock a, DenseBlock c, int n, CmCovObject cbuff, CM cm, int rl, int ru) {
 		final int bil = a.index(rl);
 		final int biu = a.index(ru-1);
 		for(int bi=bil; bi<=biu; bi++) {
@@ -2460,7 +2460,7 @@ private static void d_uavar(DenseBlock a, DenseBlock c, int n, CM_COV_Object cbu
 	 * @param rl Lower row limit.
 	 * @param ru Upper row limit.
 	 */
-	private static void d_uarvar(DenseBlock a, DenseBlock c, int n, CM_COV_Object cbuff, CM cm, int rl, int ru) {
+	private static void d_uarvar(DenseBlock a, DenseBlock c, int n, CmCovObject cbuff, CM cm, int rl, int ru) {
 		// calculate variance for each row
 		for (int i=rl; i<ru; i++) {
 			cbuff.reset(); // reset buffer for each row
@@ -2489,7 +2489,7 @@ private static void d_uarvar(DenseBlock a, DenseBlock c, int n, CM_COV_Object cb
 	 * @param rl Lower row limit.
 	 * @param ru Upper row limit.
 	 */
-	private static void d_uacvar(DenseBlock a, DenseBlock c, int n, CM_COV_Object cbuff, CM cm, int rl, int ru) {
+	private static void d_uacvar(DenseBlock a, DenseBlock c, int n, CmCovObject cbuff, CM cm, int rl, int ru) {
 		// calculate variance for each column incrementally
 		for (int i=rl; i<ru; i++)
 			varAgg(a.values(i), c, a.pos(i), n, cbuff, cm);
@@ -3306,7 +3306,7 @@ private static void s_uacmean( SparseBlock a, DenseBlock c, int n, KahanObject k
 	 * @param rl Lower row limit.
 	 * @param ru Upper row limit.
 	 */
-	private static void s_uavar(SparseBlock a, DenseBlock c, int n, CM_COV_Object cbuff, CM cm, int rl, int ru) {
+	private static void s_uavar(SparseBlock a, DenseBlock c, int n, CmCovObject cbuff, CM cm, int rl, int ru) {
 		// compute and store count of empty cells before aggregation
 		int count = (ru-rl)*n - (int)a.size(rl, ru);
 		cbuff.w = count;
@@ -3342,7 +3342,7 @@ private static void s_uavar(SparseBlock a, DenseBlock c, int n, CM_COV_Object cb
 	 * @param rl Lower row limit.
 	 * @param ru Upper row limit.
 	 */
-	private static void s_uarvar(SparseBlock a, DenseBlock c, int n, CM_COV_Object cbuff, CM cm, int rl, int ru) {
+	private static void s_uarvar(SparseBlock a, DenseBlock c, int n, CmCovObject cbuff, CM cm, int rl, int ru) {
 		// calculate aggregated variance for each row
 		for( int i=rl; i<ru; i++ ) {
 			cbuff.reset(); // reset buffer for each row
@@ -3374,7 +3374,7 @@ private static void s_uarvar(SparseBlock a, DenseBlock c, int n, CM_COV_Object c
 	 * @param rl Lower row limit.
 	 * @param ru Upper row limit.
 	 */
-	private static void s_uacvar(SparseBlock a, DenseBlock c, int n, CM_COV_Object cbuff, CM cm, int rl, int ru) {
+	private static void s_uacvar(SparseBlock a, DenseBlock c, int n, CmCovObject cbuff, CM cm, int rl, int ru) {
 		// compute and store counts of empty cells per column before aggregation
 		// note: column results are { var | mean, count, m2 correction, mean correction }
 		// - first, store total possible column counts in 3rd row of output
@@ -3598,12 +3598,12 @@ private static void meanAgg( double[] a, DenseBlock c, int[] aix, int ai, final
 		}
 	}
 
-	private static void var(double[] a, int ai, final int len, CM_COV_Object cbuff, CM cm) {
+	private static void var(double[] a, int ai, final int len, CmCovObject cbuff, CM cm) {
 		for(int i=0; i<len; i++, ai++)
-			cbuff = (CM_COV_Object) cm.execute(cbuff, a[ai]);
+			cbuff = (CmCovObject) cm.execute(cbuff, a[ai]);
 	}
 
-	private static void varAgg(double[] a, DenseBlock c, int ai, final int len, CM_COV_Object cbuff, CM cm) {
+	private static void varAgg(double[] a, DenseBlock c, int ai, final int len, CmCovObject cbuff, CM cm) {
 		//note: output might span multiple physical blocks
 		double[] var = c.values(0);
 		double[] mean = c.values(1);
@@ -3620,7 +3620,7 @@ private static void varAgg(double[] a, DenseBlock c, int ai, final int len, CM_C
 			cbuff.m2._correction = m2corr[pos3+i];
 			cbuff.mean._correction = mcorr[pos4+i];
 			// calculate incremental aggregated variance
-			cbuff = (CM_COV_Object) cm.execute(cbuff, a[ai+i]);
+			cbuff = (CmCovObject) cm.execute(cbuff, a[ai+i]);
 			// store updated values: { var | mean, count, m2 correction, mean correction }
 			var[pos0+i] = cbuff.getRequiredResult(AggregateOperationTypes.VARIANCE);
 			mean[pos1+i] = cbuff.mean._sum;
@@ -3630,7 +3630,7 @@ private static void varAgg(double[] a, DenseBlock c, int ai, final int len, CM_C
 		}
 	}
 
-	private static void varAgg(double[] a, DenseBlock c, int[] aix, int ai, final int len, final int n, CM_COV_Object cbuff, CM cm)
+	private static void varAgg(double[] a, DenseBlock c, int[] aix, int ai, final int len, final int n, CmCovObject cbuff, CM cm)
 	{
 		//note: output might span multiple physical blocks
 		double[] var = c.values(0);
@@ -3649,7 +3649,7 @@ private static void varAgg(double[] a, DenseBlock c, int[] aix, int ai, final in
 			cbuff.m2._correction = m2corr[pos3+ix];
 			cbuff.mean._correction = mcorr[pos4+ix];
 			// calculate incremental aggregated variance
-			cbuff = (CM_COV_Object) cm.execute(cbuff, a[i]);
+			cbuff = (CmCovObject) cm.execute(cbuff, a[i]);
 			// store updated values: { var | mean, count, m2 correction, mean correction }
 			var[pos0+ix] = cbuff.getRequiredResult(AggregateOperationTypes.VARIANCE);
 			mean[pos1+ix] = cbuff.mean._sum;
@@ -4062,7 +4062,7 @@ else if( _op instanceof AggregateOperator ) {
 		}
 	}
 	
-	private static class AggCmCovTask implements Callable<CM_COV_Object> {
+	private static class AggCmCovTask implements Callable<CmCovObject> {
 		private final MatrixBlock _in1, _in2, _in3;
 		private final ValueFunction _fn;
 		private final int _rl, _ru;
@@ -4077,7 +4077,7 @@ protected AggCmCovTask(MatrixBlock in1, MatrixBlock in2, MatrixBlock in3, ValueF
 		}
 		
 		@Override
-		public CM_COV_Object call() {
+		public CmCovObject call() {
 			//deep copy stateful CM function (has Kahan objects inside)
 			//for correctness and to avoid cache thrashing among threads
 			ValueFunction fn = (_fn instanceof CM) ? CM.getCMFnObject((CM)_fn) : _fn;
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
index 3a9cf83e792..4f3c55415d6 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCUDA.java
@@ -91,7 +91,6 @@
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 
-import java.lang.Math;
 import java.util.ArrayList;
 
 /**
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 5753fbbadbe..cfdf21255e7 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -384,7 +384,7 @@ public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock m
 		ret.examSparsity();
 		
 		//System.out.println("MMChain "+ct.toString()+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
-		//		             "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
+		//					 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
 	}
 
 	/**
@@ -449,7 +449,7 @@ public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock m
 		ret.examSparsity();
 		
 		//System.out.println("MMChain "+ct.toString()+" k="+k+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
-		//		              "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
+		//					  "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
 	}
 
 	public static MatrixBlock matrixMultTransposeSelf( MatrixBlock m1, MatrixBlock ret, boolean leftTranspose ) {
@@ -491,10 +491,10 @@ public static void matrixMultTransposeSelf(MatrixBlock m1, MatrixBlock ret, bool
 	/**
 	 * TSMM with optional transposed left side or not (Transposed self matrix multiplication)
 	 * 
-	 * @param m1            The matrix to do tsmm
-	 * @param ret           The output matrix to allocate the result to
+	 * @param m1			The matrix to do tsmm
+	 * @param ret		   The output matrix to allocate the result to
 	 * @param leftTranspose If the left side should be considered transposed
-	 * @param k             the number of threads to use
+	 * @param k			 the number of threads to use
 	 */
 	public static void matrixMultTransposeSelf(MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int k) {
 		//check inputs / outputs
@@ -574,7 +574,7 @@ else if( ret1.sparse )
 		}
 
 		//System.out.println("PMM Seq ("+pm1.isInSparseFormat()+","+pm1.getNumRows()+","+pm1.getNumColumns()+","+pm1.getNonZeros()+")x" +
-		//                  "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
+		//				  "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
 	}	
 
 	public static void matrixMultPermute( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int k) {
@@ -619,7 +619,7 @@ public static void matrixMultPermute( MatrixBlock pm1, MatrixBlock m2, MatrixBlo
 		}
 		
 		// System.out.println("PMM Par ("+pm1.isInSparseFormat()+","+pm1.getNumRows()+","+pm1.getNumColumns()+","+pm1.getNonZeros()+")x" +
-		//                   "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
+		//				   "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
 	}	
 
 	public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt) {
@@ -647,7 +647,7 @@ else if( mX.sparse && !mU.sparse && !mV.sparse && (mW==null || mW.sparse)
 			addMatrixMultWSLossNoWeightCorrection(mU, mV, ret, 1);
 		
 		//System.out.println("MMWSLoss " +wt.toString()+ " ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
-		//                  "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
+		//				  "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
 	}
 
 	public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int k) {
@@ -688,7 +688,7 @@ public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock
 			addMatrixMultWSLossNoWeightCorrection(mU, mV, ret, k);
 		
 		//System.out.println("MMWSLoss "+wt.toString()+" k="+k+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
-		//                   "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
+		//				   "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
 	}
 
 	public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt) {
@@ -723,7 +723,7 @@ else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEm
 		ret.examSparsity();
 		
 		//System.out.println("MMWSig "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
+		//				 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
 	}
 
 	public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int k) {
@@ -768,7 +768,7 @@ public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBloc
 		ret.examSparsity();
 
 		//System.out.println("MMWSig "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                   "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + ".");
+		//				   "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + ".");
 	}
 	
 	/**
@@ -815,7 +815,7 @@ else if( mW.sparse && !mU.sparse && !mV.sparse && (mX==null || mX.sparse || scal
 		ret.examSparsity();
 		
 		//System.out.println("MMWDiv "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
+		//				 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
 	}
 	
 	/**
@@ -886,7 +886,7 @@ public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock
 		ret.examSparsity();
 		
 		//System.out.println("MMWDiv "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
+		//				"("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
 	}	
 
 	public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt) {
@@ -911,7 +911,7 @@ else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEm
 			matrixMultWCeMMGeneric(mW, mU, mV, eps, ret, wt, 0, mW.rlen);
 		
 		//System.out.println("MMWCe "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
+		//				 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
 	}
 
 	public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, double eps, MatrixBlock ret, WCeMMType wt, int k) {
@@ -945,7 +945,7 @@ public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock m
 		}
 		
 		//System.out.println("MMWCe "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
+		//				 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
 	}
 
 	public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn) {
@@ -974,7 +974,7 @@ else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEm
 		ret.examSparsity();
 		
 		//System.out.println("MMWu "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
+		//				 "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
 	}
 
 	public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WUMMType wt, ValueFunction fn, int k) {
@@ -1019,7 +1019,7 @@ public static void matrixMultWuMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV
 		ret.examSparsity();
 
 		//System.out.println("MMWu "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
-		//                   "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + ".");
+		//				   "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + ".");
 	}
 	
 	//////////////////////////////////////////
@@ -1034,7 +1034,7 @@ private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixB
 		final int n = m2.clen;
 		final int cd = m1.clen;
 		
-		if( m==1 && n==1 ) {            //DOT PRODUCT
+		if( m==1 && n==1 ) {			//DOT PRODUCT
 			double[] avals = a.valuesAt(0);
 			double[] bvals = b.valuesAt(0);
 			if( ru > m ) //pm2r - parallelize over common dim
@@ -1042,7 +1042,7 @@ private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixB
 			else
 				c.set(0, 0, dotProduct(avals, bvals, cd));
 		}
-		else if( n>1 && cd == 1 ) {     //OUTER PRODUCT
+		else if( n>1 && cd == 1 ) {	 //OUTER PRODUCT
 			double[] avals = a.valuesAt(0);
 			double[] bvals = b.valuesAt(0);
 			for( int i=rl; i < ru; i++) {
@@ -1056,7 +1056,7 @@ else if( avals[i] != 0 )
 					Arrays.fill(cvals, cix, cix+n, 0);
 			}
 		}
-		else if( n==1 && cd == 1 ) {    //VECTOR-SCALAR
+		else if( n==1 && cd == 1 ) {	//VECTOR-SCALAR
 			double[] avals = a.valuesAt(0);
 			double[] cvals = c.valuesAt(0);
 			vectMultiplyWrite(b.get(0,0), avals, cvals, rl, rl, ru-rl);
@@ -1064,19 +1064,19 @@ else if( n==1 && cd == 1 ) {    //VECTOR-SCALAR
 		else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
 			matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru);
 		}
-		else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
+		else if( n==1 ) {			   //MATRIX-VECTOR (tall rhs)
 			matrixMultDenseDenseMVTallRHS(a, b, c, pm2, cd, rl, ru);
 		}
-		else if( pm2 && m==1 ) {        //VECTOR-MATRIX
+		else if( pm2 && m==1 ) {		//VECTOR-MATRIX
 			matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
 		}
-		else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short lhs) 
+		else if( pm2 && m<=16 ) {	   //MATRIX-MATRIX (short lhs) 
 			matrixMultDenseDenseMMShortLHS(a, b, c, m, n, cd, rl, ru);
 		}
-		else if( tm2 ) {                //MATRIX-MATRIX (skinny rhs)
+		else if( tm2 ) {				//MATRIX-MATRIX (skinny rhs)
 			matrixMultDenseDenseMMSkinnyRHS(a, b, c, m2.rlen, cd, rl, ru);
 		}
-		else {                          //MATRIX-MATRIX
+		else {						  //MATRIX-MATRIX
 			matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu);
 		}
 	}
@@ -1372,7 +1372,7 @@ private static void matrixMultDenseSparseOutDense(MatrixBlock m1, MatrixBlock m2
 		// MATRIX-MATRIX (VV, MV not applicable here because V always dense)
 		SparseBlock b = m2.sparseBlock;
 			
-		if( pm2 && m==1 ) {        //VECTOR-MATRIX
+		if( pm2 && m==1 ) {		//VECTOR-MATRIX
 			//parallelization over rows in rhs matrix
 			double[] avals = a.valuesAt(0); //vector
 			double[] cvals = c.valuesAt(0); //vector
@@ -1382,7 +1382,7 @@ private static void matrixMultDenseSparseOutDense(MatrixBlock m1, MatrixBlock m2
 						b.indexes(k), b.pos(k), 0, b.size(k));
 				}
 		}
-		else {                     //MATRIX-MATRIX
+		else {					 //MATRIX-MATRIX
 			//best effort blocking, without blocking over J because it is 
 			//counter-productive, even with front of current indexes
 			final int blocksizeK = 32;
@@ -1422,26 +1422,26 @@ private static void matrixMultSparseDense(MatrixBlock m1, MatrixBlock m2, Matrix
 		final int cd = m2.rlen;
 		final long xsp = (long)m*cd/m1.nonZeros;
 
-		if( m==1 && n==1 ) {            //DOT PRODUCT
+		if( m==1 && n==1 ) {			//DOT PRODUCT
 			if( !a.isEmpty(0) )
 				c.set(0, 0, dotProduct(a.values(0), b.values(0), a.indexes(0), a.pos(0), 0, a.size(0)));
 		}
 		else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short rhs)
 			matrixMultSparseDenseMVShortRHS(a, b, c, cd, rl, ru);
 		}
-		else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
+		else if( n==1 ) {			   //MATRIX-VECTOR (tall rhs)
 			matrixMultSparseDenseMVTallRHS(a, b, c, cd, xsp, rl, ru);
 		}
-		else if( pm2 && m==1 ) {        //VECTOR-MATRIX
+		else if( pm2 && m==1 ) {		//VECTOR-MATRIX
 			matrixMultSparseDenseVM(a, b, c, n, rl, ru);
 		}
-		else if( pm2 && m<=16 ) {       //MATRIX-MATRIX (short lhs) 
+		else if( pm2 && m<=16 ) {	   //MATRIX-MATRIX (short lhs) 
 			matrixMultSparseDenseMMShortLHS(a, b, c, n, cd, rl, ru);
 		}
-		else if( n<=64 ) {              //MATRIX-MATRIX (skinny rhs)
+		else if( n<=64 ) {			  //MATRIX-MATRIX (skinny rhs)
 			matrixMultSparseDenseMMSkinnyRHS(a, b, c, n, rl, ru);
 		}
-		else {                          //MATRIX-MATRIX
+		else {						  //MATRIX-MATRIX
 			matrixMultSparseDenseMM(a, b, c, n, cd, xsp, rl, ru);
 		}
 	}
@@ -1637,13 +1637,13 @@ private static void matrixMultSparseSparse(MatrixBlock m1, MatrixBlock m2, Matri
 		int n = m2.clen;
 		
 		// MATRIX-MATRIX (VV, MV not applicable here because V always dense)
-		if( pm2 && m==1 )               //VECTOR-MATRIX
+		if( pm2 && m==1 )			   //VECTOR-MATRIX
 			matrixMultSparseSparseVM(a, b, ret.getDenseBlock(), rl, ru);
-		else if( sparse )               //SPARSE OUPUT
+		else if( sparse )			   //SPARSE OUPUT
 			ret.setNonZeros(matrixMultSparseSparseSparseMM(a, b, ret.getSparseBlock(), n, rl, ru));
 		else if( m2.nonZeros < 2048 )   //MATRIX-SMALL MATRIX
 			matrixMultSparseSparseMMSmallRHS(a, b, ret.getDenseBlock(), rl, ru);
-		else                            //MATRIX-MATRIX
+		else							//MATRIX-MATRIX
 			matrixMultSparseSparseMM(a, b, ret.getDenseBlock(), m, cd, m1.nonZeros, rl, ru);
 	}
 	
@@ -3982,6 +3982,25 @@ public static void vectMultiplyWrite( double[] a, double[] b, double[] c, int ai
 			aVec.mul(bVec).intoArray(c, ci);
 		}
 	}
+
+	//note: public for use by codegen for consistency
+	public static void vectMultiplyAdd( double[] a, double[] b, double[] c, int ai, int bi, int ci, final int len ){
+		final int bn = len%vLen;
+
+		//rest, not aligned to vLen-blocks
+		for( int j = 0; j < bn; j++, ai++, bi++, ci++)
+			c[ ci ] += a[ ai ] * b[ bi ];
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for( int j = bn; j < len; j+=vLen, ai+=vLen, bi+=vLen, ci+=vLen)
+		{
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai);
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi);
+			DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci);
+			cVec = aVec.fma(bVec, cVec);
+			cVec.intoArray(c, ci);
+		}
+	}
 	
 	public static void vectMultiplyWrite( final double[] a, double[] b, double[] c, int[] bix, final int ai, final int bi, final int ci, final int len ) {
 		final int bn = len%8;
@@ -4824,8 +4843,8 @@ public Double call() {
 				&& (_mW==null || !_mW.isEmptyBlock()))
 				matrixMultWSLossDense(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru);
 			else if( _mX.sparse && !_mU.sparse && !_mV.sparse && (_mW==null || _mW.sparse)
-				    && !_mX.isEmptyBlock() && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() 
-				    && (_mW==null || !_mW.isEmptyBlock()))
+					&& !_mX.isEmptyBlock() && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() 
+					&& (_mW==null || !_mW.isEmptyBlock()))
 				matrixMultWSLossSparseDense(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru);
 			else
 				matrixMultWSLossGeneric(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru);
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
index 90ea445be8d..ffd7b17a20c 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
@@ -134,6 +134,8 @@ public static MatrixBlock reorg( MatrixBlock in, MatrixBlock out, ReorgOperator
 					return rev(in, out);
 			case ROLL:
 				RollIndex rix = (RollIndex) op.fn;
+				if(op.getNumThreads() > 1)
+					return roll(in, out, rix.getShift(), op.getNumThreads());
 				return roll(in, out, rix.getShift());
 			case DIAG:
 				return diag(in, out);
@@ -514,6 +516,124 @@ public static MatrixBlock roll(MatrixBlock in, MatrixBlock out, int shift) {
 		return out;
 	}
 
+	public static MatrixBlock roll(MatrixBlock input, MatrixBlock output, int shift, int numThreads) {
+
+		final int numRows = input.rlen;
+		final int numCols = input.clen;
+		final boolean isSparse = input.sparse;
+
+		// sparse-safe operation
+		if(input.isEmptyBlock(false))
+			return output;
+
+		// special case: row vector
+		if(numRows == 1) {
+			output.copy(input);
+			return output;
+		}
+
+		if(numThreads <= 1 || input.getLength() < PAR_NUMCELL_THRESHOLD) {
+			return roll(input, output, shift); // fallback to single-threaded
+		}
+
+		final int normalizedShift = getNormalizedShiftForRoll(shift, numRows);
+
+		output.reset(numRows, numCols, isSparse);
+		output.nonZeros = input.nonZeros;
+
+		if(isSparse) {
+			output.allocateSparseRowsBlock(false);
+		}
+		else {
+			output.allocateDenseBlock(false);
+		}
+
+		//TODO experiment with more tasks per thread for better load balance
+		//TODO call common kernel from both single- and multi-threaded execution
+		
+		ExecutorService threadPool = CommonThreadPool.get(numThreads);
+		try {
+			final int rowsPerThread = (int) Math.ceil((double) numRows / numThreads);
+			List<Future<?>> tasks = new ArrayList<>();
+
+			for(int threadIndex = 0; threadIndex < numThreads; threadIndex++) {
+
+				final int startRow = threadIndex * rowsPerThread;
+				final int endRow = Math.min((threadIndex + 1) * rowsPerThread, numRows);
+
+				tasks.add(threadPool.submit(() -> {
+					if(isSparse)
+						rollSparseBlock(input, output, normalizedShift, startRow, endRow);
+					else
+						rollDenseBlock(input, output, normalizedShift, startRow, endRow);
+				}));
+			}
+
+			for(Future<?> task : tasks)
+				task.get();
+
+		}
+		catch(Exception ex) {
+			throw new DMLRuntimeException(ex);
+		}
+		finally {
+			threadPool.shutdown();
+		}
+
+		return output;
+	}
+
+	private static int getNormalizedShiftForRoll(int shift, int numRows) {
+		shift = shift % numRows;
+		if(shift < 0)
+			shift += numRows;
+
+		return shift;
+	}
+
+	private static void rollDenseBlock(MatrixBlock input, MatrixBlock output, 
+			int shift, int startRow, int endRow) 
+	{
+		DenseBlock inputBlock = input.getDenseBlock();
+		DenseBlock outputBlock = output.getDenseBlock();
+		final int numRows = input.rlen;
+		final int numCols = input.clen;
+
+		for(int targetRow = startRow; targetRow < endRow; targetRow++) {
+			int sourceRow = targetRow - shift;
+			if(sourceRow < 0)
+				sourceRow += numRows;
+
+			System.arraycopy(inputBlock.values(sourceRow), inputBlock.pos(sourceRow), outputBlock.values(targetRow),
+				outputBlock.pos(targetRow), numCols);
+		}
+	}
+
+	private static void rollSparseBlock(MatrixBlock input, MatrixBlock output, 
+		int shift, int startRow, int endRow) 
+	{
+		SparseBlock inputBlock = input.getSparseBlock();
+		SparseBlock outputBlock = output.getSparseBlock();
+		final int numRows = input.rlen;
+
+		for(int targetRow = startRow; targetRow < endRow; targetRow++) {
+			int sourceRow = targetRow - shift;
+			if(sourceRow < 0)
+				sourceRow += numRows;
+
+			if(!inputBlock.isEmpty(sourceRow)) {
+				int rowStart = inputBlock.pos(sourceRow);
+				int rowEnd = rowStart + inputBlock.size(sourceRow);
+				int[] colIndexes = inputBlock.indexes(sourceRow);
+				double[] values = inputBlock.values(sourceRow);
+
+				for(int k = rowStart; k < rowEnd; k++) {
+					outputBlock.set(targetRow, colIndexes[k], values[k]);
+				}
+			}
+		}
+	}
+
 	public static void roll(IndexedMatrixValue in, long rlen, int blen, int shift, ArrayList<IndexedMatrixValue> out) {
 		MatrixIndexes inMtxIdx = in.getIndexes();
 		MatrixBlock inMtxBlk = (MatrixBlock) in.getValue();
@@ -1982,7 +2102,7 @@ private static void c2r(MatrixBlock in, int k){
 				if(m > 10 && n > 100) {
 					int blkz = Math.max((n - c) / k, 1);
 					for(int j = c; j * blkz < n; j++) {
-						tasks.add(new rTask(A, j * blkz, Math.min((j + 1) * blkz, n), b, n, m));
+						tasks.add(new RTask(A, j * blkz, Math.min((j + 1) * blkz, n), b, n, m));
 					}
 					for(Future<Object> rt : pool.invokeAll(tasks))
 						rt.get();
@@ -2000,7 +2120,7 @@ private static void c2r(MatrixBlock in, int k){
 			if(m > 10 && n > 100) {
 				int blkz = Math.max(m / k, 1);
 				for(int i = 0; i * blkz < m; i++) {
-					tasks.add(new dTask(A, i * blkz, Math.min((i + 1) * blkz, m), b, n, m));
+					tasks.add(new DTask(A, i * blkz, Math.min((i + 1) * blkz, m), b, n, m));
 				}
 				for(Future<Object> rt : pool.invokeAll(tasks))
 					rt.get();
@@ -2017,7 +2137,7 @@ private static void c2r(MatrixBlock in, int k){
 			if(m > 10 && n > 100) {
 				int blkz = Math.max(n / k, 1);
 				for(int j = 0; j * blkz < n; j++) {
-					tasks.add(new sTask(A, j * blkz, Math.min((j + 1) * blkz, n), a, n, m));
+					tasks.add(new STask(A, j * blkz, Math.min((j + 1) * blkz, n), a, n, m));
 				}
 				for(Future<Object> rt : pool.invokeAll(tasks))
 					rt.get();
@@ -2051,7 +2171,7 @@ private static void rj(double[] tmp, double[] A, int j, int b, int n, int m){
 		}
 	}
 
-	private static class rTask implements Callable<Object> {
+	private static class RTask implements Callable<Object> {
 		final double[] _A;
 
 		final int _jStart;
@@ -2060,7 +2180,7 @@ private static class rTask implements Callable<Object> {
 		final int _n;
 		final int _m;
 
-		rTask(double[] A, int jStart, int jEnd, int b, int n, int m){
+		RTask(double[] A, int jStart, int jEnd, int b, int n, int m){
 			_A = A;
 			_jStart = jStart;
 			_jEnd = jEnd;
@@ -2095,7 +2215,7 @@ private static void di(double[] tmp, double[] A, int i, int b, int n, int m){
 
 	}
 
-	private static class dTask implements Callable<Object>{
+	private static class DTask implements Callable<Object>{
 		final double[] _A;
 
 		final int _iStart;
@@ -2104,7 +2224,7 @@ private static class dTask implements Callable<Object>{
 		final int _n;
 		final int _m;
 
-		dTask(double[] A, int iStart, int iEnd, int b, int n, int m){
+		DTask(double[] A, int iStart, int iEnd, int b, int n, int m){
 			_A = A;
 			_iStart = iStart;
 			_iEnd = iEnd;
@@ -2142,7 +2262,7 @@ private static void sj(double[] tmp, double[] A, int j, int a, int n, int m){
 		}
 	}
 
-	private static class sTask implements Callable<Object>{
+	private static class STask implements Callable<Object>{
 		final double[] _A;
 
 		// final int _j;
@@ -2152,7 +2272,7 @@ private static class sTask implements Callable<Object>{
 		final int _n;
 		final int _m;
 
-		sTask(double[] A, int jStart, int jEnd, int a, int n, int m){
+		STask(double[] A, int jStart, int jEnd, int a, int n, int m){
 			_A = A;
 			_jStart = jStart;
 			_jEnd = jEnd;
@@ -2198,7 +2318,7 @@ private static void r2c(MatrixBlock in, int k){
 			if(m > 10 && n > 100) {
 				int blkz = Math.max(n / k, 1);
 				for(int j = 0; j * blkz < n; j++) {
-					tasks.add(new s_invTask(A, j * blkz, Math.min((j + 1) * blkz, n), a, n, m));
+					tasks.add(new SinvTask(A, j * blkz, Math.min((j + 1) * blkz, n), a, n, m));
 				}
 				for(Future<Object> rt : pool.invokeAll(tasks))
 					rt.get();
@@ -2214,7 +2334,7 @@ private static void r2c(MatrixBlock in, int k){
 			if(m > 10 && n > 100) {
 				int blkz = Math.max(m / k, 1);
 				for(int i = 0; i * blkz < m; i++) {
-					tasks.add(new d_invTask(A, i * blkz, Math.min((i + 1) * blkz, m), a_inv, b, c, n, m));
+					tasks.add(new DinvTask(A, i * blkz, Math.min((i + 1) * blkz, m), a_inv, b, c, n, m));
 				}
 				for(Future<Object> rt : pool.invokeAll(tasks))
 					rt.get();
@@ -2238,7 +2358,7 @@ private static void r2c(MatrixBlock in, int k){
 				if(m > 10 && n > 100) {
 					int blkz = Math.max((n - c) / k, 1);
 					for(int j = c; j * blkz < n; j++) {
-						tasks.add(new r_invTask(A, j * blkz, Math.min((j + 1) * blkz, n), b, n, m));
+						tasks.add(new RinvTask(A, j * blkz, Math.min((j + 1) * blkz, n), b, n, m));
 					}
 					for(Future<Object> rt : pool.invokeAll(tasks))
 						rt.get();
@@ -2276,7 +2396,7 @@ private static void sj_inv(double[] tmp, double[] A, int j, int a, int n, int m)
 			}
 	}
 
-	private static class s_invTask implements Callable<Object>{
+	private static class SinvTask implements Callable<Object>{
 		final double[] _A;
 
 		// final int _j;
@@ -2286,7 +2406,7 @@ private static class s_invTask implements Callable<Object>{
 		final int _n;
 		final int _m;
 
-		s_invTask(double[] A, int jStart, int jEnd, int a, int n, int m){
+		SinvTask(double[] A, int jStart, int jEnd, int a, int n, int m){
 			_A = A;
 			_jStart = jStart;
 			_jEnd = jEnd;
@@ -2338,7 +2458,7 @@ private static void di_inv_safe(double[] tmp, double[] A, int i, int a_inv, int
 		System.arraycopy(tmp, 0, A, i*n, n);
 	}
 
-	private static class d_invTask implements Callable<Object>{
+	private static class DinvTask implements Callable<Object>{
 		final double[] _A;
 
 		final int _iStart;
@@ -2349,7 +2469,7 @@ private static class d_invTask implements Callable<Object>{
 		final int _n;
 		final int _m;
 
-		d_invTask(double[] A, int iStart, int iEnd, int a_inv, int b, int c, int n, int m){
+		DinvTask(double[] A, int iStart, int iEnd, int a_inv, int b, int c, int n, int m){
 			_A = A;
 			_iStart = iStart;
 			_iEnd = iEnd;
@@ -2392,7 +2512,7 @@ private static void rj_inv(double[] tmp, double[] A, int j, int b, int n, int m)
 		}
 	}
 
-	private static class r_invTask implements Callable<Object>{
+	private static class RinvTask implements Callable<Object>{
 		final double[] _A;
 
 		// final int _j;
@@ -2402,7 +2522,7 @@ private static class r_invTask implements Callable<Object>{
 		final int _n;
 		final int _m;
 
-		r_invTask(double[] A, int jStart, int jEnd, int b, int n, int m){
+		RinvTask(double[] A, int jStart, int jEnd, int b, int n, int m){
 			_A = A;
 			_jStart = jStart;
 			_jEnd = jEnd;
@@ -2554,7 +2674,7 @@ private static void reverseSparse(MatrixBlock in, MatrixBlock out, int rl, int r
 
 	private static void rollDense(MatrixBlock in, MatrixBlock out, int shift) {
 		final int m = in.rlen;
-		shift %= (m != 0 ? m : 1); // roll matrix with axis=none
+		shift = getNormalizedShiftForRoll(shift, m); // roll matrix with axis=none
 
 		copyDenseMtx(in, out, 0, shift, m - shift, false, true);
 		copyDenseMtx(in, out, m - shift, 0, shift, true, true);
@@ -2562,7 +2682,7 @@ private static void rollDense(MatrixBlock in, MatrixBlock out, int shift) {
 
 	private static void rollSparse(MatrixBlock in, MatrixBlock out, int shift) {
 		final int m = in.rlen;
-		shift %= (m != 0 ? m : 1); // roll matrix with axis=0
+		shift = getNormalizedShiftForRoll(shift, m); // roll matrix with axis=0
 
 		copySparseMtx(in, out, 0, shift, m - shift, false, true);
 		copySparseMtx(in, out, m-shift, 0, shift, false, true);
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
index 3dd8b2ad3b4..f19fe075c96 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
@@ -94,7 +94,7 @@
 import org.apache.sysds.runtime.functionobjects.SortIndex;
 import org.apache.sysds.runtime.functionobjects.SwapIndex;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.instructions.cp.KahanObject;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
@@ -3307,8 +3307,8 @@ else if (aggOp.correction == CorrectionLocationType.LASTFOURROWS
 				&& aggOp.increOp.fn instanceof CM
 				&& ((CM) aggOp.increOp.fn).getAggOpType() == AggregateOperationTypes.VARIANCE) {
 			// create buffers to store results
-			CM_COV_Object cbuff_curr = new CM_COV_Object();
-			CM_COV_Object cbuff_part = new CM_COV_Object();
+			CmCovObject cbuff_curr = new CmCovObject();
+			CmCovObject cbuff_part = new CmCovObject();
 
 			// perform incremental aggregation
 			for (int r=0; r<rlen; r++)
@@ -3330,7 +3330,7 @@ else if (aggOp.correction == CorrectionLocationType.LASTFOURROWS
 					cbuff_part.mean._correction = newWithCor.get(r+4, c);
 
 					// calculate incremental aggregated variance
-					cbuff_curr = (CM_COV_Object) aggOp.increOp.fn.execute(cbuff_curr, cbuff_part);
+					cbuff_curr = (CmCovObject) aggOp.increOp.fn.execute(cbuff_curr, cbuff_part);
 
 					// store updated values: { var | mean, count, m2 correction, mean correction }
 					double var = cbuff_curr.getRequiredResult(AggregateOperationTypes.VARIANCE);
@@ -3345,8 +3345,8 @@ else if (aggOp.correction == CorrectionLocationType.LASTFOURCOLUMNS
 				&& aggOp.increOp.fn instanceof CM
 				&& ((CM) aggOp.increOp.fn).getAggOpType() == AggregateOperationTypes.VARIANCE) {
 			// create buffers to store results
-			CM_COV_Object cbuff_curr = new CM_COV_Object();
-			CM_COV_Object cbuff_part = new CM_COV_Object();
+			CmCovObject cbuff_curr = new CmCovObject();
+			CmCovObject cbuff_part = new CmCovObject();
 
 			// perform incremental aggregation
 			for (int r=0; r<rlen; r++)
@@ -3368,7 +3368,7 @@ else if (aggOp.correction == CorrectionLocationType.LASTFOURCOLUMNS
 					cbuff_part.mean._correction = newWithCor.get(r, c+4);
 
 					// calculate incremental aggregated variance
-					cbuff_curr = (CM_COV_Object) aggOp.increOp.fn.execute(cbuff_curr, cbuff_part);
+					cbuff_curr = (CmCovObject) aggOp.increOp.fn.execute(cbuff_curr, cbuff_part);
 
 					// store updated values: { var | mean, count, m2 correction, mean correction }
 					double var = cbuff_curr.getRequiredResult(AggregateOperationTypes.VARIANCE);
@@ -3508,8 +3508,8 @@ else if (aggOp.correction == CorrectionLocationType.LASTFOURROWS
 				&& aggOp.increOp.fn instanceof CM
 				&& ((CM) aggOp.increOp.fn).getAggOpType() == AggregateOperationTypes.VARIANCE) {
 			// create buffers to store results
-			CM_COV_Object cbuff_curr = new CM_COV_Object();
-			CM_COV_Object cbuff_part = new CM_COV_Object();
+			CmCovObject cbuff_curr = new CmCovObject();
+			CmCovObject cbuff_part = new CmCovObject();
 
 			// perform incremental aggregation
 			for (int r=0; r<rlen-4; r++)
@@ -3531,7 +3531,7 @@ else if (aggOp.correction == CorrectionLocationType.LASTFOURROWS
 					cbuff_part.mean._correction = newWithCor.get(r+4, c);
 
 					// calculate incremental aggregated variance
-					cbuff_curr = (CM_COV_Object) aggOp.increOp.fn.execute(cbuff_curr, cbuff_part);
+					cbuff_curr = (CmCovObject) aggOp.increOp.fn.execute(cbuff_curr, cbuff_part);
 
 					// store updated values: { var | mean, count, m2 correction, mean correction }
 					double var = cbuff_curr.getRequiredResult(AggregateOperationTypes.VARIANCE);
@@ -3546,8 +3546,8 @@ else if (aggOp.correction == CorrectionLocationType.LASTFOURCOLUMNS
 				&& aggOp.increOp.fn instanceof CM
 				&& ((CM) aggOp.increOp.fn).getAggOpType() == AggregateOperationTypes.VARIANCE) {
 			// create buffers to store results
-			CM_COV_Object cbuff_curr = new CM_COV_Object();
-			CM_COV_Object cbuff_part = new CM_COV_Object();
+			CmCovObject cbuff_curr = new CmCovObject();
+			CmCovObject cbuff_part = new CmCovObject();
 
 			// perform incremental aggregation
 			for (int r=0; r<rlen; r++)
@@ -3569,7 +3569,7 @@ else if (aggOp.correction == CorrectionLocationType.LASTFOURCOLUMNS
 					cbuff_part.mean._correction = newWithCor.get(r, c+4);
 
 					// calculate incremental aggregated variance
-					cbuff_curr = (CM_COV_Object) aggOp.increOp.fn.execute(cbuff_curr, cbuff_part);
+					cbuff_curr = (CmCovObject) aggOp.increOp.fn.execute(cbuff_curr, cbuff_part);
 
 					// store updated values: { var | mean, count, m2 correction, mean correction }
 					double var = cbuff_curr.getRequiredResult(AggregateOperationTypes.VARIANCE);
@@ -4581,7 +4581,7 @@ else if( !sparse && denseBlock!=null ) { //DENSE
 		}
 	}
 
-	public CM_COV_Object cmOperations(CMOperator op) {
+	public CmCovObject cmOperations(CMOperator op) {
 		checkCMOperations(this, op);
 		return LibMatrixAgg.aggregateCmCov(this, null, null, op.fn, op.getNumThreads());
 	}
@@ -4594,7 +4594,7 @@ public static void checkCMOperations(MatrixBlock mb, CMOperator op){
 		}
 	}
 		
-	public CM_COV_Object cmOperations(CMOperator op, MatrixBlock weights) {
+	public CmCovObject cmOperations(CMOperator op, MatrixBlock weights) {
 		/* this._data must be a 1 dimensional vector */
 		if ( this.getNumColumns() != 1 || weights.getNumColumns() != 1) {
 			throw new DMLRuntimeException("Central Moment can be computed only on 1-dimensional column matrices.");
@@ -4608,7 +4608,7 @@ public CM_COV_Object cmOperations(CMOperator op, MatrixBlock weights) {
 		return LibMatrixAgg.aggregateCmCov(this, weights, null, op.fn, op.getNumThreads());
 	}
 	
-	public CM_COV_Object covOperations(COVOperator op, MatrixBlock that) {
+	public CmCovObject covOperations(COVOperator op, MatrixBlock that) {
 		/* this._data must be a 1 dimensional vector */
 		if ( this.getNumColumns() != 1 || that.getNumColumns() != 1 ) {
 			throw new DMLRuntimeException("Covariance can be computed only on 1-dimensional column matrices."); 
@@ -4622,7 +4622,7 @@ public CM_COV_Object covOperations(COVOperator op, MatrixBlock that) {
 		return LibMatrixAgg.aggregateCmCov(this, that, null, op.fn, op.getNumThreads());
 	}
 	
-	public CM_COV_Object covOperations(COVOperator op, MatrixBlock that, MatrixBlock weights) {
+	public CmCovObject covOperations(COVOperator op, MatrixBlock that, MatrixBlock weights) {
 		/* this._data must be a 1 dimensional vector */
 		if ( this.getNumColumns() != 1 || that.getNumColumns() != 1 || weights.getNumColumns() != 1) {
 			throw new DMLRuntimeException("Covariance can be computed only on 1-dimensional column matrices."); 
diff --git a/src/main/java/org/apache/sysds/runtime/meta/MetaDataAll.java b/src/main/java/org/apache/sysds/runtime/meta/MetaDataAll.java
index 024f5c19d08..78156190a29 100644
--- a/src/main/java/org/apache/sysds/runtime/meta/MetaDataAll.java
+++ b/src/main/java/org/apache/sysds/runtime/meta/MetaDataAll.java
@@ -30,6 +30,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.common.Types;
+import org.apache.sysds.common.Types.FileFormat;
 import org.apache.sysds.parser.BooleanIdentifier;
 import org.apache.sysds.parser.ConstIdentifier;
 import org.apache.sysds.parser.DataExpression;
@@ -420,6 +421,13 @@ public static boolean checkHasMatrixMarketFormat(String inputFileName, String mt
 		return false;
 	}
 
+	public static String checkHasHDF5Format(String filename) {
+		if(filename != null && filename.toLowerCase().endsWith(".h5")) {
+			return FileFormat.HDF5.toString();
+		}
+		return null;
+	}
+
 	@Override
 	public String toString() {
 		return "MetaDataAll\n" + _metaObj + "\n" + super.toString();
diff --git a/src/main/java/org/apache/sysds/runtime/ooc/cache/BlockEntry.java b/src/main/java/org/apache/sysds/runtime/ooc/cache/BlockEntry.java
new file mode 100644
index 00000000000..eea76c808a2
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/ooc/cache/BlockEntry.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.ooc.cache;
+
+import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
+
+public final class BlockEntry {
+	private final BlockKey _key;
+	private final long _size;
+	private volatile int _pinCount;
+	private volatile BlockState _state;
+	private Object _data;
+
+	BlockEntry(BlockKey key, long size, Object data) {
+		this._key = key;
+		this._size = size;
+		this._pinCount = 0;
+		this._state = BlockState.HOT;
+		this._data = data;
+	}
+
+	public BlockKey getKey() {
+		return _key;
+	}
+
+	public long getSize() {
+		return _size;
+	}
+
+	public Object getData() {
+		if (_pinCount > 0)
+			return _data;
+		throw new IllegalStateException("Cannot get the data of an unpinned entry");
+	}
+
+	Object getDataUnsafe() {
+		return _data;
+	}
+
+	void setDataUnsafe(Object data) {
+		_data = data;
+	}
+
+	public BlockState getState() {
+		return _state;
+	}
+
+	public boolean isPinned() {
+		return _pinCount > 0;
+	}
+
+	synchronized void setState(BlockState state) {
+		_state = state;
+	}
+
+	/**
+	 * Tries to clear the underlying data if it is not pinned
+	 * @return the number of cleared bytes (or 0 if could not clear or data was already cleared)
+	 */
+	synchronized long clear() {
+		if (_pinCount != 0 || _data == null)
+			return 0;
+		if (_data instanceof IndexedMatrixValue)
+			((IndexedMatrixValue)_data).setValue(null); // Explicitly clear
+		_data = null;
+		return _size;
+	}
+
+	/**
+	 * Pins the underlying data in memory
+	 * @return the new number of pins (0 if pin was unsuccessful)
+	 */
+	synchronized int pin() {
+		if (_data == null)
+			return 0;
+		_pinCount++;
+		return _pinCount;
+	}
+
+	/**
+	 * Unpins the underlying data
+	 * @return true if the data is now unpinned
+	 */
+	synchronized boolean unpin() {
+		if (_pinCount <= 0)
+			throw new IllegalStateException("Cannot unpin data if it was not pinned");
+		_pinCount--;
+		return _pinCount == 0;
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/ooc/cache/BlockKey.java b/src/main/java/org/apache/sysds/runtime/ooc/cache/BlockKey.java
new file mode 100644
index 00000000000..c6435672462
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/ooc/cache/BlockKey.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.ooc.cache;
+
+import org.jetbrains.annotations.NotNull;
+
+public class BlockKey implements Comparable<BlockKey> {
+	private final long _streamId;
+	private final long _sequenceNumber;
+
+	public BlockKey(long streamId, long sequenceNumber) {
+		this._streamId = streamId;
+		this._sequenceNumber = sequenceNumber;
+	}
+
+	public long getStreamId() {
+		return _streamId;
+	}
+
+	public long getSequenceNumber() {
+		return _sequenceNumber;
+	}
+
+	@Override
+	public int compareTo(@NotNull BlockKey blockKey) {
+		int cmp = Long.compare(_streamId, blockKey._streamId);
+		if (cmp != 0)
+			return cmp;
+		return Long.compare(_sequenceNumber, blockKey._sequenceNumber);
+	}
+
+	@Override
+	public int hashCode() {
+		return 31 * Long.hashCode(_streamId) + Long.hashCode(_sequenceNumber);
+	}
+
+	@Override
+	public boolean equals(Object obj) {
+		return obj instanceof BlockKey && ((BlockKey)obj)._streamId == _streamId && ((BlockKey)obj)._sequenceNumber == _sequenceNumber;
+	}
+
+	@Override
+	public String toString() {
+		return "BlockKey(" + _streamId + ", " + _sequenceNumber + ")";
+	}
+
+	public String toFileKey() {
+		return _streamId + "_" + _sequenceNumber;
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test3.java b/src/main/java/org/apache/sysds/runtime/ooc/cache/BlockState.java
similarity index 56%
rename from src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test3.java
rename to src/main/java/org/apache/sysds/runtime/ooc/cache/BlockState.java
index 71a6b1762ec..30013f736e7 100644
--- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test3.java
+++ b/src/main/java/org/apache/sysds/runtime/ooc/cache/BlockState.java
@@ -17,22 +17,33 @@
  * under the License.
  */
 
-package org.apache.sysds.test.functions.io.hdf5;
+package org.apache.sysds.runtime.ooc.cache;
 
-public class ReadHDF5Test3 extends ReadHDF5Test {
+public enum BlockState {
+	HOT,
+	WARM,
+	EVICTING,
+	READING,
+	//DEFERRED_READ, // Deferred read
+	COLD,
+	REMOVED; // Removed state means that it is not owned by the cache anymore. It doesn't mean the object is dereferenced
 
-	private final static String TEST_NAME = "ReadHDF5Test";
-	private final static String TEST_CLASS_DIR = TEST_DIR + ReadHDF5Test3.class.getSimpleName() + "/";
+	public boolean isAvailable() {
+		return this == HOT || this == WARM || this == EVICTING || this == REMOVED;
+	}
 
-	protected String getTestName() {
-		return TEST_NAME;
+	public boolean isUnavailable() {
+		return this == COLD || this == READING;
 	}
 
-	protected String getTestClassDir() {
-		return TEST_CLASS_DIR;
+	public boolean readScheduled() {
+		return this == READING;
 	}
 
-	protected int getId() {
-		return 3;
+	public boolean isBackedByDisk() {
+		return switch(this) {
+			case WARM, COLD, READING -> true;
+			default -> false;
+		};
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/ooc/cache/CloseableQueue.java b/src/main/java/org/apache/sysds/runtime/ooc/cache/CloseableQueue.java
new file mode 100644
index 00000000000..b8c312d2a3d
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/ooc/cache/CloseableQueue.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.ooc.cache;
+
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+
+public class CloseableQueue<T> {
+	private final BlockingQueue<Object> queue = new LinkedBlockingQueue<>();
+	private final Object POISON = new Object();  // sentinel
+	private volatile boolean closed = false;
+
+	public CloseableQueue() { }
+
+	/**
+	 * Enqueue if the queue is not closed.
+	 * @return false if already closed
+	 */
+	public boolean enqueueIfOpen(T task) throws InterruptedException {
+		if (task == null)
+			throw new IllegalArgumentException("null tasks not allowed");
+		synchronized (this) {
+			if (closed)
+				return false;
+			queue.put(task);
+		}
+		return true;
+	}
+
+	public T take() throws InterruptedException {
+		if (closed && queue.isEmpty())
+			return null;
+
+		Object x = queue.take();
+
+		if (x == POISON)
+			return null;
+
+		return (T) x;
+	}
+
+	/**
+	 * Poll with max timeout.
+	 * @return item, or null if:
+	 *   - timeout, or
+	 *   - queue has been closed and this consumer reached its poison pill
+	 */
+	@SuppressWarnings("unchecked")
+	public T poll(long timeout, TimeUnit unit) throws InterruptedException {
+		if (closed && queue.isEmpty())
+			return null;
+
+		Object x = queue.poll(timeout, unit);
+		if (x == null)
+			return null;          // timeout
+
+		if (x == POISON)
+			return null;
+
+		return (T) x;
+	}
+
+	/**
+	 * Close queue for N consumers.
+	 * Each consumer will receive exactly one poison pill and then should stop.
+	 */
+	public boolean close() throws InterruptedException {
+		synchronized (this) {
+			if (closed)
+				return false;           // idempotent
+			closed = true;
+		}
+		queue.put(POISON);
+		return true;
+	}
+
+	public synchronized boolean isFinished() {
+		return closed && queue.isEmpty();
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCCacheManager.java b/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCCacheManager.java
new file mode 100644
index 00000000000..bbf4cfb314c
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCCacheManager.java
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.ooc.cache;
+
+import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.instructions.ooc.OOCStream;
+import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.ooc.stats.OOCEventLog;
+import org.apache.sysds.utils.Statistics;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicReference;
+
+public class OOCCacheManager {
+	private static final double OOC_BUFFER_PERCENTAGE = 0.2;
+	private static final double OOC_BUFFER_PERCENTAGE_HARD = 0.3;
+	private static final long _evictionLimit;
+	private static final long _hardLimit;
+
+	private static final AtomicReference<OOCIOHandler> _ioHandler;
+	private static final AtomicReference<OOCCacheScheduler> _scheduler;
+
+	static {
+		_evictionLimit = (long)(Runtime.getRuntime().maxMemory() * OOC_BUFFER_PERCENTAGE);
+		_hardLimit = (long)(Runtime.getRuntime().maxMemory() * OOC_BUFFER_PERCENTAGE_HARD);
+		_ioHandler = new AtomicReference<>();
+		_scheduler = new AtomicReference<>();
+	}
+
+	public static void reset() {
+		OOCIOHandler ioHandler = _ioHandler.getAndSet(null);
+		OOCCacheScheduler cacheScheduler = _scheduler.getAndSet(null);
+		if (ioHandler != null)
+			ioHandler.shutdown();
+		if (cacheScheduler != null)
+			cacheScheduler.shutdown();
+
+		if (DMLScript.OOC_STATISTICS)
+			Statistics.resetOOCEvictionStats();
+
+		if (DMLScript.OOC_LOG_EVENTS) {
+			try {
+				String csv = OOCEventLog.getComputeEventsCSV();
+				Files.writeString(Path.of(DMLScript.OOC_LOG_PATH, "ComputeEventLog.csv"), csv);
+				csv = OOCEventLog.getDiskReadEventsCSV();
+				Files.writeString(Path.of(DMLScript.OOC_LOG_PATH, "DiskReadEventLog.csv"), csv);
+				csv = OOCEventLog.getDiskWriteEventsCSV();
+				Files.writeString(Path.of(DMLScript.OOC_LOG_PATH, "DiskWriteEventLog.csv"), csv);
+				csv = OOCEventLog.getCacheSizeEventsCSV();
+				Files.writeString(Path.of(DMLScript.OOC_LOG_PATH, "CacheSizeEventLog.csv"), csv);
+				csv = OOCEventLog.getRunSettingsCSV();
+				Files.writeString(Path.of(DMLScript.OOC_LOG_PATH, "RunSettings.csv"), csv);
+				System.out.println("Event logs written to: " + DMLScript.OOC_LOG_PATH);
+			}
+			catch(IOException e) {
+				System.err.println("Could not write event logs: " + e.getMessage());
+			}
+			OOCEventLog.clear();
+		}
+	}
+
+	public static OOCCacheScheduler getCache() {
+		while (true) {
+			OOCCacheScheduler scheduler = _scheduler.get();
+
+			if(scheduler != null)
+				return scheduler;
+
+			OOCIOHandler ioHandler = new OOCMatrixIOHandler();
+			scheduler = new OOCLRUCacheScheduler(ioHandler, _evictionLimit, _hardLimit);
+
+			if(_scheduler.compareAndSet(null, scheduler)) {
+				_ioHandler.set(ioHandler);
+				return scheduler;
+			}
+		}
+	}
+
+	public static OOCIOHandler getIOHandler() {
+		OOCIOHandler io = _ioHandler.get();
+		if(io != null)
+			return io;
+		// Ensure initialization happens
+		getCache();
+		return _ioHandler.get();
+	}
+
+	/**
+	 * Removes a block from the cache without setting its data to null.
+	 */
+	public static void forget(long streamId, int blockId) {
+		BlockKey key = new BlockKey(streamId, blockId);
+		getCache().forget(key);
+	}
+
+	/**
+	 * Store a block in the OOC cache (serialize once)
+	 */
+	public static void put(long streamId, int blockId, IndexedMatrixValue value) {
+		BlockKey key = new BlockKey(streamId, blockId);
+		getCache().put(key, value, ((MatrixBlock)value.getValue()).getExactSerializedSize());
+	}
+
+	/**
+	 * Store a source-backed block in the OOC cache and register its source location.
+	 */
+	public static void putSourceBacked(long streamId, int blockId, IndexedMatrixValue value,
+		OOCIOHandler.SourceBlockDescriptor descriptor) {
+		BlockKey key = new BlockKey(streamId, blockId);
+		getCache().putSourceBacked(key, value, ((MatrixBlock) value.getValue()).getExactSerializedSize(), descriptor);
+	}
+
+	public static OOCStream.QueueCallback<IndexedMatrixValue> putAndPin(long streamId, int blockId, IndexedMatrixValue value) {
+		BlockKey key = new BlockKey(streamId, blockId);
+		return new CachedQueueCallback<>(getCache().putAndPin(key, value, ((MatrixBlock)value.getValue()).getExactSerializedSize()), null);
+	}
+
+	public static OOCStream.QueueCallback<IndexedMatrixValue> putAndPinSourceBacked(long streamId, int blockId,
+		IndexedMatrixValue value, OOCIOHandler.SourceBlockDescriptor descriptor) {
+		BlockKey key = new BlockKey(streamId, blockId);
+		return new CachedQueueCallback<>(
+			getCache().putAndPinSourceBacked(key, value, ((MatrixBlock) value.getValue()).getExactSerializedSize(),
+				descriptor), null);
+	}
+
+	public static CompletableFuture<OOCStream.QueueCallback<IndexedMatrixValue>> requestBlock(long streamId, long blockId) {
+		BlockKey key = new BlockKey(streamId, blockId);
+		return getCache().request(key).thenApply(e -> new CachedQueueCallback<>(e, null));
+	}
+
+	public static CompletableFuture<List<OOCStream.QueueCallback<IndexedMatrixValue>>> requestManyBlocks(List<BlockKey> keys) {
+		return getCache().request(keys).thenApply(
+			l -> l.stream().map(e -> (OOCStream.QueueCallback<IndexedMatrixValue>)new CachedQueueCallback<IndexedMatrixValue>(e, null)).toList());
+	}
+
+	private static void pin(BlockEntry entry) {
+		getCache().pin(entry);
+	}
+
+	private static void unpin(BlockEntry entry) {
+		getCache().unpin(entry);
+	}
+
+
+
+
+	static class CachedQueueCallback<T> implements OOCStream.QueueCallback<T> {
+		private final BlockEntry _result;
+		private DMLRuntimeException _failure;
+		private final AtomicBoolean _pinned;
+
+		CachedQueueCallback(BlockEntry result, DMLRuntimeException failure) {
+			this._result = result;
+			this._failure = failure;
+			this._pinned = new AtomicBoolean(true);
+		}
+
+		@SuppressWarnings("unchecked")
+		@Override
+		public T get() {
+			if (_failure != null)
+				throw _failure;
+			if (!_pinned.get())
+				throw new IllegalStateException("Cannot get cached item of a closed callback");
+			T ret = (T)_result.getData();
+			if (ret == null)
+				throw new IllegalStateException("Cannot get a cached item if it is not pinned in memory: " + _result.getState());
+			return ret;
+		}
+
+		@Override
+		public OOCStream.QueueCallback<T> keepOpen() {
+			if (!_pinned.get())
+				throw new IllegalStateException("Cannot keep open an already closed callback");
+			pin(_result);
+			return new CachedQueueCallback<>(_result, _failure);
+		}
+
+		@Override
+		public void fail(DMLRuntimeException failure) {
+			this._failure = failure;
+		}
+
+		@Override
+		public boolean isEos() {
+			return get() == null;
+		}
+
+		@Override
+		public void close() {
+			if (_pinned.compareAndSet(true, false)) {
+				unpin(_result);
+			}
+		}
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCCacheScheduler.java b/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCCacheScheduler.java
new file mode 100644
index 00000000000..cd04f9879aa
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCCacheScheduler.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.ooc.cache;
+
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+
+public interface OOCCacheScheduler {
+
+	/**
+	 * Requests a single block from the cache.
+	 * @param key the requested key associated to the block
+	 * @return the available BlockEntry
+	 */
+	CompletableFuture<BlockEntry> request(BlockKey key);
+
+	/**
+	 * Requests a list of blocks from the cache that must be available at the same time.
+	 * @param keys the requested keys associated to the block
+	 * @return the list of available BlockEntries
+	 */
+	CompletableFuture<List<BlockEntry>> request(List<BlockKey> keys);
+
+	/**
+	 * Places a new block in the cache. Note that objects are immutable and cannot be overwritten.
+	 * The object data should now only be accessed via cache, as ownership has been transferred.
+	 * @param key the associated key of the block
+	 * @param data the block data
+	 * @param size the size of the data
+	 */
+	void put(BlockKey key, Object data, long size);
+
+	/**
+	 * Places a new block in the cache and returns a pinned handle.
+	 * Note that objects are immutable and cannot be overwritten.
+	 * @param key the associated key of the block
+	 * @param data the block data
+	 * @param size the size of the data
+	 */
+	BlockEntry putAndPin(BlockKey key, Object data, long size);
+
+	/**
+	 * Places a new source-backed block in the cache and registers the location with the IO handler. The entry is
+	 * treated as backed by disk, so eviction does not schedule spill writes.
+	 *
+	 * @param key        the associated key of the block
+	 * @param data       the block data
+	 * @param size       the size of the data
+	 * @param descriptor the source location descriptor
+	 */
+	void putSourceBacked(BlockKey key, Object data, long size, OOCIOHandler.SourceBlockDescriptor descriptor);
+
+	/**
+	 * Places a new source-backed block in the cache and returns a pinned handle.
+	 *
+	 * @param key        the associated key of the block
+	 * @param data       the block data
+	 * @param size       the size of the data
+	 * @param descriptor the source location descriptor
+	 */
+	BlockEntry putAndPinSourceBacked(BlockKey key, Object data, long size,
+		OOCIOHandler.SourceBlockDescriptor descriptor);
+
+	/**
+	 * Forgets a block from the cache.
+	 * @param key the associated key of the block
+	 */
+	void forget(BlockKey key);
+
+	/**
+	 * Pins a BlockEntry in cache to prevent eviction.
+	 * @param entry the entry to be pinned
+	 */
+	void pin(BlockEntry entry);
+
+	/**
+	 * Unpins a pinned block.
+	 * @param entry the entry to be unpinned
+	 */
+	void unpin(BlockEntry entry);
+
+	/**
+	 * Shuts down the cache scheduler.
+	 */
+	void shutdown();
+}
diff --git a/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCIOHandler.java b/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCIOHandler.java
new file mode 100644
index 00000000000..b4d14646e0e
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCIOHandler.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.ooc.cache;
+
+import java.util.concurrent.CompletableFuture;
+import java.util.List;
+
+public interface OOCIOHandler {
+	void shutdown();
+
+	CompletableFuture<Void> scheduleEviction(BlockEntry block);
+
+	CompletableFuture<BlockEntry> scheduleRead(BlockEntry block);
+
+	CompletableFuture<Boolean> scheduleDeletion(BlockEntry block);
+
+	/**
+	 * Registers the source location of a block for future direct reads.
+	 */
+	void registerSourceLocation(BlockKey key, SourceBlockDescriptor descriptor);
+
+	/**
+	 * Schedule an asynchronous read from an external source into the provided target stream.
+	 * The returned future completes when either EOF is reached or the requested byte budget
+	 * is exhausted. When the budget is reached and keepOpenOnLimit is true, the target stream
+	 * is kept open and a continuation token is provided so the caller can resume.
+	 */
+	CompletableFuture<SourceReadResult> scheduleSourceRead(SourceReadRequest request);
+
+	/**
+	 * Continue a previously throttled source read using the provided continuation token.
+	 */
+	CompletableFuture<SourceReadResult> continueSourceRead(SourceReadContinuation continuation, long maxBytesInFlight);
+
+	interface SourceReadContinuation {}
+
+	class SourceReadRequest {
+		public final String path;
+		public final org.apache.sysds.common.Types.FileFormat format;
+		public final long rows;
+		public final long cols;
+		public final int blen;
+		public final long estNnz;
+		public final long maxBytesInFlight;
+		public final boolean keepOpenOnLimit;
+		public final org.apache.sysds.runtime.instructions.ooc.OOCStream<org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue> target;
+
+		public SourceReadRequest(String path, org.apache.sysds.common.Types.FileFormat format, long rows, long cols,
+			int blen, long estNnz, long maxBytesInFlight, boolean keepOpenOnLimit,
+			org.apache.sysds.runtime.instructions.ooc.OOCStream<org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue> target) {
+			this.path = path;
+			this.format = format;
+			this.rows = rows;
+			this.cols = cols;
+			this.blen = blen;
+			this.estNnz = estNnz;
+			this.maxBytesInFlight = maxBytesInFlight;
+			this.keepOpenOnLimit = keepOpenOnLimit;
+			this.target = target;
+		}
+	}
+
+	class SourceReadResult {
+		public final long bytesRead;
+		public final boolean eof;
+		public final SourceReadContinuation continuation;
+		public final List<SourceBlockDescriptor> blocks;
+
+		public SourceReadResult(long bytesRead, boolean eof, SourceReadContinuation continuation,
+			List<SourceBlockDescriptor> blocks) {
+			this.bytesRead = bytesRead;
+			this.eof = eof;
+			this.continuation = continuation;
+			this.blocks = blocks;
+		}
+	}
+
+	class SourceBlockDescriptor {
+		public final String path;
+		public final org.apache.sysds.common.Types.FileFormat format;
+		public final org.apache.sysds.runtime.matrix.data.MatrixIndexes indexes;
+		public final long offset;
+		public final int recordLength;
+		public final long serializedSize;
+
+		public SourceBlockDescriptor(String path, org.apache.sysds.common.Types.FileFormat format,
+			org.apache.sysds.runtime.matrix.data.MatrixIndexes indexes, long offset, int recordLength,
+			long serializedSize) {
+			this.path = path;
+			this.format = format;
+			this.indexes = indexes;
+			this.offset = offset;
+			this.recordLength = recordLength;
+			this.serializedSize = serializedSize;
+		}
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCLRUCacheScheduler.java b/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCLRUCacheScheduler.java
new file mode 100644
index 00000000000..0f30914770a
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCLRUCacheScheduler.java
@@ -0,0 +1,619 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.ooc.cache;
+
+import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.runtime.ooc.stats.OOCEventLog;
+import org.apache.sysds.utils.Statistics;
+import scala.Tuple2;
+
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class OOCLRUCacheScheduler implements OOCCacheScheduler {
+	private static final boolean SANITY_CHECKS = false;
+
+	private final OOCIOHandler _ioHandler;
+	private final LinkedHashMap<BlockKey, BlockEntry> _cache;
+	private final HashMap<BlockKey, BlockEntry> _evictionCache;
+	private final Deque<DeferredReadRequest> _deferredReadRequests;
+	private final Deque<DeferredReadRequest> _processingReadRequests;
+	private final long _hardLimit;
+	private final long _evictionLimit;
+	private final int _callerId;
+	private long _cacheSize;
+	private long _bytesUpForEviction;
+	private volatile boolean _running;
+	private boolean _warnThrottling;
+
+	public OOCLRUCacheScheduler(OOCIOHandler ioHandler, long evictionLimit, long hardLimit) {
+		this._ioHandler = ioHandler;
+		this._cache = new LinkedHashMap<>(1024, 0.75f, true);
+		this._evictionCache = new  HashMap<>();
+		this._deferredReadRequests = new ArrayDeque<>();
+		this._processingReadRequests = new ArrayDeque<>();
+		this._hardLimit = hardLimit;
+		this._evictionLimit = evictionLimit;
+		this._cacheSize = 0;
+		this._bytesUpForEviction = 0;
+		this._running = true;
+		this._warnThrottling = false;
+		this._callerId = DMLScript.OOC_LOG_EVENTS ? OOCEventLog.registerCaller("LRUCacheScheduler") : 0;
+
+		if (DMLScript.OOC_LOG_EVENTS) {
+			OOCEventLog.putRunSetting("CacheEvictionLimit", _evictionLimit);
+			OOCEventLog.putRunSetting("CacheHardLimit", _hardLimit);
+		}
+	}
+
+	@Override
+	public CompletableFuture<BlockEntry> request(BlockKey key) {
+		if (!this._running)
+			throw new IllegalStateException("Cache scheduler has been shut down.");
+
+		Statistics.incrementOOCEvictionGet();
+
+		BlockEntry entry;
+		boolean couldPin = false;
+		synchronized(this) {
+			entry = _cache.get(key);
+			if (entry == null)
+				entry = _evictionCache.get(key);
+			if (entry == null)
+				throw new IllegalArgumentException("Could not find requested block with key " + key);
+
+			synchronized(entry) {
+				if (entry.getState().isAvailable()) {
+					if (entry.pin() == 0)
+						throw new IllegalStateException();
+					couldPin = true;
+				}
+			}
+		}
+
+		if (couldPin) {
+			// Then we could pin the required entry and can terminate
+			return CompletableFuture.completedFuture(entry);
+		}
+
+		//System.out.println("Requesting deferred: " + key);
+		// Schedule deferred read otherwise
+		final CompletableFuture<BlockEntry> future = new CompletableFuture<>();
+		final CompletableFuture<List<BlockEntry>> requestFuture = new CompletableFuture<>();
+		requestFuture.whenComplete((r, t) -> future.complete(r.get(0)));
+		scheduleDeferredRead(new DeferredReadRequest(requestFuture, Collections.singletonList(entry)));
+		return future;
+	}
+
+	@Override
+	public CompletableFuture<List<BlockEntry>> request(List<BlockKey> keys) {
+		if (!this._running)
+			throw new IllegalStateException("Cache scheduler has been shut down.");
+
+		Statistics.incrementOOCEvictionGet(keys.size());
+
+		List<BlockEntry> entries = new ArrayList<>(keys.size());
+		boolean couldPinAll = true;
+
+		synchronized(this) {
+			for (BlockKey key : keys) {
+				BlockEntry entry = _cache.get(key);
+				if (entry == null)
+					entry = _evictionCache.get(key);
+				if (entry == null)
+					throw new IllegalArgumentException("Could not find requested block with key " + key);
+
+				if (couldPinAll) {
+					synchronized(entry) {
+						if(entry.getState().isAvailable()) {
+							if(entry.pin() == 0)
+								throw new IllegalStateException();
+						}
+						else {
+							couldPinAll = false;
+						}
+					}
+
+					if (!couldPinAll) {
+						// Undo pin for all previous entries
+						for (BlockEntry e : entries)
+							e.unpin(); // Do not unpin using unpin(...) method to avoid explicit eviction on memory pressure
+					}
+				}
+				entries.add(entry);
+			}
+		}
+
+		if (couldPinAll) {
+			// Then we could pin all entries
+			return CompletableFuture.completedFuture(entries);
+		}
+
+		// Schedule deferred read otherwise
+		final  CompletableFuture<List<BlockEntry>> future = new CompletableFuture<>();
+		scheduleDeferredRead(new DeferredReadRequest(future, entries));
+		return future;
+	}
+
+	private void scheduleDeferredRead(DeferredReadRequest deferredReadRequest) {
+		synchronized(this) {
+			_deferredReadRequests.add(deferredReadRequest);
+		}
+		onCacheSizeChanged(false); // To schedule deferred reads if possible
+	}
+
+	@Override
+	public void put(BlockKey key, Object data, long size) {
+		put(key, data, size, false, null);
+	}
+
+	@Override
+	public BlockEntry putAndPin(BlockKey key, Object data, long size) {
+		return put(key, data, size, true, null);
+	}
+
+	@Override
+	public void putSourceBacked(BlockKey key, Object data, long size, OOCIOHandler.SourceBlockDescriptor descriptor) {
+		put(key, data, size, false, descriptor);
+	}
+
+	@Override
+	public BlockEntry putAndPinSourceBacked(BlockKey key, Object data, long size, OOCIOHandler.SourceBlockDescriptor descriptor) {
+		return put(key, data, size, true, descriptor);
+	}
+
+	private BlockEntry put(BlockKey key, Object data, long size, boolean pin, OOCIOHandler.SourceBlockDescriptor descriptor) {
+		if (!this._running)
+			throw new IllegalStateException();
+		if (data == null)
+			throw new IllegalArgumentException();
+		if (descriptor != null)
+			_ioHandler.registerSourceLocation(key, descriptor);
+
+		Statistics.incrementOOCEvictionPut();
+		BlockEntry entry = new BlockEntry(key, size, data);
+		if (descriptor != null)
+			entry.setState(BlockState.WARM);
+		if (pin)
+			entry.pin();
+		synchronized(this) {
+			BlockEntry avail = _cache.putIfAbsent(key, entry);
+			if (avail != null || _evictionCache.containsKey(key))
+				throw new IllegalStateException("Cannot overwrite existing entries: " + key);
+			_cacheSize += size;
+		}
+		onCacheSizeChanged(true);
+		return entry;
+	}
+
+	@Override
+	public void forget(BlockKey key) {
+		if (!this._running)
+			return;
+		BlockEntry entry;
+		boolean shouldScheduleDeletion = false;
+		long cacheSizeDelta = 0;
+		synchronized(this) {
+			entry = _cache.remove(key);
+
+			if (entry == null)
+				entry = _evictionCache.remove(key);
+
+			if (entry != null) {
+				synchronized(entry) {
+					shouldScheduleDeletion = entry.getState().isBackedByDisk()
+						|| entry.getState() == BlockState.EVICTING;
+					cacheSizeDelta = transitionMemState(entry, BlockState.REMOVED);
+				}
+
+			}
+		}
+		if (cacheSizeDelta != 0)
+			onCacheSizeChanged(cacheSizeDelta > 0);
+		if (shouldScheduleDeletion)
+			_ioHandler.scheduleDeletion(entry);
+	}
+
+	@Override
+	public void pin(BlockEntry entry) {
+		if (!this._running)
+			throw new IllegalStateException("Cache scheduler has been shut down.");
+
+		int pinCount = entry.pin();
+		if (pinCount == 0)
+			throw new IllegalStateException("Could not pin the requested entry: " + entry.getKey());
+		synchronized(this) {
+			// Access element in cache for Lru
+			_cache.get(entry.getKey());
+		}
+	}
+
+	@Override
+	public void unpin(BlockEntry entry) {
+		boolean couldFree = entry.unpin();
+
+		if (couldFree) {
+			long cacheSizeDelta = 0;
+			synchronized(this) {
+				if (_cacheSize <= _evictionLimit)
+					return; // Nothing to do
+
+				synchronized(entry) {
+					if (entry.isPinned())
+						return; // Pin state changed so we cannot evict
+
+					if (entry.getState().isAvailable() && entry.getState().isBackedByDisk()) {
+						cacheSizeDelta =  transitionMemState(entry, BlockState.COLD);
+						long cleared = entry.clear();
+						if (cleared != entry.getSize())
+							throw new IllegalStateException();
+						_cache.remove(entry.getKey());
+						_evictionCache.put(entry.getKey(), entry);
+					} else if (entry.getState() == BlockState.HOT) {
+						cacheSizeDelta = onUnpinnedHotBlockUnderMemoryPressure(entry);
+					}
+				}
+			}
+			if (cacheSizeDelta != 0)
+				onCacheSizeChanged(cacheSizeDelta > 0);
+		}
+	}
+
+	@Override
+	public synchronized void shutdown() {
+		this._running = false;
+		_cache.clear();
+		_evictionCache.clear();
+		_processingReadRequests.clear();
+		_deferredReadRequests.clear();
+		_cacheSize = 0;
+		_bytesUpForEviction = 0;
+	}
+
+	/**
+	 * Must be called while this cache and the corresponding entry are locked
+	 */
+	private long onUnpinnedHotBlockUnderMemoryPressure(BlockEntry entry) {
+		long cacheSizeDelta = transitionMemState(entry, BlockState.EVICTING);
+		evict(entry);
+		return cacheSizeDelta;
+	}
+
+	private void onCacheSizeChanged(boolean incr) {
+		if (incr)
+			onCacheSizeIncremented();
+		else {
+			while(onCacheSizeDecremented()) {}
+		}
+		if (DMLScript.OOC_LOG_EVENTS)
+			OOCEventLog.onCacheSizeChangedEvent(_callerId, System.nanoTime(), _cacheSize, _bytesUpForEviction);
+	}
+
+	private synchronized void sanityCheck() {
+		if (_cacheSize > _hardLimit * 1.1) {
+			if (!_warnThrottling) {
+				_warnThrottling = true;
+				System.out.println("[WARN] Cache hard limit exceeded by over 10%: " + String.format("%.2f", _cacheSize/1000000.0) + "MB (-" + String.format("%.2f", _bytesUpForEviction/1000000.0) + "MB) > " + String.format("%.2f", _hardLimit/1000000.0) + "MB");
+			}
+		}
+		else if (_warnThrottling && _cacheSize < _hardLimit) {
+			_warnThrottling = false;
+			System.out.println("[INFO] Cache within limit: " + String.format("%.2f", _cacheSize/1000000.0) + "MB (-" + String.format("%.2f", _bytesUpForEviction/1000000.0) + "MB) <= " + String.format("%.2f", _hardLimit/1000000.0) + "MB");
+		}
+
+		if (!SANITY_CHECKS)
+			return;
+
+		int pinned = 0;
+		int backedByDisk = 0;
+		int evicting = 0;
+		int total = 0;
+		long actualCacheSize = 0;
+		long upForEviction = 0;
+		for (BlockEntry entry : _cache.values()) {
+			if (entry.isPinned())
+				pinned++;
+			if (entry.getState().isBackedByDisk())
+				backedByDisk++;
+			if (entry.getState() == BlockState.EVICTING) {
+				evicting++;
+				upForEviction += entry.getSize();
+			}
+			if (!entry.getState().isAvailable())
+				throw new IllegalStateException();
+			total++;
+			actualCacheSize += entry.getSize();
+		}
+		for (BlockEntry entry : _evictionCache.values()) {
+			if (entry.getState().isAvailable())
+				throw new IllegalStateException("Invalid eviction state: " + entry.getState());
+			if (entry.getState() == BlockState.READING)
+				actualCacheSize += entry.getSize();
+		}
+		if (actualCacheSize != _cacheSize)
+			throw new IllegalStateException(actualCacheSize + " != " + _cacheSize);
+		if (upForEviction != _bytesUpForEviction)
+			throw new IllegalStateException(upForEviction + " != " + _bytesUpForEviction);
+		System.out.println("==========");
+		System.out.println("Limit: " + _evictionLimit/1000 + "KB");
+		System.out.println("Memory: (" + _cacheSize/1000 + "KB - " + _bytesUpForEviction/1000 + "KB) / " + _hardLimit/1000 + "KB");
+		System.out.println("Pinned: " + pinned + " / " + total);
+		System.out.println("Disk backed: " + backedByDisk + " / " + total);
+		System.out.println("Evicting: " + evicting + " / " + total);
+	}
+
+	private void onCacheSizeIncremented() {
+		long cacheSizeDelta = 0;
+		List<BlockEntry> upForEviction;
+		synchronized(this) {
+			if(_cacheSize - _bytesUpForEviction <= _evictionLimit)
+				return; // Nothing to do
+
+			// Scan for values that can be evicted
+			Collection<BlockEntry> entries = _cache.values();
+			List<BlockEntry> toRemove = new ArrayList<>();
+			upForEviction = new ArrayList<>();
+
+			for(BlockEntry entry : entries) {
+				if(_cacheSize - _bytesUpForEviction <= _evictionLimit)
+					break;
+
+				synchronized(entry) {
+					if(!entry.isPinned() && entry.getState().isBackedByDisk()) {
+						cacheSizeDelta += transitionMemState(entry, BlockState.COLD);
+						entry.clear();
+						toRemove.add(entry);
+					}
+					else if(entry.getState() != BlockState.EVICTING && !entry.getState().isBackedByDisk()) {
+						cacheSizeDelta += transitionMemState(entry, BlockState.EVICTING);
+						upForEviction.add(entry);
+					}
+				}
+			}
+
+			for(BlockEntry entry : toRemove) {
+				_cache.remove(entry.getKey());
+				_evictionCache.put(entry.getKey(), entry);
+			}
+
+			sanityCheck();
+		}
+
+		for (BlockEntry entry : upForEviction) {
+			evict(entry);
+		}
+
+		if (cacheSizeDelta != 0)
+			onCacheSizeChanged(cacheSizeDelta > 0);
+	}
+
+	private boolean onCacheSizeDecremented() {
+		boolean allReserved = true;
+		List<Tuple2<Integer, BlockEntry>> toRead;
+		DeferredReadRequest req;
+		synchronized(this) {
+			if(_cacheSize >= _hardLimit || _deferredReadRequests.isEmpty())
+				return false; // Nothing to do
+
+			// Try to schedule the next disk read
+			req = _deferredReadRequests.peek();
+			toRead = new ArrayList<>(req.getEntries().size());
+
+			for(int idx = 0; idx < req.getEntries().size(); idx++) {
+				if(!req.actionRequired(idx))
+					continue;
+
+				BlockEntry entry = req.getEntries().get(idx);
+				synchronized(entry) {
+					if(entry.getState().isAvailable()) {
+						if(entry.pin() == 0)
+							throw new IllegalStateException();
+						req.setPinned(idx);
+					}
+					else {
+						if(_cacheSize + entry.getSize() <= _hardLimit) {
+							transitionMemState(entry, BlockState.READING);
+							toRead.add(new Tuple2<>(idx, entry));
+							req.schedule(idx);
+						}
+						else {
+							allReserved = false;
+						}
+					}
+				}
+			}
+
+			if (allReserved) {
+				_deferredReadRequests.poll();
+				if (!toRead.isEmpty())
+					_processingReadRequests.add(req);
+			}
+
+			sanityCheck();
+		}
+
+		if (allReserved && toRead.isEmpty()) {
+			req.getFuture().complete(req.getEntries());
+			return true;
+		}
+
+		for (Tuple2<Integer, BlockEntry> tpl : toRead) {
+			final int idx = tpl._1;
+			final BlockEntry entry = tpl._2;
+			CompletableFuture<BlockEntry> future = _ioHandler.scheduleRead(entry);
+			future.whenComplete((r, t) -> {
+				boolean allAvailable;
+				synchronized(this) {
+					synchronized(r) {
+						transitionMemState(r, BlockState.WARM);
+						if (r.pin() == 0)
+							throw new IllegalStateException();
+						_evictionCache.remove(r.getKey());
+						_cache.put(r.getKey(), r);
+						allAvailable = req.setPinned(idx);
+					}
+
+					if (allAvailable) {
+						_processingReadRequests.remove(req);
+					}
+
+					sanityCheck();
+				}
+				if (allAvailable) {
+					req.getFuture().complete(req.getEntries());
+				}
+			});
+		}
+
+		return false;
+	}
+
+	private void evict(final BlockEntry entry) {
+		CompletableFuture<Void> future = _ioHandler.scheduleEviction(entry);
+		future.whenComplete((r, e) -> onEvicted(entry));
+	}
+
+	private void onEvicted(final BlockEntry entry) {
+		long cacheSizeDelta;
+		synchronized(this) {
+			synchronized(entry) {
+				if(entry.isPinned()) {
+					transitionMemState(entry, BlockState.WARM);
+					return; // Then we cannot clear the data
+				}
+				cacheSizeDelta = transitionMemState(entry, BlockState.COLD);
+				entry.clear();
+			}
+			BlockEntry tmp = _cache.remove(entry.getKey());
+			if(tmp != null && tmp != entry)
+				throw new IllegalStateException();
+			tmp = _evictionCache.put(entry.getKey(), entry);
+			if (tmp != null)
+				throw new IllegalStateException();
+			sanityCheck();
+		}
+		if (cacheSizeDelta != 0)
+			onCacheSizeChanged(cacheSizeDelta > 0);
+	}
+
+	/**
+	 * Cleanly transitions state of a BlockEntry and handles accounting.
+	 * Requires both the scheduler object and the entry to be locked:
+	 */
+	private long transitionMemState(BlockEntry entry, BlockState newState) {
+		BlockState oldState = entry.getState();
+		if (oldState == newState)
+			return 0;
+
+		long sz = entry.getSize();
+		long oldCacheSize = _cacheSize;
+
+		// Remove old contribution
+		switch (oldState) {
+			case REMOVED:
+				throw new IllegalStateException();
+			case HOT:
+			case WARM:
+				_cacheSize -= sz;
+				break;
+			case EVICTING:
+				_cacheSize -= sz;
+				_bytesUpForEviction -= sz;
+				break;
+			case READING:
+				_cacheSize -= sz;
+				break;
+			case COLD:
+				break;
+		}
+
+		// Add new contribution
+		switch (newState) {
+			case REMOVED:
+			case COLD:
+				break;
+			case HOT:
+			case WARM:
+				_cacheSize += sz;
+				break;
+			case EVICTING:
+				_cacheSize += sz;
+				_bytesUpForEviction += sz;
+				break;
+			case READING:
+				_cacheSize += sz;
+				break;
+		}
+
+		entry.setState(newState);
+		return _cacheSize - oldCacheSize;
+	}
+
+
+
+	private static class DeferredReadRequest {
+		private static final short NOT_SCHEDULED = 0;
+		private static final short SCHEDULED = 1;
+		private static final short PINNED = 2;
+
+		private final CompletableFuture<List<BlockEntry>> _future;
+		private final List<BlockEntry> _entries;
+		private final short[] _pinned;
+		private final AtomicInteger _availableCount;
+
+		DeferredReadRequest(CompletableFuture<List<BlockEntry>> future, List<BlockEntry> entries) {
+			this._future = future;
+			this._entries = entries;
+			this._pinned = new short[entries.size()];
+			this._availableCount = new AtomicInteger(0);
+		}
+
+		CompletableFuture<List<BlockEntry>> getFuture() {
+			return _future;
+		}
+
+		List<BlockEntry> getEntries() {
+			return _entries;
+		}
+
+		public synchronized boolean actionRequired(int idx) {
+			return _pinned[idx] == NOT_SCHEDULED;
+		}
+
+		public synchronized boolean setPinned(int idx) {
+			if (_pinned[idx] == PINNED)
+				return false; // already pinned
+			_pinned[idx] = PINNED;
+			return _availableCount.incrementAndGet() == _entries.size();
+		}
+
+		public synchronized void schedule(int idx) {
+			_pinned[idx] = SCHEDULED;
+		}
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCMatrixIOHandler.java b/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCMatrixIOHandler.java
new file mode 100644
index 00000000000..a9da3ccd294
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/ooc/cache/OOCMatrixIOHandler.java
@@ -0,0 +1,623 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.ooc.cache;
+
+import org.apache.sysds.api.DMLScript;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.sysds.common.Types;
+import org.apache.sysds.conf.ConfigurationManager;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
+import org.apache.sysds.runtime.io.IOUtilFunctions;
+import org.apache.sysds.runtime.io.MatrixReader;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
+import org.apache.sysds.runtime.ooc.stats.OOCEventLog;
+import org.apache.sysds.runtime.ooc.stream.OOCSourceStream;
+import org.apache.sysds.runtime.util.FastBufferedDataInputStream;
+import org.apache.sysds.runtime.util.FastBufferedDataOutputStream;
+import org.apache.sysds.runtime.util.LocalFileUtils;
+import org.apache.sysds.utils.Statistics;
+import scala.Tuple2;
+import scala.Tuple3;
+
+import java.io.DataInput;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.RandomAccessFile;
+import java.nio.channels.Channels;
+import java.nio.channels.ClosedByInterruptException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.RejectedExecutionException;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicIntegerArray;
+import java.util.concurrent.atomic.AtomicLongArray;
+import java.util.concurrent.atomic.AtomicReference;
+
+public class OOCMatrixIOHandler implements OOCIOHandler {
+	private static final int WRITER_SIZE = 4;
+	private static final int READER_SIZE = 10;
+	private static final long OVERFLOW = 8192 * 1024;
+	private static final long MAX_PARTITION_SIZE = 8192 * 8192;
+
+	private final String _spillDir;
+	private final ThreadPoolExecutor _writeExec;
+	private final ThreadPoolExecutor _readExec;
+
+	// Spill related structures
+	private final ConcurrentHashMap<String, SpillLocation> _spillLocations =  new ConcurrentHashMap<>();
+	private final ConcurrentHashMap<Integer, PartitionFile> _partitions = new ConcurrentHashMap<>();
+	private final ConcurrentHashMap<BlockKey, SourceBlockDescriptor> _sourceLocations = new ConcurrentHashMap<>();
+	private final AtomicInteger _partitionCounter = new AtomicInteger(0);
+	private final CloseableQueue<Tuple2<BlockEntry, CompletableFuture<Void>>>[] _q;
+	private final AtomicLong _wCtr;
+	private final AtomicBoolean _started;
+
+	private final int _evictCallerId = OOCEventLog.registerCaller("write");
+	private final int _readCallerId = OOCEventLog.registerCaller("read");
+	private final int _srcReadCallerId = OOCEventLog.registerCaller("read_src");
+
+	@SuppressWarnings("unchecked")
+	public OOCMatrixIOHandler() {
+		this._spillDir = LocalFileUtils.getUniqueWorkingDir("ooc_stream");
+		_writeExec = new ThreadPoolExecutor(
+			WRITER_SIZE,
+			WRITER_SIZE,
+			0L,
+			TimeUnit.MILLISECONDS,
+			new ArrayBlockingQueue<>(100000));
+		_readExec = new ThreadPoolExecutor(
+			READER_SIZE,
+			READER_SIZE,
+			0L,
+			TimeUnit.MILLISECONDS,
+			new ArrayBlockingQueue<>(100000));
+		_q = new CloseableQueue[WRITER_SIZE];
+		_wCtr = new AtomicLong(0);
+		_started = new  AtomicBoolean(false);
+	}
+
+	private synchronized void start() {
+		if (_started.compareAndSet(false, true)) {
+			for (int i = 0; i < WRITER_SIZE; i++) {
+				final int finalIdx = i;
+				_q[i] = new CloseableQueue<>();
+				_writeExec.submit(() -> evictTask(_q[finalIdx]));
+			}
+		}
+	}
+
+	@Override
+	public void shutdown() {
+		boolean started = _started.get();
+		if (started) {
+			try {
+				for(int i = 0; i < WRITER_SIZE; i++) {
+					_q[i].close();
+				}
+			}
+			catch(InterruptedException ignored) {
+			}
+		}
+		_writeExec.getQueue().clear();
+		_writeExec.shutdownNow();
+		_readExec.getQueue().clear();
+		_readExec.shutdownNow();
+		_spillLocations.clear();
+		_partitions.clear();
+		if (started)
+			LocalFileUtils.deleteFileIfExists(_spillDir);
+	}
+
+	@Override
+	public CompletableFuture<Void> scheduleEviction(BlockEntry block) {
+		start();
+		CompletableFuture<Void> future = new CompletableFuture<>();
+		try {
+			long q = _wCtr.getAndAdd(block.getSize()) / OVERFLOW;
+			int i = (int)(q % WRITER_SIZE);
+			_q[i].enqueueIfOpen(new Tuple2<>(block, future));
+		}
+		catch(InterruptedException ignored) {
+		}
+
+		return future;
+	}
+
+	@Override
+	public CompletableFuture<BlockEntry> scheduleRead(final BlockEntry block) {
+		final CompletableFuture<BlockEntry> future = new CompletableFuture<>();
+		try {
+			_readExec.submit(() -> {
+				try {
+					long ioStart = DMLScript.OOC_LOG_EVENTS ? System.nanoTime() : 0;
+					loadFromDisk(block);
+					if (DMLScript.OOC_LOG_EVENTS)
+						OOCEventLog.onDiskReadEvent(_readCallerId, ioStart, System.nanoTime(), block.getSize());
+					future.complete(block);
+				} catch (Throwable e) {
+					future.completeExceptionally(e);
+				}
+			});
+		} catch (RejectedExecutionException e) {
+			future.completeExceptionally(e);
+		}
+		return future;
+	}
+
+	@Override
+	public CompletableFuture<Boolean> scheduleDeletion(BlockEntry block) {
+		_sourceLocations.remove(block.getKey());
+		return CompletableFuture.completedFuture(true);
+	}
+
+	@Override
+	public void registerSourceLocation(BlockKey key, SourceBlockDescriptor descriptor) {
+		_sourceLocations.put(key, descriptor);
+	}
+
+	@Override
+	public CompletableFuture<SourceReadResult> scheduleSourceRead(SourceReadRequest request) {
+		return submitSourceRead(request, null, request.maxBytesInFlight);
+	}
+
+	@Override
+	public CompletableFuture<SourceReadResult> continueSourceRead(SourceReadContinuation continuation, long maxBytesInFlight) {
+		if (!(continuation instanceof SourceReadState state)) {
+			CompletableFuture<SourceReadResult> failed = new CompletableFuture<>();
+			failed.completeExceptionally(new DMLRuntimeException("Unsupported continuation type: " + continuation));
+			return failed;
+		}
+		return submitSourceRead(state.request, state, maxBytesInFlight);
+	}
+
+	private CompletableFuture<SourceReadResult> submitSourceRead(SourceReadRequest request, SourceReadState state,
+		long maxBytesInFlight) {
+		if(request.format != Types.FileFormat.BINARY)
+			return CompletableFuture.failedFuture(
+				new DMLRuntimeException("Unsupported format for source read: " + request.format));
+		return readBinarySourceParallel(request, state, maxBytesInFlight);
+	}
+
+	private CompletableFuture<SourceReadResult> readBinarySourceParallel(SourceReadRequest request,
+		SourceReadState state, long maxBytesInFlight) {
+		final long byteLimit = maxBytesInFlight > 0 ? maxBytesInFlight : Long.MAX_VALUE;
+		final AtomicLong bytesRead = new AtomicLong(0);
+		final AtomicBoolean stop = new AtomicBoolean(false);
+		final AtomicBoolean budgetHit = new AtomicBoolean(false);
+		final AtomicReference<Throwable> error = new AtomicReference<>();
+		final Object budgetLock = new Object();
+		final CompletableFuture<SourceReadResult> result = new CompletableFuture<>();
+		final ConcurrentLinkedDeque<SourceBlockDescriptor> descriptors = new ConcurrentLinkedDeque<>();
+
+		JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
+		Path path = new Path(request.path);
+
+		Path[] files;
+		AtomicLongArray filePositions;
+		AtomicIntegerArray completed;
+
+		try {
+			FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
+			MatrixReader.checkValidInputFile(fs, path);
+
+			if(state == null) {
+				List<Path> seqFiles = new ArrayList<>(Arrays.asList(IOUtilFunctions.getSequenceFilePaths(fs, path)));
+				files = seqFiles.toArray(Path[]::new);
+				filePositions = new AtomicLongArray(files.length);
+				completed = new AtomicIntegerArray(files.length);
+			}
+			else {
+				files = state.paths;
+				filePositions = state.filePositions;
+				completed = state.completed;
+			}
+		}
+		catch(IOException e) {
+			throw new DMLRuntimeException(e);
+		}
+
+		int activeTasks = 0;
+		for(int i = 0; i < files.length; i++)
+			if(completed.get(i) == 0)
+				activeTasks++;
+
+		final AtomicInteger remaining = new AtomicInteger(activeTasks);
+		boolean anyTask = activeTasks > 0;
+
+		for(int i = 0; i < files.length; i++) {
+			if(completed.get(i) == 1)
+				continue;
+			final int fileIdx = i;
+			try {
+				_readExec.submit(() -> {
+					try {
+						readSequenceFile(job, files[fileIdx], request, fileIdx, filePositions, completed, stop,
+							budgetHit, bytesRead, byteLimit, budgetLock, descriptors);
+					}
+					catch(Throwable t) {
+						error.compareAndSet(null, t);
+						stop.set(true);
+					}
+					finally {
+						if(remaining.decrementAndGet() == 0)
+							completeResult(result, bytesRead, budgetHit, error, request, files, filePositions,
+								completed, descriptors);
+					}
+				});
+			}
+			catch(RejectedExecutionException e) {
+				error.compareAndSet(null, e);
+				stop.set(true);
+				if(remaining.decrementAndGet() == 0)
+					completeResult(result, bytesRead, budgetHit, error, request, files, filePositions, completed,
+						descriptors);
+				break;
+			}
+		}
+
+		if(!anyTask) {
+			tryCloseTarget(request.target, true);
+			result.complete(new SourceReadResult(bytesRead.get(), true, null, List.of()));
+		}
+
+		return result;
+	}
+
+	private void completeResult(CompletableFuture<SourceReadResult> future, AtomicLong bytesRead, AtomicBoolean budgetHit,
+		AtomicReference<Throwable> error, SourceReadRequest request, Path[] files, AtomicLongArray filePositions,
+		AtomicIntegerArray completed, ConcurrentLinkedDeque<SourceBlockDescriptor> descriptors) {
+		Throwable err = error.get();
+		if (err != null) {
+			future.completeExceptionally(err instanceof Exception ? err : new Exception(err));
+			return;
+		}
+
+		if (budgetHit.get()) {
+			if (!request.keepOpenOnLimit)
+				tryCloseTarget(request.target, false);
+			SourceReadContinuation cont = new SourceReadState(request, files, filePositions, completed);
+			future.complete(new SourceReadResult(bytesRead.get(), false, cont, new ArrayList<>(descriptors)));
+			return;
+		}
+
+		tryCloseTarget(request.target, true);
+		future.complete(new SourceReadResult(bytesRead.get(), true, null, new ArrayList<>(descriptors)));
+	}
+
+	private void readSequenceFile(JobConf job, Path path, SourceReadRequest request, int fileIdx,
+		AtomicLongArray filePositions, AtomicIntegerArray completed, AtomicBoolean stop, AtomicBoolean budgetHit,
+		AtomicLong bytesRead, long byteLimit, Object budgetLock, ConcurrentLinkedDeque<SourceBlockDescriptor> descriptors)
+		throws IOException {
+		MatrixIndexes key = new MatrixIndexes();
+		MatrixBlock value = new MatrixBlock();
+
+		try(SequenceFile.Reader reader = new SequenceFile.Reader(job, SequenceFile.Reader.file(path))) {
+			long pos = filePositions.get(fileIdx);
+			if (pos > 0)
+				reader.seek(pos);
+
+			long ioStart = DMLScript.OOC_LOG_EVENTS ? System.nanoTime() : 0;
+			while(!stop.get()) {
+				long recordStart = reader.getPosition();
+				if (!reader.next(key, value))
+					break;
+				long recordEnd = reader.getPosition();
+				long blockSize = value.getExactSerializedSize();
+				boolean shouldBreak = false;
+
+				synchronized(budgetLock) {
+					if (stop.get())
+						shouldBreak = true;
+					else if (bytesRead.get() + blockSize > byteLimit) {
+						stop.set(true);
+						budgetHit.set(true);
+						shouldBreak = true;
+					}
+					bytesRead.addAndGet(blockSize);
+				}
+
+				MatrixIndexes outIdx = new MatrixIndexes(key);
+				MatrixBlock outBlk = new MatrixBlock(value);
+				IndexedMatrixValue imv = new IndexedMatrixValue(outIdx, outBlk);
+				SourceBlockDescriptor descriptor = new SourceBlockDescriptor(path.toString(), request.format, outIdx,
+					recordStart, (int)(recordEnd - recordStart), blockSize);
+
+				if (request.target instanceof OOCSourceStream src)
+					src.enqueue(imv, descriptor);
+				else
+					request.target.enqueue(imv);
+
+				descriptors.add(descriptor);
+				filePositions.set(fileIdx, reader.getPosition());
+
+				if (DMLScript.OOC_LOG_EVENTS) {
+					long currTime = System.nanoTime();
+					OOCEventLog.onDiskReadEvent(_srcReadCallerId, ioStart, currTime, blockSize);
+					ioStart = currTime;
+				}
+
+				if (shouldBreak)
+					break; // Note that we knowingly go over limit, which could result in READER_SIZE*8MB overshoot
+			}
+
+			if (!stop.get())
+				completed.set(fileIdx, 1);
+		}
+	}
+
+	private void tryCloseTarget(org.apache.sysds.runtime.instructions.ooc.OOCStream<IndexedMatrixValue> target, boolean close) {
+		if (close) {
+			try {
+				target.closeInput();
+			}
+			catch(Exception ignored) {
+			}
+		}
+	}
+
+
+	private void loadFromDisk(BlockEntry block) {
+		String key = block.getKey().toFileKey();
+
+		SourceBlockDescriptor src = _sourceLocations.get(block.getKey());
+		if (src != null) {
+			loadFromSource(block, src);
+			return;
+		}
+
+		long ioDuration = 0;
+		// 1. find the blocks address (spill location)
+		SpillLocation sloc = _spillLocations.get(key);
+		if (sloc == null)
+			throw new DMLRuntimeException("Failed to load spill location for: " + key);
+
+		PartitionFile partFile = _partitions.get(sloc.partitionId);
+		if (partFile == null)
+			throw new DMLRuntimeException("Failed to load partition for: " + sloc.partitionId);
+
+		String filename = partFile.filePath;
+
+		// Create an empty object to read data into.
+		MatrixIndexes ix = new  MatrixIndexes();
+		MatrixBlock mb = new  MatrixBlock();
+
+		try (RandomAccessFile raf = new RandomAccessFile(filename, "r")) {
+			raf.seek(sloc.offset);
+
+			DataInput dis = new FastBufferedDataInputStream(Channels.newInputStream(raf.getChannel()));
+			long ioStart = DMLScript.STATISTICS ? System.nanoTime() : 0;
+			ix.readFields(dis); // 1. Read Indexes
+			mb.readFields(dis); // 2. Read Block
+			if (DMLScript.STATISTICS)
+				ioDuration = System.nanoTime() - ioStart;
+		} catch (ClosedByInterruptException ignored) {
+		} catch (IOException e) {
+			throw new RuntimeException(e);
+		}
+
+		block.setDataUnsafe(new IndexedMatrixValue(ix, mb));
+
+		if (DMLScript.STATISTICS) {
+			Statistics.incrementOOCLoadFromDisk();
+			Statistics.accumulateOOCLoadFromDiskTime(ioDuration);
+		}
+	}
+
+	private void loadFromSource(BlockEntry block, SourceBlockDescriptor src) {
+		if (src.format != Types.FileFormat.BINARY)
+			throw new DMLRuntimeException("Unsupported format for source read: " + src.format);
+
+		JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
+		Path path = new Path(src.path);
+
+		MatrixIndexes ix = new MatrixIndexes();
+		MatrixBlock mb = new MatrixBlock();
+
+		try(SequenceFile.Reader reader = new SequenceFile.Reader(job, SequenceFile.Reader.file(path))) {
+			reader.seek(src.offset);
+			if (!reader.next(ix, mb))
+				throw new DMLRuntimeException("Failed to read source block at offset " + src.offset + " in " + src.path);
+		}
+		catch(IOException e) {
+			throw new DMLRuntimeException(e);
+		}
+
+		block.setDataUnsafe(new IndexedMatrixValue(ix, mb));
+	}
+
+	private void evictTask(CloseableQueue<Tuple2<BlockEntry, CompletableFuture<Void>>> q) {
+		long byteCtr = 0;
+
+		while (!q.isFinished()) {
+			// --- 1. WRITE PHASE ---
+			int partitionId = _partitionCounter.getAndIncrement();
+
+			LocalFileUtils.createLocalFileIfNotExist(_spillDir);
+
+			String filename = _spillDir + "/stream_batch_part_" + partitionId;
+
+			PartitionFile partFile = new PartitionFile(filename);
+			_partitions.put(partitionId, partFile);
+
+			FileOutputStream fos = null;
+			CountableFastBufferedDataOutputStream dos = null;
+			ConcurrentLinkedDeque<Tuple3<Long, Long, CompletableFuture<Void>>> waitingForFlush = null;
+
+			try {
+				fos = new FileOutputStream(filename);
+				dos = new CountableFastBufferedDataOutputStream(fos);
+
+				Tuple2<BlockEntry, CompletableFuture<Void>> tpl;
+				waitingForFlush = new ConcurrentLinkedDeque<>();
+				boolean closePartition = false;
+
+				while((tpl = q.take()) != null) {
+					long ioStart = DMLScript.STATISTICS || DMLScript.OOC_LOG_EVENTS ? System.nanoTime() : 0;
+					BlockEntry entry = tpl._1;
+					CompletableFuture<Void> future = tpl._2;
+					long wrote = writeOut(partitionId, entry, future, fos, dos, waitingForFlush);
+
+					if(DMLScript.STATISTICS && wrote > 0) {
+						Statistics.incrementOOCEvictionWrite();
+						Statistics.accumulateOOCEvictionWriteTime(System.nanoTime() - ioStart);
+					}
+
+					byteCtr += wrote;
+					if (byteCtr >= MAX_PARTITION_SIZE) {
+						closePartition = true;
+						byteCtr = 0;
+						break;
+					}
+
+					if (DMLScript.OOC_LOG_EVENTS)
+						OOCEventLog.onDiskWriteEvent(_evictCallerId, ioStart, System.nanoTime(), wrote);
+				}
+
+				if (!closePartition && q.close()) {
+					while((tpl = q.take()) != null) {
+						long ioStart = DMLScript.STATISTICS ? System.nanoTime() : 0;
+						BlockEntry entry = tpl._1;
+						CompletableFuture<Void> future = tpl._2;
+						long wrote = writeOut(partitionId, entry, future, fos, dos, waitingForFlush);
+						byteCtr += wrote;
+
+						if(DMLScript.STATISTICS && wrote > 0) {
+							Statistics.incrementOOCEvictionWrite();
+							Statistics.accumulateOOCEvictionWriteTime(System.nanoTime() - ioStart);
+						}
+
+						if (DMLScript.OOC_LOG_EVENTS)
+							OOCEventLog.onDiskWriteEvent(_evictCallerId, ioStart, System.nanoTime(), wrote);
+					}
+				}
+			}
+			catch(IOException | InterruptedException ex) {
+				throw new DMLRuntimeException(ex);
+			}
+			catch(Exception ignored) {
+			}
+			finally {
+				IOUtilFunctions.closeSilently(dos);
+				IOUtilFunctions.closeSilently(fos);
+				if(waitingForFlush != null)
+					flushQueue(Long.MAX_VALUE, waitingForFlush);
+			}
+		}
+	}
+
+	private long writeOut(int partitionId, BlockEntry entry, CompletableFuture<Void> future, FileOutputStream fos,
+		CountableFastBufferedDataOutputStream dos, ConcurrentLinkedDeque<Tuple3<Long, Long, CompletableFuture<Void>>> flushQueue) throws IOException {
+		String key = entry.getKey().toFileKey();
+		boolean alreadySpilled = _spillLocations.containsKey(key);
+
+		if (!alreadySpilled) {
+			// 1. get the current file position. this is the offset.
+			// flush any buffered data to the file
+			//dos.flush();
+			long offsetBefore = fos.getChannel().position() + dos.getCount();
+
+			// 2. write indexes and block
+			IndexedMatrixValue imv = (IndexedMatrixValue) entry.getDataUnsafe(); // Get data without requiring pin
+			imv.getIndexes().write(dos); // write Indexes
+			imv.getValue().write(dos);
+
+			long offsetAfter = fos.getChannel().position() + dos.getCount();
+			flushQueue.offer(new Tuple3<>(offsetBefore, offsetAfter, future));
+
+			// 3. create the spillLocation
+			SpillLocation sloc = new SpillLocation(partitionId, offsetBefore);
+			_spillLocations.put(key, sloc);
+			flushQueue(fos.getChannel().position(), flushQueue);
+
+			return offsetAfter - offsetBefore;
+		}
+		return 0;
+	}
+
+	private void flushQueue(long offset, ConcurrentLinkedDeque<Tuple3<Long, Long, CompletableFuture<Void>>> flushQueue) {
+		Tuple3<Long, Long, CompletableFuture<Void>> tmp;
+		while ((tmp = flushQueue.peek()) != null && tmp._2() < offset) {
+			flushQueue.poll();
+			tmp._3().complete(null);
+		}
+	}
+
+
+
+
+	private static class SpillLocation {
+		// structure of spillLocation: file, offset
+		final int partitionId;
+		final long offset;
+
+		SpillLocation(int partitionId, long offset) {
+			this.partitionId = partitionId;
+			this.offset = offset;
+		}
+	}
+
+	private static class PartitionFile {
+		final String filePath;
+
+		PartitionFile(String filePath) {
+			this.filePath = filePath;
+		}
+	}
+
+	private static class CountableFastBufferedDataOutputStream extends FastBufferedDataOutputStream {
+		public CountableFastBufferedDataOutputStream(OutputStream out) {
+			super(out);
+		}
+
+		public int getCount() {
+			return _count;
+		}
+	}
+
+	private static class SourceReadState implements SourceReadContinuation {
+		final SourceReadRequest request;
+		final Path[] paths;
+		final AtomicLongArray filePositions;
+		final AtomicIntegerArray completed;
+
+		SourceReadState(SourceReadRequest request, Path[] paths, AtomicLongArray filePositions,
+			AtomicIntegerArray completed) {
+			this.request = request;
+			this.paths = paths;
+			this.filePositions = filePositions;
+			this.completed = completed;
+		}
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/ooc/stats/OOCEventLog.java b/src/main/java/org/apache/sysds/runtime/ooc/stats/OOCEventLog.java
new file mode 100644
index 00000000000..0df22c9a851
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/ooc/stats/OOCEventLog.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.ooc.stats;
+
+import org.apache.sysds.api.DMLScript;
+
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class OOCEventLog {
+	private static final AtomicInteger _callerCtr = new AtomicInteger(0);
+	private static final ConcurrentHashMap<Integer, String> _callerNames = new ConcurrentHashMap<>();
+	private static final ConcurrentHashMap<String, Object> _runSettings = new ConcurrentHashMap<>();
+
+	private static final AtomicInteger _logCtr = new AtomicInteger(0);
+	private static EventType[] _eventTypes;
+	private static long[] _startTimestamps;
+	private static long[] _endTimestamps;
+	private static int[] _callerIds;
+	private static long[] _threadIds;
+	private static long[] _data;
+
+	public static void setup(int maxNumEvents) {
+		_eventTypes = DMLScript.OOC_LOG_EVENTS ? new  EventType[maxNumEvents] : null;
+		_startTimestamps = DMLScript.OOC_LOG_EVENTS ? new long[maxNumEvents] : null;
+		_endTimestamps = DMLScript.OOC_LOG_EVENTS ? new long[maxNumEvents] : null;
+		_callerIds = DMLScript.OOC_LOG_EVENTS ? new int[maxNumEvents] : null;
+		_threadIds = DMLScript.OOC_LOG_EVENTS ? new long[maxNumEvents] : null;
+		_data = DMLScript.OOC_LOG_EVENTS ? new long[maxNumEvents] : null;
+	}
+
+	public static int registerCaller(String callerName) {
+		int callerId = _callerCtr.incrementAndGet();
+		_callerNames.put(callerId, callerName);
+		return callerId;
+	}
+
+	public static void onComputeEvent(int callerId, long startTimestamp, long endTimestamp) {
+		int idx = _logCtr.getAndIncrement();
+		_eventTypes[idx] = EventType.COMPUTE;
+		_startTimestamps[idx] = startTimestamp;
+		_endTimestamps[idx] = endTimestamp;
+		_callerIds[idx] = callerId;
+		_threadIds[idx] = Thread.currentThread().getId();
+	}
+
+	public static void onDiskWriteEvent(int callerId, long startTimestamp, long endTimestamp, long size) {
+		int idx = _logCtr.getAndIncrement();
+		_eventTypes[idx] = EventType.DISK_WRITE;
+		_startTimestamps[idx] = startTimestamp;
+		_endTimestamps[idx] = endTimestamp;
+		_callerIds[idx] = callerId;
+		_threadIds[idx] = Thread.currentThread().getId();
+		_data[idx] = size;
+	}
+
+	public static void onDiskReadEvent(int callerId, long startTimestamp, long endTimestamp, long size) {
+		int idx = _logCtr.getAndIncrement();
+		_eventTypes[idx] = EventType.DISK_READ;
+		_startTimestamps[idx] = startTimestamp;
+		_endTimestamps[idx] = endTimestamp;
+		_callerIds[idx] = callerId;
+		_threadIds[idx] = Thread.currentThread().getId();
+		_data[idx] = size;
+	}
+
+	public static void onCacheSizeChangedEvent(int callerId, long timestamp, long cacheSize, long bytesToEvict) {
+		int idx = _logCtr.getAndIncrement();
+		_eventTypes[idx] = EventType.CACHESIZE_CHANGE;
+		_startTimestamps[idx] = timestamp;
+		_endTimestamps[idx] = bytesToEvict;
+		_callerIds[idx] = callerId;
+		_threadIds[idx] = Thread.currentThread().getId();
+		_data[idx] = cacheSize;
+	}
+
+	public static void putRunSetting(String setting, Object data) {
+		_runSettings.put(setting, data);
+	}
+
+	public static String getComputeEventsCSV() {
+		return getFilteredCSV("ThreadID,CallerID,StartNanos,EndNanos\n", EventType.COMPUTE, false);
+	}
+
+	public static String getDiskReadEventsCSV() {
+		return getFilteredCSV("ThreadID,CallerID,StartNanos,EndNanos,NumBytes\n", EventType.DISK_READ, true);
+	}
+
+	public static String getDiskWriteEventsCSV() {
+		return getFilteredCSV("ThreadID,CallerID,StartNanos,EndNanos,NumBytes\n", EventType.DISK_WRITE, true);
+	}
+
+	public static String getCacheSizeEventsCSV() {
+		return getFilteredCSV("ThreadID,CallerID,Timestamp,ScheduledEvictionSize,CacheSize\n", EventType.CACHESIZE_CHANGE, true);
+	}
+
+	private static String getFilteredCSV(String header, EventType filter, boolean data) {
+		StringBuilder sb = new StringBuilder();
+		sb.append(header);
+
+		int maxIdx = _logCtr.get();
+		for (int i = 0; i < maxIdx; i++) {
+			if (_eventTypes[i] != filter)
+				continue;
+			sb.append(_threadIds[i]);
+			sb.append(',');
+			sb.append(_callerNames.get(_callerIds[i]));
+			sb.append(',');
+			sb.append(_startTimestamps[i]);
+			sb.append(',');
+			sb.append(_endTimestamps[i]);
+			if (data) {
+				sb.append(',');
+				sb.append(_data[i]);
+			}
+			sb.append('\n');
+		}
+
+		return sb.toString();
+	}
+
+	public static String getRunSettingsCSV() {
+		StringBuilder sb = new StringBuilder();
+		Set<Map.Entry<String, Object>> entrySet = _runSettings.entrySet();
+
+		int ctr = 0;
+		for (Map.Entry<String, Object> entry : entrySet) {
+			sb.append(entry.getKey());
+			ctr++;
+			if (ctr >= entrySet.size())
+				sb.append('\n');
+			else
+				sb.append(',');
+		}
+
+		ctr = 0;
+		for (Map.Entry<String, Object> entry : _runSettings.entrySet()) {
+			sb.append(entry.getValue());
+			ctr++;
+			if (ctr < entrySet.size())
+				sb.append(',');
+		}
+
+		return sb.toString();
+	}
+
+	public static void clear() {
+		_callerCtr.set(0);
+		_logCtr.set(0);
+		_callerNames.clear();
+		_runSettings.clear();
+	}
+
+	public enum EventType {
+		COMPUTE,
+		DISK_WRITE,
+		DISK_READ,
+		CACHESIZE_CHANGE
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/ooc/stream/OOCSourceStream.java b/src/main/java/org/apache/sysds/runtime/ooc/stream/OOCSourceStream.java
new file mode 100644
index 00000000000..c48aaa45ab2
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/ooc/stream/OOCSourceStream.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.ooc.stream;
+
+import org.apache.sysds.runtime.instructions.ooc.SubscribableTaskQueue;
+import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
+import org.apache.sysds.runtime.ooc.cache.OOCIOHandler;
+import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
+
+import java.util.concurrent.ConcurrentHashMap;
+
+public class OOCSourceStream extends SubscribableTaskQueue<IndexedMatrixValue> {
+	private final ConcurrentHashMap<MatrixIndexes, OOCIOHandler.SourceBlockDescriptor> _idx;
+
+	public OOCSourceStream() {
+		this._idx = new ConcurrentHashMap<>();
+	}
+
+	public void enqueue(IndexedMatrixValue value, OOCIOHandler.SourceBlockDescriptor descriptor) {
+		if(descriptor == null)
+			throw new IllegalArgumentException("Source descriptor must not be null");
+		MatrixIndexes key = new MatrixIndexes(descriptor.indexes);
+		_idx.put(key, descriptor);
+		super.enqueue(value);
+	}
+
+	@Override
+	public void enqueue(IndexedMatrixValue val) {
+		throw new UnsupportedOperationException("Use enqueue(value, descriptor) for source streams");
+	}
+
+	public OOCIOHandler.SourceBlockDescriptor getDescriptor(MatrixIndexes indexes) {
+		return _idx.get(indexes);
+	}
+}
diff --git a/src/main/java/org/apache/sysds/runtime/util/LocalFileUtils.java b/src/main/java/org/apache/sysds/runtime/util/LocalFileUtils.java
index c7e2f4b0404..7d5be41c261 100644
--- a/src/main/java/org/apache/sysds/runtime/util/LocalFileUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/util/LocalFileUtils.java
@@ -496,7 +496,6 @@ public static String getUniqueWorkingDir(String category) {
 			createWorkingDirectory();
 		StringBuilder sb = new StringBuilder();
 		sb.append( _workingDir );
-		sb.append( Lop.FILE_SEPARATOR );
 		sb.append( category );
 		sb.append( Lop.FILE_SEPARATOR );
 		sb.append( "tmp" );
diff --git a/src/main/java/org/apache/sysds/runtime/util/UnixPipeUtils.java b/src/main/java/org/apache/sysds/runtime/util/UnixPipeUtils.java
index 69014acc0f0..9ed1f73b7f4 100644
--- a/src/main/java/org/apache/sysds/runtime/util/UnixPipeUtils.java
+++ b/src/main/java/org/apache/sysds/runtime/util/UnixPipeUtils.java
@@ -1,29 +1,24 @@
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
+ * or more contributor license agreements.	See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *	 http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
+ * KIND, either express or implied.	 See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 
 package org.apache.sysds.runtime.util;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.sysds.common.Types;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
-
 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
 import java.io.EOFException;
@@ -37,10 +32,33 @@
 import java.nio.DoubleBuffer;
 import java.nio.FloatBuffer;
 import java.nio.IntBuffer;
+import java.nio.LongBuffer;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.frame.data.columns.Array;
+import org.apache.sysds.runtime.frame.data.columns.ArrayFactory;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+
 
 public class UnixPipeUtils {
 	private static final Log LOG = LogFactory.getLog(UnixPipeUtils.class.getName());
 
+	public static int getElementSize(Types.ValueType type) {
+		return switch (type) {
+			case UINT8, BOOLEAN -> 1;
+			case INT32, FP32 -> 4;
+			case INT64, FP64 -> 8;
+			default -> throw new UnsupportedOperationException("Unsupported type: " + type);
+		};
+	}
+
+	private static ByteBuffer newLittleEndianBuffer(byte[] buffer, int length) {
+		return ByteBuffer.wrap(buffer, 0, length).order(ByteOrder.LITTLE_ENDIAN);
+	}
+
 	/**
 	 * Opens a named pipe for input, reads 4 bytes as an int, compares it to the expected ID.
 	 * If matched, returns the InputStream for further use.
@@ -74,7 +92,10 @@ public static void readHandshake(int expectedId, BufferedInputStream bis) throws
 			bis.close();
 			throw new IOException("Failed to read handshake integer from pipe");
 		}
+		compareHandshakeIds(expectedId, bis, buffer);
+	}
 
+	private static void compareHandshakeIds(int expectedId, BufferedInputStream bis, byte[] buffer) throws IOException {
 		// Convert bytes to int (assuming little-endian to match typical Python struct.pack)
 		int receivedId = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
 		expectedId += 1000;
@@ -106,15 +127,65 @@ public static void writeHandshake(int expectedId, BufferedOutputStream bos) thro
 		bos.flush();
 	}
 
-	public static void readNumpyArrayInBatches(BufferedInputStream in, int id, int batchSize, int numElem,
-											   Types.ValueType type, double[] out, int offsetOut)
-			throws IOException {
-		int elemSize;
-		switch (type){
-			case UINT8 -> elemSize = 1;
-			case INT32, FP32 -> elemSize = 4;
-			default -> elemSize = 8;
+	@FunctionalInterface
+	private interface BufferReader {
+		int readTo(Object dest, int offset, ByteBuffer bb);
+	}
+
+	private static BufferReader getBufferReader(Types.ValueType type) {
+		return switch (type) {
+			case FP64 -> (dest, offset, bb) -> {
+				DoubleBuffer db = bb.asDoubleBuffer();
+				double[] out = (double[]) dest;
+				int remaining = db.remaining();
+				db.get(out, offset, remaining);
+				return offset + remaining;
+			};
+			case FP32 -> (dest, offset, bb) -> {
+				FloatBuffer fb = bb.asFloatBuffer();
+				double[] out = (double[]) dest;
+				int n = fb.remaining();
+				for (int i = 0; i < n; i++) out[offset++] = fb.get();
+				return offset;
+			};
+			case INT64 -> (dest, offset, bb) -> {
+				LongBuffer lb = bb.asLongBuffer();
+				double[] out = (double[]) dest;
+				int n = lb.remaining();
+				for (int i = 0; i < n; i++) out[offset++] = lb.get();
+				return offset;
+			};
+			case INT32 -> (dest, offset, bb) -> {
+				IntBuffer ib = bb.asIntBuffer();
+				double[] out = (double[]) dest;
+				int n = ib.remaining();
+				for (int i = 0; i < n; i++) out[offset++] = ib.get();
+				return offset;
+			};
+			case UINT8 -> (dest, offset, bb) -> {
+				double[] out = (double[]) dest;
+				for (int i = 0; i < bb.limit(); i++) out[offset++] = bb.get(i) & 0xFF;
+				return offset;
+			};
+			default -> throw new UnsupportedOperationException("Unsupported type: " + type);
+		};
+	}
+
+	private static void readFully(BufferedInputStream in, byte[] buffer, int len) throws IOException {
+		int total = 0;
+		while (total < len) {
+			int read = in.read(buffer, total, len - total);
+			if (read == -1)
+				throw new EOFException("Unexpected end of stream");
+			total += read;
 		}
+	}
+
+	public static long readNumpyArrayInBatches(BufferedInputStream in, int id, int batchSize, int numElem,
+		Types.ValueType type, double[] out, int offsetOut)
+			throws IOException {
+		int elemSize = getElementSize(type);
+		long nonZeros = 0;
 
 		try {
 			// Read start header
@@ -122,26 +193,26 @@ public static void readNumpyArrayInBatches(BufferedInputStream in, int id, int b
 			long bytesRemaining = ((long) numElem) * elemSize;
 			byte[] buffer = new byte[batchSize];
 
+			BufferReader reader = getBufferReader(type);
+			int prevOffset = offsetOut;
 			while (bytesRemaining > 0) {
-				int currentBatchSize = (int) Math.min(batchSize, bytesRemaining);
-				int totalRead = 0;
-
-				while (totalRead < currentBatchSize) {
-					int bytesRead = in.read(buffer, totalRead, currentBatchSize - totalRead);
-					if (bytesRead == -1) {
-						throw new EOFException("Unexpected end of stream in pipe #" + id +
-								": expected " + currentBatchSize + " bytes, got " + totalRead);
+				int chunk = (int) Math.min(batchSize, bytesRemaining);
+				readFully(in, buffer, chunk);
+				offsetOut = reader.readTo(out, offsetOut, newLittleEndianBuffer(buffer, chunk));
+				
+				// Count nonzeros in the batch we just read (performant: single pass)
+				for (int i = prevOffset; i < offsetOut; i++) {
+					if (out[i] != 0.0) {
+						nonZeros++;
 					}
-					totalRead += bytesRead;
 				}
-
-				// Interpret bytes with value type and fill the dense MB
-				offsetOut = fillDoubleArrayFromByteArray(type, out, offsetOut, buffer, currentBatchSize);
-				bytesRemaining -= currentBatchSize;
+				prevOffset = offsetOut;
+				bytesRemaining -= chunk;
 			}
 
 			// Read end header
 			readHandshake(id, in);
+			return nonZeros;
 
 		} catch (Exception e) {
 			LOG.error("Error occurred while reading data from pipe #" + id, e);
@@ -149,120 +220,540 @@ public static void readNumpyArrayInBatches(BufferedInputStream in, int id, int b
 		}
 	}
 
-	private static int fillDoubleArrayFromByteArray(Types.ValueType type, double[] out, int offsetOut, byte[] buffer,
-													int currentBatchSize) {
-		ByteBuffer bb = ByteBuffer.wrap(buffer, 0, currentBatchSize).order(ByteOrder.LITTLE_ENDIAN);
-		switch (type){
-			default -> {
-				DoubleBuffer doubleBuffer = bb.asDoubleBuffer();
-				int numDoubles = doubleBuffer.remaining();
-				doubleBuffer.get(out, offsetOut, numDoubles);
-				offsetOut += numDoubles;
+
+	@FunctionalInterface
+	private interface BufferWriter {
+		int writeFrom(Object src, int offset, ByteBuffer bb);
+	}
+
+	private static BufferWriter getBufferWriter(Types.ValueType type) {
+		return switch (type) {
+			case FP64 -> (src, offset, bb) -> {
+				MatrixBlock mb = (MatrixBlock) src;
+				DoubleBuffer db = bb.asDoubleBuffer();
+				int n = Math.min(db.remaining(), mb.getNumRows() * mb.getNumColumns() - offset);
+				for (int i = 0; i < n; i++) {
+					int r = (offset + i) / mb.getNumColumns();
+					int c = (offset + i) % mb.getNumColumns();
+					db.put(mb.getDouble(r, c));
+				}
+				return n * 8;
+			};
+			case FP32 -> (src, offset, bb) -> {
+				MatrixBlock mb = (MatrixBlock) src;
+				FloatBuffer fb = bb.asFloatBuffer();
+				int n = Math.min(fb.remaining(), mb.getNumRows() * mb.getNumColumns() - offset);
+				for (int i = 0; i < n; i++) {
+					int r = (offset + i) / mb.getNumColumns();
+					int c = (offset + i) % mb.getNumColumns();
+					fb.put((float) mb.getDouble(r, c));
+				}
+				return n * 4;
+			};
+			case INT64 -> (src, offset, bb) -> {
+				MatrixBlock mb = (MatrixBlock) src;
+				LongBuffer lb = bb.asLongBuffer();
+				int n = Math.min(lb.remaining(), mb.getNumRows() * mb.getNumColumns() - offset);
+				for (int i = 0; i < n; i++) {
+					int r = (offset + i) / mb.getNumColumns();
+					int c = (offset + i) % mb.getNumColumns();
+					lb.put((long) mb.getDouble(r, c));
+				}
+				return n * 8;
+			};
+			case INT32 -> (src, offset, bb) -> {
+				MatrixBlock mb = (MatrixBlock) src;
+				IntBuffer ib = bb.asIntBuffer();
+				int n = Math.min(ib.remaining(), mb.getNumRows() * mb.getNumColumns() - offset);
+				for (int i = 0; i < n; i++) {
+					int r = (offset + i) / mb.getNumColumns();
+					int c = (offset + i) % mb.getNumColumns();
+					ib.put((int) mb.getDouble(r, c));
+				}
+				return n * 4;
+			};
+			case UINT8 -> (src, offset, bb) -> {
+				MatrixBlock mb = (MatrixBlock) src;
+				int n = Math.min(bb.limit(), mb.getNumRows() * mb.getNumColumns() - offset);
+				for (int i = 0; i < n; i++) {
+					int r = (offset + i) / mb.getNumColumns();
+					int c = (offset + i) % mb.getNumColumns();
+					bb.put(i, (byte) ((int) mb.getDouble(r, c) & 0xFF));
+				}
+				return n;
+			};
+			default -> throw new UnsupportedOperationException("Unsupported type: " + type);
+		};
+	}
+
+	/**
+	 * Symmetric with readNumpyArrayInBatches — writes data in batches with handshake.
+	 */
+	public static long writeNumpyArrayInBatches(BufferedOutputStream out, int id, int batchSize,
+												int numElem, Types.ValueType type, MatrixBlock mb)
+			throws IOException {
+		int elemSize = getElementSize(type);
+		long totalBytesWritten = 0;
+
+		try {
+			writeHandshake(id, out);
+			long bytesRemaining = ((long) numElem) * elemSize;
+			byte[] buffer = new byte[batchSize];
+			BufferWriter writer = getBufferWriter(type);
+
+			int offset = 0;
+			while (bytesRemaining > 0) {
+				int chunk = (int) Math.min(batchSize, bytesRemaining);
+				ByteBuffer bb = newLittleEndianBuffer(buffer, chunk);
+				int bytesFilled = writer.writeFrom(mb, offset, bb);
+				out.write(buffer, 0, bytesFilled);
+				totalBytesWritten += bytesFilled;
+				bytesRemaining -= bytesFilled;
+				offset += bytesFilled / elemSize;
+			}
+
+			out.flush();
+			writeHandshake(id, out);
+			return totalBytesWritten;
+		} catch (Exception e) {
+			LOG.error("Error occurred while writing data to pipe #" + id, e);
+			throw e;
+		}
+	}
+
+	public static Array<?> readFrameColumnFromPipe(
+			BufferedInputStream in, int id, int rows, int totalBytes, int batchSize,
+			Types.ValueType type) throws IOException {
+
+		long tStart = System.nanoTime();
+		long tIoStart, tIoTotal = 0;
+		long tDecodeTotal = 0;
+		int numStrings = 0;
+		
+		readHandshake(id, in);
+		Array<?> array = ArrayFactory.allocate(type, rows);
+		byte[] buffer = new byte[batchSize];
+		try {
+			if (type != Types.ValueType.STRING) {
+				tIoStart = System.nanoTime();
+				readFixedTypeColumn(in, array, type, rows, totalBytes, buffer);
+				tIoTotal = System.nanoTime() - tIoStart;
+				readHandshake(id, in);
+			} else {
+				tIoStart = System.nanoTime();
+				VarFillTiming timing = readVariableTypeColumn(in, id, array, type, rows, buffer);
+				tIoTotal = System.nanoTime() - tIoStart;
+				tDecodeTotal = timing.decodeTime;
+				numStrings = timing.numStrings;
+			}
+		} catch (Exception e) {
+			LOG.error("Error occurred while reading FrameBlock column from pipe #" + id, e);
+			throw e;
+		}
+		
+		long tTotal = System.nanoTime() - tStart;
+		if (type == Types.ValueType.STRING) {
+			LOG.debug(String.format(
+				"Java readFrameColumnFromPipe timing: total=%.3fs, I/O=%.3fs (%.1f%%), decode=%.3fs (%.1f%%), strings=%d",
+				tTotal / 1e9, tIoTotal / 1e9, 100.0 * tIoTotal / tTotal,
+				tDecodeTotal / 1e9, 100.0 * tDecodeTotal / tTotal, numStrings));
+		}
+		return array;
+	}
+	
+	private static class VarFillTiming {
+		long decodeTime;
+		int numStrings;
+		VarFillTiming(long decodeTime, int numStrings) {
+			this.decodeTime = decodeTime;
+			this.numStrings = numStrings;
+		}
+	}
+
+	private static void readFixedTypeColumn(
+			BufferedInputStream in, Array<?> array,
+			Types.ValueType type, int rows, int totalBytes, byte[] buffer) throws IOException {
+
+		int elemSize = getElementSize(type);
+		int expected = rows * elemSize;
+		if (totalBytes != expected)
+			throw new IOException("Expected " + expected + " bytes but got " + totalBytes);
+
+		int offset = 0;
+		long bytesRemaining = totalBytes;
+
+		while (bytesRemaining > 0) {
+			int chunk = (int) Math.min(buffer.length, bytesRemaining);
+			readFully(in, buffer, chunk);
+			offset = fillFixedArrayFromBytes(array, type, offset, buffer, chunk);
+			bytesRemaining -= chunk;
+		}
+	}
+
+	private static int fillFixedArrayFromBytes(
+			Array<?> array, Types.ValueType type, int offsetOut,
+			byte[] buffer, int currentBatchSize) {
+
+		ByteBuffer bb = newLittleEndianBuffer(buffer, currentBatchSize);
+
+		switch (type) {
+			case FP64 -> {
+				DoubleBuffer db = bb.asDoubleBuffer();
+				while (db.hasRemaining())
+					array.set(offsetOut++, db.get());
 			}
 			case FP32 -> {
-				FloatBuffer floatBuffer = bb.asFloatBuffer();
-				int numFloats = floatBuffer.remaining();
-				for (int i = 0; i < numFloats; i++) {
-					out[offsetOut++] = floatBuffer.get();
-				}
+				FloatBuffer fb = bb.asFloatBuffer();
+				while (fb.hasRemaining())
+					array.set(offsetOut++, fb.get());
+			}
+			case INT64 -> {
+				LongBuffer lb = bb.asLongBuffer();
+				while (lb.hasRemaining())
+					array.set(offsetOut++, lb.get());
 			}
 			case INT32 -> {
-				IntBuffer intBuffer = bb.asIntBuffer();
-				int numInts = intBuffer.remaining();
-				for (int i = 0; i < numInts; i++) {
-					out[offsetOut++] = intBuffer.get();
-				}
+				IntBuffer ib = bb.asIntBuffer();
+				while (ib.hasRemaining())
+					array.set(offsetOut++, ib.get());
 			}
 			case UINT8 -> {
-				for (int i = 0; i < currentBatchSize; i++) {
-					out[offsetOut++] = bb.get(i) & 0xFF;
-				}
+				for (int i = 0; i < currentBatchSize; i++)
+					array.set(offsetOut++, (int) (bb.get(i) & 0xFF));
 			}
+			case BOOLEAN -> {
+				for (int i = 0; i < currentBatchSize; i++)
+					array.set(offsetOut++, bb.get(i) != 0 ? 1.0 : 0.0);
+			}
+			default -> throw new UnsupportedOperationException("Unsupported fixed type: " + type);
 		}
 		return offsetOut;
 	}
 
-	public static long writeNumpyArrayInBatches(BufferedOutputStream out, int id, int batchSize, int numElem,
-												Types.ValueType type, MatrixBlock mb) throws IOException {
-		int elemSize;
-		switch (type) {
-			case UINT8 -> elemSize = 1;
-			case INT32, FP32 -> elemSize = 4;
-			default -> elemSize = 8;
+	private static VarFillTiming readVariableTypeColumn(
+			BufferedInputStream in, int id, Array<?> array,
+			Types.ValueType type, int elems, byte[] buffer) throws IOException {
+
+		long tDecodeTotal = 0;
+		int numStrings = 0;
+		
+		int offset = 0;
+		// Use a reusable growable byte array to avoid repeated toByteArray() allocations
+		byte[] combined = new byte[32 * 1024]; // Start with 32KB
+		int combinedLen = 0;
+
+		// Keep reading until all expected elements are filled
+		while (offset < elems) {
+			int chunk = in.read(buffer);
+
+			// Ensure combined array is large enough
+			if (combinedLen + chunk > combined.length) {
+				// Grow array (double size, but at least accommodate new data)
+				int newSize = Math.max(combined.length * 2, combinedLen + chunk);
+				byte[] newCombined = new byte[newSize];
+				System.arraycopy(combined, 0, newCombined, 0, combinedLen);
+				combined = newCombined;
+			}
+			
+			// Append newly read bytes
+			System.arraycopy(buffer, 0, combined, combinedLen, chunk);
+			combinedLen += chunk;
+
+			// Try decoding as many complete elements as possible
+			long tDecodeStart = System.nanoTime();
+			VarFillResult res = fillVariableArrayFromBytes(array, offset, elems, combined, combinedLen, type);
+			tDecodeTotal += System.nanoTime() - tDecodeStart;
+			int stringsDecoded = res.offsetOut - offset;
+			numStrings += stringsDecoded;
+			offset = res.offsetOut;
+
+			// Retain any incomplete trailing bytes by shifting them to the start
+			int remainingBytes = res.remainingBytes;
+			if (remainingBytes > 0) {
+				// Move remaining bytes to the start of the buffer
+				System.arraycopy(combined, combinedLen - remainingBytes, combined, 0, remainingBytes);
+				combinedLen = remainingBytes;
+			} else {
+				combinedLen = 0;
+			}
 		}
-		long totalBytesWritten = 0;
 
-		// Write start header
-		writeHandshake(id, out);
+		// ---- handshake check ----
+		if(combinedLen == 0)
+			readHandshake(id, in);
+		else if (combinedLen == 4) {
+			byte[] tail = new byte[4];
+			System.arraycopy(combined, 0, tail, 0, 4);
+			compareHandshakeIds(id, in, tail);
+		}
+		else
+			throw new IOException("Expected 4-byte handshake after last element, found " + combinedLen + " bytes");
 
-		int bytesRemaining = numElem * elemSize;
-		int offset = 0;
+		return new VarFillTiming(tDecodeTotal, numStrings);
+	}
 
-		byte[] buffer = new byte[batchSize];
+	/**
+	 * Result container for variable-length decoding.
+	 *
+	 * @param offsetOut		 number of elements written to the output array
+	 * @param remainingBytes number of unconsumed tail bytes (partial element)
+	 */
+	private record VarFillResult(int offsetOut, int remainingBytes) {
+	}
 
-		while (bytesRemaining > 0) {
-			int currentBatchSize = Math.min(batchSize, bytesRemaining);
+	private static VarFillResult fillVariableArrayFromBytes(
+			Array<?> array, int offsetOut, int maxOffset, byte[] buffer,
+			int currentBatchSize, Types.ValueType type) {
+
+		ByteBuffer bb = newLittleEndianBuffer(buffer, currentBatchSize);
+		int bytesConsumed = 0;
+
+		// Each variable-length element = [int32 length][payload...]
+		while (bb.remaining() >= 4 && offsetOut < maxOffset) {
+			bb.mark();
+			int len = bb.getInt();
+
+			if (len < 0) {
+				// null string
+				array.set(offsetOut++, (String) null);
+				bytesConsumed = bb.position();
+				continue;
+			}
+			if (bb.remaining() < len) {
+				// Not enough bytes for full payload → rollback and stop
+				bb.reset();
+				break;
+			}
+			
+
+			switch (type) {
+				case STRING -> {
+					int stringStart = bb.position();
+					
+					byte[] backingArray = bb.array();
+					int arrayOffset = bb.arrayOffset() + stringStart;
+					String s = new String(backingArray, arrayOffset, len, StandardCharsets.UTF_8);
+					array.set(offsetOut++, s);
+					
+					bb.position(stringStart + len);
+				}
 
-			// Fill buffer from MatrixBlock into byte[] (typed)
-			int bytesWritten = fillByteArrayFromDoubleArray(type, mb, offset, buffer, currentBatchSize);
-			totalBytesWritten += bytesWritten;
+				default -> throw new UnsupportedOperationException(
+						"Unsupported variable-length type: " + type);
+			}
 
-			out.write(buffer, 0, currentBatchSize);
-			offset += currentBatchSize / elemSize;
-			bytesRemaining -= currentBatchSize;
+			bytesConsumed = bb.position();
 		}
 
-		out.flush();
+		int remainingBytes = currentBatchSize - bytesConsumed;
+		return new VarFillResult(offsetOut, remainingBytes);
+	}
+
+	/**
+	 * Symmetric with readFrameColumnFromPipe — writes FrameBlock column data to pipe.
+	 * Supports both fixed-size types and variable-length types (strings).
+	 */
+	public static long writeFrameColumnToPipe(
+			BufferedOutputStream out, int id, int batchSize,
+			Array<?> array, Types.ValueType type) throws IOException {
+		
+		long tStart = System.nanoTime();
+		long tIoStart, tIoTotal = 0;
+		long tEncodeTotal = 0;
+		int numStrings = 0;
+		long totalBytesWritten = 0;
 
-		// Write end header
-		writeHandshake(id, out);
-		return totalBytesWritten;
+		try {
+			writeHandshake(id, out);
+			
+			if (type != Types.ValueType.STRING) {
+				tIoStart = System.nanoTime();
+				totalBytesWritten = writeFixedTypeColumn(out, array, type, batchSize);
+				tIoTotal = System.nanoTime() - tIoStart;
+			} else {
+				tIoStart = System.nanoTime();
+				VarWriteTiming timing = writeVariableTypeColumn(out, array, type, batchSize);
+				tIoTotal = System.nanoTime() - tIoStart;
+				tEncodeTotal = timing.encodeTime;
+				numStrings = timing.numStrings;
+				totalBytesWritten = timing.totalBytes;
+			}
+			
+			out.flush();
+			writeHandshake(id, out);
+			
+			long tTotal = System.nanoTime() - tStart;
+			if (type == Types.ValueType.STRING) {
+				LOG.debug(String.format(
+					"Java writeFrameColumnToPipe timing: total=%.3fs, I/O=%.3fs (%.1f%%), encode=%.3fs (%.1f%%), strings=%d",
+					tTotal / 1e9, tIoTotal / 1e9, 100.0 * tIoTotal / tTotal,
+					tEncodeTotal / 1e9, 100.0 * tEncodeTotal / tTotal, numStrings));
+			}
+			
+			return totalBytesWritten;
+		} catch (Exception e) {
+			LOG.error("Error occurred while writing FrameBlock column to pipe #" + id, e);
+			throw e;
+		}
 	}
 
-	private static int fillByteArrayFromDoubleArray(Types.ValueType type, MatrixBlock mb, int offsetIn,
-													byte[] buffer, int maxBytes) {
-		ByteBuffer bb = ByteBuffer.wrap(buffer, 0, maxBytes).order(ByteOrder.LITTLE_ENDIAN);
-		int r,c;
-		switch (type) {
-			default -> { // FP64
-				DoubleBuffer doubleBuffer = bb.asDoubleBuffer();
-				int count = Math.min(doubleBuffer.remaining(), mb.getNumRows() * mb.getNumColumns() - offsetIn);
-				for (int i = 0; i < count; i++) {
-					r = (offsetIn + i) / mb.getNumColumns();
-					c = (offsetIn + i) % mb.getNumColumns();
-					doubleBuffer.put(mb.getDouble(r,c));
-				}
-				return count * 8;
+	private static class VarWriteTiming {
+		long encodeTime;
+		int numStrings;
+		long totalBytes;
+		VarWriteTiming(long encodeTime, int numStrings, long totalBytes) {
+			this.encodeTime = encodeTime;
+			this.numStrings = numStrings;
+			this.totalBytes = totalBytes;
+		}
+	}
+
+	private static long writeFixedTypeColumn(
+			BufferedOutputStream out, Array<?> array,
+			Types.ValueType type, int batchSize) throws IOException {
+		
+		int elemSize = getElementSize(type);
+		int rows = array.size();
+		long totalBytes = (long) rows * elemSize;
+		
+		byte[] buffer = new byte[batchSize];
+		int arrayIndex = 0;
+		int bufferPos = 0;
+
+		while (arrayIndex < rows) {
+			// Calculate how many elements can fit in the remaining buffer space
+			int remainingBufferSpace = batchSize - bufferPos;
+			int elementsToWrite = Math.min((remainingBufferSpace / elemSize), rows - arrayIndex);
+			
+			if (elementsToWrite == 0) {
+				// Buffer is full, flush it
+				out.write(buffer, 0, bufferPos);
+				bufferPos = 0;
+				continue;
 			}
-			case FP32 -> {
-				FloatBuffer floatBuffer = bb.asFloatBuffer();
-				int count = Math.min(floatBuffer.remaining(), mb.getNumRows() * mb.getNumColumns() - offsetIn);
-				for (int i = 0; i < count; i++) {
-					r = (offsetIn + i) / mb.getNumColumns();
-					c = (offsetIn + i) % mb.getNumColumns();
-					floatBuffer.put((float) mb.getDouble(r,c));
+			
+			// Convert elements to bytes directly into the buffer
+			ByteBuffer bb = ByteBuffer.wrap(buffer, bufferPos, elementsToWrite * elemSize)
+					.order(ByteOrder.LITTLE_ENDIAN);
+			
+			switch (type) {
+				case FP64 -> {
+					DoubleBuffer db = bb.asDoubleBuffer();
+					for (int i = 0; i < elementsToWrite; i++) {
+						db.put(array.getAsDouble(arrayIndex++));
+					}
+					bufferPos += elementsToWrite * 8;
+				}
+				case FP32 -> {
+					FloatBuffer fb = bb.asFloatBuffer();
+					for (int i = 0; i < elementsToWrite; i++) {
+						fb.put((float) array.getAsDouble(arrayIndex++));
+					}
+					bufferPos += elementsToWrite * 4;
+				}
+				case INT64 -> {
+					LongBuffer lb = bb.asLongBuffer();
+					for (int i = 0; i < elementsToWrite; i++) {
+						lb.put((long) array.getAsDouble(arrayIndex++));
+					}
+					bufferPos += elementsToWrite * 8;
 				}
-				return count * 4;
-			}
 			case INT32 -> {
-				IntBuffer intBuffer = bb.asIntBuffer();
-				int count = Math.min(intBuffer.remaining(), mb.getNumRows() * mb.getNumColumns() - offsetIn);
-				for (int i = 0; i < count; i++) {
-					r = (offsetIn + i) / mb.getNumColumns();
-					c = (offsetIn + i) % mb.getNumColumns();
-					intBuffer.put((int) mb.getDouble(r,c));
+				IntBuffer ib = bb.asIntBuffer();
+				for (int i = 0; i < elementsToWrite; i++) {
+					ib.put((int) array.getAsDouble(arrayIndex++));
 				}
-				return count * 4;
+				bufferPos += elementsToWrite * 4;
 			}
-			case UINT8 -> {
-				int count = Math.min(maxBytes, mb.getNumRows() * mb.getNumColumns() - offsetIn);
-				for (int i = 0; i < count; i++) {
-					r = (offsetIn + i) / mb.getNumColumns();
-					c = (offsetIn + i) % mb.getNumColumns();
-					buffer[i] = (byte) ((int) mb.getDouble(r,c) & 0xFF);
+			case BOOLEAN -> {
+				for (int i = 0; i < elementsToWrite; i++) {
+					buffer[bufferPos++] = (byte) (array.getAsDouble(arrayIndex++) != 0.0 ? 1 : 0);
 				}
-				return count;
+			}
+			default -> throw new UnsupportedOperationException("Unsupported type: " + type);
 			}
 		}
+
+		out.write(buffer, 0, bufferPos);
+		return totalBytes;
+	}
+
+	private static VarWriteTiming writeVariableTypeColumn(
+			BufferedOutputStream out, Array<?> array,
+			Types.ValueType type, int batchSize) throws IOException {
+		
+		long tEncodeTotal = 0;
+		int numStrings = 0;
+		long totalBytesWritten = 0;
+		
+		byte[] buffer = new byte[batchSize]; // Use 2x batch size like Python side
+		int pos = 0;
+
+		int rows = array.size();
+		
+		for (int i = 0; i < rows; i++) {
+			numStrings++;
+			
+			// Get string value
+			Object value = array.get(i);
+			boolean isNull = (value == null);
+			
+			int length;
+			byte[] encoded;
+			
+			if (isNull) {
+				// Use -1 as marker for null values
+				length = -1;
+				encoded = new byte[0];
+			} else {
+				// Encode to UTF-8
+				long tEncodeStart = System.nanoTime();
+				String str = value.toString();
+				encoded = str.getBytes(StandardCharsets.UTF_8);
+				tEncodeTotal += System.nanoTime() - tEncodeStart;
+				length = encoded.length;
+			}
+			
+			int entrySize = 4 + (length >= 0 ? length : 0); // length prefix + data (or just prefix for null)
+
+			// If next string doesn't fit comfortably, flush first half
+			if (pos + entrySize > batchSize) {
+				out.write(buffer, 0, pos);
+				totalBytesWritten += pos;
+				pos = 0;
+			}
+
+			// Write length prefix (little-endian) - use -1 for null
+			ByteBuffer bb = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN);
+			bb.putInt(length);
+			System.arraycopy(bb.array(), 0, buffer, pos, 4);
+			pos += 4;
+			
+			// Write the encoded bytes (skip for null)
+			if (length > 0) {
+				int remainingBytes = length;
+				int encodedOffset = 0;
+				while (remainingBytes > 0) {
+					int chunk = Math.min(remainingBytes, batchSize - pos);
+					System.arraycopy(encoded, encodedOffset, buffer, pos, chunk);
+					pos += chunk;
+					if (pos == batchSize) {
+						out.write(buffer, 0, pos);
+						totalBytesWritten += pos;
+						pos = 0;
+					}
+					encodedOffset += chunk;
+					remainingBytes -= chunk;
+				}
+			}
+		}
+
+		// Flush the tail
+		if (pos > 0) {
+			out.write(buffer, 0, pos);
+			totalBytesWritten += pos;
+		}
+
+		return new VarWriteTiming(tEncodeTotal, numStrings, totalBytesWritten);
 	}
 }
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/utils/Statistics.java b/src/main/java/org/apache/sysds/utils/Statistics.java
index e6fdf5db3cd..9ec94b1025c 100644
--- a/src/main/java/org/apache/sysds/utils/Statistics.java
+++ b/src/main/java/org/apache/sysds/utils/Statistics.java
@@ -65,6 +65,7 @@
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.DoubleAdder;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.LongAdder;
 import java.util.function.Consumer;
 
@@ -222,6 +223,16 @@ public Object getMeta(String key) {
 
 	public static boolean allowWorkerStatistics = true;
 
+	// Out-of-core eviction metrics
+	private static final ConcurrentHashMap<String, LongAdder> oocHeavyHitters = new  ConcurrentHashMap<>();
+	private static final LongAdder oocGetCalls = new LongAdder();
+	private static final LongAdder oocPutCalls = new LongAdder();
+	private static final LongAdder oocLoadFromDiskCalls = new LongAdder();
+	private static final LongAdder oocLoadFromDiskTimeNanos = new LongAdder();
+	private static final LongAdder oocEvictionWriteCalls = new LongAdder();
+	private static final LongAdder oocEvictionWriteTimeNanos = new LongAdder();
+	private static final AtomicLong oocStatsStartTime = new AtomicLong(System.nanoTime());
+
 	public static long getNoOfExecutedSPInst() {
 		return numExecutedSPInst.longValue();
 	}
@@ -338,6 +349,146 @@ public static void stopRunTimer() {
 	public static long getRunTime() {
 		return execEndTime - execStartTime;
 	}
+
+	public static void resetOOCEvictionStats() {
+		oocHeavyHitters.clear();
+		oocGetCalls.reset();
+		oocPutCalls.reset();
+		oocLoadFromDiskCalls.reset();
+		oocLoadFromDiskTimeNanos.reset();
+		oocEvictionWriteCalls.reset();
+		oocEvictionWriteTimeNanos.reset();
+		oocStatsStartTime.set(System.nanoTime());
+	}
+
+	public static String getOOCHeavyHitters(int num) {
+		if (num <= 0 || oocHeavyHitters == null || oocHeavyHitters.isEmpty())
+			return "-";
+
+		@SuppressWarnings("unchecked")
+		Map.Entry<String, LongAdder>[] tmp =
+			oocHeavyHitters.entrySet().toArray(new Map.Entry[0]);
+
+		Arrays.sort(tmp, (e1, e2) ->
+			Long.compare(e1.getValue().longValue(), e2.getValue().longValue())
+		);
+
+		final String numCol   = "#";
+		final String instCol  = "Instruction";
+		final String timeCol  = "Time(s)";
+
+		DecimalFormat sFormat = new DecimalFormat("#,##0.000");
+
+		StringBuilder sb = new StringBuilder();
+		int len = tmp.length;
+		int numHittersToDisplay = Math.min(num, len);
+
+		int maxNumLen  = String.valueOf(numHittersToDisplay).length();
+		int maxInstLen = instCol.length();
+		int maxTimeLen = timeCol.length();
+
+		// first pass: compute column widths
+		for (int i = 0; i < numHittersToDisplay; i++) {
+			Map.Entry<String, LongAdder> hh = tmp[len - 1 - i];
+
+			String instruction = hh.getKey();
+			double timeS = hh.getValue().longValue() / 1_000_000_000d;
+			String timeStr = sFormat.format(timeS);
+
+			maxInstLen = Math.max(maxInstLen, instruction.length());
+			maxTimeLen = Math.max(maxTimeLen, timeStr.length());
+		}
+
+		maxInstLen = Math.min(maxInstLen, DMLScript.STATISTICS_MAX_WRAP_LEN);
+
+		// header
+		sb.append(String.format(
+			" %" + maxNumLen + "s  %-" + maxInstLen + "s  %" + maxTimeLen + "s",
+			numCol, instCol, timeCol));
+		sb.append("\n");
+
+		// rows
+		for (int i = 0; i < numHittersToDisplay; i++) {
+			Map.Entry<String, LongAdder> hh = tmp[len - 1 - i];
+
+			String instruction = hh.getKey();
+			double timeS = hh.getValue().longValue() / 1_000_000_000d;
+			String timeStr = sFormat.format(timeS);
+
+			String[] wrappedInstruction = wrap(instruction, maxInstLen);
+
+			for (int w = 0; w < wrappedInstruction.length; w++) {
+				if (w == 0) {
+					sb.append(String.format(
+						" %" + maxNumLen + "d  %-" + maxInstLen + "s  %"
+							+ maxTimeLen + "s",
+						(i + 1), wrappedInstruction[w], timeStr));
+				} else {
+					sb.append(String.format(
+						" %" + maxNumLen + "s  %-" + maxInstLen + "s  %"
+							+ maxTimeLen + "s",
+						"", wrappedInstruction[w], ""));
+				}
+				sb.append("\n");
+			}
+		}
+
+		return sb.toString();
+	}
+
+	public static void maintainOOCHeavyHitter(String op, long timeNanos) {
+		LongAdder adder = oocHeavyHitters.computeIfAbsent(op, k -> new LongAdder());
+		adder.add(timeNanos);
+	}
+
+	public static void incrementOOCEvictionGet() {
+		oocGetCalls.increment();
+	}
+
+	public static void incrementOOCEvictionGet(int incr) {
+		oocGetCalls.add(incr);
+	}
+
+	public static void incrementOOCEvictionPut() {
+		oocPutCalls.increment();
+	}
+
+	public static void incrementOOCLoadFromDisk() {
+		oocLoadFromDiskCalls.increment();
+	}
+
+	public static void incrementOOCEvictionWrite() {
+		oocEvictionWriteCalls.increment();
+	}
+
+	public static void accumulateOOCLoadFromDiskTime(long nanos) {
+		oocLoadFromDiskTimeNanos.add(nanos);
+	}
+
+	public static void accumulateOOCEvictionWriteTime(long nanos) {
+		oocEvictionWriteTimeNanos.add(nanos);
+	}
+
+	public static String displayOOCEvictionStats() {
+		long elapsedNanos = Math.max(1, System.nanoTime() - oocStatsStartTime.get());
+		double elapsedSeconds = elapsedNanos / 1e9;
+		double getThroughput = oocGetCalls.longValue() / elapsedSeconds;
+		double putThroughput = oocPutCalls.longValue() / elapsedSeconds;
+
+		StringBuilder sb = new StringBuilder();
+		sb.append("OOC heavy hitters:\n");
+		sb.append(getOOCHeavyHitters(DMLScript.OOC_STATISTICS_COUNT));
+		sb.append('\n');
+		sb.append(String.format(Locale.US, "  get calls:\t\t%d (%.2f/sec)\n",
+			oocGetCalls.longValue(), getThroughput));
+		sb.append(String.format(Locale.US, "  put calls:\t\t%d (%.2f/sec)\n",
+			oocPutCalls.longValue(), putThroughput));
+		sb.append(String.format(Locale.US, "  loadFromDisk:\t\t%d (time %.3f sec)\n",
+			oocLoadFromDiskCalls.longValue(), oocLoadFromDiskTimeNanos.longValue() / 1e9));
+		sb.append(String.format(Locale.US, "  evict writes:\t\t%d (time %.3f sec)\n",
+			oocEvictionWriteCalls.longValue(), oocEvictionWriteTimeNanos.longValue() / 1e9));
+		return sb.toString();
+	}
 	
 	public static void reset()
 	{
@@ -358,6 +509,7 @@ public static void reset()
 		
 		CacheStatistics.reset();
 		LineageCacheStatistics.reset();
+		resetOOCEvictionStats();
 		
 		resetJITCompileTime();
 		resetJVMgcTime();
@@ -1126,6 +1278,11 @@ public static String display(int maxHeavyHitters)
 			sb.append(ParamServStatistics.displayFloStatistics());
 		}
 
+		if (DMLScript.OOC_STATISTICS) {
+			sb.append('\n');
+			sb.append(displayOOCEvictionStats());
+		}
+
 		return sb.toString();
 	}
 }
diff --git a/src/main/python/docs/source/guide/movie_recommender.rst b/src/main/python/docs/source/guide/movie_recommender.rst
new file mode 100644
index 00000000000..ed74b5c260b
--- /dev/null
+++ b/src/main/python/docs/source/guide/movie_recommender.rst
@@ -0,0 +1,492 @@
+.. -------------------------------------------------------------
+.. 
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+.. 
+..   http://www.apache.org/licenses/LICENSE-2.0
+.. 
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+.. 
+.. ------------------------------------------------------------
+
+
+Building a Movie Recommender System
+===============
+
+Have you ever wondered how Netflix, Disney+, and other streaming
+platforms know exactly which movies or TV shows to recommend to you? In
+this tutorial, we will explore how recommender systems work and show how
+to implement them using SystemDS, as well as NumPy and PyTorch for
+comparison. The goal of this tutorial is to showcase different features
+of the SystemDS framework that can be accessed with the Python API.
+
+In this tutorial, we will explore the implementation of a recommender
+system using three distinct mathematical and machine learning
+approaches:
+
+-  **Cosine Similarity**: A geometric approach to measure the similarity
+   between users or items based on the angle between their preference
+   vectors.
+
+-  **Matrix Factorization**: A technique often used in latent factor
+   models (like ALS) to decompose the user-item interaction matrix into
+   lower-dimensional representations.
+
+-  **Linear Regression**: A supervised learning approach used to predict
+   specific ratings by modeling the relationship between user/item
+   features and the target rating.
+
+This tutorial shows only snippets of the code and the whole code can be
+found
+`here <https://github.com/apache/systemds/tree/main/src/main/python/systemds/examples/tutorials/movie_recommender_system.py>`__.
+
+To start with the tutorial, you first have to install SystemDS: :doc:`/getting_started/install`.
+
+
+Dataset
+~~~~~~~
+
+As a dataset we chose the `MovieLens 100K
+Dataset <https://grouplens.org/datasets/movielens/100k/>`__. It consists
+of 100.000 movie ratings from 943 different users on 1682 movies from
+the late 1990s. In the following, we will often refer to movies as
+items. The data is stored in different files: 
+
+-  **u.data** (contains user_id, item_id (movie), rating and timestamp), 
+
+-  **u.user** (contains user_id, age, gender, occupation and zip code), 
+
+-  **u.item** (contains item_id, movie name, release date, ImDb link, genre in a hot-one format).
+
+Preprocessing for Cosine Similarity and Matrix Factorization
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To prepare our data for Cosine Similarity and Matrix Factorization, we
+must convert the raw ratings into a User-Item Interaction Matrix. In
+this structure, each row represents a unique user, and each column
+represents a specific movie. The intersection of a row and column
+contains the user’s rating for that movie.
+
+Because the average user has only seen a small percentage of the
+thousands of movies available, this matrix is extremely sparse. Most
+cells will contain missing values (NaN), which we must handle
+differently depending on the algorithm we choose.
+
+First, we load the MovieLens 100k dataset:
+
+.. code:: python
+
+   # Load the MovieLens 100k dataset
+   header = ['user_id', 'item_id', 'rating', 'timestamp']
+   ratings_df = pd.read_csv('movie_data/ml-100k/u.data', sep='\t', names=header)
+
+We then use the Pandas ``.pivot()`` function to transform the data. This
+gives us the User-Item table.
+
+.. code:: python
+
+   pivot_df = ratings_df.pivot(index='user_id', columns='item_id', values='rating')
+
+The resulting matrix provides a high-level view of our dataset’s
+interaction patterns:
+
+====== ====== ====== ======
+User   Item 1 Item 2 Item 3
+====== ====== ====== ======
+user 1 5             3
+user 2        4      2
+user 3 1      5      
+====== ====== ====== ======
+
+Cosine Similarity
+~~~~~~~~~~~~~~~~~
+
+Collaborative Filtering is an umbrella term for algorithms that generate
+recommendations by identifying patterns in user ratings. One of the most
+common techniques is User-Based Collaborative Filtering, where we
+calculate the similarity between users based on their rating history. To
+do this, we treat each user as a vector in a high-dimensional space and
+measure the “distance” between them using `Cosine Similarity <https://towardsdatascience.com/cosine-similarity-how-does-it-measure-the-similarity-maths-behind-and-usage-in-python-50ad30aad7db/>`__.
+
+To calculate the cosine similarity between all users (rows) in a matrix
+:math:`X`, we normalize this matrix and then multiply it with its
+transposose.
+
+If :math:`\hat{X}` is the row-normalized version of :math:`X` such that
+each row :math:`i` is defined as
+:math:`\hat{x}_i = \frac{x_i}{\|x_i\|}`, then the entire Cosine
+Similarity matrix :math:`S` is calculated via the gramian matrix:
+
+.. math:: S = \hat{X}\hat{X}^T
+
+Using NumPy, we perform these operations using vectorization for
+efficiency:
+
+.. code:: python
+
+   # L2 norm of each user vector
+   norms = np.linalg.norm(X, axis=1, keepdims=True)
+
+   # Normalize user vectors
+   X_norm = X / norms
+
+   # Cosine similarity = dot product of normalized vectors
+   user_similarity = X_norm @ X_norm.T
+
+In SystemDS, we follow a similar logic. First, we import and initialize
+the SystemDSContext.
+
+.. code:: python
+
+   from systemds.context import SystemDSContext
+
+   with SystemDSContext() as sds:
+
+In this context window, we load the data into SystemDS and do our
+calculations, using simple matrix
+functions: :doc:`/api/operator/node/matrix`.
+
+.. code:: python
+
+   # Load into SystemDS
+   X = sds.from_numpy(X_np)
+
+   # Compute L2 norms 
+   row_sums = (X * X).sum(axis=1)
+   norms = row_sums.sqrt()
+
+   # Normalize user vectors
+   X_norm = X / norms
+
+   # Cosine similarity = dot product of normalized vectors
+   user_similarity = X_norm @ X_norm.t()
+
+In SystemDS, the line ``user_similarity_op = X_norm @ X_norm.t()`` does
+not execute any math. Instead, it creates an execution plan. The actual
+computation only occurs when we call ``.compute()``, allowing SystemDS
+to optimize the entire operation.
+
+.. code:: python
+
+   user_similarity = user_similarity.compute()
+
+In both cases ``user_similarity`` gives us a diagonal matrix that shows the
+similarity for every user-user pair.
+
+While both methods produce the same results, SystemDS takes slightly
+longer for this specific dataset.
+
+=============== ===== ========
+Method          NumPy SystemDS
+=============== ===== ========
+Time in seconds 0.02  0.47
+=============== ===== ========
+
+Matrix Factorization
+~~~~~~~~~~~~~~~~~~~~
+
+Another powerful method for generating movie recommendations is Matrix
+Factorization. Instead of looking at surface-level data, this technique
+uncovers latent factors, the hidden patterns that represent a user’s
+specific tastes (like a preference for 90s rom-coms) and a movie’s
+unique characteristics (like its level of whimsy).
+
+In a real-world scenario, our user-item interaction matrix :math:`R`
+is incredibly sparse because most users have only rated a tiny fraction
+of the available movies. Matrix factorization solves this by decomposing
+:math:`R` into two much smaller, lower-dimensional matrices:
+
+-  :math:`P`: Representing user preferences.
+-  :math:`Q`: Representing item characteristics.
+
+By multiplying these two matrices back together, we can estimate the
+missing values in our original matrix:
+
+.. math::  R \approx P \cdot Q^T
+
+To find :math:`P` and :math:`Q`, we use the optimization algorithm
+called 
+`Alternating Least Squares (ALS) <https://www.shaped.ai/blog/matrix-factorization-the-bedrock-of-collaborative-filtering-recommendations>`__.
+
+In NumPy, we manually iterate through users and items, solving a
+least-squares problem for each. This gives us full control but can be
+computationally expensive as the dataset grows. We can compute
+:math:`\hat{R} = P \cdot Q^T` like this
+`(cf. CodeSignal) <https://codesignal.com/learn/courses/diving-deep-into-collaborative-filtering-techniques-with-als/lessons/implementing-the-alternating-least-squares-algorithm>`__:
+
+.. code:: python
+
+   # Random initialization of user and item factors
+   P = np.random.rand(num_users, rank) * 0.01
+   Q = np.random.rand(num_items, rank) * 0.01
+
+   for iteration in range(maxi):
+
+       # Update user factors
+       for u in range(num_users):
+
+           # Get only items user 'u' actually rated
+           user_mask = mask[u, :] 
+           Q_u = Q[user_mask, :]
+           R_u = R[u, user_mask]
+           
+           if Q_u.shape[0] > 0:
+               P[u, :] = np.linalg.solve(np.dot(Q_u.T, Q_u) + reg * np.eye(rank), np.dot(Q_u.T, R_u))
+
+       # Update item factors
+       for i in range(num_items):
+
+           # Get only users who actually rated item 'i'
+           item_mask = mask[:, i]
+           P_i = P[item_mask, :]
+           R_i = R[item_mask, i]
+           
+           if P_i.shape[0] > 0:
+               Q[i, :] = np.linalg.solve(np.dot(P_i.T, P_i) + reg * np.eye(rank), np.dot(P_i.T, R_i))
+
+   R_hat = P @ Q.T
+
+SystemDS allows us to execute the same logic using high-level
+script-like functions that are internally optimized. It offers a wide
+variety of built-in algorithms  :doc:`/api/operator/algorithms`, including ALS.
+First, we import our algorithm.
+
+.. code:: python
+
+   from systemds.operator.algorithm import als
+
+Then, we initialize the SystemDS context:
+
+.. code:: python
+
+   with SystemDSContext() as sds:
+
+To tune the model for our specific dataset, we configure the following
+hyperparameters:
+
+-  ``rank = 20`` The number of latent factors (hidden features) used to
+   describe users and movies. A higher rank allows for more complexity
+   but increases the risk of overfitting.
+-  ``reg = 1.0`` The regularization parameter. This prevents the model
+   from becoming too complex by penalizing large weights, helping it
+   generalize better to unseen data.
+-  ``maxi = 20`` The maximum number of iterations. ALS is an iterative
+   process.
+
+Then we can do the computation.
+
+.. code:: python
+
+   # Load data into SystemDS
+   R = sds.from_numpy(pivot_df.fillna(0).values)
+
+   # Approximate factorization of R into two matrices P and Q using ALS
+   P, Q = als(R, rank=20, reg=1.0, maxi=20).compute()
+
+   R_hat = P @ Q
+
+To test how well our models generalize to new data, we performed an
+80/20 split, using the first 80,000 ratings for training and the
+remainder for testing. We compared both approaches based on execution
+time and Root Mean Squared Error (RMSE).
+
+=============== ===== ========
+Method          NumPy SystemDS
+=============== ===== ========
+Time in seconds 1.09  2.48
+Train RMSE      0.67  0.67
+Test RMSE       1.03  1.01
+=============== ===== ========
+
+Both implementations are mathematically consistent. SystemDS achieved a
+slightly better Test RMSE.
+
+Linear Regression
+~~~~~~~~~~~~~~~~~
+
+Unlike Matrix Factorization, which relies purely on interaction
+patterns, Linear Regression allows us to incorporate “side information”
+about users and items. By using features like user demographics and
+movie genres, we can build a Content-Based Filtering model that predicts
+ratings based on specific attributes.
+
+Preprocessing
+^^^^^^^^^^^^^
+
+For Linear Regression and Neural Networks, our data must be strictly
+numerical and properly scaled. We begin by loading the MovieLens
+datasets:
+
+.. code:: python
+
+   ratings_df = pd.read_csv('movie_data/ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
+   user_df = pd.read_csv('movie_data/ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
+   item_df = pd.read_csv('movie_data/ml-100k/u.item', sep='|', names=[
+       'item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 
+       "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
+       'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], encoding='latin-1')
+
+Libraries like NumPy and SystemDS cannot process strings (e.g.,
+“Student” or “Female”). We must transform these into numerical
+representations:
+
+.. code:: python
+
+   # Turn categorical data into numerical
+   user_df['gender'] = user_df['gender'].apply(lambda x: 0 if x == 'F' else 1)
+   user_df = pd.get_dummies(user_df, columns=['occupation'])
+   item_df['release_date'] = pd.to_datetime(item_df['release_date'], errors='raise', format='%d-%b-%Y')
+   item_df['release_year'] = item_df['release_date'].dt.year
+
+Features like ``age`` and ``release_year`` have different scales. If
+left unscaled, the model might incorrectly give more “weight” to the
+larger year values. We normalize them to a 0–1 range to ensure equal
+influence.
+
+.. code:: python
+
+   # Normalize data
+   user_df['age'] = (user_df['age'] - user_df['age'].min()) / (user_df['age'].max() - user_df['age'].min())
+   item_df['release_year'] = (item_df['release_year'] - item_df['release_year'].min()) / (item_df['release_year'].max() - item_df['release_year'].min())
+
+Finally, we merge these datasets into a single table. Each row
+represents a specific rating, enriched with all available user and movie
+features. After merging, we drop non-numerical columns (like ``title``
+or ``IMDb_URL``), remove rows with NaN-values and split the data into
+Training and Testing sets.
+
+Linear Regression with PyTorch
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PyTorch is a popular deep learning framework that approaches linear
+regression as an iterative optimization problem. We use Gradient Descent
+to minimize the Mean Squared Error (MSE) by repeatedly updating the
+model’s weights based on calculated gradients.
+
+Data must be converted to ``torch.Tensor`` format.
+
+.. code:: python
+
+   X_train_tensor = torch.from_numpy(X_train).float()
+   y_train_tensor = torch.from_numpy(y_train).float().reshape(-1, 1)
+   X_test_tensor = torch.from_numpy(X_test).float()
+   y_test_tensor = torch.from_numpy(y_test).float().reshape(-1, 1)
+
+We define a model class and an optimizer (SGD). The learning rate
+(``lr``) determines the step size for each update.
+
+.. code:: python
+
+   n_features = X_train.shape[1]
+
+   class linearRegression(torch.nn.Module):
+       def __init__(self):
+           super(linearRegression, self).__init__()
+           # input size: n_features, output size: 1
+           self.linear = torch.nn.Linear(n_features, 1)
+
+       def forward(self, x):
+           out = self.linear(x)
+           return out
+
+   lr_model = linearRegression()
+   criterion = torch.nn.MSELoss()
+   optimizer = torch.optim.SGD(lr_model.parameters(), lr = 0.01)
+
+The model iterates through the dataset for a set number of epochs. In
+each iteration, it performs a forward pass, calculates the loss, and
+backpropagates the gradients to update the weights.
+
+.. code:: python
+
+   for epoch in range(1000):
+
+       # Forward pass and loss
+       pred_y = lr_model(X_train_tensor)
+       loss = criterion(pred_y, y_train_tensor)
+
+       # Backward pass and optimization
+       optimizer.zero_grad()
+       loss.backward()
+       optimizer.step()
+
+We use ``.eval()`` and ``torch.no_grad()`` to disable gradient tracking
+during inference
+
+.. code:: python
+
+   lr_model.eval()
+   with torch.no_grad():
+       y_pred_test = lr_model(X_test_tensor)
+
+Then, we can calculate the RMSE.
+
+.. code:: python
+
+    y_pred = y_pred_test.numpy().flatten()
+    y_true = y_test_tensor.numpy().flatten()
+    rmse = np.sqrt(np.mean((y_pred - y_true) ** 2))
+
+Linear Regression with SystemDS
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Following the same pattern as ALS, SystemDS
+provides a highly optimized, built-in algorithm for linear regression.
+This implementation is designed to handle large-scale data by
+automatically deciding between direct solvers and conjugate gradient
+methods based on the data’s characteristics.
+
+First, we import the ``lm`` training algorithm and the ``lmPredict``
+function for inference.
+
+.. code:: python
+
+   from systemds.operator.algorithm import lm, lmPredict
+
+We transfer our NumPy arrays into the SystemDS context.
+
+.. code:: python
+
+   X_ds = sds.from_numpy(X_train)
+   y_ds = sds.from_numpy(y_train)
+   X_test_ds = sds.from_numpy(X_test)
+
+We call the ``lm`` function to train our model.
+
+.. code:: python
+
+   model = lm(X=X_ds, y=y_ds)
+
+To generate predictions for the test set, we use ``lmPredict``. Because
+SystemDS uses Lazy Evaluation, the actual computation is only triggered
+when we call ``.compute()``.
+
+.. code:: python
+
+   predictions = lmPredict(X_test_ds, model).compute()
+
+Finally, we calculate the RMSE to compare the performance against your
+PyTorch implementation.
+
+Comparison
+^^^^^^^^^^
+
+=============== ======= ========
+Method          PyTorch SystemDS
+=============== ======= ========
+Time in seconds 1.77    0.87
+Test RMSE       1.13    1.08
+=============== ======= ========
+
+Using linear regression, SystemDS worked way faster than our PyTorch
+approach and achieved better results.
diff --git a/src/main/python/docs/source/index.rst b/src/main/python/docs/source/index.rst
index 09de5494df5..98dd9393447 100644
--- a/src/main/python/docs/source/index.rst
+++ b/src/main/python/docs/source/index.rst
@@ -54,6 +54,7 @@ tensors (multi-dimensional arrays) whose first dimension may have a heterogeneou
    guide/federated.rst
    guide/algorithms_basics.rst
    guide/python_end_to_end_tut.rst
+   guide/movie_recommender.rst
 
 .. toctree::
    :maxdepth: 1
diff --git a/src/main/python/systemds/context/systemds_context.py b/src/main/python/systemds/context/systemds_context.py
index 41cfdfc698f..99a6cba57b8 100644
--- a/src/main/python/systemds/context/systemds_context.py
+++ b/src/main/python/systemds/context/systemds_context.py
@@ -126,6 +126,7 @@ def __setup_data_transfer(self, data_transfer_mode=0, multi_pipe_enabled=False):
             self._FIFO_PY2JAVA_PIPES = out_pipes
             self._FIFO_JAVA2PY_PIPES = in_pipes
         else:
+            self._log.info("Using py4j for data transfer")
             self._data_transfer_mode = 0
 
     def __init_pipes(self, num_pipes):
diff --git a/src/main/python/systemds/examples/tutorials/movie_recommender_system.py b/src/main/python/systemds/examples/tutorials/movie_recommender_system.py
new file mode 100644
index 00000000000..66e486949a7
--- /dev/null
+++ b/src/main/python/systemds/examples/tutorials/movie_recommender_system.py
@@ -0,0 +1,580 @@
+#!/usr/bin/env python3
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import time
+import pandas as pd
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from systemds.context import SystemDSContext
+from systemds.operator.algorithm import als, lm, lmPredict
+
+# To run this code, first download the MovieLens 100k dataset from
+# https://grouplens.org/datasets/movielens/100k/ and extract it to the specified folder.
+
+data_folder = "/movie_data/ml-100k/"
+
+
+def read_movie_data(n_rows: int = 10000) -> pd.DataFrame:
+    """
+    Reads the MovieLens 100k dataset and returns a DataFrame with the following columns: user_id, item_id, rating.
+
+    :param n_rows: Number of rows to read from the dataset.
+    :return: DataFrame containing the movie ratings data.
+    """
+
+    # Load the MovieLens 100k dataset
+    header = ["user_id", "item_id", "rating", "timestamp"]
+    ratings_df = pd.read_csv(data_folder + "u.data", sep="\t", names=header)
+
+    # Drop timestamp column
+    ratings_df = ratings_df.drop("timestamp", axis=1)
+
+    # Only check first n_rows rows to speed up processing
+    ratings_df = ratings_df.head(n_rows)
+
+    return ratings_df
+
+
+def create_pivot_table(ratings_df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Creates a pivot table from the ratings DataFrame where rows are users, columns are items, and values are ratings.
+
+    :param ratings_df: DataFrame containing the movie ratings data with columns user_id, item_id, rating.
+    :return: Pivot table with users as rows, items as columns, and ratings as values.
+    """
+
+    return ratings_df.pivot(index="user_id", columns="item_id", values="rating")
+
+
+### Cosine Similarity Functions ###
+
+
+def numpy_cosine_similarity(pivot_df: pd.DataFrame) -> tuple[pd.DataFrame, float]:
+    """
+    Calculates the cosine similarity between users using NumPy.
+
+    :param pivot_df: DataFrame containing the pivot table of user-item ratings.
+    :return: DataFrame containing the cosine similarity between users and time taken.
+    """
+
+    # zeros = unrated items
+    X = pivot_df.fillna(0).values
+
+    start = time.time()
+
+    # L2 norm of each user vector
+    norms = np.linalg.norm(X, axis=1, keepdims=True)
+
+    # Normalize user vectors
+    X_norm = X / norms
+
+    # Cosine similarity = dot product of normalized vectors
+    user_similarity = X_norm @ X_norm.T
+
+    end = time.time()
+
+    # convert to DataFrame for better readability
+    user_similarity_df = pd.DataFrame(
+        user_similarity, index=pivot_df.index, columns=pivot_df.index
+    )
+
+    return user_similarity_df, end - start
+
+
+def systemds_cosine_similarity(pivot_df: pd.DataFrame) -> tuple[pd.DataFrame, float]:
+    """
+    Calculates the cosine similarity between users using SystemDS.
+
+    :param pivot_df: DataFrame containing the pivot table of user-item ratings.
+    :return: DataFrame containing the cosine similarity between users and time taken.
+    """
+
+    # Zeros = unrated items
+    X_np = pivot_df.fillna(0).values
+
+    with SystemDSContext() as sds:
+
+        start = time.time()
+
+        # Load into SystemDS
+        X = sds.from_numpy(X_np)
+
+        # Compute L2 norms
+        row_sums = (X * X).sum(axis=1)
+        norms = row_sums.sqrt()
+
+        # Normalize user vectors
+        X_norm = X / norms
+
+        # Cosine similarity = dot product of normalized vectors
+        user_similarity = X_norm @ X_norm.t()
+
+        # Compute result
+        user_similarity = user_similarity.compute()
+
+        end = time.time()
+
+        # Convert to DataFrame for better readability
+        user_similarity_df = pd.DataFrame(
+            user_similarity, index=pivot_df.index, columns=pivot_df.index
+        )
+
+    return user_similarity_df, end - start
+
+
+def evaluate_cosine_similarity() -> None:
+    """
+    Evaluates and compares the cosine similarity computations between NumPy and SystemDS.
+    """
+
+    ratings = read_movie_data(100000)
+    pivot_df = create_pivot_table(ratings)
+
+    numpy_df, numpy_time = numpy_cosine_similarity(pivot_df)
+    systemds_df, systemds_time = systemds_cosine_similarity(pivot_df)
+
+    # Check if the results are approximately equal
+    if np.allclose(numpy_df.values, systemds_df.values, atol=1e-8):
+        print("Cosine similarity DataFrames are approximately equal.")
+    else:
+        print("Cosine similarity DataFrames are NOT equal.")
+
+    print(f"Time taken for NumPy cosine similarity: {numpy_time}")
+    print(f"Time taken for SystemDS cosine similarity: {systemds_time}")
+
+
+### Matrix Factorization Functions ###
+
+
+def numpy_als(
+    pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int
+) -> tuple[pd.DataFrame, float]:
+    """
+    Calculates a matrix R_hat using Alternating Least Squares (ALS) matrix factorization in numpy.
+
+    :param pivot_df: DataFrame containing the pivot table of user-item ratings.
+    :return: DataFrame containing the predicted ratings and time taken.
+    """
+
+    # Fill NaNs with zeros for computation
+    R = pivot_df.fillna(0).values
+
+    start = time.time()
+    num_users, num_items = R.shape
+    mask = R != 0
+
+    # Random initialization of user and item factors
+    P = np.random.rand(num_users, rank) * 0.01
+    Q = np.random.rand(num_items, rank) * 0.01
+
+    for iteration in range(maxi):
+
+        # Update user factors
+        for u in range(num_users):
+
+            # Get only items user 'u' actually rated
+            user_mask = mask[u, :]
+            Q_u = Q[user_mask, :]
+            R_u = R[u, user_mask]
+
+            if Q_u.shape[0] > 0:
+                P[u, :] = np.linalg.solve(
+                    np.dot(Q_u.T, Q_u) + reg * np.eye(rank), np.dot(Q_u.T, R_u)
+                )
+
+        # Update item factors
+        for i in range(num_items):
+
+            # Get only users who actually rated item 'i'
+            item_mask = mask[:, i]
+            P_i = P[item_mask, :]
+            R_i = R[item_mask, i]
+
+            if P_i.shape[0] > 0:
+                Q[i, :] = np.linalg.solve(
+                    np.dot(P_i.T, P_i) + reg * np.eye(rank), np.dot(P_i.T, R_i)
+                )
+
+    end = time.time()
+
+    # Multiply P and Q to get the approximated ratings matrix
+    R_hat = P @ Q.T
+
+    # Convert to DataFrame for better readability
+    ratings_hat_df = pd.DataFrame(R_hat, index=pivot_df.index, columns=pivot_df.columns)
+
+    return ratings_hat_df, end - start
+
+
+def systemds_als(
+    pivot_df: pd.DataFrame, rank: int, reg: float, maxi: int
+) -> tuple[pd.DataFrame, float]:
+    """
+    Calculates a matrix R_hat using Alternating Least Squares (ALS) matrix factorization in SystemDS.
+
+    :param pivot_df: DataFrame containing the pivot table of user-item ratings.
+    :return: DataFrame containing the predicted ratings and time taken.
+    """
+
+    start = time.time()
+
+    with SystemDSContext() as sds:
+
+        # Load data into SystemDS
+        R = sds.from_numpy(pivot_df.fillna(0).values)
+
+        # Approximate factorization of R into two matrices P and Q using ALS
+        P, Q = als(R, rank=rank, reg=reg, maxi=maxi).compute()
+        end = time.time()
+
+        # Multiply P and Q to get the approximated ratings matrix
+        R_hat = P @ Q
+
+    # Convert to DataFrame for better readability
+    ratings_hat_df = pd.DataFrame(R_hat, index=pivot_df.index, columns=pivot_df.columns)
+
+    return ratings_hat_df, end - start
+
+
+def evaluate_als(
+    model: str = "systemds", rank: int = 10, reg: float = 1.0, maxi: int = 20
+) -> None:
+    """
+    Evaluates and compares the ALS computations between NumPy and SystemDS. The data is split into training
+    and test sets with an 80/20 ratio. Then the RMSE is calculated for both sets.
+
+    :param model: Model to use for ALS computation ("systemds" or "numpy").
+    :param rank: Rank of the factorized matrices.
+    :param reg: Regularization parameter.
+    :param maxi: Maximum number of iterations.
+    """
+
+    ratings = read_movie_data(100000)
+    pivot_df = create_pivot_table(ratings[:80000])
+
+    if model == "systemds":
+        ratings_hat_df, systemds_time = systemds_als(pivot_df, rank, reg, maxi)
+    else:
+        ratings_hat_df, numpy_time = numpy_als(pivot_df, rank, reg, maxi)
+
+    # Print time taken
+    print(
+        f"Time taken for {model} ALS: ",
+        systemds_time if model == "systemds" else numpy_time,
+    )
+
+    # Training error
+    mask = ~np.isnan(pivot_df.values)
+    train_rmse = np.sqrt(
+        np.mean((ratings_hat_df.values[mask] - pivot_df.values[mask]) ** 2)
+    )
+    print(f"Train RMSE for model with {model}: {train_rmse}")
+
+    # Test error
+    test_set = ratings[80000:]
+    stacked_series = ratings_hat_df.stack()
+    ratings_hat_long = stacked_series.reset_index()
+    ratings_hat_long.columns = ["user_id", "item_id", "rating"]
+
+    merged_df = pd.merge(
+        test_set,
+        ratings_hat_long,
+        on=["user_id", "item_id"],
+        how="inner",
+        suffixes=("_actual", "_predicted"),
+    )
+
+    # Force predictions to stay between 0.5 and 5.0
+    merged_df["rating_predicted"] = merged_df["rating_predicted"].clip(0.5, 5.0)
+
+    # Calculate root mean squared error (RMSE)
+    squared_errors = (merged_df["rating_actual"] - merged_df["rating_predicted"]) ** 2
+    mse = np.mean(squared_errors)
+    test_rmse = np.sqrt(mse)
+
+    print(f"Test RMSE for model with {model}: {test_rmse}")
+
+
+### Linear Regression ###
+
+
+def preprocess_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """
+    This function reads and preprocesses the MovieLens 100k dataset for linear regression. It returns four
+    different numpy arrays: X_train, y_train, X_test, y_test. The preprocessing steps include:
+    - Reading the datasets
+    - Handling categorical variables
+    - Normalizing numerical features
+    - Merging datasets
+    - Dropping unnecessary columns
+    - Dropping rows with NaN values
+    - Splitting into training and testing sets.
+
+    :return: tuple of numpy arrays (X_train, y_train, X_test, y_test)
+    """
+
+    # Read datasets
+    ratings_df = pd.read_csv(
+        data_folder + "u.data",
+        sep="\t",
+        names=["user_id", "item_id", "rating", "timestamp"],
+    )
+    user_df = pd.read_csv(
+        data_folder + "u.user",
+        sep="|",
+        names=["user_id", "age", "gender", "occupation", "zip_code"],
+    )
+    item_df = pd.read_csv(
+        data_folder + "u.item",
+        sep="|",
+        names=[
+            "item_id",
+            "title",
+            "release_date",
+            "video_release_date",
+            "IMDb_URL",
+            "unknown",
+            "Action",
+            "Adventure",
+            "Animation",
+            "Children's",
+            "Comedy",
+            "Crime",
+            "Documentary",
+            "Drama",
+            "Fantasy",
+            "Film-Noir",
+            "Horror",
+            "Musical",
+            "Mystery",
+            "Romance",
+            "Sci-Fi",
+            "Thriller",
+            "War",
+            "Western",
+        ],
+        encoding="latin-1",
+    )
+
+    # Turn categorical data into numerical
+    user_df["gender"] = user_df["gender"].apply(lambda x: 0 if x == "F" else 1)
+    user_df = pd.get_dummies(user_df, columns=["occupation"])
+    item_df["release_date"] = pd.to_datetime(
+        item_df["release_date"], errors="raise", format="%d-%b-%Y"
+    )
+    item_df["release_year"] = item_df["release_date"].dt.year
+
+    # Normalize data
+    user_df["age"] = (user_df["age"] - user_df["age"].min()) / (
+        user_df["age"].max() - user_df["age"].min()
+    )
+    item_df["release_year"] = (
+        item_df["release_year"] - item_df["release_year"].min()
+    ) / (item_df["release_year"].max() - item_df["release_year"].min())
+
+    # Merge datasets
+    merged_df = ratings_df.merge(user_df, on="user_id").merge(item_df, on="item_id")
+
+    # Drop unnecessary columns
+    merged_df = merged_df.drop(
+        [
+            "user_id",
+            "item_id",
+            "timestamp",
+            "zip_code",
+            "title",
+            "release_date",
+            "video_release_date",
+            "IMDb_URL",
+            "unknown",
+        ],
+        axis=1,
+    )
+
+    # Convert boolean columns to integers (important for NumPy and SystemDS)
+    bool_cols = merged_df.select_dtypes(include=["bool"]).columns
+    merged_df[bool_cols] = merged_df[bool_cols].astype(int)
+
+    # Drop rows with NaN values
+    merged_df = merged_df.dropna()
+
+    ratings = merged_df.pop("rating")
+    features = merged_df
+
+    # Split into train and test sets and convert to numpy arrays
+    train_size = int(0.8 * len(ratings))
+    X_train = features[:train_size].to_numpy()
+    y_train = ratings[:train_size].to_numpy()
+    X_test = features[train_size:].to_numpy()
+    y_test = ratings[train_size:].to_numpy()
+
+    print("NaNs in X:", np.isnan(X_train).any())
+    print("NaNs in y:", np.isnan(y_train).any())
+
+    return X_train, y_train, X_test, y_test
+
+
+def linear_regression_pytorch(
+    X_train, y_train, X_test, y_test, num_epochs=1000
+) -> tuple[float, float]:
+    """
+    Trains a linear regression model using PyTorch.
+
+    :param X_train, X_test: numpy arrays of shape (n_samples, n_features)
+    :param y_train, y_test: numpy arrays of shape (n_samples,)
+    :param num_epochs: number of training iterations
+
+    :return rmse: RMSE on test set
+    :return time taken: time in seconds for training and prediction
+    """
+
+    start = time.time()
+
+    # Convert to PyTorch tensors
+    X_train_tensor = torch.from_numpy(X_train).float()
+    y_train_tensor = torch.from_numpy(y_train).float().reshape(-1, 1)
+    X_test_tensor = torch.from_numpy(X_test).float()
+    y_test_tensor = torch.from_numpy(y_test).float().reshape(-1, 1)
+
+    # Define model
+    n_features = X_train.shape[1]
+
+    class linearRegression(torch.nn.Module):
+        def __init__(self):
+            super(linearRegression, self).__init__()
+            # input size: n_features, output size: 1
+            self.linear = torch.nn.Linear(n_features, 1)
+
+        def forward(self, x):
+            out = self.linear(x)
+            return out
+
+    lr_model = linearRegression()
+
+    # Loss and optimizer
+    criterion = torch.nn.MSELoss()
+    optimizer = torch.optim.SGD(lr_model.parameters(), lr=0.01)
+
+    # Training loop
+    for epoch in range(num_epochs):
+
+        # Forward pass and loss
+        pred_y = lr_model(X_train_tensor)
+        loss = criterion(pred_y, y_train_tensor)
+
+        # Backward pass and optimization
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        if (epoch + 1) % 100 == 0:
+            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
+
+    # Make predictions on test set
+    lr_model.eval()
+    with torch.no_grad():
+        y_pred_test = lr_model(X_test_tensor)
+
+    end = time.time()
+
+    y_pred = y_pred_test.numpy().flatten()
+    y_true = y_test_tensor.numpy().flatten()
+
+    # Compute RMSE
+    rmse = np.sqrt(np.mean((y_pred - y_true) ** 2))
+
+    return rmse, end - start
+
+
+def linear_regression_systemds(
+    X_train, y_train, X_test, y_test, num_epochs=1000
+) -> tuple[float, float]:
+    """
+    Trains a linear regression model using SystemDS.
+
+    :param X_train, X_test: numpy arrays of shape (n_samples, n_features)
+    :param y_train, y_test: numpy arrays of shape (n_samples,)
+    :param num_epochs: maximum number of training iterations
+
+    :return rmse: RMSE on test set
+    :return time taken: time in seconds for training and prediction
+    """
+
+    with SystemDSContext() as sds:
+
+        start = time.time()
+
+        # Read data into SystemDS
+        X_ds = sds.from_numpy(X_train)
+        y_ds = sds.from_numpy(y_train)
+        X_test_ds = sds.from_numpy(X_test)
+
+        # Train linear regression model with max iterations
+        model = lm(X=X_ds, y=y_ds, maxi=num_epochs)
+        # Make predictions on test set
+        predictions = lmPredict(X_test_ds, model).compute()
+
+        end = time.time()
+
+        y_pred = predictions.flatten()
+        y_true = y_test.flatten()
+
+        # Compute RMSE
+        rmse = np.sqrt(np.mean((y_pred - y_true) ** 2))
+
+    return rmse, end - start
+
+
+def evaluate_lr() -> None:
+    """
+    Evaluates and compares the linear regression computations between PyTorch and SystemDS. The data is split into
+    training and test sets with an 80/20 ratio. Then the RMSE is calculated for both sets.
+    """
+
+    print("Evaluating Linear Regression Models...")
+
+    X_train, y_train, X_test, y_test = preprocess_data()
+
+    pytorch_rmse, pytorch_time = linear_regression_pytorch(
+        X_train, y_train, X_test, y_test, num_epochs=1000
+    )
+    systemds_rmse, systemds_time = linear_regression_systemds(
+        X_train, y_train, X_test, y_test, num_epochs=1000
+    )
+
+    print(f"PyTorch RMSE: {pytorch_rmse}, Time: {pytorch_time} seconds")
+    print(f"SystemDS RMSE: {systemds_rmse}, Time: {systemds_time} seconds")
+
+
+if __name__ == "__main__":
+
+    # Cosine Similarity
+    evaluate_cosine_similarity()
+
+    # Matrix Factorization using ALS
+    evaluate_als("systemds")
+    evaluate_als("numpy")
+
+    # Linear Regression
+    evaluate_lr()
diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py
index 8b5a8621d1d..168f036b1e3 100644
--- a/src/main/python/systemds/scuro/__init__.py
+++ b/src/main/python/systemds/scuro/__init__.py
@@ -30,7 +30,13 @@
     AggregatedRepresentation,
 )
 from systemds.scuro.representations.average import Average
-from systemds.scuro.representations.bert import Bert
+from systemds.scuro.representations.bert import (
+    Bert,
+    RoBERTa,
+    DistillBERT,
+    ALBERT,
+    ELECTRA,
+)
 from systemds.scuro.representations.bow import BoW
 from systemds.scuro.representations.concatenation import Concatenation
 from systemds.scuro.representations.context import Context
@@ -101,6 +107,22 @@
 from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
 from systemds.scuro.representations.vgg import VGG19
 from systemds.scuro.representations.clip import CLIPText, CLIPVisual
+from systemds.scuro.representations.text_context import (
+    SentenceBoundarySplit,
+    OverlappingSplit,
+)
+from systemds.scuro.representations.text_context_with_indices import (
+    SentenceBoundarySplitIndices,
+    OverlappingSplitIndices,
+)
+from systemds.scuro.representations.elmo import ELMoRepresentation
+from systemds.scuro.representations.dimensionality_reduction import (
+    DimensionalityReduction,
+)
+from systemds.scuro.representations.mlp_averaging import MLPAveraging
+from systemds.scuro.representations.mlp_learned_dim_reduction import (
+    MLPLearnedDimReduction,
+)
 
 __all__ = [
     "BaseLoader",
@@ -113,6 +135,10 @@
     "AggregatedRepresentation",
     "Average",
     "Bert",
+    "RoBERTa",
+    "DistillBERT",
+    "ALBERT",
+    "ELECTRA",
     "BoW",
     "Concatenation",
     "Context",
@@ -177,4 +203,12 @@
     "VGG19",
     "CLIPVisual",
     "CLIPText",
+    "SentenceBoundarySplit",
+    "OverlappingSplit",
+    "ELMoRepresentation",
+    "SentenceBoundarySplitIndices",
+    "OverlappingSplitIndices",
+    "MLPAveraging",
+    "MLPLearnedDimReduction",
+    "DimensionalityReduction",
 ]
diff --git a/src/main/python/systemds/scuro/dataloader/image_loader.py b/src/main/python/systemds/scuro/dataloader/image_loader.py
index 0667e703b12..21ad27bf049 100644
--- a/src/main/python/systemds/scuro/dataloader/image_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/image_loader.py
@@ -54,7 +54,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         else:
             height, width, channels = image.shape
 
-        image = image.astype(np.float32) / 255.0
+        image = image.astype(np.uint8, copy=False)
 
         self.metadata[file] = self.modality_type.create_metadata(
             width, height, channels
diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py
index ed154485971..53e98e7e19f 100644
--- a/src/main/python/systemds/scuro/dataloader/json_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/json_loader.py
@@ -55,6 +55,6 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
                 except:
                     text = json_file[self.field]
 
-                text = " ".join(text)
+                text = " ".join(text) if isinstance(text, list) else text
                 self.data.append(text)
                 self.metadata[idx] = self.modality_type.create_metadata(len(text), text)
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
index 2c154ecbafe..2fee7cbf5a3 100644
--- a/src/main/python/systemds/scuro/dataloader/video_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -45,11 +45,6 @@ def __init__(
 
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
-        # if not self.load_data_from_file:
-        #     self.metadata[file] = self.modality_type.create_metadata(
-        #         30, 10, 100, 100, 3
-        #     )
-        # else:
         cap = cv2.VideoCapture(file)
 
         if not cap.isOpened():
diff --git a/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py b/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py
index 8c5e4c24e1e..ed0eb5abdee 100644
--- a/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py
+++ b/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py
@@ -19,8 +19,9 @@
 #
 # -------------------------------------------------------------
 from typing import Dict, List, Tuple, Any, Optional
-import numpy as np
-from sklearn.model_selection import ParameterGrid
+from skopt import gp_minimize
+from skopt.space import Real, Integer, Categorical
+from skopt.utils import use_named_args
 import json
 import logging
 from dataclasses import dataclass
@@ -28,7 +29,6 @@
 import copy
 
 from systemds.scuro.modality.modality import Modality
-from systemds.scuro.drsearch.task import Task
 
 
 @dataclass
@@ -103,7 +103,9 @@ def extract_k_best_modalities_per_task(self):
             representations[task.model.name] = {}
             for modality in self.modalities:
                 k_best_results, cached_data = (
-                    self.optimization_results.get_k_best_results(modality, self.k, task)
+                    self.optimization_results.get_k_best_results(
+                        modality, task, self.scoring_metric
+                    )
                 )
                 representations[task.model.name][modality.modality_id] = k_best_results
                 self.k_best_representations[task.model.name].extend(k_best_results)
@@ -161,25 +163,71 @@ def visit_node(node_id):
         start_time = time.time()
         rep_name = "_".join([rep.__name__ for rep in reps])
 
-        param_grid = list(ParameterGrid(hyperparams))
-        if max_evals and len(param_grid) > max_evals:
-            np.random.shuffle(param_grid)
-            param_grid = param_grid[:max_evals]
+        search_space = []
+        param_names = []
+        for param_name, param_values in hyperparams.items():
+            param_names.append(param_name)
+            if isinstance(param_values, list):
+                if all(isinstance(v, (int, float)) for v in param_values):
+                    if all(isinstance(v, int) for v in param_values):
+                        search_space.append(
+                            Integer(
+                                min(param_values), max(param_values), name=param_name
+                            )
+                        )
+                    else:
+                        search_space.append(
+                            Real(min(param_values), max(param_values), name=param_name)
+                        )
+                else:
+                    search_space.append(Categorical(param_values, name=param_name))
+            elif isinstance(param_values, tuple) and len(param_values) == 2:
+                if isinstance(param_values[0], int) and isinstance(
+                    param_values[1], int
+                ):
+                    search_space.append(
+                        Integer(param_values[0], param_values[1], name=param_name)
+                    )
+                else:
+                    search_space.append(
+                        Real(param_values[0], param_values[1], name=param_name)
+                    )
+            else:
+                search_space.append(Categorical([param_values], name=param_name))
+
+        n_calls = max_evals if max_evals else 50
 
         all_results = []
-        for params in param_grid:
+
+        @use_named_args(search_space)
+        def objective(**params):
             result = self.evaluate_dag_config(
                 dag, params, node_order, modality_ids, task
             )
             all_results.append(result)
 
+            score = result[1].average_scores[self.scoring_metric]
+            if self.maximize_metric:
+                return -score
+            else:
+                return score
+
+        result = gp_minimize(
+            objective,
+            search_space,
+            n_calls=n_calls,
+            random_state=42,
+            verbose=self.debug,
+            n_initial_points=min(10, n_calls // 2),
+        )
+
         if self.maximize_metric:
             best_params, best_score = max(
-                all_results, key=lambda x: x[1].scores[self.scoring_metric]
+                all_results, key=lambda x: x[1].average_scores[self.scoring_metric]
             )
         else:
             best_params, best_score = min(
-                all_results, key=lambda x: x[1].scores[self.scoring_metric]
+                all_results, key=lambda x: x[1].average_scores[self.scoring_metric]
             )
 
         tuning_time = time.time() - start_time
diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
index 9d0088a976a..7c17353663c 100644
--- a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -19,6 +19,7 @@
 #
 # -------------------------------------------------------------
 import os
+import torch
 import multiprocessing as mp
 import itertools
 import threading
@@ -56,10 +57,7 @@ def _evaluate_dag_worker(dag_pickle, task_pickle, modalities_pickle, debug=False
                 f"[DEBUG][worker] pid={os.getpid()} evaluating dag_root={getattr(dag, 'root_node_id', None)} task={getattr(task.model, 'name', None)}"
             )
 
-        dag_copy = copy.deepcopy(dag)
-        task_copy = copy.deepcopy(task)
-
-        fused_representation = dag_copy.execute(modalities_for_dag, task_copy)
+        fused_representation = dag.execute(modalities_for_dag, task)
         if fused_representation is None:
             return None
 
@@ -72,21 +70,22 @@ def _evaluate_dag_worker(dag_pickle, task_pickle, modalities_pickle, debug=False
         )
         from systemds.scuro.representations.aggregate import Aggregation
 
-        if task_copy.expected_dim == 1 and get_shape(final_representation.metadata) > 1:
+        if task.expected_dim == 1 and get_shape(final_representation.metadata) > 1:
             agg_operator = AggregatedRepresentation(Aggregation())
             final_representation = agg_operator.transform(final_representation)
 
         eval_start = time.time()
-        scores = task_copy.run(final_representation.data)
+        scores = task.run(final_representation.data)
         eval_time = time.time() - eval_start
         total_time = time.time() - start_time
 
         return OptimizationResult(
-            dag=dag_copy,
-            train_score=scores[0],
-            val_score=scores[1],
+            dag=dag,
+            train_score=scores[0].average_scores,
+            val_score=scores[1].average_scores,
+            test_score=scores[2].average_scores,
             runtime=total_time,
-            task_name=task_copy.model.name,
+            task_name=task.model.name,
             task_time=eval_time,
             representation_time=total_time - eval_time,
         )
@@ -106,6 +105,7 @@ def __init__(
         debug: bool = True,
         min_modalities: int = 2,
         max_modalities: int = None,
+        metric: str = "accuracy",
     ):
         self.modalities = modalities
         self.tasks = tasks
@@ -116,6 +116,7 @@ def __init__(
 
         self.operator_registry = Registry()
         self.fusion_operators = self.operator_registry.get_fusion_operators()
+        self.metric_name = metric
 
         self.k_best_representations = self._extract_k_best_representations(
             unimodal_optimization_results
@@ -242,7 +243,7 @@ def _extract_k_best_representations(
             for modality in self.modalities:
                 k_best_results, cached_data = (
                     unimodal_optimization_results.get_k_best_results(
-                        modality, self.k, task
+                        modality, task, self.metric_name
                     )
                 )
 
@@ -350,49 +351,40 @@ def build_variants(
     def _evaluate_dag(self, dag: RepresentationDag, task: Task) -> "OptimizationResult":
         start_time = time.time()
         try:
-            tid = threading.get_ident()
-            tname = threading.current_thread().name
 
-            dag_copy = copy.deepcopy(dag)
-            modalities_for_dag = copy.deepcopy(
+            fused_representation = dag.execute(
                 list(
                     chain.from_iterable(
                         self.k_best_representations[task.model.name].values()
                     )
-                )
-            )
-            task_copy = copy.deepcopy(task)
-            fused_representation = dag_copy.execute(
-                modalities_for_dag,
-                task_copy,
+                ),
+                task,
+                enable_cache=False,
             )
 
+            torch.cuda.empty_cache()
+
             if fused_representation is None:
                 return None
 
-            final_representation = fused_representation[
-                list(fused_representation.keys())[-1]
-            ]
-            if (
-                task_copy.expected_dim == 1
-                and get_shape(final_representation.metadata) > 1
-            ):
+            if task.expected_dim == 1 and get_shape(fused_representation.metadata) > 1:
                 agg_operator = AggregatedRepresentation(Aggregation())
-                final_representation = agg_operator.transform(final_representation)
+                fused_representation = agg_operator.transform(fused_representation)
 
             eval_start = time.time()
-            scores = task_copy.run(final_representation.data)
+            scores = task.run(fused_representation.data)
             eval_time = time.time() - eval_start
 
             total_time = time.time() - start_time
-
+            del fused_representation
             return OptimizationResult(
-                dag=dag_copy,
-                train_score=scores[0],
-                val_score=scores[1],
+                dag=dag,
+                train_score=scores[0].average_scores,
+                val_score=scores[1].average_scores,
+                test_score=scores[2].average_scores,
                 runtime=total_time,
                 representation_time=total_time - eval_time,
-                task_name=task_copy.model.name,
+                task_name=task.model.name,
                 task_time=eval_time,
             )
 
@@ -479,6 +471,7 @@ class OptimizationResult:
     dag: RepresentationDag
     train_score: PerformanceMeasure = None
     val_score: PerformanceMeasure = None
+    test_score: PerformanceMeasure = None
     runtime: float = 0.0
     task_time: float = 0.0
     representation_time: float = 0.0
diff --git a/src/main/python/systemds/scuro/drsearch/operator_registry.py b/src/main/python/systemds/scuro/drsearch/operator_registry.py
index 3b20245956b..bf9547ddbf6 100644
--- a/src/main/python/systemds/scuro/drsearch/operator_registry.py
+++ b/src/main/python/systemds/scuro/drsearch/operator_registry.py
@@ -33,8 +33,11 @@ class Registry:
 
     _instance = None
     _representations = {}
-    _context_operators = []
+    _context_operators = {}
     _fusion_operators = []
+    _text_context_operators = []
+    _video_context_operators = []
+    _dimensionality_reduction_operators = {}
 
     def __new__(cls):
         if not cls._instance:
@@ -60,12 +63,29 @@ def add_representation(
     ):
         self._representations[modality].append(representation)
 
-    def add_context_operator(self, context_operator):
-        self._context_operators.append(context_operator)
+    def add_context_operator(self, context_operator, modality_type):
+        if not isinstance(modality_type, list):
+            modality_type = [modality_type]
+        for m_type in modality_type:
+            if not m_type in self._context_operators.keys():
+                self._context_operators[m_type] = []
+            self._context_operators[m_type].append(context_operator)
 
     def add_fusion_operator(self, fusion_operator):
         self._fusion_operators.append(fusion_operator)
 
+    def add_dimensionality_reduction_operator(
+        self, dimensionality_reduction_operator, modality_type
+    ):
+        if not isinstance(modality_type, list):
+            modality_type = [modality_type]
+        for m_type in modality_type:
+            if not m_type in self._dimensionality_reduction_operators.keys():
+                self._dimensionality_reduction_operators[m_type] = []
+            self._dimensionality_reduction_operators[m_type].append(
+                dimensionality_reduction_operator
+            )
+
     def get_representations(self, modality: ModalityType):
         return self._representations[modality]
 
@@ -76,9 +96,11 @@ def get_not_self_contained_representations(self, modality: ModalityType):
                 reps.append(rep)
         return reps
 
-    def get_context_operators(self):
-        # TODO: return modality specific context operations
-        return self._context_operators
+    def get_context_operators(self, modality_type):
+        return self._context_operators[modality_type]
+
+    def get_dimensionality_reduction_operators(self, modality_type):
+        return self._dimensionality_reduction_operators[modality_type]
 
     def get_fusion_operators(self):
         return self._fusion_operators
@@ -121,13 +143,27 @@ def decorator(cls):
     return decorator
 
 
-def register_context_operator():
+def register_dimensionality_reduction_operator(modality_type):
+    """
+    Decorator to register a dimensionality reduction operator.
+    """
+
+    def decorator(cls):
+        Registry().add_dimensionality_reduction_operator(cls, modality_type)
+        return cls
+
+    return decorator
+
+
+def register_context_operator(modality_type):
     """
     Decorator to register a context operator.
+
+    @param modality_type: The modality type for which the context operator is to be registered
     """
 
     def decorator(cls):
-        Registry().add_context_operator(cls)
+        Registry().add_context_operator(cls, modality_type)
         return cls
 
     return decorator
diff --git a/src/main/python/systemds/scuro/drsearch/ranking.py b/src/main/python/systemds/scuro/drsearch/ranking.py
index 831a059eb88..b4b8a392ea1 100644
--- a/src/main/python/systemds/scuro/drsearch/ranking.py
+++ b/src/main/python/systemds/scuro/drsearch/ranking.py
@@ -19,8 +19,7 @@
 #
 # -------------------------------------------------------------
 
-from dataclasses import replace
-from typing import Callable, Iterable, List, Optional
+from typing import Callable, Iterable, Optional
 
 
 def rank_by_tradeoff(
@@ -31,7 +30,7 @@ def rank_by_tradeoff(
     runtime_accessor: Optional[Callable[[object], float]] = None,
     cache_scores: bool = True,
     score_attr: str = "tradeoff_score",
-) -> List:
+):
     entries = list(entries)
     if not entries:
         return []
@@ -39,6 +38,7 @@ def rank_by_tradeoff(
     performance_score_accessor = lambda entry: getattr(entry, "val_score")[
         performance_metric_name
     ]
+
     if runtime_accessor is None:
 
         def runtime_accessor(entry):
@@ -77,14 +77,17 @@ def safe_normalize(values, vmin, vmax):
     if cache_scores:
         for entry, score in zip(entries, scores):
             if hasattr(entry, score_attr):
-                try:
-                    new_entry = replace(entry, **{score_attr: score})
-                    entries[entries.index(entry)] = new_entry
-                except TypeError:
-                    setattr(entry, score_attr, score)
+                setattr(entry, score_attr, score)
             else:
                 setattr(entry, score_attr, score)
 
-    return sorted(
-        entries, key=lambda entry: getattr(entry, score_attr, 0.0), reverse=True
-    )
+    sorted_entries = sorted(entries, key=lambda e: e.tradeoff_score, reverse=True)
+
+    sorted_indices = [
+        i
+        for i, _ in sorted(
+            enumerate(entries), key=lambda pair: pair[1].tradeoff_score, reverse=True
+        )
+    ]
+
+    return sorted_entries, sorted_indices
diff --git a/src/main/python/systemds/scuro/drsearch/representation_dag.py b/src/main/python/systemds/scuro/drsearch/representation_dag.py
index 5543da32dd1..01020546a05 100644
--- a/src/main/python/systemds/scuro/drsearch/representation_dag.py
+++ b/src/main/python/systemds/scuro/drsearch/representation_dag.py
@@ -20,7 +20,7 @@
 # -------------------------------------------------------------
 import copy
 from dataclasses import dataclass, field
-from typing import List, Dict, Any
+from typing import List, Dict, Union, Any, Hashable, Optional
 from systemds.scuro.modality.modality import Modality
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.representation import (
@@ -30,7 +30,34 @@
     AggregatedRepresentation,
 )
 from systemds.scuro.representations.context import Context
+from systemds.scuro.representations.dimensionality_reduction import (
+    DimensionalityReduction,
+)
 from systemds.scuro.utils.identifier import get_op_id, get_node_id
+from collections import OrderedDict
+
+
+class LRUCache:
+    def __init__(self, max_size: int = 256):
+        self.max_size = max_size
+        self._cache: "OrderedDict[Hashable, Any]" = OrderedDict()
+
+    def get(self, key: Hashable) -> Optional[Any]:
+        if key not in self._cache:
+            return None
+        value = self._cache.pop(key)
+        self._cache[key] = value
+        return value
+
+    def put(self, key: Hashable, value: Any) -> None:
+        if key in self._cache:
+            self._cache.pop(key)
+        elif len(self._cache) >= self.max_size:
+            self._cache.popitem(last=False)
+        self._cache[key] = value
+
+    def __len__(self) -> int:
+        return len(self._cache)
 
 
 @dataclass
@@ -119,10 +146,24 @@ def has_cycle(node_id: str, path: set) -> bool:
 
         return not has_cycle(self.root_node_id, set())
 
+    def _compute_leaf_signature(self, node) -> Hashable:
+        return ("leaf", node.modality_id, node.representation_index)
+
+    def _compute_node_signature(self, node, input_sig_tuple) -> Hashable:
+        op_cls = node.operation
+        params_items = tuple(sorted((node.parameters or {}).items()))
+        return ("op", op_cls, params_items, input_sig_tuple)
+
     def execute(
-        self, modalities: List[Modality], task=None
-    ) -> Dict[str, TransformedModality]:
-        cache = {}
+        self,
+        modalities: List[Modality],
+        task=None,
+        external_cache: Optional[LRUCache] = None,
+        enable_cache=True,
+        rep_cache: Dict[Any, TransformedModality] = None,
+    ) -> Union[Dict[str, TransformedModality], TransformedModality]:
+        cache: Dict[str, TransformedModality] = {}
+        node_signatures: Dict[str, Hashable] = {}
 
         def execute_node(node_id: str, task) -> TransformedModality:
             if node_id in cache:
@@ -134,44 +175,70 @@ def execute_node(node_id: str, task) -> TransformedModality:
                 modality = get_modality_by_id_and_instance_id(
                     modalities, node.modality_id, node.representation_index
                 )
-                cache[node_id] = modality
+                if enable_cache:
+                    cache[node_id] = modality
+                node_signatures[node_id] = self._compute_leaf_signature(node)
                 return modality
 
             input_mods = [execute_node(input_id, task) for input_id in node.inputs]
+            input_signatures = tuple(
+                node_signatures[input_id] for input_id in node.inputs
+            )
+            node_signature = self._compute_node_signature(node, input_signatures)
+            is_unimodal = len(input_mods) == 1
+
+            cached_result = None
+            if external_cache and is_unimodal:
+                cached_result = external_cache.get(node_signature)
+            if cached_result is not None:
+                result = cached_result
 
-            node_operation = copy.deepcopy(node.operation())
-            if len(input_mods) == 1:
-                # It's a unimodal operation
-                if isinstance(node_operation, Context):
-                    result = input_mods[0].context(node_operation)
-                elif isinstance(node_operation, AggregatedRepresentation):
-                    result = node_operation.transform(input_mods[0])
-                elif isinstance(node_operation, UnimodalRepresentation):
+            else:
+                node_operation = copy.deepcopy(node.operation())
+                if len(input_mods) == 1:
+                    # It's a unimodal operation
+                    if isinstance(node_operation, Context):
+                        result = input_mods[0].context(node_operation)
+                    elif isinstance(node_operation, DimensionalityReduction):
+                        result = input_mods[0].dimensionality_reduction(node_operation)
+                    elif isinstance(node_operation, AggregatedRepresentation):
+                        result = node_operation.transform(input_mods[0])
+                    elif isinstance(node_operation, UnimodalRepresentation):
+                        if rep_cache is not None:
+                            result = rep_cache[node_operation.name]
+                        elif (
+                            isinstance(input_mods[0], TransformedModality)
+                            and input_mods[0].transformation[0].__class__
+                            == node.operation
+                        ):
+                            # Avoid duplicate transformations
+                            result = input_mods[0]
+                        else:
+                            # Compute the representation
+                            result = input_mods[0].apply_representation(node_operation)
+                else:
+                    # It's a fusion operation
+                    fusion_op = node_operation
                     if (
-                        isinstance(input_mods[0], TransformedModality)
-                        and input_mods[0].transformation[0].__class__ == node.operation
+                        hasattr(fusion_op, "needs_training")
+                        and fusion_op.needs_training
                     ):
-                        # Avoid duplicate transformations
-                        result = input_mods[0]
+                        result = input_mods[0].combine_with_training(
+                            input_mods[1:], fusion_op, task
+                        )
                     else:
-                        # Compute the representation
-                        result = input_mods[0].apply_representation(node_operation)
-            else:
-                # It's a fusion operation
-                fusion_op = node_operation
-                if hasattr(fusion_op, "needs_training") and fusion_op.needs_training:
-                    result = input_mods[0].combine_with_training(
-                        input_mods[1:], fusion_op, task
-                    )
-                else:
-                    result = input_mods[0].combine(input_mods[1:], fusion_op)
+                        result = input_mods[0].combine(input_mods[1:], fusion_op)
+                if external_cache and is_unimodal:
+                    external_cache.put(node_signature, result)
 
-            cache[node_id] = result
+            if enable_cache:
+                cache[node_id] = result
+            node_signatures[node_id] = node_signature
             return result
 
-        execute_node(self.root_node_id, task)
+        result = execute_node(self.root_node_id, task)
 
-        return cache
+        return cache if enable_cache else result
 
 
 def get_modality_by_id_and_instance_id(
@@ -230,3 +297,9 @@ def build(self, root_node_id: str) -> RepresentationDag:
         if not dag.validate():
             raise ValueError("Invalid DAG construction")
         return dag
+
+    def get_node(self, node_id: str) -> Optional[RepresentationNode]:
+        for node in self.nodes:
+            if node.node_id == node_id:
+                return node
+        return None
diff --git a/src/main/python/systemds/scuro/drsearch/task.py b/src/main/python/systemds/scuro/drsearch/task.py
index bfd1f16ab37..fbe08bcc61e 100644
--- a/src/main/python/systemds/scuro/drsearch/task.py
+++ b/src/main/python/systemds/scuro/drsearch/task.py
@@ -20,12 +20,10 @@
 # -------------------------------------------------------------
 import copy
 import time
-from typing import List, Union
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.representation import Representation
+from typing import List
 from systemds.scuro.models.model import Model
 import numpy as np
-from sklearn.model_selection import KFold
+from sklearn.model_selection import train_test_split
 
 
 class PerformanceMeasure:
@@ -69,7 +67,8 @@ def __init__(
         val_indices: List,
         kfold=5,
         measure_performance=True,
-        performance_measures="accuracy",
+        performance_measures=["accuracy"],
+        fusion_train_split=0.8,
     ):
         """
         Parent class for the prediction task that is performed on top of the aligned representation
@@ -85,7 +84,7 @@ def __init__(
         self.model = model
         self.labels = labels
         self.train_indices = train_indices
-        self.val_indices = val_indices
+        self.test_indices = val_indices
         self.kfold = kfold
         self.measure_performance = measure_performance
         self.inference_time = []
@@ -94,6 +93,47 @@ def __init__(
         self.performance_measures = performance_measures
         self.train_scores = PerformanceMeasure("train", performance_measures)
         self.val_scores = PerformanceMeasure("val", performance_measures)
+        self.test_scores = PerformanceMeasure("test", performance_measures)
+        self.fusion_train_indices = None
+        self._create_cv_splits()
+
+    def _create_cv_splits(self):
+        train_labels = [self.labels[i] for i in self.train_indices]
+        train_labels_array = np.array(train_labels)
+
+        train_indices_array = np.array(self.train_indices)
+
+        self.cv_train_indices = []
+        self.cv_val_indices = []
+
+        for fold_idx in range(self.kfold):
+            fold_train_indices_array, fold_val_indices_array, _, _ = train_test_split(
+                train_indices_array,
+                train_labels_array,
+                test_size=0.2,
+                shuffle=True,
+                random_state=11 + fold_idx,
+            )
+
+            fold_train_indices = fold_train_indices_array.tolist()
+            fold_val_indices = fold_val_indices_array.tolist()
+
+            self.cv_train_indices.append(fold_train_indices)
+            self.cv_val_indices.append(fold_val_indices)
+
+            overlap = set(fold_train_indices) & set(fold_val_indices)
+            if overlap:
+                raise ValueError(
+                    f"Fold {fold_idx}: Overlap detected between train and val indices: {overlap}"
+                )
+
+        all_val_indices = set()
+        for val_indices in self.cv_val_indices:
+            all_val_indices.update(val_indices)
+
+        self.fusion_train_indices = [
+            idx for idx in self.train_indices if idx not in all_val_indices
+        ]
 
     def create_model(self):
         """
@@ -107,12 +147,12 @@ def create_model(self):
     def get_train_test_split(self, data):
         X_train = [data[i] for i in self.train_indices]
         y_train = [self.labels[i] for i in self.train_indices]
-        if self.val_indices is None:
+        if self.test_indices is None:
             X_test = None
             y_test = None
         else:
-            X_test = [data[i] for i in self.val_indices]
-            y_test = [self.labels[i] for i in self.val_indices]
+            X_test = [data[i] for i in self.test_indices]
+            y_test = [self.labels[i] for i in self.test_indices]
 
         return X_train, y_train, X_test, y_test
 
@@ -125,22 +165,25 @@ def run(self, data):
         """
         self._reset_params()
         model = self.create_model()
-        skf = KFold(n_splits=self.kfold, shuffle=True, random_state=11)
 
-        fold = 0
-        X, y, _, _ = self.get_train_test_split(data)
+        test_X = np.array([data[i] for i in self.test_indices])
+        test_y = np.array([self.labels[i] for i in self.test_indices])
+
+        for fold_idx in range(self.kfold):
+            fold_train_indices = self.cv_train_indices[fold_idx]
+            fold_val_indices = self.cv_val_indices[fold_idx]
 
-        for train, test in skf.split(X, y):
-            train_X = np.array(X)[train]
-            train_y = np.array(y)[train]
-            test_X = np.array(X)[test]
-            test_y = np.array(y)[test]
-            self._run_fold(model, train_X, train_y, test_X, test_y)
-            fold += 1
+            train_X = np.array([data[i] for i in fold_train_indices])
+            train_y = np.array([self.labels[i] for i in fold_train_indices])
+            val_X = np.array([data[i] for i in fold_val_indices])
+            val_y = np.array([self.labels[i] for i in fold_val_indices])
+
+            self._run_fold(model, train_X, train_y, val_X, val_y, test_X, test_y)
 
         return [
             self.train_scores.compute_averages(),
             self.val_scores.compute_averages(),
+            self.test_scores.compute_averages(),
         ]
 
     def _reset_params(self):
@@ -148,48 +191,18 @@ def _reset_params(self):
         self.training_time = []
         self.train_scores = PerformanceMeasure("train", self.performance_measures)
         self.val_scores = PerformanceMeasure("val", self.performance_measures)
+        self.test_scores = PerformanceMeasure("test", self.performance_measures)
 
-    def _run_fold(self, model, train_X, train_y, test_X, test_y):
+    def _run_fold(self, model, train_X, train_y, val_X, val_y, test_X, test_y):
         train_start = time.time()
-        train_score = model.fit(train_X, train_y, test_X, test_y)
+        train_score = model.fit(train_X, train_y, val_X, val_y)
         train_end = time.time()
         self.training_time.append(train_end - train_start)
         self.train_scores.add_scores(train_score[0])
+        val_score = model.test(val_X, val_y)
         test_start = time.time()
         test_score = model.test(np.array(test_X), test_y)
         test_end = time.time()
         self.inference_time.append(test_end - test_start)
-        self.val_scores.add_scores(test_score[0])
-
-    def create_representation_and_run(
-        self,
-        representation: Representation,
-        modalities: Union[List[Modality], Modality],
-    ):
-        self._reset_params()
-        skf = KFold(n_splits=self.kfold, shuffle=True, random_state=11)
-
-        fold = 0
-        X, y, _, _ = self.get_train_test_split(data)
-
-        for train, test in skf.split(X, y):
-            train_X = np.array(X)[train]
-            train_y = np.array(y)[train]
-            test_X = s.transform(np.array(X)[test])
-            test_y = np.array(y)[test]
-
-            if isinstance(modalities, Modality):
-                rep = modality.apply_representation(representation())
-            else:
-                representation().transform(
-                    train_X, train_y
-                )  # TODO: think about a way how to handle masks
-
-            self._run_fold(train_X, train_y, test_X, test_y)
-            fold += 1
-
-        if self.measure_performance:
-            self.inference_time = np.mean(self.inference_time)
-            self.training_time = np.mean(self.training_time)
-
-        return [np.mean(train_scores), np.mean(test_scores)]
+        self.val_scores.add_scores(val_score[0])
+        self.test_scores.add_scores(test_score[0])
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 7735986c2e6..c555c2b677d 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -25,7 +25,6 @@
 import multiprocessing as mp
 from typing import List, Any
 from functools import lru_cache
-
 from systemds.scuro import ModalityType
 from systemds.scuro.drsearch.ranking import rank_by_tradeoff
 from systemds.scuro.drsearch.task import PerformanceMeasure
@@ -46,18 +45,27 @@
     RepresentationDAGBuilder,
 )
 from systemds.scuro.drsearch.representation_dag_visualizer import visualize_dag
+from systemds.scuro.drsearch.representation_dag import LRUCache
 
 
 class UnimodalOptimizer:
     def __init__(
-        self, modalities, tasks, debug=True, save_all_results=False, result_path=None
+        self,
+        modalities,
+        tasks,
+        debug=True,
+        save_all_results=False,
+        result_path=None,
+        k=2,
+        metric_name="accuracy",
     ):
         self.modalities = modalities
         self.tasks = tasks
-        self.run = None
+        self.modality_ids = [modality.modality_id for modality in modalities]
         self.save_all_results = save_all_results
         self.result_path = result_path
-
+        self.k = k
+        self.metric_name = metric_name
         self.builders = {
             modality.modality_id: RepresentationDAGBuilder() for modality in modalities
         }
@@ -65,7 +73,9 @@ def __init__(
         self.debug = debug
 
         self.operator_registry = Registry()
-        self.operator_performance = UnimodalResults(modalities, tasks, debug, self.run)
+        self.operator_performance = UnimodalResults(
+            modalities, tasks, debug, True, k, metric_name
+        )
 
         self._tasks_require_same_dims = True
         self.expected_dimensions = tasks[0].expected_dim
@@ -88,8 +98,14 @@ def _get_not_self_contained_reps(self, modality_type):
         )
 
     @lru_cache(maxsize=32)
-    def _get_context_operators(self):
-        return self.operator_registry.get_context_operators()
+    def _get_context_operators(self, modality_type):
+        return self.operator_registry.get_context_operators(modality_type)
+
+    @lru_cache(maxsize=32)
+    def _get_dimensionality_reduction_operators(self, modality_type):
+        return self.operator_registry.get_dimensionality_reduction_operators(
+            modality_type
+        )
 
     def store_results(self, file_name=None):
         if file_name is None:
@@ -102,6 +118,17 @@ def store_results(self, file_name=None):
         with open(file_name, "wb") as f:
             pickle.dump(self.operator_performance.results, f)
 
+    def store_cache(self, file_name=None):
+        if file_name is None:
+            import time
+
+            timestr = time.strftime("%Y%m%d-%H%M%S")
+            file_name = "unimodal_optimizer_cache" + timestr + ".pkl"
+
+        file_name = f"{self.result_path}/{file_name}"
+        with open(file_name, "wb") as f:
+            pickle.dump(self.operator_performance.cache, f)
+
     def load_results(self, file_name):
         with open(file_name, "rb") as f:
             self.operator_performance.results = pickle.load(f)
@@ -167,24 +194,33 @@ def _process_modality(self, modality, parallel):
         modality_specific_operators = self._get_modality_operators(
             modality.modality_type
         )
-
+        dags = []
+        operators = []
         for operator in modality_specific_operators:
-            dags = self._build_modality_dag(modality, operator())
+            dags.extend(self._build_modality_dag(modality, operator()))
+            operators.append(operator())
 
-            for dag in dags:
-                representations = dag.execute([modality])
-                node_id = list(representations.keys())[-1]
-                node = dag.get_node_by_id(node_id)
-                if node.operation is None:
-                    continue
+        external_cache = LRUCache(max_size=32)
+        rep_cache = None
+        if hasattr(modality, "data_loader") and modality.data_loader.chunk_size:
+            rep_cache = modality.apply_representations(operators)
 
-                reps = self._get_representation_chain(node, dag)
-                combination = next((op for op in reps if isinstance(op, Fusion)), None)
-                self._evaluate_local(
-                    representations[node_id], local_results, dag, combination
-                )
-                if self.debug:
-                    visualize_dag(dag)
+        for dag in dags:
+            representations = dag.execute(
+                [modality], external_cache=external_cache, rep_cache=rep_cache
+            )
+            node_id = list(representations.keys())[-1]
+            node = dag.get_node_by_id(node_id)
+            if node.operation is None:
+                continue
+
+            reps = self._get_representation_chain(node, dag)
+            combination = next((op for op in reps if isinstance(op, Fusion)), None)
+            self._evaluate_local(
+                representations[node_id], local_results, dag, combination
+            )
+            if self.debug:
+                visualize_dag(dag)
 
         if self.save_all_results:
             timestr = time.strftime("%Y%m%d-%H%M%S")
@@ -232,15 +268,21 @@ def _evaluate_local(self, modality, local_results, dag, combination=None):
                     agg_operator.get_current_parameters(),
                 )
                 dag = builder.build(rep_node_id)
-                representations = dag.execute([modality])
-                node_id = list(representations.keys())[-1]
+
+                aggregated_modality = agg_operator.transform(modality)
+
                 for task in self.tasks:
                     start = time.perf_counter()
-                    scores = task.run(representations[node_id].data)
+                    scores = task.run(aggregated_modality.data)
                     end = time.perf_counter()
 
                     local_results.add_result(
-                        scores, modality, task.model.name, end - start, combination, dag
+                        scores,
+                        aggregated_modality,
+                        task.model.name,
+                        end - start,
+                        combination,
+                        dag,
                     )
             else:
                 modality.pad()
@@ -262,7 +304,10 @@ def _evaluate_local(self, modality, local_results, dag, combination=None):
                         agg_operator.get_current_parameters(),
                     )
                     dag = builder.build(rep_node_id)
+                    start_rep = time.perf_counter()
                     representations = dag.execute([modality])
+                    end_rep = time.perf_counter()
+                    modality.transform_time += end_rep - start_rep
                     node_id = list(representations.keys())[-1]
 
                     start = time.perf_counter()
@@ -279,6 +324,27 @@ def _evaluate_local(self, modality, local_results, dag, combination=None):
                         scores, modality, task.model.name, end - start, combination, dag
                     )
 
+    def add_dimensionality_reduction_operators(self, builder, current_node_id):
+        dags = []
+        modality_type = (
+            builder.get_node(current_node_id).operation().output_modality_type
+        )
+
+        if modality_type is not ModalityType.EMBEDDING:
+            return None
+
+        dimensionality_reduction_operators = (
+            self._get_dimensionality_reduction_operators(modality_type)
+        )
+        for dimensionality_reduction_op in dimensionality_reduction_operators:
+            dimensionality_reduction_node_id = builder.create_operation_node(
+                dimensionality_reduction_op,
+                [current_node_id],
+                dimensionality_reduction_op().get_current_parameters(),
+            )
+            dags.append(builder.build(dimensionality_reduction_node_id))
+        return dags
+
     def _build_modality_dag(
         self, modality: Modality, operator: Any
     ) -> List[RepresentationDag]:
@@ -292,6 +358,50 @@ def _build_modality_dag(
         current_node_id = rep_node_id
         dags.append(builder.build(current_node_id))
 
+        dimensionality_reduction_dags = self.add_dimensionality_reduction_operators(
+            builder, current_node_id
+        )
+        if dimensionality_reduction_dags is not None:
+            dags.extend(dimensionality_reduction_dags)
+
+        if operator.needs_context:
+            context_operators = self._get_context_operators(modality.modality_type)
+            for context_op in context_operators:
+                if operator.initial_context_length is not None:
+                    context_length = operator.initial_context_length
+
+                    context_node_id = builder.create_operation_node(
+                        context_op,
+                        [leaf_id],
+                        context_op(context_length).get_current_parameters(),
+                    )
+                else:
+                    context_node_id = builder.create_operation_node(
+                        context_op,
+                        [leaf_id],
+                        context_op().get_current_parameters(),
+                    )
+
+                context_rep_node_id = builder.create_operation_node(
+                    operator.__class__,
+                    [context_node_id],
+                    operator.get_current_parameters(),
+                )
+                dimensionality_reduction_dags = self.add_dimensionality_reduction_operators(
+                    builder, context_rep_node_id
+                )  # TODO: check if this is correctly using the 3d approach of the dimensionality reduction operator
+                if dimensionality_reduction_dags is not None:
+                    dags.extend(dimensionality_reduction_dags)
+
+                agg_operator = AggregatedRepresentation()
+                context_agg_node_id = builder.create_operation_node(
+                    agg_operator.__class__,
+                    [context_rep_node_id],
+                    agg_operator.get_current_parameters(),
+                )
+
+                dags.append(builder.build(context_agg_node_id))
+
         if not operator.self_contained:
             not_self_contained_reps = self._get_not_self_contained_reps(
                 modality.modality_type
@@ -334,7 +444,7 @@ def _build_modality_dag(
 
     def default_context_operators(self, modality, builder, leaf_id, current_node_id):
         dags = []
-        context_operators = self._get_context_operators()
+        context_operators = self._get_context_operators(modality.modality_type)
         for context_op in context_operators:
             if (
                 modality.modality_type != ModalityType.TEXT
@@ -358,7 +468,7 @@ def default_context_operators(self, modality, builder, leaf_id, current_node_id)
 
     def temporal_context_operators(self, modality, builder, leaf_id, current_node_id):
         aggregators = self.operator_registry.get_representations(modality.modality_type)
-        context_operators = self._get_context_operators()
+        context_operators = self._get_context_operators(modality.modality_type)
 
         dags = []
         for agg in aggregators:
@@ -374,22 +484,32 @@ def temporal_context_operators(self, modality, builder, leaf_id, current_node_id
 
 
 class UnimodalResults:
-    def __init__(self, modalities, tasks, debug=False, store_cache=True):
+    def __init__(
+        self,
+        modalities,
+        tasks,
+        debug=False,
+        store_cache=True,
+        k=-1,
+        metric_name="accuracy",
+    ):
         self.modality_ids = [modality.modality_id for modality in modalities]
         self.task_names = [task.model.name for task in tasks]
         self.results = {}
         self.debug = debug
         self.cache = {}
         self.store_cache = store_cache
-
+        self.k = k
+        self.metric_name = metric_name
         for modality in self.modality_ids:
             self.results[modality] = {task_name: [] for task_name in self.task_names}
-            self.cache[modality] = {task_name: {} for task_name in self.task_names}
+            self.cache[modality] = {task_name: [] for task_name in self.task_names}
 
     def add_result(self, scores, modality, task_name, task_time, combination, dag):
         entry = ResultEntry(
             train_score=scores[0].average_scores,
             val_score=scores[1].average_scores,
+            test_score=scores[2].average_scores,
             representation_time=modality.transform_time,
             task_time=task_time,
             combination=combination.name if combination else "",
@@ -398,12 +518,20 @@ def add_result(self, scores, modality, task_name, task_time, combination, dag):
 
         self.results[modality.modality_id][task_name].append(entry)
         if self.store_cache:
-            cache_key = (
-                id(dag),
-                scores[1],
-                modality.transform_time,
+            self.cache[modality.modality_id][task_name].append(modality)
+
+        results = self.results[modality.modality_id][task_name]
+        if self.k != -1 and len(results) > self.k:
+            ranked, sorted_indices = rank_by_tradeoff(
+                results, performance_metric_name=self.metric_name
             )
-            self.cache[modality.modality_id][task_name][cache_key] = modality
+            keep = set(sorted_indices[: self.k])
+
+            self.cache[modality.modality_id][task_name] = [
+                m
+                for i, m in enumerate(self.cache[modality.modality_id][task_name])
+                if i in keep
+            ]
 
         if self.debug:
             print(f"{modality.modality_id}_{task_name}: {entry}")
@@ -414,22 +542,25 @@ def print_results(self):
                 for entry in self.results[modality][task_name]:
                     print(f"{modality}_{task_name}: {entry}")
 
-    def get_k_best_results(self, modality, k, task):
+    def get_k_best_results(
+        self, modality, task, performance_metric_name, prune_cache=False
+    ):
         """
         Get the k best results for the given modality
         :param modality: modality to get the best results for
         :param k: number of best results
+        :param task: task to get the best results for
+        :param performance_metric_name: name of the performance metric to use for ranking
         """
 
         task_results = self.results[modality.modality_id][task.model.name]
 
-        results = rank_by_tradeoff(task_results)[:k]
+        results, sorted_indices = rank_by_tradeoff(
+            task_results, performance_metric_name=performance_metric_name
+        )
 
-        sorted_indices = sorted(
-            range(len(task_results)),
-            key=lambda x: task_results[x].tradeoff_score,
-            reverse=True,
-        )[:k]
+        results = results[: self.k]
+        sorted_indices = sorted_indices[: self.k]
 
         task_cache = self.cache.get(modality.modality_id, {}).get(task.model.name, None)
         if not task_cache:
@@ -443,13 +574,29 @@ def get_k_best_results(self, modality, k, task):
             cache_items = list(task_cache.items()) if task_cache else []
             cache = [cache_items[i][1] for i in sorted_indices if i < len(cache_items)]
 
+        if prune_cache:
+            # Note: in case the unimodal results are loaded from a file, we need to initialize the cache for the modality and task
+            if modality.modality_id not in self.operator_performance.cache:
+                self.operator_performance.cache[modality.modality_id] = {}
+            if (
+                task.model.name
+                not in self.operator_performance.cache[modality.modality_id]
+            ):
+                self.operator_performance.cache[modality.modality_id][
+                    task.model.name
+                ] = {}
+            self.operator_performance.cache[modality.modality_id][
+                task.model.name
+            ] = cache
+
         return results, cache
 
 
-@dataclass(frozen=True)
+@dataclass
 class ResultEntry:
     val_score: PerformanceMeasure = None
     train_score: PerformanceMeasure = None
+    test_score: PerformanceMeasure = None
     representation_time: float = 0.0
     task_time: float = 0.0
     combination: str = ""
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index 98dd631e12c..8162662b7f4 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -18,11 +18,9 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from copy import deepcopy
 from typing import List
 
 import numpy as np
-from numpy.f2py.auxfuncs import throw_error
 
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.representations import utils
@@ -31,7 +29,12 @@
 class Modality:
 
     def __init__(
-        self, modalityType: ModalityType, modality_id=-1, metadata={}, data_type=None
+        self,
+        modalityType: ModalityType,
+        modality_id=-1,
+        metadata={},
+        data_type=None,
+        transform_time=0,
     ):
         """
         Parent class of the different Modalities (unimodal & multimodal)
@@ -45,7 +48,7 @@ def __init__(
         self.cost = None
         self.shape = None
         self.modality_id = modality_id
-        self.transform_time = None
+        self.transform_time = transform_time if transform_time else 0
 
     @property
     def data(self):
@@ -88,11 +91,12 @@ def update_metadata(self):
         ):
             return
 
-        md_copy = deepcopy(self.metadata)
-        self.metadata = {}
-        for i, (md_k, md_v) in enumerate(md_copy.items()):
+        for i, (md_k, md_v) in enumerate(self.metadata.items()):
+            md_v = selective_copy_metadata(md_v)
             updated_md = self.modality_type.update_metadata(md_v, self.data[i])
             self.metadata[md_k] = updated_md
+            if i == 0:
+                self.data_type = updated_md["data_layout"]["type"]
 
     def flatten(self, padding=False):
         """
@@ -137,24 +141,86 @@ def pad(self, value=0, max_len=None):
             else:
                 raise "Needs padding to max_len"
         except:
-            maxlen = (
-                max([len(seq) for seq in self.data]) if max_len is None else max_len
-            )
-
-            result = np.full((len(self.data), maxlen), value, dtype=self.data_type)
-
-            for i, seq in enumerate(self.data):
-                data = seq[:maxlen]
-                result[i, : len(data)] = data
-
-                if self.has_metadata():
-                    attention_mask = np.zeros(result.shape[1], dtype=np.int8)
-                    attention_mask[: len(seq[:maxlen])] = 1
-                    md_key = list(self.metadata.keys())[i]
-                    if "attention_mask" in self.metadata[md_key]:
-                        self.metadata[md_key]["attention_mask"] = attention_mask
-                    else:
-                        self.metadata[md_key].update({"attention_mask": attention_mask})
+            first = self.data[0]
+            if isinstance(first, np.ndarray) and first.ndim == 3:
+                maxlen = (
+                    max([seq.shape[0] for seq in self.data])
+                    if max_len is None
+                    else max_len
+                )
+                tail_shape = first.shape[1:]
+                result = np.full(
+                    (len(self.data), maxlen, *tail_shape),
+                    value,
+                    dtype=self.data_type or first.dtype,
+                )
+                for i, seq in enumerate(self.data):
+                    data = seq[:maxlen]
+                    result[i, : len(data), ...] = data
+                    if self.has_metadata():
+                        attention_mask = np.zeros(maxlen, dtype=np.int8)
+                        attention_mask[: len(data)] = 1
+                        md_key = list(self.metadata.keys())[i]
+                        if "attention_mask" in self.metadata[md_key]:
+                            self.metadata[md_key]["attention_mask"] = attention_mask
+                        else:
+                            self.metadata[md_key].update(
+                                {"attention_mask": attention_mask}
+                            )
+            elif (
+                isinstance(first, list)
+                and len(first) > 0
+                and isinstance(first[0], np.ndarray)
+                and first[0].ndim == 2
+            ):
+                maxlen = (
+                    max([len(seq) for seq in self.data]) if max_len is None else max_len
+                )
+                row_dim, col_dim = first[0].shape
+                result = np.full(
+                    (len(self.data), maxlen, row_dim, col_dim),
+                    value,
+                    dtype=self.data_type or first[0].dtype,
+                )
+                for i, seq in enumerate(self.data):
+                    data = seq[:maxlen]
+                    # stack list of 2D arrays into 3D then assign
+                    if len(data) > 0:
+                        result[i, : len(data), :, :] = np.stack(data, axis=0)
+                    if self.has_metadata():
+                        attention_mask = np.zeros(maxlen, dtype=np.int8)
+                        attention_mask[: len(data)] = 1
+                        md_key = list(self.metadata.keys())[i]
+                        if "attention_mask" in self.metadata[md_key]:
+                            self.metadata[md_key]["attention_mask"] = attention_mask
+                        else:
+                            self.metadata[md_key].update(
+                                {"attention_mask": attention_mask}
+                            )
+            else:
+                maxlen = (
+                    max([len(seq) for seq in self.data]) if max_len is None else max_len
+                )
+                result = np.full((len(self.data), maxlen), value, dtype=self.data_type)
+                for i, seq in enumerate(self.data):
+                    data = seq[:maxlen]
+                    try:
+                        result[i, : len(data)] = data
+                    except:
+                        print(f"Error padding data for modality {self.modality_id}")
+                        print(f"Data shape: {data.shape}")
+                        print(f"Result shape: {result.shape}")
+                        raise Exception("Error padding data")
+                    if self.has_metadata():
+                        attention_mask = np.zeros(result.shape[1], dtype=np.int8)
+                        attention_mask[: len(data)] = 1
+                        md_key = list(self.metadata.keys())[i]
+                        if "attention_mask" in self.metadata[md_key]:
+                            self.metadata[md_key]["attention_mask"] = attention_mask
+                        else:
+                            self.metadata[md_key].update(
+                                {"attention_mask": attention_mask}
+                            )
         # TODO: this might need to be a new modality (otherwise we loose the original data)
         self.data = result
 
@@ -181,3 +247,20 @@ def is_aligned(self, other_modality):
                 break
 
         return aligned
+
+
+def selective_copy_metadata(metadata):
+    if isinstance(metadata, dict):
+        new_md = {}
+        for k, v in metadata.items():
+            if k == "data_layout":
+                new_md[k] = v.copy() if isinstance(v, dict) else v
+            elif isinstance(v, np.ndarray):
+                new_md[k] = v
+            else:
+                new_md[k] = selective_copy_metadata(v)
+        return new_md
+    elif isinstance(metadata, (list, tuple)):
+        return type(metadata)(selective_copy_metadata(item) for item in metadata)
+    else:
+        return metadata
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
index f7739f03df9..078b65f0bc3 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -18,10 +18,8 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from functools import reduce
-from operator import or_
 from typing import Union, List
-
+import numpy as np
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.modality.joined import JoinedModality
 from systemds.scuro.modality.modality import Modality
@@ -45,7 +43,11 @@ def __init__(
 
         metadata = modality.metadata.copy() if modality.metadata is not None else None
         super().__init__(
-            new_modality_type, modality.modality_id, metadata, modality.data_type
+            new_modality_type,
+            modality.modality_id,
+            metadata,
+            modality.data_type,
+            modality.transform_time,
         )
         self.transformation = None
         self.self_contained = (
@@ -108,7 +110,7 @@ def window_aggregation(self, window_size, aggregation):
         )
         start = time.time()
         transformed_modality.data = w.execute(self)
-        transformed_modality.transform_time = time.time() - start
+        transformed_modality.transform_time += time.time() - start
         return transformed_modality
 
     def context(self, context_operator):
@@ -117,14 +119,37 @@ def context(self, context_operator):
         )
         start = time.time()
         transformed_modality.data = context_operator.execute(self)
-        transformed_modality.transform_time = time.time() - start
+        transformed_modality.transform_time += time.time() - start
+        return transformed_modality
+
+    def dimensionality_reduction(self, dimensionality_reduction_operator):
+        transformed_modality = TransformedModality(
+            self, dimensionality_reduction_operator, self_contained=self.self_contained
+        )
+        start = time.time()
+        if len(self.data[0].shape) >= 3:
+            return self
+        else:
+            try:
+                data = np.array(self.data)
+                if len(data.shape) >= 3:
+                    data = data.reshape(data.shape[0], -1)
+                transformed_modality.data = dimensionality_reduction_operator.execute(
+                    data
+                )
+            except:
+                transformed_modality.data = self._padded_dimensionality_reduction(
+                    dimensionality_reduction_operator
+                )
+
+        transformed_modality.transform_time += time.time() - start
         return transformed_modality
 
     def apply_representation(self, representation):
         start = time.time()
         new_modality = representation.transform(self)
         new_modality.update_metadata()
-        new_modality.transform_time = time.time() - start
+        new_modality.transform_time += time.time() - start
         new_modality.self_contained = representation.self_contained
         return new_modality
 
@@ -167,3 +192,29 @@ def create_modality_list(self, other: Union[Modality, List[Modality]]):
             modalities.append(other)
 
         return modalities
+
+    def _padded_dimensionality_reduction(self, dimensionality_reduction_operator):
+        all_outputs = []
+        batch_size = 1024 if len(self.data[0].shape) >= 3 else len(self.data)
+        ndim = self.data[0].ndim
+        start = 0
+        while start < len(self.data):
+            end = min(start + batch_size, len(self.data))
+            max_shape = tuple(
+                max(a.shape[i] for a in self.data[start:end]) for i in range(ndim)
+            )
+
+            padded = []
+            for a in self.data[start:end]:
+                pad_width = tuple((0, max_shape[i] - a.shape[i]) for i in range(ndim))
+                padded.append(np.pad(a, pad_width=pad_width, mode="constant"))
+            padded = np.array(padded)
+            end = min(start + batch_size, len(self.data))
+
+            if len(padded.shape) >= 3:
+                padded = padded.reshape(padded.shape[0], -1)
+
+            out = dimensionality_reduction_operator.execute(padded)
+            all_outputs.append(out)
+            start = end
+        return np.concatenate(all_outputs, axis=0)
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
index c6f713df240..23d97e869b0 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -108,8 +108,12 @@ def update_base_metadata(cls, md, data, data_is_single_instance=True):
                 shape = data.shape
         elif data_layout is DataLayout.NESTED_LEVEL:
             if data_is_single_instance:
-                dtype = data.dtype
-                shape = data.shape
+                if isinstance(data, list):
+                    dtype = type(data[0])
+                    shape = (len(data), len(data[0]))
+                else:
+                    dtype = data.dtype
+                    shape = data.shape
             else:
                 shape = data[0].shape
                 dtype = data[0].dtype
@@ -281,7 +285,7 @@ def create_video_metadata(self, frequency, length, width, height, num_channels):
         md["num_channels"] = num_channels
         md["timestamp"] = create_timestamps(frequency, length)
         md["data_layout"]["representation"] = DataLayout.NESTED_LEVEL
-        md["data_layout"]["type"] = float
+        md["data_layout"]["type"] = np.float32
         md["data_layout"]["shape"] = (width, height, num_channels)
         return md
 
@@ -291,7 +295,7 @@ def create_image_metadata(self, width, height, num_channels):
         md["height"] = height
         md["num_channels"] = num_channels
         md["data_layout"]["representation"] = DataLayout.SINGLE_LEVEL
-        md["data_layout"]["type"] = float
+        md["data_layout"]["type"] = np.float32
         md["data_layout"]["shape"] = (width, height, num_channels)
         return md
 
@@ -306,13 +310,15 @@ def get_data_layout(cls, data, data_is_single_instance):
             return None
 
         if data_is_single_instance:
-            if (
+            if (isinstance(data, list) and not isinstance(data[0], str)) or (
+                isinstance(data, np.ndarray) and data.ndim == 1
+            ):
+                return DataLayout.SINGLE_LEVEL
+            elif (
                 isinstance(data, list)
                 or isinstance(data, np.ndarray)
-                and data.ndim == 1
+                or isinstance(data, torch.Tensor)
             ):
-                return DataLayout.SINGLE_LEVEL
-            elif isinstance(data, np.ndarray) or isinstance(data, torch.Tensor):
                 return DataLayout.NESTED_LEVEL
 
         if isinstance(data[0], list):
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 5898ea98c1f..4efaa7d7333 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -18,8 +18,7 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from functools import reduce
-from operator import or_
+import gc
 import time
 import numpy as np
 from systemds.scuro import ModalityType
@@ -95,7 +94,7 @@ def context(self, context_operator):
         transformed_modality = TransformedModality(self, context_operator)
 
         transformed_modality.data = context_operator.execute(self)
-        transformed_modality.transform_time = time.time() - start
+        transformed_modality.transform_time += time.time() - start
         return transformed_modality
 
     def aggregate(self, aggregation_function):
@@ -103,29 +102,45 @@ def aggregate(self, aggregation_function):
             raise Exception("Data is None")
 
     def apply_representations(self, representations):
-        # TODO
-        pass
-
-    def apply_representation(self, representation):
-        new_modality = TransformedModality(
-            self,
-            representation,
-        )
-
-        pad_dim_one = False
+        """
+        Applies a list of representations to the modality. Specifically, it applies the representations to the modality in a chunked manner.
+        :param representations: List of representations to apply
+        :return: List of transformed modalities
+        """
+        transformed_modalities_per_representation = {}
+        padding_per_representation = {}
+        original_lengths_per_representation = {}
+
+        # Initialize dictionaries for each representation
+        for representation in representations:
+            transformed_modality = TransformedModality(self, representation.name)
+            transformed_modality.data = []
+            transformed_modalities_per_representation[representation.name] = (
+                transformed_modality
+            )
+            padding_per_representation[representation.name] = False
+            original_lengths_per_representation[representation.name] = []
 
-        new_modality.data = []
-        start = time.time()
-        original_lengths = []
+        start = (
+            time.time()
+        )  # TODO: should be repalced in unimodal_representation.transform
         if self.data_loader.chunk_size:
             self.data_loader.reset()
             while self.data_loader.next_chunk < self.data_loader.num_chunks:
                 self.extract_raw_data()
-                transformed_chunk = representation.transform(self)
-                new_modality.data.extend(transformed_chunk.data)
-                for d in transformed_chunk.data:
-                    original_lengths.append(d.shape[0])
-                new_modality.metadata.update(transformed_chunk.metadata)
+                for representation in representations:
+                    transformed_chunk = representation.transform(self)
+                    transformed_modalities_per_representation[
+                        representation.name
+                    ].data.extend(transformed_chunk.data)
+                    transformed_modalities_per_representation[
+                        representation.name
+                    ].metadata.update(transformed_chunk.metadata)
+                    for d in transformed_chunk.data:
+                        original_lengths_per_representation[representation.name].append(
+                            d.shape[0]
+                        )
+
         else:
             if not self.has_data():
                 self.extract_raw_data()
@@ -141,73 +156,79 @@ def apply_representation(self, representation):
             ):
                 for d in new_modality.data:
                     if d.shape[0] == 1 and d.ndim == 2:
-                        pad_dim_one = True
-                        original_lengths.append(d.shape[1])
+                        padding_per_representation[representation.name] = True
+                        original_lengths_per_representation[representation.name].append(
+                            d.shape[1]
+                        )
                     else:
-                        original_lengths.append(d.shape[0])
+                        original_lengths_per_representation[representation.name].append(
+                            d.shape[0]
+                        )
+            transformed_modalities_per_representation[representation.name] = (
+                new_modality
+            )
 
-        new_modality.data = self.l2_normalize_features(new_modality.data)
+        for representation in representations:
+            self._apply_padding(
+                transformed_modalities_per_representation[representation.name],
+                original_lengths_per_representation[representation.name],
+                padding_per_representation[representation.name],
+            )
+            transformed_modalities_per_representation[
+                representation.name
+            ].transform_time += (time.time() - start)
+            transformed_modalities_per_representation[
+                representation.name
+            ].self_contained = representation.self_contained
+        gc.collect()
+        return transformed_modalities_per_representation
+
+    def apply_representation(self, representation):
+        return self.apply_representations([representation])[representation.name]
 
+    def _apply_padding(self, modality, original_lengths, pad_dim_one):
         if len(original_lengths) > 0 and min(original_lengths) < max(original_lengths):
             target_length = max(original_lengths)
             padded_embeddings = []
-            for embeddings in new_modality.data:
+            for embeddings in modality.data:
                 current_length = (
                     embeddings.shape[0] if not pad_dim_one else embeddings.shape[1]
                 )
                 if current_length < target_length:
                     padding_needed = target_length - current_length
                     if pad_dim_one:
-                        padding = np.zeros((embeddings.shape[0], padding_needed))
-                        padded_embeddings.append(
-                            np.concatenate((embeddings, padding), axis=1)
+                        padded = np.pad(
+                            embeddings,
+                            ((0, 0), (0, padding_needed)),
+                            mode="constant",
+                            constant_values=0,
                         )
+                        padded_embeddings.append(padded)
                     else:
                         if len(embeddings.shape) == 1:
-                            padded = np.zeros(
-                                embeddings.shape[0] + padding_needed,
-                                dtype=embeddings.dtype,
+                            padded = np.pad(
+                                embeddings,
+                                (0, padding_needed),
+                                mode="constant",
+                                constant_values=0,
                             )
-                            padded[: embeddings.shape[0]] = embeddings
                         else:
-                            padded = np.zeros(
-                                (
-                                    embeddings.shape[0] + padding_needed,
-                                    embeddings.shape[1],
-                                ),
-                                dtype=embeddings.dtype,
+                            padded = np.pad(
+                                embeddings,
+                                ((0, padding_needed), (0, 0)),
+                                mode="constant",
+                                constant_values=0,
                             )
-                            padded[: embeddings.shape[0], :] = embeddings
                         padded_embeddings.append(padded)
                 else:
                     padded_embeddings.append(embeddings)
 
-            attention_masks = np.zeros((len(new_modality.data), target_length))
+            attention_masks = np.zeros((len(modality.data), target_length))
             for i, length in enumerate(original_lengths):
                 attention_masks[i, :length] = 1
 
             ModalityType(self.modality_type).add_field_for_instances(
-                new_modality.metadata, "attention_masks", attention_masks
+                modality.metadata, "attention_masks", attention_masks
             )
-            new_modality.data = padded_embeddings
-        new_modality.update_metadata()
-        new_modality.transform_time = time.time() - start
-        new_modality.self_contained = representation.self_contained
-        return new_modality
-
-    def l2_normalize_features(self, feature_list):
-        normalized_features = []
-        for feature in feature_list:
-            original_shape = feature.shape
-            flattened = feature.flatten()
-
-            norm = np.linalg.norm(flattened)
-            if norm > 0:
-                normalized_flat = flattened / norm
-                normalized_feature = normalized_flat.reshape(original_shape)
-            else:
-                normalized_feature = feature
-
-            normalized_features.append(normalized_feature)
-
-        return normalized_features
+            modality.data = padded_embeddings
+        modality.update_metadata()
diff --git a/src/main/python/systemds/scuro/representations/aggregate.py b/src/main/python/systemds/scuro/representations/aggregate.py
index 0a8438e684f..9503a48587b 100644
--- a/src/main/python/systemds/scuro/representations/aggregate.py
+++ b/src/main/python/systemds/scuro/representations/aggregate.py
@@ -71,7 +71,7 @@ def execute(self, modality):
         max_len = 0
         for i, instance in enumerate(modality.data):
             data.append([])
-            if isinstance(instance, np.ndarray):
+            if isinstance(instance, np.ndarray) or isinstance(instance, list):
                 if (
                     modality.modality_type == ModalityType.IMAGE
                     or modality.modality_type == ModalityType.VIDEO
diff --git a/src/main/python/systemds/scuro/representations/aggregated_representation.py b/src/main/python/systemds/scuro/representations/aggregated_representation.py
index 1e98d2f92ae..bcc36f46210 100644
--- a/src/main/python/systemds/scuro/representations/aggregated_representation.py
+++ b/src/main/python/systemds/scuro/representations/aggregated_representation.py
@@ -21,6 +21,7 @@
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.representation import Representation
 from systemds.scuro.representations.aggregate import Aggregation
+import time
 
 
 class AggregatedRepresentation(Representation):
@@ -33,8 +34,11 @@ def __init__(self, aggregation="mean"):
         self.self_contained = True
 
     def transform(self, modality):
+        start = time.perf_counter()
         aggregated_modality = TransformedModality(
             modality, self, self_contained=modality.self_contained
         )
+        end = time.perf_counter()
+        aggregated_modality.transform_time += end - start
         aggregated_modality.data = self.aggregation.execute(modality)
         return aggregated_modality
diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py
index 4d486bff59d..be579c0dd6c 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -22,7 +22,7 @@
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 import torch
-from transformers import BertTokenizerFast, BertModel
+from transformers import AutoTokenizer, AutoModel
 from systemds.scuro.representations.utils import save_embeddings
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.drsearch.operator_registry import register_representation
@@ -37,15 +37,18 @@ class TextDataset(Dataset):
     def __init__(self, texts):
 
         self.texts = []
-        for text in texts:
-            if text is None:
-                self.texts.append("")
-            elif isinstance(text, np.ndarray):
-                self.texts.append(str(text.item()) if text.size == 1 else str(text))
-            elif not isinstance(text, str):
-                self.texts.append(str(text))
-            else:
-                self.texts.append(text)
+        if isinstance(texts, list):
+            self.texts = texts
+        else:
+            for text in texts:
+                if text is None:
+                    self.texts.append("")
+                elif isinstance(text, np.ndarray):
+                    self.texts.append(str(text.item()) if text.size == 1 else str(text))
+                elif not isinstance(text, str):
+                    self.texts.append(str(text))
+                else:
+                    self.texts.append(text)
 
     def __len__(self):
         return len(self.texts)
@@ -54,36 +57,61 @@ def __getitem__(self, idx):
         return self.texts[idx]
 
 
-@register_representation(ModalityType.TEXT)
-class Bert(UnimodalRepresentation):
-    def __init__(self, model_name="bert", output_file=None, max_seq_length=512):
-        parameters = {"model_name": "bert"}
+class BertFamily(UnimodalRepresentation):
+    def __init__(
+        self,
+        representation_name,
+        model_name,
+        layer,
+        parameters={},
+        output_file=None,
+        max_seq_length=512,
+    ):
         self.model_name = model_name
-        super().__init__("Bert", ModalityType.EMBEDDING, parameters)
+        super().__init__(representation_name, ModalityType.EMBEDDING, parameters)
 
+        self.layer_name = layer
         self.output_file = output_file
         self.max_seq_length = max_seq_length
+        self.needs_context = True
+        self.initial_context_length = 350
 
     def transform(self, modality):
         transformed_modality = TransformedModality(modality, self)
-        model_name = "bert-base-uncased"
-        tokenizer = BertTokenizerFast.from_pretrained(
-            model_name, clean_up_tokenization_spaces=True
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name, clean_up_tokenization_spaces=True
         )
+        self.model = AutoModel.from_pretrained(self.model_name).to(get_device())
+        self.bert_output = None
+
+        def get_activation(name):
+            def hook(model, input, output):
+                self.bert_output = output.detach().cpu().numpy()
 
-        model = BertModel.from_pretrained(model_name).to(get_device())
+            return hook
 
-        embeddings = self.create_embeddings(modality, model, tokenizer)
+        if self.layer_name != "cls":
+            for name, layer in self.model.named_modules():
+                if name == self.layer_name:
+                    layer.register_forward_hook(get_activation(name))
+                    break
+
+        if isinstance(modality.data[0], list):
+            embeddings = []
+            for d in modality.data:
+                embeddings.append(self.create_embeddings(d, self.model, tokenizer))
+        else:
+            embeddings = self.create_embeddings(modality.data, self.model, tokenizer)
 
         if self.output_file is not None:
             save_embeddings(embeddings, self.output_file)
 
         transformed_modality.data_type = np.float32
-        transformed_modality.data = np.array(embeddings)
+        transformed_modality.data = embeddings
         return transformed_modality
 
-    def create_embeddings(self, modality, model, tokenizer):
-        dataset = TextDataset(modality.data)
+    def create_embeddings(self, data, model, tokenizer):
+        dataset = TextDataset(data)
         dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=None)
         cls_embeddings = []
         for batch in dataloader:
@@ -94,27 +122,146 @@ def create_embeddings(self, modality, model, tokenizer):
                 padding="max_length",
                 return_attention_mask=True,
                 truncation=True,
-                max_length=512,  # TODO: make this dynamic
+                max_length=512,  # TODO: make this dynamic with parameter to tune
             )
 
             inputs.to(get_device())
-            ModalityType.TEXT.add_field_for_instances(
-                modality.metadata,
-                "token_to_character_mapping",
-                inputs.data["offset_mapping"].tolist(),
-            )
-
-            ModalityType.TEXT.add_field_for_instances(
-                modality.metadata,
-                "attention_masks",
-                inputs.data["attention_mask"].tolist(),
-            )
+            # ModalityType.TEXT.add_field_for_instances(
+            #     modality.metadata,
+            #     "token_to_character_mapping",
+            #     inputs.data["offset_mapping"].tolist(),
+            # )
+            #
+            # ModalityType.TEXT.add_field_for_instances(
+            #     modality.metadata,
+            #     "attention_masks",
+            #     inputs.data["attention_mask"].tolist(),
+            # )
             del inputs.data["offset_mapping"]
 
             with torch.no_grad():
                 outputs = model(**inputs)
-
-                cls_embedding = outputs.last_hidden_state.detach().cpu().numpy()
+                if self.layer_name == "cls":
+                    cls_embedding = outputs.last_hidden_state.detach().cpu().numpy()
+                else:
+                    cls_embedding = self.bert_output
                 cls_embeddings.extend(cls_embedding)
 
         return np.array(cls_embeddings)
+
+
+@register_representation(ModalityType.TEXT)
+class Bert(BertFamily):
+    def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+        parameters = {
+            "layer_name": [
+                "cls",
+                "encoder.layer.0",
+                "encoder.layer.1",
+                "encoder.layer.2",
+                "encoder.layer.3",
+                "encoder.layer.4",
+                "encoder.layer.5",
+                "encoder.layer.6",
+                "encoder.layer.7",
+                "encoder.layer.8",
+                "encoder.layer.9",
+                "encoder.layer.10",
+                "encoder.layer.11",
+                "pooler",
+                "pooler.activation",
+            ]
+        }
+        super().__init__(
+            "Bert", "bert-base-uncased", layer, parameters, output_file, max_seq_length
+        )
+
+
+@register_representation(ModalityType.TEXT)
+class RoBERTa(BertFamily):
+    def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+        parameters = {
+            "layer_name": [
+                "cls",
+                "encoder.layer.0",
+                "encoder.layer.1",
+                "encoder.layer.2",
+                "encoder.layer.3",
+                "encoder.layer.4",
+                "encoder.layer.5",
+                "encoder.layer.6",
+                "encoder.layer.7",
+                "encoder.layer.8",
+                "encoder.layer.9",
+                "encoder.layer.10",
+                "encoder.layer.11",
+                "pooler",
+                "pooler.activation",
+            ]
+        }
+        super().__init__(
+            "RoBERTa", "roberta-base", layer, parameters, output_file, max_seq_length
+        )
+
+
+@register_representation(ModalityType.TEXT)
+class DistillBERT(BertFamily):
+    def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+        parameters = {
+            "layer_name": [
+                "cls",
+                "transformer.layer.0",
+                "transformer.layer.1",
+                "transformer.layer.2",
+                "transformer.layer.3",
+                "transformer.layer.4",
+                "transformer.layer.5",
+            ]
+        }
+        super().__init__(
+            "DistillBERT",
+            "distilbert-base-uncased",
+            layer,
+            parameters,
+            output_file,
+            max_seq_length,
+        )
+
+
+@register_representation(ModalityType.TEXT)
+class ALBERT(BertFamily):
+    def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+        parameters = {"layer_name": ["cls", "encoder.albert_layer_groups.0", "pooler"]}
+        super().__init__(
+            "ALBERT", "albert-base-v2", layer, parameters, output_file, max_seq_length
+        )
+
+
+@register_representation(ModalityType.TEXT)
+class ELECTRA(BertFamily):
+    def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+        parameters = {
+            "layer_name": [
+                "cls",
+                "encoder.layer.0",
+                "encoder.layer.1",
+                "encoder.layer.2",
+                "encoder.layer.3",
+                "encoder.layer.4",
+                "encoder.layer.5",
+                "encoder.layer.6",
+                "encoder.layer.7",
+                "encoder.layer.8",
+                "encoder.layer.9",
+                "encoder.layer.10",
+                "encoder.layer.11",
+            ]
+        }
+        super().__init__(
+            "ELECTRA",
+            "google/electra-base-discriminator",
+            layer,
+            parameters,
+            output_file,
+            max_seq_length,
+        )
diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py
index 2b338d30ee6..9d1d82a6be8 100644
--- a/src/main/python/systemds/scuro/representations/bow.py
+++ b/src/main/python/systemds/scuro/representations/bow.py
@@ -32,7 +32,7 @@
 @register_representation(ModalityType.TEXT)
 class BoW(UnimodalRepresentation):
     def __init__(self, ngram_range=2, min_df=2, output_file=None):
-        parameters = {"ngram_range": [ngram_range], "min_df": [min_df]}
+        parameters = {"ngram_range": [2, 3, 5, 10], "min_df": [1, 2, 4, 8]}
         super().__init__("BoW", ModalityType.EMBEDDING, parameters)
         self.ngram_range = int(ngram_range)
         self.min_df = int(min_df)
diff --git a/src/main/python/systemds/scuro/representations/clip.py b/src/main/python/systemds/scuro/representations/clip.py
index 1d458aeb7d0..a431e52761c 100644
--- a/src/main/python/systemds/scuro/representations/clip.py
+++ b/src/main/python/systemds/scuro/representations/clip.py
@@ -34,7 +34,7 @@
 from systemds.scuro.utils.torch_dataset import CustomDataset
 
 
-@register_representation(ModalityType.VIDEO)
+@register_representation([ModalityType.VIDEO, ModalityType.IMAGE])
 class CLIPVisual(UnimodalRepresentation):
     def __init__(self, output_file=None):
         parameters = {}
@@ -46,8 +46,10 @@ def __init__(self, output_file=None):
         self.output_file = output_file
 
     def transform(self, modality):
-        transformed_modality = TransformedModality(modality, self)
-        self.data_type = numpy_dtype_to_torch_dtype(modality.data_type)
+        transformed_modality = TransformedModality(
+            modality, self, self.output_modality_type
+        )
+        self.data_type = torch.float32
         if next(self.model.parameters()).dtype != self.data_type:
             self.model = self.model.to(self.data_type)
 
@@ -60,14 +62,20 @@ def transform(self, modality):
         return transformed_modality
 
     def create_visual_embeddings(self, modality):
-        tf = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])
+
+        clip_transform = transforms.Compose(
+            [
+                transforms.ToPILImage(),
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.ConvertImageDtype(dtype=self.data_type),
+            ]
+        )
         dataset = CustomDataset(
-            modality.data,
-            self.data_type,
-            get_device(),
-            (modality.metadata[0]["width"], modality.metadata[0]["height"]),
-            tf=tf,
+            modality.data, self.data_type, get_device(), tf=clip_transform
         )
+
         embeddings = {}
         for instance in torch.utils.data.DataLoader(dataset):
             id = int(instance["id"][0])
@@ -94,7 +102,7 @@ def create_visual_embeddings(self, modality):
                     .cpu()
                     .float()
                     .numpy()
-                    .astype(modality.data_type)
+                    .astype(np.float32)
                 )
 
             embeddings[id] = np.array(embeddings[id])
@@ -111,11 +119,20 @@ def __init__(self, output_file=None):
         )
         self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
         self.output_file = output_file
+        self.needs_context = True
+        self.initial_context_length = 55
 
     def transform(self, modality):
-        transformed_modality = TransformedModality(modality, self)
+        transformed_modality = TransformedModality(
+            modality, self, self.output_modality_type
+        )
 
-        embeddings = self.create_text_embeddings(modality.data, self.model)
+        if isinstance(modality.data[0], list):
+            embeddings = []
+            for d in modality.data:
+                embeddings.append(self.create_text_embeddings(d, self.model))
+        else:
+            embeddings = self.create_text_embeddings(modality.data, self.model)
 
         if self.output_file is not None:
             save_embeddings(embeddings, self.output_file)
diff --git a/src/main/python/systemds/scuro/representations/color_histogram.py b/src/main/python/systemds/scuro/representations/color_histogram.py
index 6412b1979df..2d780939e15 100644
--- a/src/main/python/systemds/scuro/representations/color_histogram.py
+++ b/src/main/python/systemds/scuro/representations/color_histogram.py
@@ -22,17 +22,19 @@
 import numpy as np
 import cv2
 
+from systemds.scuro.drsearch.operator_registry import register_representation
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.modality.transformed import TransformedModality
 
 
+@register_representation(ModalityType.IMAGE)
 class ColorHistogram(UnimodalRepresentation):
     def __init__(
         self,
         color_space="RGB",
-        bins=32,
-        normalize=True,
+        bins=64,
+        normalize=False,
         aggregation="mean",
         output_file=None,
     ):
@@ -48,7 +50,7 @@ def __init__(
     def _get_parameters(self):
         return {
             "color_space": ["RGB", "HSV", "GRAY"],
-            "bins": [8, 16, 32, 64, 128, 256, (8, 8, 8), (16, 16, 16)],
+            "bins": [8, 16, 32, 64, 128, 256],
             "normalize": [True, False],
             "aggregation": ["mean", "max", "concat"],
         }
diff --git a/src/main/python/systemds/scuro/representations/concatenation.py b/src/main/python/systemds/scuro/representations/concatenation.py
index bf854a481fd..ea199d58274 100644
--- a/src/main/python/systemds/scuro/representations/concatenation.py
+++ b/src/main/python/systemds/scuro/representations/concatenation.py
@@ -51,7 +51,7 @@ def execute(self, modalities: List[Modality]):
         max_emb_size = self.get_max_embedding_size(modalities)
         size = len(modalities[0].data)
 
-        if modalities[0].data.ndim > 2:
+        if np.array(modalities[0].data).ndim > 2:
             data = np.zeros((size, max_emb_size, 0))
         else:
             data = np.zeros((size, 0))
diff --git a/src/main/python/systemds/scuro/representations/dimensionality_reduction.py b/src/main/python/systemds/scuro/representations/dimensionality_reduction.py
new file mode 100644
index 00000000000..71138b36417
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/dimensionality_reduction.py
@@ -0,0 +1,81 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import abc
+
+import numpy as np
+
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.representations.representation import Representation
+
+
+class DimensionalityReduction(Representation):
+    def __init__(self, name, parameters=None):
+        """
+        Parent class for different dimensionality reduction operations
+        :param name: Name of the dimensionality reduction operator
+        """
+        super().__init__(name, parameters)
+        self.needs_training = False
+
+    @abc.abstractmethod
+    def execute(self, data, labels=None):
+        """
+        Implemented for every child class and creates a sampled representation for a given modality
+        :param data: data to apply the dimensionality reduction on
+        :param labels: labels for learned dimensionality reduction
+        :return: dimensionality reduced data
+        """
+        if labels is not None:
+            self.execute_with_training(data, labels)
+        else:
+            self.execute(data)
+
+    def apply_representation(self, data):
+        """
+        Implemented for every child class and creates a dimensionality reduced representation for a given modality
+        :param data: data to apply the representation on
+        :return: dimensionality reduced data
+        """
+        raise f"Not implemented for Dimensionality Reduction Operator: {self.name}"
+
+    def execute_with_training(self, modality, task):
+        fusion_train_indices = task.fusion_train_indices
+        # Handle 3d data
+        data = modality.data
+        if (
+            len(np.array(modality.data).shape) == 3
+            and np.array(modality.data).shape[1] == 1
+        ):
+            data = np.array([x.reshape(-1) for x in modality.data])
+        transformed_train = self.execute(
+            np.array(data)[fusion_train_indices], task.labels[fusion_train_indices]
+        )
+
+        all_other_indices = [
+            i for i in range(len(modality.data)) if i not in fusion_train_indices
+        ]
+        transformed_other = self.apply_representation(np.array(data)[all_other_indices])
+
+        transformed_data = np.zeros((len(data), transformed_train.shape[1]))
+        transformed_data[fusion_train_indices] = transformed_train
+        transformed_data[all_other_indices] = transformed_other
+
+        return transformed_data
diff --git a/src/main/python/systemds/scuro/representations/elmo.py b/src/main/python/systemds/scuro/representations/elmo.py
new file mode 100644
index 00000000000..ba2a99f8e1d
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/elmo.py
@@ -0,0 +1,154 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from systemds.scuro.utils.torch_dataset import CustomDataset
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.drsearch.operator_registry import register_representation
+import torch.utils.data
+import torch
+import numpy as np
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.utils.static_variables import get_device
+from flair.embeddings import ELMoEmbeddings
+from flair.data import Sentence
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+
+
+class TextDataset(Dataset):
+    def __init__(self, texts):
+
+        self.texts = []
+        if isinstance(texts, list):
+            self.texts = texts
+        else:
+            for text in texts:
+                if text is None:
+                    self.texts.append("")
+                elif isinstance(text, np.ndarray):
+                    self.texts.append(str(text.item()) if text.size == 1 else str(text))
+                elif not isinstance(text, str):
+                    self.texts.append(str(text))
+                else:
+                    self.texts.append(text)
+
+    def __len__(self):
+        return len(self.texts)
+
+    def __getitem__(self, idx):
+        return self.texts[idx]
+
+
+# @register_representation([ModalityType.TEXT])
+class ELMoRepresentation(UnimodalRepresentation):
+    def __init__(
+        self, model_name="elmo-original", layer="mix", pooling="mean", output_file=None
+    ):
+        self.data_type = torch.float32
+        self.model_name = model_name
+        self.layer_name = layer
+        self.pooling = pooling  # "mean", "max", "first", "last", or "all" (no pooling)
+        parameters = self._get_parameters()
+        super().__init__("ELMo", ModalityType.EMBEDDING, parameters)
+
+        self.output_file = output_file
+
+    @property
+    def model_name(self):
+        return self._model_name
+
+    @model_name.setter
+    def model_name(self, model_name):
+        self._model_name = model_name
+
+        if model_name == "elmo-original":
+            self.model = ELMoEmbeddings("original")
+            self.embedding_dim = 1024
+        elif model_name == "elmo-small":
+            self.model = ELMoEmbeddings("small")
+            self.embedding_dim = 256
+        elif model_name == "elmo-medium":
+            self.model = ELMoEmbeddings("medium")
+            self.embedding_dim = 512
+        else:
+            raise NotImplementedError(f"Model {model_name} not supported")
+
+        self.model = self.model.to(get_device())
+
+    def _get_parameters(self):
+        parameters = {
+            "model_name": ["elmo-original", "elmo-small", "elmo-medium"],
+            "layer_name": [
+                "mix",
+                "layer_0",
+                "layer_1",
+                "layer_2",
+            ],
+            "pooling": ["mean", "max", "first", "last", "all"],
+        }
+        return parameters
+
+    def transform(self, modality):
+        transformed_modality = TransformedModality(
+            modality, self, ModalityType.EMBEDDING
+        )
+        dataset = TextDataset(modality.data)
+        dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=None)
+        embeddings = []
+        for batch in dataloader:
+            texts = batch
+            for text in texts:
+                sentence = Sentence(text)
+                self.model.embed(sentence)
+                token_embeddings = []
+                for token in sentence:
+                    if self.layer_name == "mix":
+                        embedding = token.embedding
+                    elif self.layer_name == "layer_0":
+                        embedding = token.get_embedding(self.model.name + "-0")
+                    elif self.layer_name == "layer_1":
+                        embedding = token.get_embedding(self.model.name + "-1")
+                    elif self.layer_name == "layer_2":
+                        embedding = token.get_embedding(self.model.name + "-2")
+                    else:
+                        embedding = token.embedding
+
+                    token_embeddings.append(embedding.cpu().numpy())
+
+                token_embeddings = np.array(token_embeddings)
+
+                if self.pooling == "mean":
+                    sentence_embedding = np.mean(token_embeddings, axis=0)
+                elif self.pooling == "max":
+                    sentence_embedding = np.max(token_embeddings, axis=0)
+                elif self.pooling == "first":
+                    sentence_embedding = token_embeddings[0]
+                elif self.pooling == "last":
+                    sentence_embedding = token_embeddings[-1]
+                elif self.pooling == "all":
+                    sentence_embedding = token_embeddings.flatten()
+                else:
+                    sentence_embedding = np.mean(token_embeddings, axis=0)
+
+                embeddings.append(sentence_embedding.astype(np.float32))
+
+        transformed_modality.data = np.array(embeddings)
+        return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py
index 8cf67b1cb42..1426797f00b 100644
--- a/src/main/python/systemds/scuro/representations/fusion.py
+++ b/src/main/python/systemds/scuro/representations/fusion.py
@@ -22,6 +22,8 @@
 from typing import List
 
 import numpy as np
+
+from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.representations.aggregated_representation import (
     AggregatedRepresentation,
 )
@@ -44,6 +46,7 @@ def __init__(self, name, parameters=None):
         self.needs_alignment = False
         self.needs_training = False
         self.needs_instance_alignment = False
+        self.output_modality_type = ModalityType.EMBEDDING
 
     def transform(self, modalities: List[Modality]):
         """
@@ -68,25 +71,31 @@ def transform(self, modalities: List[Modality]):
         return self.execute(mods)
 
     def transform_with_training(self, modalities: List[Modality], task):
+        fusion_train_indices = task.fusion_train_indices
+
         train_modalities = []
         for modality in modalities:
             train_data = [
-                d for i, d in enumerate(modality.data) if i in task.train_indices
+                d for i, d in enumerate(modality.data) if i in fusion_train_indices
             ]
             train_modality = TransformedModality(modality, self)
-            train_modality.data = copy.deepcopy(train_data)
+            train_modality.data = list(train_data)
             train_modalities.append(train_modality)
 
         transformed_train = self.execute(
-            train_modalities, task.labels[task.train_indices]
+            train_modalities, task.labels[fusion_train_indices]
         )
-        transformed_val = self.transform_data(modalities, task.val_indices)
+
+        all_other_indices = [
+            i for i in range(len(modalities[0].data)) if i not in fusion_train_indices
+        ]
+        transformed_other = self.transform_data(modalities, all_other_indices)
 
         transformed_data = np.zeros(
             (len(modalities[0].data), transformed_train.shape[1])
         )
-        transformed_data[task.train_indices] = transformed_train
-        transformed_data[task.val_indices] = transformed_val
+        transformed_data[fusion_train_indices] = transformed_train
+        transformed_data[all_other_indices] = transformed_other
 
         return transformed_data
 
@@ -121,29 +130,16 @@ def get_max_embedding_size(self, modalities: List[Modality]):
         :param modalities: List of modalities
         :return: maximum embedding size
         """
-        try:
-            modalities[0].data = np.array(modalities[0].data)
-        except:
-            pass
-
-        if isinstance(modalities[0].data[0], list):
-            max_size = modalities[0].data[0][0].shape[1]
-        elif isinstance(modalities[0].data, np.ndarray):
-            max_size = modalities[0].data.shape[1]
-        else:
-            max_size = modalities[0].data[0].shape[1]
-        for idx in range(1, len(modalities)):
-            if isinstance(modalities[idx].data[0], list):
-                curr_shape = modalities[idx].data[0][0].shape
-            elif isinstance(modalities[idx].data, np.ndarray):
-                curr_shape = modalities[idx].data.shape
-            else:
-                curr_shape = modalities[idx].data[0].shape
-            if len(modalities[idx - 1].data) != len(modalities[idx].data):
-                raise f"Modality sizes don't match!"
-            elif len(curr_shape) == 1:
-                continue
-            elif curr_shape[1] > max_size:
-                max_size = curr_shape[1]
 
+        max_size = 0
+        for m in modalities:
+            data = m.data
+            if isinstance(data, memoryview):
+                data = np.array(data)
+            arr = np.asarray(data)
+            if arr.ndim < 2:
+                continue
+            emb_size = arr.shape[1]
+            if emb_size > max_size:
+                max_size = emb_size
         return max_size
diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py
index 9076efecfc9..8f9a73d0d5b 100644
--- a/src/main/python/systemds/scuro/representations/glove.py
+++ b/src/main/python/systemds/scuro/representations/glove.py
@@ -18,8 +18,10 @@
 # under the License.
 #
 # -------------------------------------------------------------
+import zipfile
 import numpy as np
 from gensim.utils import tokenize
+from huggingface_hub import hf_hub_download
 
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
@@ -39,11 +41,17 @@ def load_glove_embeddings(file_path):
     return embeddings
 
 
-# @register_representation(ModalityType.TEXT)
+@register_representation(ModalityType.TEXT)
 class GloVe(UnimodalRepresentation):
-    def __init__(self, glove_path, output_file=None):
+    def __init__(self, output_file=None):
         super().__init__("GloVe", ModalityType.TEXT)
-        self.glove_path = glove_path
+        file_path = hf_hub_download(
+            repo_id="stanfordnlp/glove", filename="glove.6B.zip"
+        )
+        with zipfile.ZipFile(file_path, "r") as zip_ref:
+            zip_ref.extractall("./glove_extracted")
+
+        self.glove_path = "./glove_extracted/glove.6B.100d.txt"
         self.output_file = output_file
 
     def transform(self, modality):
@@ -51,22 +59,23 @@ def transform(self, modality):
         glove_embeddings = load_glove_embeddings(self.glove_path)
 
         embeddings = []
+        embedding_dim = (
+            len(next(iter(glove_embeddings.values()))) if glove_embeddings else 100
+        )
+
         for sentences in modality.data:
             tokens = list(tokenize(sentences.lower()))
-            embeddings.append(
-                np.mean(
-                    [
-                        glove_embeddings[token]
-                        for token in tokens
-                        if token in glove_embeddings
-                    ],
-                    axis=0,
-                )
-            )
+            token_embeddings = [
+                glove_embeddings[token] for token in tokens if token in glove_embeddings
+            ]
+
+            if len(token_embeddings) > 0:
+                embeddings.append(np.mean(token_embeddings, axis=0))
+            else:
+                embeddings.append(np.zeros(embedding_dim, dtype=np.float32))
 
         if self.output_file is not None:
             save_embeddings(np.array(embeddings), self.output_file)
 
-        transformed_modality.data_type = np.float32
         transformed_modality.data = np.array(embeddings)
         return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py
index c8e96448815..58b878820e6 100644
--- a/src/main/python/systemds/scuro/representations/lstm.py
+++ b/src/main/python/systemds/scuro/representations/lstm.py
@@ -42,7 +42,7 @@ def __init__(
         depth=1,
         dropout_rate=0.1,
         learning_rate=0.001,
-        epochs=50,
+        epochs=20,
         batch_size=32,
     ):
         parameters = {
@@ -50,7 +50,7 @@ def __init__(
             "depth": [1, 2, 3],
             "dropout_rate": [0.1, 0.2, 0.3, 0.4, 0.5],
             "learning_rate": [0.001, 0.0001, 0.01, 0.1],
-            "epochs": [50, 100, 200],
+            "epochs": [10, 2050, 100, 200],
             "batch_size": [8, 16, 32, 64, 128],
         }
 
@@ -70,6 +70,7 @@ def __init__(
         self.num_classes = None
         self.is_trained = False
         self.model_state = None
+        self.is_multilabel = False
 
         self._set_random_seeds()
 
@@ -166,18 +167,32 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
         X = self._prepare_data(modalities)
         y = np.array(labels)
 
+        if y.ndim == 2 and y.shape[1] > 1:
+            self.is_multilabel = True
+            self.num_classes = y.shape[1]
+        else:
+            self.is_multilabel = False
+            if y.ndim == 2:
+                y = y.ravel()
+            self.num_classes = len(np.unique(y))
+
         self.input_dim = X.shape[2]
-        self.num_classes = len(np.unique(y))
 
         self.model = self._build_model(self.input_dim, self.num_classes)
         device = get_device()
         self.model.to(device)
 
-        criterion = nn.CrossEntropyLoss()
+        if self.is_multilabel:
+            criterion = nn.BCEWithLogitsLoss()
+        else:
+            criterion = nn.CrossEntropyLoss()
         optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
 
-        X_tensor = torch.FloatTensor(X).to(device)
-        y_tensor = torch.LongTensor(y).to(device)
+        X_tensor = torch.FloatTensor(X)
+        if self.is_multilabel:
+            y_tensor = torch.FloatTensor(y)
+        else:
+            y_tensor = torch.LongTensor(y)
 
         dataset = TensorDataset(X_tensor, y_tensor)
         dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
@@ -186,6 +201,8 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
         for epoch in range(self.epochs):
             total_loss = 0
             for batch_X, batch_y in dataloader:
+                batch_X = batch_X.to(device)
+                batch_y = batch_y.to(device)
                 optimizer.zero_grad()
 
                 features, predictions = self.model(batch_X)
@@ -202,15 +219,24 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
             "state_dict": self.model.state_dict(),
             "input_dim": self.input_dim,
             "num_classes": self.num_classes,
+            "is_multilabel": self.is_multilabel,
             "width": self.width,
             "depth": self.depth,
             "dropout_rate": self.dropout_rate,
         }
 
         self.model.eval()
+        all_features = []
         with torch.no_grad():
-            features, _ = self.model(X_tensor)
-        return features.cpu().numpy()
+            inference_dataloader = DataLoader(
+                TensorDataset(X_tensor), batch_size=self.batch_size, shuffle=False
+            )
+            for (batch_X,) in inference_dataloader:
+                batch_X = batch_X.to(device)
+                features, _ = self.model(batch_X)
+                all_features.append(features.cpu())
+
+        return torch.cat(all_features, dim=0).numpy()
 
     def apply_representation(self, modalities: List[Modality]) -> np.ndarray:
         if not self.is_trained or self.model is None:
@@ -221,13 +247,19 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray:
         device = get_device()
         self.model.to(device)
 
-        X_tensor = torch.FloatTensor(X).to(device)
-
+        X_tensor = torch.FloatTensor(X)
+        all_features = []
         self.model.eval()
         with torch.no_grad():
-            features, _ = self.model(X_tensor)
+            inference_dataloader = DataLoader(
+                TensorDataset(X_tensor), batch_size=self.batch_size, shuffle=False
+            )
+            for (batch_X,) in inference_dataloader:
+                batch_X = batch_X.to(device)
+                features, _ = self.model(batch_X)
+                all_features.append(features.cpu())
 
-        return features.cpu().numpy()
+        return torch.cat(all_features, dim=0).numpy()
 
     def get_model_state(self) -> Dict[str, Any]:
         return self.model_state
@@ -236,6 +268,7 @@ def set_model_state(self, state: Dict[str, Any]):
         self.model_state = state
         self.input_dim = state["input_dim"]
         self.num_classes = state["num_classes"]
+        self.is_multilabel = state.get("is_multilabel", False)
 
         self.model = self._build_model(self.input_dim, self.num_classes)
         self.model.load_state_dict(state["state_dict"])
diff --git a/src/main/python/systemds/scuro/representations/mlp_averaging.py b/src/main/python/systemds/scuro/representations/mlp_averaging.py
new file mode 100644
index 00000000000..6935ab3721b
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/mlp_averaging.py
@@ -0,0 +1,102 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, TensorDataset
+import numpy as np
+
+import warnings
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.utils.static_variables import get_device
+from systemds.scuro.utils.utils import set_random_seeds
+from systemds.scuro.drsearch.operator_registry import (
+    register_dimensionality_reduction_operator,
+)
+from systemds.scuro.representations.dimensionality_reduction import (
+    DimensionalityReduction,
+)
+
+
+@register_dimensionality_reduction_operator(ModalityType.EMBEDDING)
+class MLPAveraging(DimensionalityReduction):
+    """
+    Averaging dimensionality reduction using a simple average pooling operation.
+    This operator is used to reduce the dimensionality of a representation using a simple average pooling operation.
+    """
+
+    def __init__(self, output_dim=512, batch_size=32):
+        parameters = {
+            "output_dim": [64, 128, 256, 512, 1024, 2048, 4096],
+            "batch_size": [8, 16, 32, 64, 128],
+        }
+        super().__init__("MLPAveraging", parameters)
+        self.output_dim = output_dim
+        self.batch_size = batch_size
+
+    def execute(self, data):
+        set_random_seeds(42)
+
+        input_dim = data.shape[1]
+        if input_dim < self.output_dim:
+            warnings.warn(
+                f"Input dimension {input_dim} is smaller than output dimension {self.output_dim}. Returning original data."
+            )  # TODO: this should be pruned as possible representation, could add output_dim as parameter to reps if possible
+            return data
+
+        dim_reduction_model = AggregationMLP(input_dim, self.output_dim)
+        dim_reduction_model.to(get_device())
+        dim_reduction_model.eval()
+
+        tensor_data = torch.from_numpy(data).float()
+
+        dataset = TensorDataset(tensor_data)
+        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False)
+
+        all_features = []
+
+        with torch.no_grad():
+            for (batch,) in dataloader:
+                batch_features = dim_reduction_model(batch.to(get_device()))
+                all_features.append(batch_features.cpu())
+
+        all_features = torch.cat(all_features, dim=0)
+        return all_features.numpy()
+
+
+class AggregationMLP(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super(AggregationMLP, self).__init__()
+        agg_size = input_dim // output_dim
+        remainder = input_dim % output_dim
+        weight = torch.zeros(output_dim, input_dim).to(get_device())
+
+        start_idx = 0
+        for i in range(output_dim):
+            current_agg_size = agg_size + (1 if i < remainder else 0)
+            end_idx = start_idx + current_agg_size
+            weight[i, start_idx:end_idx] = 1.0 / current_agg_size
+            start_idx = end_idx
+
+        self.register_buffer("weight", weight)
+
+    def forward(self, x):
+        return torch.matmul(x, self.weight.T)
diff --git a/src/main/python/systemds/scuro/representations/mlp_learned_dim_reduction.py b/src/main/python/systemds/scuro/representations/mlp_learned_dim_reduction.py
new file mode 100644
index 00000000000..5ea15c64d71
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/mlp_learned_dim_reduction.py
@@ -0,0 +1,171 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from torch.utils.data import DataLoader, TensorDataset
+import numpy as np
+import torch
+import torch.nn as nn
+from systemds.scuro.utils.static_variables import get_device
+
+from systemds.scuro.drsearch.operator_registry import (
+    register_dimensionality_reduction_operator,
+)
+from systemds.scuro.representations.dimensionality_reduction import (
+    DimensionalityReduction,
+)
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.utils.utils import set_random_seeds
+
+
+# @register_dimensionality_reduction_operator(ModalityType.EMBEDDING)
+class MLPLearnedDimReduction(DimensionalityReduction):
+    """
+    Learned dimensionality reduction using MLP
+    This operator is used to reduce the dimensionality of a representation using a learned MLP.
+    Parameters:
+    :param output_dim: The number of dimensions to reduce the representation to
+    :param batch_size: The batch size to use for training
+    :param learning_rate: The learning rate to use for training
+    :param epochs: The number of epochs to train for
+    """
+
+    def __init__(self, output_dim=256, batch_size=32, learning_rate=0.001, epochs=5):
+        parameters = {
+            "output_dim": [64, 128, 256, 512, 1024],
+            "batch_size": [8, 16, 32, 64, 128],
+            "learning_rate": [0.001, 0.0001, 0.01, 0.1],
+            "epochs": [5, 10, 20, 50, 100],
+        }
+        super().__init__("MLPLearnedDimReduction", parameters)
+        self.output_dim = output_dim
+        self.needs_training = True
+        set_random_seeds()
+        self.is_multilabel = False
+        self.num_classes = 0
+        self.is_trained = False
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.epochs = epochs
+        self.model = None
+
+    def execute_with_training(self, data, labels):
+        if labels is None:
+            raise ValueError("MLP labels requires labels for training")
+
+        X = np.array(data)
+        y = np.array(labels)
+
+        if y.ndim == 2 and y.shape[1] > 1:
+            self.is_multilabel = True
+            self.num_classes = y.shape[1]
+        else:
+            self.is_multilabel = False
+            if y.ndim == 2:
+                y = y.ravel()
+            self.num_classes = len(np.unique(y))
+
+        input_dim = X.shape[1]
+        device = get_device()
+        self.model = None
+        self.is_trained = False
+
+        self.model = self._build_model(input_dim, self.output_dim, self.num_classes).to(
+            device
+        )
+        if self.is_multilabel:
+            criterion = nn.BCEWithLogitsLoss()
+        else:
+            criterion = nn.CrossEntropyLoss()
+        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
+
+        X_tensor = torch.FloatTensor(X)
+        if self.is_multilabel:
+            y_tensor = torch.FloatTensor(y)
+        else:
+            y_tensor = torch.LongTensor(y)
+
+        dataset = TensorDataset(X_tensor, y_tensor)
+        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
+
+        self.model.train()
+        for epoch in range(self.epochs):
+            total_loss = 0
+            for batch_X, batch_y in dataloader:
+                batch_X = batch_X.to(device)
+                batch_y = batch_y.to(device)
+                optimizer.zero_grad()
+
+                features, predictions = self.model(batch_X)
+                loss = criterion(predictions, batch_y)
+
+                loss.backward()
+                optimizer.step()
+
+                total_loss += loss.item()
+
+        self.is_trained = True
+        self.model.eval()
+        all_features = []
+        with torch.no_grad():
+            inference_dataloader = DataLoader(
+                TensorDataset(X_tensor), batch_size=self.batch_size, shuffle=False
+            )
+            for (batch_X,) in inference_dataloader:
+                batch_X = batch_X.to(device)
+                features, _ = self.model(batch_X)
+                all_features.append(features.cpu())
+
+        return torch.cat(all_features, dim=0).numpy()
+
+    def apply_representation(self, data) -> np.ndarray:
+        if not self.is_trained or self.model is None:
+            raise ValueError("Model must be trained before applying representation")
+
+        device = get_device()
+        self.model.to(device)
+        X = np.array(data)
+        X_tensor = torch.FloatTensor(X)
+        all_features = []
+        self.model.eval()
+        with torch.no_grad():
+            inference_dataloader = DataLoader(
+                TensorDataset(X_tensor), batch_size=self.batch_size, shuffle=False
+            )
+            for (batch_X,) in inference_dataloader:
+                batch_X = batch_X.to(device)
+                features, _ = self.model(batch_X)
+                all_features.append(features.cpu())
+
+        return torch.cat(all_features, dim=0).numpy()
+
+    def _build_model(self, input_dim, output_dim, num_classes):
+
+        class MLP(nn.Module):
+            def __init__(self, input_dim, output_dim):
+                super(MLP, self).__init__()
+                self.layers = nn.Sequential(nn.Linear(input_dim, output_dim))
+
+                self.classifier = nn.Linear(output_dim, num_classes)
+
+            def forward(self, x):
+                output = self.layers(x)
+                return output, self.classifier(output)
+
+        return MLP(input_dim, output_dim)
diff --git a/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py b/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py
index 6f5f527f311..a295eaa267a 100644
--- a/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py
+++ b/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py
@@ -40,7 +40,7 @@ def __init__(
         num_heads=8,
         dropout=0.1,
         batch_size=32,
-        num_epochs=50,
+        num_epochs=20,
         learning_rate=0.001,
     ):
         parameters = {
@@ -48,7 +48,7 @@ def __init__(
             "num_heads": [2, 4, 8, 12],
             "dropout": [0.0, 0.1, 0.2, 0.3, 0.4],
             "batch_size": [8, 16, 32, 64, 128],
-            "num_epochs": [50, 100, 150, 200],
+            "num_epochs": [10, 20, 50, 100, 150, 200],
             "learning_rate": [1e-5, 1e-4, 1e-3, 1e-2],
         }
         super().__init__("AttentionFusion", parameters)
@@ -69,6 +69,7 @@ def __init__(
         self.num_classes = None
         self.is_trained = False
         self.model_state = None
+        self.is_multilabel = False
 
         self._set_random_seeds()
 
@@ -122,9 +123,17 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
         inputs, input_dimensions, max_sequence_length = self._prepare_data(modalities)
         y = np.array(labels)
 
+        if y.ndim == 2 and y.shape[1] > 1:
+            self.is_multilabel = True
+            self.num_classes = y.shape[1]
+        else:
+            self.is_multilabel = False
+            if y.ndim == 2:
+                y = y.ravel()
+            self.num_classes = len(np.unique(y))
+
         self.input_dim = input_dimensions
         self.max_sequence_length = max_sequence_length
-        self.num_classes = len(np.unique(y))
 
         self.encoder = MultiModalAttentionFusion(
             self.input_dim,
@@ -142,7 +151,10 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
         self.encoder.to(device)
         self.classification_head.to(device)
 
-        criterion = nn.CrossEntropyLoss()
+        if self.is_multilabel:
+            criterion = nn.BCEWithLogitsLoss()
+        else:
+            criterion = nn.CrossEntropyLoss()
         optimizer = torch.optim.Adam(
             list(self.encoder.parameters())
             + list(self.classification_head.parameters()),
@@ -150,8 +162,12 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
         )
 
         for modality_name in inputs:
-            inputs[modality_name] = inputs[modality_name].to(device)
-        labels_tensor = torch.from_numpy(y).long().to(device)
+            inputs[modality_name] = inputs[modality_name]
+
+        if self.is_multilabel:
+            labels_tensor = torch.from_numpy(y).float()
+        else:
+            labels_tensor = torch.from_numpy(y).long()
 
         dataset_inputs = []
         for i in range(len(y)):
@@ -183,9 +199,9 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
                 for modality_name in batch_inputs:
                     batch_inputs[modality_name] = torch.stack(
                         batch_inputs[modality_name]
-                    )
+                    ).to(device)
 
-                batch_labels = torch.stack(batch_labels)
+                batch_labels = torch.stack(batch_labels).to(device)
 
                 optimizer.zero_grad()
 
@@ -197,9 +213,17 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
                 optimizer.step()
 
                 total_loss += loss.item()
-                _, predicted = torch.max(logits.data, 1)
-                total_correct += (predicted == batch_labels).sum().item()
-                total_samples += batch_labels.size(0)
+
+                if self.is_multilabel:
+                    predicted = (torch.sigmoid(logits) > 0.5).float()
+                    correct = (predicted == batch_labels).float()
+                    hamming_acc = correct.mean()
+                    total_correct += hamming_acc.item() * batch_labels.size(0)
+                    total_samples += batch_labels.size(0)
+                else:
+                    _, predicted = torch.max(logits.data, 1)
+                    total_correct += (predicted == batch_labels).sum().item()
+                    total_samples += batch_labels.size(0)
 
         self.is_trained = True
 
@@ -214,10 +238,26 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
             "dropout": self.dropout,
         }
 
+        all_features = []
+
         with torch.no_grad():
-            encoder_output = self.encoder(inputs)
+            for batch_start in range(
+                0, len(inputs[list(inputs.keys())[0]]), self.batch_size
+            ):
+                batch_end = min(
+                    batch_start + self.batch_size, len(inputs[list(inputs.keys())[0]])
+                )
 
-        return encoder_output["fused"].cpu().numpy()
+                batch_inputs = {}
+                for modality_name, tensor in inputs.items():
+                    batch_inputs[modality_name] = tensor[batch_start:batch_end].to(
+                        device
+                    )
+
+                encoder_output = self.encoder(batch_inputs)
+                all_features.append(encoder_output["fused"].cpu())
+
+        return torch.cat(all_features, dim=0).numpy()
 
     def apply_representation(self, modalities: List[Modality]) -> np.ndarray:
         if not self.is_trained or self.encoder is None:
@@ -228,14 +268,26 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray:
         device = get_device()
         self.encoder.to(device)
 
-        for modality_name in inputs:
-            inputs[modality_name] = inputs[modality_name].to(device)
-
         self.encoder.eval()
+        all_features = []
+
         with torch.no_grad():
-            encoder_output = self.encoder(inputs)
+            batch_size = self.batch_size
+            n_samples = len(inputs[list(inputs.keys())[0]])
+
+            for batch_start in range(0, n_samples, batch_size):
+                batch_end = min(batch_start + batch_size, n_samples)
+
+                batch_inputs = {}
+                for modality_name, tensor in inputs.items():
+                    batch_inputs[modality_name] = tensor[batch_start:batch_end].to(
+                        device
+                    )
+
+                encoder_output = self.encoder(batch_inputs)
+                all_features.append(encoder_output["fused"].cpu())
 
-        return encoder_output["fused"].cpu().numpy()
+        return torch.cat(all_features, dim=0).numpy()
 
     def get_model_state(self) -> Dict[str, Any]:
         return self.model_state
@@ -245,6 +297,7 @@ def set_model_state(self, state: Dict[str, Any]):
         self.input_dim = state["input_dimensions"]
         self.max_sequence_length = state["max_sequence_length"]
         self.num_classes = state["num_classes"]
+        self.is_multilabel = state.get("is_multilabel", False)
 
         self.encoder = MultiModalAttentionFusion(
             self.input_dim,
diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index f544e6a46fc..50fe084b9f5 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -114,7 +114,7 @@ def _get_parameters(self, high_level=True):
         return parameters
 
     def transform(self, modality):
-        self.data_type = numpy_dtype_to_torch_dtype(modality.data_type)
+        self.data_type = torch.float32
         if next(self.model.parameters()).dtype != self.data_type:
             self.model = self.model.to(self.data_type)
 
@@ -163,7 +163,7 @@ def hook(
                     .cpu()
                     .float()
                     .numpy()
-                    .astype(modality.data_type)
+                    .astype(np.float32)
                 )
 
             embeddings[video_id] = np.array(embeddings[video_id])
diff --git a/src/main/python/systemds/scuro/representations/text_context.py b/src/main/python/systemds/scuro/representations/text_context.py
new file mode 100644
index 00000000000..b98b90e187f
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/text_context.py
@@ -0,0 +1,221 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import re
+from typing import List, Any
+
+from systemds.scuro.drsearch.operator_registry import register_context_operator
+from systemds.scuro.representations.context import Context
+from systemds.scuro.modality.type import ModalityType
+
+
+def _split_into_words(text: str) -> List[str]:
+    """Split text into words, preserving whitespace structure."""
+    if not text or not isinstance(text, str):
+        return []
+    return text.split()
+
+
+def _split_into_sentences(text: str) -> List[str]:
+    """
+    Split text into sentences using regex.
+    Handles common sentence endings: . ! ?
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])(?=\s*$)"
+    sentences = re.split(sentence_pattern, text.strip())
+
+    sentences = [s.strip() for s in sentences if s.strip()]
+
+    if not sentences:
+        return [text]
+
+    return sentences
+
+
+def _count_words(text: str) -> int:
+    """
+    Count the number of words in a text string.
+    """
+    if not text or not isinstance(text, str):
+        return 0
+    return len(text.split())
+
+
+def _extract_text(instance: Any) -> str:
+    if isinstance(instance, str):
+        text = instance
+    else:
+        text = str(instance)
+
+    if not text or not text.strip():
+        return ""
+    return text
+
+
+@register_context_operator(ModalityType.TEXT)
+class SentenceBoundarySplit(Context):
+    """
+    Splits text at sentence boundaries while respecting maximum word count.
+
+    Parameters:
+        max_words (int): Maximum number of words per chunk (default: 55)
+        min_words (int): Minimum number of words per chunk before splitting (default: 10)
+    """
+
+    def __init__(self, max_words=55, min_words=10):
+        parameters = {
+            "max_words": [40, 50, 55, 60, 70, 250, 300, 350, 400, 450],
+            "min_words": [10, 20, 30],
+        }
+        super().__init__("SentenceBoundarySplit", parameters)
+        self.max_words = int(max_words)
+        self.min_words = max(1, int(min_words))
+
+    def execute(self, modality):
+        """
+        Split each text instance at sentence boundaries, respecting max_words.
+
+        Returns:
+            List of lists, where each inner list contains text chunks (strings)
+        """
+        chunked_data = []
+
+        for instance in modality.data:
+            text = _extract_text(instance)
+            if not text:
+                chunked_data.append([""])
+                continue
+
+            sentences = _split_into_sentences(text)
+
+            if not sentences:
+                chunked_data.append([text])
+                continue
+
+            chunks = []
+            current_chunk = []
+            current_word_count = 0
+
+            for sentence in sentences:
+                sentence_word_count = _count_words(sentence)
+
+                if sentence_word_count > self.max_words:
+                    if current_chunk and current_word_count >= self.min_words:
+                        chunks.append("".join(current_chunk))
+                        current_chunk = []
+                        current_word_count = 0
+
+                    words = _split_into_words(sentence)
+                    for i in range(0, len(words), self.max_words):
+                        chunk_words = words[i : i + self.max_words]
+                        chunks.append(" ".join(chunk_words))
+
+                elif current_word_count + sentence_word_count > self.max_words:
+                    if current_chunk and current_word_count >= self.min_words:
+                        chunks.append(" ".join(current_chunk))
+                        current_chunk = [sentence]
+                        current_word_count = sentence_word_count
+                    else:
+                        current_chunk.append(sentence)
+                        current_word_count += sentence_word_count
+                else:
+                    current_chunk.append(sentence)
+                    current_word_count += sentence_word_count
+
+            # Add remaining chunk
+            if current_chunk:
+                chunks.append(" ".join(current_chunk))
+
+            if not chunks:
+                chunks = [text]
+
+            chunked_data.append(chunks)
+
+        return chunked_data
+
+
+@register_context_operator(ModalityType.TEXT)
+class OverlappingSplit(Context):
+    """
+    Splits text with overlapping chunks using a sliding window approach.
+
+    Parameters:
+        max_words (int): Maximum number of words per chunk (default: 55)
+        overlap (float): percentage of overlapping words between chunks (default: 50%)
+        stride (int, optional): Step size in words. If None, stride = max_words - overlap_words
+    """
+
+    def __init__(self, max_words=55, overlap=0.5, stride=None):
+        overlap_words = int(max_words * overlap)
+        if stride is None:
+            stride = max_words - overlap_words
+
+        parameters = {
+            "max_words": [40, 55, 70, 250, 300, 350, 400, 450],
+            "overlap": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+            "stride": [10, 15, 20, 30],
+        }
+        super().__init__("OverlappingSplit", parameters)
+        self.max_words = max_words
+        self.overlap = overlap
+        self.stride = stride
+
+    def execute(self, modality):
+        """
+        Split each text instance with overlapping chunks.
+
+        Returns:
+            List of lists, where each inner list contains text chunks (strings)
+        """
+        chunked_data = []
+
+        for instance in modality.data:
+            text = _extract_text(instance)
+            if not text:
+                chunked_data.append("")
+                continue
+
+            words = _split_into_words(text)
+
+            if len(words) <= self.max_words:
+                chunked_data.append([text])
+                continue
+
+            chunks = []
+
+            # Create overlapping chunks with specified stride
+            for i in range(0, len(words), self.stride):
+                chunk_words = words[i : i + self.max_words]
+                if chunk_words:
+                    chunk_text = " ".join(chunk_words)
+                    chunks.append(chunk_text)
+
+                if i + self.max_words >= len(words):
+                    break
+
+            if not chunks:
+                chunks = [text]
+
+            chunked_data.append(chunks)
+
+        return chunked_data
diff --git a/src/main/python/systemds/scuro/representations/text_context_with_indices.py b/src/main/python/systemds/scuro/representations/text_context_with_indices.py
new file mode 100644
index 00000000000..7daf93855f3
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/text_context_with_indices.py
@@ -0,0 +1,300 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import re
+from typing import List, Any
+
+from systemds.scuro.drsearch.operator_registry import register_context_operator
+from systemds.scuro.representations.context import Context
+from systemds.scuro.modality.type import ModalityType
+
+# TODO: Use this to get indices for text chunks based on different splitting strategies
+# To use this approach a differnt extration of text chunks is needed in either the TextModality or the Representations
+
+
+def _split_into_words(text: str) -> List[str]:
+    """Split text into words, preserving whitespace structure."""
+    if not text or not isinstance(text, str):
+        return []
+    return text.split()
+
+
+def _split_into_sentences(text: str) -> List[str]:
+    """
+    Split text into sentences using regex.
+    Handles common sentence endings: . ! ?
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])(?=\s*$)"
+    sentences = re.split(sentence_pattern, text.strip())
+
+    sentences = [s.strip() for s in sentences if s.strip()]
+
+    if not sentences:
+        return [text]
+
+    return sentences
+
+
+def _count_words(text: str) -> int:
+    """
+    Count the number of words in a text string.
+    """
+    if not text or not isinstance(text, str):
+        return 0
+    return len(text.split())
+
+
+def _extract_text(instance: Any) -> str:
+    if isinstance(instance, str):
+        text = instance
+    else:
+        text = str(instance)
+
+    if not text or not text.strip():
+        return ""
+    return text
+
+
+# @register_context_operator(ModalityType.TEXT)
+class WordCountSplitIndices(Context):
+    """
+    Splits text after a fixed number of words.
+
+    Parameters:
+        max_words (int): Maximum number of words per chunk (default: 55)
+        overlap (int): Number of overlapping words between chunks (default: 0)
+    """
+
+    def __init__(self, max_words=55, overlap=0):
+        parameters = {
+            "max_words": [40, 50, 55, 60, 70, 250, 300, 350, 400, 450],
+            "overlap": [0, 10, 20, 30],
+        }
+        super().__init__("WordCountSplit", parameters)
+        self.max_words = int(max_words)
+        self.overlap = max(0, int(overlap))
+
+    def execute(self, modality):
+        """
+        Split each text instance into chunks of max_words words.
+
+        Returns:
+            List of tuples, where each tuple contains the start and end index of text chunks
+        """
+        chunked_data = []
+
+        for instance in modality.data:
+            text = _extract_text(instance)
+
+            if not text:
+                chunked_data.append((0, 0))
+                continue
+
+            words = _split_into_words(text)
+
+            if len(words) <= self.max_words:
+                chunked_data.append([(0, len(text))])
+                continue
+
+            chunks = []
+            stride = self.max_words - self.overlap
+
+            start = 0
+            for i in range(0, len(words), stride):
+                chunk_words = words[i : i + self.max_words]
+                chunk_text = " ".join(chunk_words)
+                chunks.append((start, start + len(chunk_text)))
+                start += len(chunk_text) + 1
+
+                if i + self.max_words >= len(words):
+                    break
+
+            chunked_data.append(chunks)
+
+        return chunked_data
+
+
+# @register_context_operator(ModalityType.TEXT)
+class SentenceBoundarySplitIndices(Context):
+    """
+    Splits text at sentence boundaries while respecting maximum word count.
+
+    Parameters:
+        max_words (int): Maximum number of words per chunk (default: 55)
+        min_words (int): Minimum number of words per chunk before splitting (default: 10)
+    """
+
+    def __init__(self, max_words=55, min_words=10, overlap=0.1):
+        parameters = {
+            "max_words": [40, 50, 55, 60, 70, 250, 300, 350, 400, 450],
+            "min_words": [10, 20, 30],
+        }
+        super().__init__("SentenceBoundarySplit", parameters)
+        self.max_words = int(max_words)
+        self.min_words = max(1, int(min_words))
+        self.overlap = overlap
+        self.stride = max(1, int(max_words * (1 - overlap)))
+
+    def execute(self, modality):
+        """
+        Split each text instance at sentence boundaries, respecting max_words.
+
+        Returns:
+            List of lists, where each inner list contains text chunks (strings)
+        """
+        chunked_data = []
+
+        for instance in modality.data:
+            text = _extract_text(instance)
+            if not text:
+                chunked_data.append((0, 0))
+                continue
+
+            sentences = _split_into_sentences(text)
+
+            if not sentences:
+                chunked_data.append((0, len(text)))
+                continue
+
+            chunks = []
+            current_chunk = None
+            current_word_count = 0
+            start = 0
+            for sentence in sentences:
+                sentence_word_count = _count_words(sentence)
+
+                if sentence_word_count > self.max_words:
+                    if current_chunk and current_word_count >= self.min_words:
+                        chunks.append(current_chunk)
+                        current_chunk = []
+                        current_word_count = 0
+
+                    words = _split_into_words(sentence)
+                    for i in range(0, len(words), self.max_words):
+                        chunk_words = words[i : i + self.max_words]
+                        current_chunk = (
+                            (start, start + len(" ".join(chunk_words)))
+                            if not current_chunk
+                            else (current_chunk[0], start + len(" ".join(chunk_words)))
+                        )
+                        start += len(" ".join(chunk_words)) + 1
+
+                elif current_word_count + sentence_word_count > self.max_words:
+                    if current_chunk and current_word_count >= self.min_words:
+                        chunks.append(current_chunk)
+                        current_chunk = (start, start + len(sentence))
+                        start += len(sentence) + 1
+                        current_word_count = sentence_word_count
+                    else:
+                        current_chunk = (current_chunk[0], start + len(sentence))
+                        start += len(sentence) + 1
+                        current_word_count += sentence_word_count
+                else:
+                    current_chunk = (
+                        (start, start + len(sentence))
+                        if not current_chunk
+                        else (current_chunk[0], start + len(sentence))
+                    )
+                    start += len(sentence) + 1
+                    current_word_count += sentence_word_count
+
+            # Add remaining chunk
+            if current_chunk:
+                chunks.append(current_chunk)
+
+            if not chunks:
+                chunks = [(0, len(text))]
+
+            chunked_data.append(chunks)
+
+        return chunked_data
+
+
+# @register_context_operator(ModalityType.TEXT)
+class OverlappingSplitIndices(Context):
+    """
+    Splits text with overlapping chunks using a sliding window approach.
+
+    Parameters:
+        max_words (int): Maximum number of words per chunk (default: 55)
+        overlap (int): percentage of overlapping words between chunks (default: 50%)
+        stride (int, optional): Step size in words. If None, stride = max_words - overlap_words
+    """
+
+    def __init__(self, max_words=55, overlap=0.5, stride=None):
+        overlap_words = int(max_words * overlap)
+        if stride is None:
+            stride = max_words - overlap_words
+
+        parameters = {
+            "max_words": [40, 55, 70, 250, 300, 350, 400, 450],
+            "overlap": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+            "stride": [10, 15, 20, 30],
+        }
+        super().__init__("OverlappingSplit", parameters)
+        self.max_words = max_words
+        self.overlap = overlap
+        self.stride = stride
+
+    def execute(self, modality):
+        """
+        Split each text instance with overlapping chunks.
+
+        Returns:
+            List of tuples, where each tuple contains start and end index to the text chunks
+        """
+        chunked_data = []
+
+        for instance in modality.data:
+            text = _extract_text(instance)
+            if not text:
+                chunked_data.append((0, 0))
+                continue
+
+            words = _split_into_words(text)
+
+            if len(words) <= self.max_words:
+                chunked_data.append((0, len(text)))
+                continue
+
+            chunks = []
+
+            # Create overlapping chunks with specified stride
+            start = 0
+            for i in range(0, len(words), self.stride):
+                chunk_words = words[i : i + self.max_words]
+                if chunk_words:
+                    chunk_text = " ".join(chunk_words)
+                    chunks.append((start, start + len(chunk_text)))
+                    start += len(chunk_text) - len(
+                        " ".join(chunk_words[self.stride - len(chunk_words) :])
+                    )
+                if i + self.max_words >= len(words):
+                    break
+
+            if not chunks:
+                chunks = [(0, len(text))]
+
+            chunked_data.append(chunks)
+
+        return chunked_data
diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py
index c82961949fe..95fa6c111f7 100644
--- a/src/main/python/systemds/scuro/representations/tfidf.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -32,7 +32,7 @@
 @register_representation(ModalityType.TEXT)
 class TfIdf(UnimodalRepresentation):
     def __init__(self, min_df=2, output_file=None):
-        parameters = {"min_df": [min_df]}
+        parameters = {"min_df": [min_df, 4, 8]}
         super().__init__("TF-IDF", ModalityType.EMBEDDING, parameters)
         self.min_df = int(min_df)
         self.output_file = output_file
diff --git a/src/main/python/systemds/scuro/representations/timeseries_representations.py b/src/main/python/systemds/scuro/representations/timeseries_representations.py
index d1dee67f861..3270992a97c 100644
--- a/src/main/python/systemds/scuro/representations/timeseries_representations.py
+++ b/src/main/python/systemds/scuro/representations/timeseries_representations.py
@@ -46,7 +46,7 @@ def transform(self, modality):
             feature = self.compute_feature(signal)
             result.append(feature)
 
-        transformed_modality.data = np.vstack(result).astype(
+        transformed_modality.data = np.vstack(np.array(result)).astype(
             modality.metadata[list(modality.metadata.keys())[0]]["data_layout"]["type"]
         )
         return transformed_modality
@@ -184,7 +184,7 @@ def compute_feature(self, signal):
 @register_representation([ModalityType.TIMESERIES])
 class SpectralCentroid(TimeSeriesRepresentation):
     def __init__(self, fs=1.0):
-        super().__init__("SpectralCentroid", parameters={"fs": [1.0]})
+        super().__init__("SpectralCentroid", parameters={"fs": [0.5, 1.0, 2.0]})
         self.fs = fs
 
     def compute_feature(self, signal):
@@ -199,7 +199,8 @@ def compute_feature(self, signal):
 class BandpowerFFT(TimeSeriesRepresentation):
     def __init__(self, fs=1.0, f1=0.0, f2=0.5):
         super().__init__(
-            "BandpowerFFT", parameters={"fs": [1.0], "f1": [0.0], "f2": [0.5]}
+            "BandpowerFFT",
+            parameters={"fs": [0.5, 1.0], "f1": [0.0, 1.0], "f2": [0.5, 1.0]},
         )
         self.fs = fs
         self.f1 = f1
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py b/src/main/python/systemds/scuro/representations/unimodal.py
index 362888aa278..a1a1632c26e 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/representations/unimodal.py
@@ -38,9 +38,12 @@ def __init__(
         if parameters is None:
             parameters = {}
         self.self_contained = self_contained
+        self.needs_context = False
+        self.initial_context_length = None
 
     @abc.abstractmethod
     def transform(self, data):
+        # TODO: check if there is a way to time the transformation in here (needed for chunked execution)
         raise f"Not implemented for {self.name}"
 
 
diff --git a/src/main/python/systemds/scuro/representations/vgg.py b/src/main/python/systemds/scuro/representations/vgg.py
index 4d0212883c6..8bc4a15b951 100644
--- a/src/main/python/systemds/scuro/representations/vgg.py
+++ b/src/main/python/systemds/scuro/representations/vgg.py
@@ -53,19 +53,19 @@ def forward(self, input_: torch.Tensor) -> torch.Tensor:
         self.model.fc = Identity()
 
     def _get_parameters(self):
-        parameters = {"layer_name": []}
-
-        parameters["layer_name"] = [
-            "features.35",
-            "classifier.0",
-            "classifier.3",
-            "classifier.6",
-        ]
+        parameters = {
+            "layer_name": [
+                "features.35",
+                "classifier.0",
+                "classifier.3",
+                "classifier.6",
+            ]
+        }
 
         return parameters
 
     def transform(self, modality):
-        self.data_type = numpy_dtype_to_torch_dtype(modality.data_type)
+        self.data_type = torch.float32
         if next(self.model.parameters()).dtype != self.data_type:
             self.model = self.model.to(self.data_type)
 
@@ -120,7 +120,7 @@ def hook(
                     .cpu()
                     .float()
                     .numpy()
-                    .astype(modality.data_type)
+                    .astype(np.float32)
                 )
 
             embeddings[video_id] = np.array(embeddings[video_id])
diff --git a/src/main/python/systemds/scuro/representations/window_aggregation.py b/src/main/python/systemds/scuro/representations/window_aggregation.py
index adb92ceb530..4d4ec19c5b9 100644
--- a/src/main/python/systemds/scuro/representations/window_aggregation.py
+++ b/src/main/python/systemds/scuro/representations/window_aggregation.py
@@ -59,11 +59,11 @@ def aggregation_function(self, value):
             self._aggregation_function = Aggregation(value)
 
 
-@register_context_operator()
+@register_context_operator([ModalityType.TIMESERIES, ModalityType.AUDIO])
 class WindowAggregation(Window):
     def __init__(self, aggregation_function="mean", window_size=10, pad=False):
         super().__init__("WindowAggregation", aggregation_function)
-        self.parameters["window_size"] = [window_size]
+        self.parameters["window_size"] = [5, 10, 15, 25, 50, 100]
         self.window_size = int(window_size)
         self.pad = pad
 
@@ -167,11 +167,11 @@ def window_aggregate_nested_level(self, instance, new_length):
         return np.array(result)
 
 
-@register_context_operator()
+@register_context_operator([ModalityType.TIMESERIES, ModalityType.AUDIO])
 class StaticWindow(Window):
     def __init__(self, aggregation_function="mean", num_windows=100):
         super().__init__("StaticWindow", aggregation_function)
-        self.parameters["num_windows"] = [num_windows]
+        self.parameters["num_windows"] = [10, num_windows]
         self.num_windows = int(num_windows)
 
     def execute(self, modality):
@@ -198,11 +198,11 @@ def execute(self, modality):
         return np.array(windowed_data)
 
 
-@register_context_operator()
+@register_context_operator([ModalityType.TIMESERIES, ModalityType.AUDIO])
 class DynamicWindow(Window):
     def __init__(self, aggregation_function="mean", num_windows=100):
         super().__init__("DynamicWindow", aggregation_function)
-        self.parameters["num_windows"] = [num_windows]
+        self.parameters["num_windows"] = [10, num_windows]
         self.num_windows = int(num_windows)
 
     def execute(self, modality):
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
index 837811935cd..737d72b8b0c 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -43,8 +43,8 @@ def get_embedding(sentence, model):
 class W2V(UnimodalRepresentation):
     def __init__(self, vector_size=150, min_count=1, output_file=None):
         parameters = {
-            "vector_size": [vector_size],
-            "min_count": [min_count],
+            "vector_size": [50, 100, 150, 200],
+            "min_count": [1, 2, 4, 8],
         }
         super().__init__("Word2Vec", ModalityType.EMBEDDING, parameters)
         self.vector_size = vector_size
diff --git a/src/main/python/systemds/scuro/utils/torch_dataset.py b/src/main/python/systemds/scuro/utils/torch_dataset.py
index 19875f8802f..9c462e36753 100644
--- a/src/main/python/systemds/scuro/utils/torch_dataset.py
+++ b/src/main/python/systemds/scuro/utils/torch_dataset.py
@@ -62,7 +62,6 @@ def __getitem__(self, index) -> Dict[str, object]:
 
         if isinstance(data, np.ndarray) and data.ndim == 3:
             # image
-            data = torch.tensor(data).permute(2, 0, 1)
             output = self.tf(data).to(self.device)
         else:
             for i, d in enumerate(data):
diff --git a/src/main/python/systemds/scuro/utils/utils.py b/src/main/python/systemds/scuro/utils/utils.py
new file mode 100644
index 00000000000..fc4a5df8b52
--- /dev/null
+++ b/src/main/python/systemds/scuro/utils/utils.py
@@ -0,0 +1,34 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import os
+import torch
+import random
+import numpy as np
+
+
+def set_random_seeds(seed=42):
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
diff --git a/src/main/python/systemds/utils/converters.py b/src/main/python/systemds/utils/converters.py
index 93744a267e1..5f4619a8bbc 100644
--- a/src/main/python/systemds/utils/converters.py
+++ b/src/main/python/systemds/utils/converters.py
@@ -20,16 +20,21 @@
 # -------------------------------------------------------------
 
 import struct
-import tempfile
-import mmap
-import time
-
+from time import time
 import numpy as np
 import pandas as pd
 import concurrent.futures
 from py4j.java_gateway import JavaClass, JavaGateway, JavaObject, JVMView
 import os
 
+# Constants
+_HANDSHAKE_OFFSET = 1000
+_DEFAULT_BATCH_SIZE_BYTES = 32 * 1024  # 32 KB
+_FRAME_BATCH_SIZE_BYTES = 16 * 1024  # 16 KB
+_MIN_BYTES_PER_PIPE = 1024 * 1024 * 1024  # 1 GB
+_STRING_LENGTH_PREFIX_SIZE = 4  # int32
+_MAX_ROWS_FOR_OPTIMIZED_CONVERSION = 4
+
 
 def format_bytes(size):
     for unit in ["Bytes", "KB", "MB", "GB", "TB", "PB"]:
@@ -38,36 +43,39 @@ def format_bytes(size):
         size /= 1024.0
 
 
-def pipe_transfer_header(pipe, pipe_id):
-    handshake = struct.pack("<i", pipe_id + 1000)
+def _pipe_transfer_header(pipe, pipe_id):
+    """Sends a handshake header to the pipe."""
+    handshake = struct.pack("<i", pipe_id + _HANDSHAKE_OFFSET)
     os.write(pipe.fileno(), handshake)
 
 
-def pipe_transfer_bytes(pipe, offset, end, batch_size_bytes, mem_view):
+def _pipe_transfer_bytes(pipe, offset, end, batch_size_bytes, mem_view):
+    """Transfers bytes from memoryview to pipe in batches."""
     while offset < end:
         # Slice the memoryview without copying
         slice_end = min(offset + batch_size_bytes, end)
         chunk = mem_view[offset:slice_end]
         written = os.write(pipe.fileno(), chunk)
         if written == 0:
-            raise Exception("Buffer issue")
+            raise IOError("Buffer issue: wrote 0 bytes")
         offset += written
 
 
-def pipe_receive_header(pipe, pipe_id, logger):
-    expected_handshake = pipe_id + 1000
-    header = os.read(pipe.fileno(), 4)  # pipe.read(4)
-    if len(header) < 4:
+def _pipe_receive_header(pipe, pipe_id, logger):
+    """Receives and validates a handshake header from the pipe."""
+    expected_handshake = pipe_id + _HANDSHAKE_OFFSET
+    header = os.read(pipe.fileno(), _STRING_LENGTH_PREFIX_SIZE)
+    if len(header) < _STRING_LENGTH_PREFIX_SIZE:
         raise IOError("Failed to read handshake header")
     received = struct.unpack("<i", header)[0]
     if received != expected_handshake:
         raise ValueError(
             f"Handshake mismatch: expected {expected_handshake}, got {received}"
         )
-    logger.debug("Read handshake successfully")
 
 
-def pipe_receive_bytes(pipe, view, offset, end, batch_size_bytes, logger):
+def _pipe_receive_bytes(pipe, view, offset, end, batch_size_bytes, logger):
+    """Receives bytes from pipe into memoryview in batches."""
     while offset < end:
         slice_end = min(offset + batch_size_bytes, end)
         chunk = os.read(pipe.fileno(), slice_end - offset)
@@ -78,6 +86,200 @@ def pipe_receive_bytes(pipe, view, offset, end, batch_size_bytes, logger):
         offset += actual_size
 
 
+def _pipe_receive_strings(
+    pipe, num_strings, batch_size=_DEFAULT_BATCH_SIZE_BYTES, pipe_id=0, logger=None
+):
+    """
+    Reads UTF-8 encoded strings from the pipe in batches.
+    Format: <I (little-endian int32) length prefix, followed by UTF-8 bytes.
+
+    Returns: tuple of (strings_list, total_time, decode_time, io_time, num_strings)
+    """
+    t_total_start = time()
+    t_decode = 0.0
+    t_io = 0.0
+
+    strings = []
+    fd = pipe.fileno()  # Cache file descriptor
+
+    # Use a reusable buffer to avoid repeated allocations
+    buf = bytearray(batch_size * 2)
+    buf_pos = 0
+    buf_remaining = 0  # Number of bytes already in buffer
+
+    i = 0
+    while i < num_strings:
+        # If we don't have enough bytes for the length prefix, read more
+        if buf_remaining < _STRING_LENGTH_PREFIX_SIZE:
+            # Shift remaining bytes to start of buffer
+            if buf_remaining > 0:
+                buf[:buf_remaining] = buf[buf_pos : buf_pos + buf_remaining]
+
+            # Read more data
+            t0 = time()
+            chunk = os.read(fd, batch_size)
+            t_io += time() - t0
+            if not chunk:
+                raise IOError("Pipe read returned empty data unexpectedly")
+
+            # Append new data to buffer
+            chunk_len = len(chunk)
+            if buf_remaining + chunk_len > len(buf):
+                # Grow buffer if needed
+                new_buf = bytearray(len(buf) * 2)
+                new_buf[:buf_remaining] = buf[:buf_remaining]
+                buf = new_buf
+
+            buf[buf_remaining : buf_remaining + chunk_len] = chunk
+            buf_remaining += chunk_len
+            buf_pos = 0
+
+        # Read length prefix (little-endian int32)
+        # Note: length can be -1 (0xFFFFFFFF) to indicate null value
+        length = struct.unpack(
+            "<i", buf[buf_pos : buf_pos + _STRING_LENGTH_PREFIX_SIZE]
+        )[0]
+        buf_pos += _STRING_LENGTH_PREFIX_SIZE
+        buf_remaining -= _STRING_LENGTH_PREFIX_SIZE
+
+        # Handle null value (marked by -1)
+        if length == -1:
+            strings.append(None)
+            i += 1
+            continue
+
+        # If we don't have enough bytes for the string data, read more
+        if buf_remaining < length:
+            # Shift remaining bytes to start of buffer
+            if buf_remaining > 0:
+                buf[:buf_remaining] = buf[buf_pos : buf_pos + buf_remaining]
+            buf_pos = 0
+
+            # Read more data until we have enough
+            bytes_needed = length - buf_remaining
+            while bytes_needed > 0:
+                t0 = time()
+                chunk = os.read(fd, min(batch_size, bytes_needed))
+                t_io += time() - t0
+                if not chunk:
+                    raise IOError("Pipe read returned empty data unexpectedly")
+
+                chunk_len = len(chunk)
+                if buf_remaining + chunk_len > len(buf):
+                    # Grow buffer if needed
+                    new_buf = bytearray(len(buf) * 2)
+                    new_buf[:buf_remaining] = buf[:buf_remaining]
+                    buf = new_buf
+
+                buf[buf_remaining : buf_remaining + chunk_len] = chunk
+                buf_remaining += chunk_len
+                bytes_needed -= chunk_len
+
+        # Decode the string
+        t0 = time()
+        if length == 0:
+            decoded_str = ""
+        else:
+            decoded_str = buf[buf_pos : buf_pos + length].decode("utf-8")
+        t_decode += time() - t0
+
+        strings.append(decoded_str)
+        buf_pos += length
+        buf_remaining -= length
+        i += 1
+    header_received = False
+    if buf_remaining == _STRING_LENGTH_PREFIX_SIZE:
+        # There is still data in the buffer, probably the handshake header
+        received = struct.unpack(
+            "<i", buf[buf_pos : buf_pos + _STRING_LENGTH_PREFIX_SIZE]
+        )[0]
+        if received != pipe_id + _HANDSHAKE_OFFSET:
+            raise ValueError(
+                "Handshake mismatch: expected {}, got {}".format(
+                    pipe_id + _HANDSHAKE_OFFSET, received
+                )
+            )
+        header_received = True
+    elif buf_remaining > _STRING_LENGTH_PREFIX_SIZE:
+        raise ValueError(
+            "Unexpected number of bytes in buffer: {}".format(buf_remaining)
+        )
+
+    t_total = time() - t_total_start
+    return (strings, t_total, t_decode, t_io, num_strings, header_received)
+
+
+def _get_numpy_value_type(jvm, dtype):
+    """Maps numpy dtype to SystemDS ValueType."""
+    if dtype is np.dtype(np.uint8):
+        return jvm.org.apache.sysds.common.Types.ValueType.UINT8
+    elif dtype is np.dtype(np.int32):
+        return jvm.org.apache.sysds.common.Types.ValueType.INT32
+    elif dtype is np.dtype(np.float32):
+        return jvm.org.apache.sysds.common.Types.ValueType.FP32
+    else:
+        return jvm.org.apache.sysds.common.Types.ValueType.FP64
+
+
+def _transfer_matrix_block_single_pipe(
+    sds, pipe_id, pipe, mv, total_bytes, rows, cols, value_type, ep
+):
+    """Transfers matrix block data using a single pipe."""
+    sds._log.debug(
+        "Using single FIFO pipe for transferring {}".format(format_bytes(total_bytes))
+    )
+    fut = sds._executor_pool.submit(
+        ep.startReadingMbFromPipe, pipe_id, rows, cols, value_type
+    )
+
+    _pipe_transfer_header(pipe, pipe_id)  # start
+    _pipe_transfer_bytes(pipe, 0, total_bytes, _DEFAULT_BATCH_SIZE_BYTES, mv)
+    _pipe_transfer_header(pipe, pipe_id)  # end
+
+    return fut.result()  # Java returns MatrixBlock
+
+
+def _transfer_matrix_block_multi_pipe(
+    sds, mv, arr, np_arr, total_bytes, rows, cols, value_type, ep, jvm
+):
+    """Transfers matrix block data using multiple pipes in parallel."""
+    num_pipes = min(len(sds._FIFO_PY2JAVA_PIPES), total_bytes // _MIN_BYTES_PER_PIPE)
+    # Align blocks per element
+    num_elems = len(arr)
+    elem_size = np_arr.dtype.itemsize
+    min_elems_block = num_elems // num_pipes
+    left_over = num_elems % num_pipes
+    block_sizes = sds.java_gateway.new_array(jvm.int, num_pipes)
+    for i in range(num_pipes):
+        block_sizes[i] = min_elems_block + int(i < left_over)
+
+    # Run java readers in parallel
+    fut_java = sds._executor_pool.submit(
+        ep.startReadingMbFromPipes, block_sizes, rows, cols, value_type
+    )
+
+    # Run writers in parallel
+    def _pipe_write_task(_pipe_id, _pipe, memview, start, end):
+        _pipe_transfer_header(_pipe, _pipe_id)
+        _pipe_transfer_bytes(_pipe, start, end, _DEFAULT_BATCH_SIZE_BYTES, memview)
+        _pipe_transfer_header(_pipe, _pipe_id)
+
+    cur = 0
+    futures = []
+    for i, size in enumerate(block_sizes):
+        pipe = sds._FIFO_PY2JAVA_PIPES[i]
+        start_byte = cur * elem_size
+        cur += size
+        end_byte = cur * elem_size
+
+        fut = sds._executor_pool.submit(
+            _pipe_write_task, i, pipe, mv, start_byte, end_byte
+        )
+        futures.append(fut)
+
+    return fut_java.result()  # Java returns MatrixBlock
+
+
 def numpy_to_matrix_block(sds, np_arr: np.array):
     """Converts a given numpy array, to internal matrix block representation.
 
@@ -89,7 +291,7 @@ def numpy_to_matrix_block(sds, np_arr: np.array):
     cols = np_arr.shape[1] if np_arr.ndim == 2 else 1
 
     if rows > 2147483647:
-        raise Exception("")
+        raise ValueError("Matrix rows exceed maximum value (2147483647)")
 
     # If not numpy array then convert to numpy array
     if not isinstance(np_arr, np.ndarray):
@@ -98,90 +300,45 @@ def numpy_to_matrix_block(sds, np_arr: np.array):
     jvm: JVMView = sds.java_gateway.jvm
     ep = sds.java_gateway.entry_point
 
-    # flatten and set value type
+    # Flatten and set value type
     if np_arr.dtype is np.dtype(np.uint8):
         arr = np_arr.ravel()
-        value_type = jvm.org.apache.sysds.common.Types.ValueType.UINT8
     elif np_arr.dtype is np.dtype(np.int32):
         arr = np_arr.ravel()
-        value_type = jvm.org.apache.sysds.common.Types.ValueType.INT32
     elif np_arr.dtype is np.dtype(np.float32):
         arr = np_arr.ravel()
-        value_type = jvm.org.apache.sysds.common.Types.ValueType.FP32
     else:
         arr = np_arr.ravel().astype(np.float64)
-        value_type = jvm.org.apache.sysds.common.Types.ValueType.FP64
+
+    value_type = _get_numpy_value_type(jvm, np_arr.dtype)
 
     if sds._data_transfer_mode == 1:
         mv = memoryview(arr).cast("B")
         total_bytes = mv.nbytes
-        min_bytes_per_pipe = 1024 * 1024 * 1024 * 1
-        batch_size_bytes = 32 * 1024  # pipe's ring buffer is 64KB
 
         # Using multiple pipes is disabled by default
         use_single_pipe = (
-            not sds._multi_pipe_enabled or total_bytes < 2 * min_bytes_per_pipe
+            not sds._multi_pipe_enabled or total_bytes < 2 * _MIN_BYTES_PER_PIPE
         )
         if use_single_pipe:
-            sds._log.debug(
-                "Using single FIFO pipe for reading {}".format(
-                    format_bytes(total_bytes)
-                )
-            )
-            pipe_id = 0
-            pipe = sds._FIFO_PY2JAVA_PIPES[pipe_id]
-            fut = sds._executor_pool.submit(
-                ep.startReadingMbFromPipe, pipe_id, rows, cols, value_type
+            return _transfer_matrix_block_single_pipe(
+                sds,
+                0,
+                sds._FIFO_PY2JAVA_PIPES[0],
+                mv,
+                total_bytes,
+                rows,
+                cols,
+                value_type,
+                ep,
             )
-
-            pipe_transfer_header(pipe, pipe_id)  # start
-            pipe_transfer_bytes(pipe, 0, total_bytes, batch_size_bytes, mv)
-            pipe_transfer_header(pipe, pipe_id)  # end
-
-            return fut.result()  # Java returns MatrixBlock
         else:
-            num_pipes = min(
-                len(sds._FIFO_PY2JAVA_PIPES), total_bytes // min_bytes_per_pipe
-            )
-            # align blocks per element
-            num_elems = len(arr)
-            elem_size = np_arr.dtype.itemsize
-            min_elems_block = num_elems // num_pipes
-            left_over = num_elems % num_pipes
-            block_sizes = sds.java_gateway.new_array(jvm.int, num_pipes)
-            for i in range(num_pipes):
-                block_sizes[i] = min_elems_block + int(i < left_over)
-
-            # run java readers in parallel
-            fut_java = sds._executor_pool.submit(
-                ep.startReadingMbFromPipes, block_sizes, rows, cols, value_type
+            return _transfer_matrix_block_multi_pipe(
+                sds, mv, arr, np_arr, total_bytes, rows, cols, value_type, ep, jvm
             )
-
-            # run writers in parallel
-            def _pipe_write_task(_pipe_id, _pipe, memview, start, end):
-                pipe_transfer_header(_pipe, _pipe_id)
-                pipe_transfer_bytes(_pipe, start, end, batch_size_bytes, memview)
-                pipe_transfer_header(_pipe, _pipe_id)
-
-            cur = 0
-            futures = []
-            for i, size in enumerate(block_sizes):
-                pipe = sds._FIFO_PY2JAVA_PIPES[i]
-                start_byte = cur * elem_size
-                cur += size
-                end_byte = cur * elem_size
-
-                fut = sds._executor_pool.submit(
-                    _pipe_write_task, i, pipe, mv, start_byte, end_byte
-                )
-                futures.append(fut)
-
-            return fut_java.result()  # Java returns MatrixBlock
     else:
-        # prepare byte buffer.
+        # Prepare byte buffer and send data to java via Py4J
         buf = arr.tobytes()
-
-        # Send data to java.
         j_class: JavaClass = jvm.org.apache.sysds.runtime.util.Py4jConverterUtils
         return j_class.convertPy4JArrayToMB(buf, rows, cols, value_type)
 
@@ -213,7 +370,7 @@ def matrix_block_to_numpy(sds, mb: JavaObject):
             pipe = sds._FIFO_JAVA2PY_PIPES[pipe_id]
 
             sds._log.debug(
-                "Using single FIFO pipe for reading {}".format(
+                "Using single FIFO pipe for transferring {}".format(
                     format_bytes(total_bytes)
                 )
             )
@@ -221,14 +378,9 @@ def matrix_block_to_numpy(sds, mb: JavaObject):
             # Java starts writing to pipe in background
             fut = sds._executor_pool.submit(ep.startWritingMbToPipe, pipe_id, mb)
 
-            pipe_receive_header(pipe, pipe_id, sds._log)
-            sds._log.debug(
-                "Py4j task for writing {} [{}] is: done=[{}], running=[{}]".format(
-                    format_bytes(total_bytes), sds._FIFO_PATH, fut.done(), fut.running()
-                )
-            )
-            pipe_receive_bytes(pipe, mv, 0, total_bytes, batch_size_bytes, sds._log)
-            pipe_receive_header(pipe, pipe_id, sds._log)
+            _pipe_receive_header(pipe, pipe_id, sds._log)
+            _pipe_receive_bytes(pipe, mv, 0, total_bytes, batch_size_bytes, sds._log)
+            _pipe_receive_header(pipe, pipe_id, sds._log)
 
             fut.result()
             sds._log.debug("Reading is done for {}".format(format_bytes(total_bytes)))
@@ -246,7 +398,9 @@ def matrix_block_to_numpy(sds, mb: JavaObject):
         return None
 
 
-def convert(jvm, fb, idx, num_elements, value_type, pd_series, conversion="column"):
+def _convert_pandas_series_to_frameblock(
+    jvm, fb, idx, num_elements, value_type, pd_series, conversion="column"
+):
     """Converts a given pandas column or row to a FrameBlock representation.
 
     :param jvm: The JVMView of the current SystemDS context.
@@ -326,59 +480,407 @@ def pandas_to_frame_block(sds, pd_df: pd.DataFrame):
     try:
         jc_String = jvm.java.lang.String
         jc_FrameBlock = jvm.org.apache.sysds.runtime.frame.data.FrameBlock
-        # execution speed increases with optimized code when the number of rows exceeds 4
-        if rows > 4:
-            # Row conversion if more columns than rows and all columns have the same type, otherwise column
-            conversion_type = (
-                "row" if cols > rows and len(set(pd_df.dtypes)) == 1 else "column"
-            )
-            if conversion_type == "row":
-                pd_df = pd_df.transpose()
-                col_names = pd_df.columns.tolist()  # re-calculate col names
 
-            fb = jc_FrameBlock(
+        if sds._data_transfer_mode == 1:
+            return pandas_to_frame_block_pipe(
+                col_names,
+                j_colNameArray,
                 j_valueTypeArray,
+                jc_FrameBlock,
+                pd_df,
+                rows,
+                schema,
+                sds,
+            )
+        else:
+            return pandas_to_frame_block_py4j(
+                col_names,
                 j_colNameArray,
-                rows if conversion_type == "column" else None,
+                j_valueTypeArray,
+                jc_FrameBlock,
+                jc_String,
+                pd_df,
+                rows,
+                cols,
+                schema,
+                sds,
+            )
+
+    except Exception as e:
+        sds.exception_and_close(e)
+
+
+def pandas_to_frame_block_py4j(
+    col_names: list,
+    j_colNameArray,
+    j_valueTypeArray,
+    jc_FrameBlock,
+    jc_String,
+    pd_df: pd.DataFrame,
+    rows: int,
+    cols: int,
+    schema: list,
+    sds,
+):
+    java_gate = sds.java_gateway
+    jvm = java_gate.jvm
+
+    # Execution speed increases with optimized code when the number of rows exceeds threshold
+    if rows > _MAX_ROWS_FOR_OPTIMIZED_CONVERSION:
+        # Row conversion if more columns than rows and all columns have the same type, otherwise column
+        conversion_type = (
+            "row" if cols > rows and len(set(pd_df.dtypes)) == 1 else "column"
+        )
+        if conversion_type == "row":
+            pd_df = pd_df.transpose()
+            col_names = pd_df.columns.tolist()  # re-calculate col names
+
+        fb = jc_FrameBlock(
+            j_valueTypeArray,
+            j_colNameArray,
+            rows if conversion_type == "column" else None,
+        )
+        if conversion_type == "row":
+            fb.ensureAllocatedColumns(rows)
+
+        # We use .submit() with explicit .result() calling to properly propagate exceptions
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(
+                    _convert_pandas_series_to_frameblock,
+                    jvm,
+                    fb,
+                    i,
+                    rows if conversion_type == "column" else cols,
+                    schema[i],
+                    pd_df[col_name],
+                    conversion_type,
+                )
+                for i, col_name in enumerate(col_names)
+            ]
+
+            for future in concurrent.futures.as_completed(futures):
+                future.result()
+
+        return fb
+    else:
+        j_dataArray = java_gate.new_array(jc_String, rows, cols)
+
+        for j, col_name in enumerate[str](col_names):
+            col_data = pd_df[col_name].fillna("").to_numpy(dtype=str)
+
+            for i in range(col_data.shape[0]):
+                if col_data[i]:
+                    j_dataArray[i][j] = col_data[i]
+
+        fb = jc_FrameBlock(j_valueTypeArray, j_colNameArray, j_dataArray)
+        return fb
+
+
+def _transfer_string_column_to_pipe(
+    sds, pipe, pipe_id, pd_series, col_name, rows, fb, col_idx, schema, ep
+):
+    """Transfers a string column to FrameBlock via pipe."""
+    t0 = time()
+
+    # Start Java reader in background
+    fut = sds._executor_pool.submit(
+        ep.startReadingColFromPipe, pipe_id, fb, rows, -1, col_idx, schema, True
+    )
+
+    _pipe_transfer_header(pipe, pipe_id)  # start
+    py_timing = _pipe_transfer_strings(pipe, pd_series, _FRAME_BATCH_SIZE_BYTES)
+    _pipe_transfer_header(pipe, pipe_id)  # end
+
+    fut.result()
+
+    t1 = time()
+
+    # Print aggregated timing breakdown
+    py_total, py_encoding, py_packing, py_io, num_strings = py_timing
+    total_time = t1 - t0
+
+    sds._log.debug(f"""
+        === TO FrameBlock - Timing Breakdown (Strings) ===
+        Column: {col_name}
+        Total time: {total_time:.3f}s
+        Python side (writing):
+        Total: {py_total:.3f}s
+        Encoding: {py_encoding:.3f}s ({100*py_encoding/py_total:.1f}%)
+        Struct packing: {py_packing:.3f}s ({100*py_packing/py_total:.1f}%)
+        I/O writes: {py_io:.3f}s ({100*py_io/py_total:.1f}%)
+        Other: {py_total - py_encoding - py_packing - py_io:.3f}s
+        Strings processed: {num_strings:,}
+        """)
+
+
+def _transfer_numeric_column_to_pipe(
+    sds, pipe, pipe_id, byte_data, col_name, rows, fb, col_idx, schema, ep
+):
+    """Transfers a numeric column to FrameBlock via pipe."""
+    mv = memoryview(byte_data).cast("B")
+    total_bytes = mv.nbytes
+    sds._log.debug(
+        "TO FrameBlock - Using single FIFO pipe for transferring {} | {} bytes | Column: {}".format(
+            format_bytes(total_bytes), total_bytes, col_name
+        )
+    )
+
+    fut = sds._executor_pool.submit(
+        ep.startReadingColFromPipe,
+        pipe_id,
+        fb,
+        rows,
+        total_bytes,
+        col_idx,
+        schema,
+        True,
+    )
+
+    _pipe_transfer_header(pipe, pipe_id)  # start
+    _pipe_transfer_bytes(pipe, 0, total_bytes, _FRAME_BATCH_SIZE_BYTES, mv)
+    _pipe_transfer_header(pipe, pipe_id)  # end
+
+    fut.result()
+
+
+def pandas_to_frame_block_pipe(
+    col_names: list,
+    j_colNameArray,
+    j_valueTypeArray,
+    jc_FrameBlock,
+    pd_df: pd.DataFrame,
+    rows: int,
+    schema: list,
+    sds,
+):
+    ep = sds.java_gateway.entry_point
+    fb = jc_FrameBlock(
+        j_valueTypeArray,
+        j_colNameArray,
+        rows,
+    )
+
+    pipe_id = 0
+    pipe = sds._FIFO_PY2JAVA_PIPES[pipe_id]
+
+    for i, col_name in enumerate(col_names):
+        pd_series = pd_df[col_name]
+
+        if pd_series.dtype == "string" or pd_series.dtype == "object":
+            _transfer_string_column_to_pipe(
+                sds, pipe, pipe_id, pd_series, col_name, rows, fb, i, schema[i], ep
             )
-            if conversion_type == "row":
-                fb.ensureAllocatedColumns(rows)
-
-            # We use .submit() with explicit .result() calling to properly propagate exceptions
-            with concurrent.futures.ThreadPoolExecutor() as executor:
-                futures = [
-                    executor.submit(
-                        convert,
-                        jvm,
-                        fb,
-                        i,
-                        rows if conversion_type == "column" else cols,
-                        schema[i],
-                        pd_df[col_name],
-                        conversion_type,
-                    )
-                    for i, col_name in enumerate(col_names)
-                ]
-
-                for future in concurrent.futures.as_completed(futures):
-                    future.result()
-
-            return fb
+            continue
+
+        # Prepare numeric data
+        if pd_series.dtype == "bool":
+            # Convert boolean to uint8 (0/1) for proper byte representation
+            byte_data = pd_series.fillna(False).astype(np.uint8).to_numpy()
         else:
-            j_dataArray = java_gate.new_array(jc_String, rows, cols)
+            byte_data = pd_series.fillna("").to_numpy()
 
-            for j, col_name in enumerate(col_names):
-                col_data = pd_df[col_name].fillna("").to_numpy(dtype=str)
+        _transfer_numeric_column_to_pipe(
+            sds, pipe, pipe_id, byte_data, col_name, rows, fb, i, schema[i], ep
+        )
 
-                for i in range(col_data.shape[0]):
-                    if col_data[i]:
-                        j_dataArray[i][j] = col_data[i]
+    return fb
 
-            fb = jc_FrameBlock(j_valueTypeArray, j_colNameArray, j_dataArray)
-            return fb
 
-    except Exception as e:
-        sds.exception_and_close(e)
+def _pipe_transfer_strings(pipe, pd_series, batch_size=_DEFAULT_BATCH_SIZE_BYTES):
+    """
+    Streams UTF-8 encoded strings to the pipe in batches without building the full bytearray first.
+    Uses a 2×batch_size buffer to accommodate long strings without frequent flushes.
+
+    Returns: tuple of (total_time, encoding_time, packing_time, io_time, num_strings)
+    """
+    t_total_start = time()
+    t_encoding = 0.0
+    t_packing = 0.0
+    t_io = 0.0
+    num_strings = 0
+
+    buf = bytearray(batch_size * 2)
+    view = memoryview(buf)
+    pos = 0
+    fd = pipe.fileno()  # Cache file descriptor to avoid repeated lookups
+
+    # Convert pandas Series to list/array for faster iteration (avoids pandas overhead)
+    # Use .values for numpy array or .tolist() for Python list - tolist() is often faster for strings
+    values = pd_series.tolist() if hasattr(pd_series, "tolist") else list(pd_series)
+
+    for value in values:
+        num_strings += 1
+
+        # Check for null values (None, pd.NA, np.nan)
+        is_null = value is None or pd.isna(value)
+
+        if is_null:
+            # Use -1 as marker for null values (signed int32)
+            length = -1
+            entry_size = _STRING_LENGTH_PREFIX_SIZE  # Only length prefix, no data bytes
+        else:
+            # Encode and get length - len() on bytes is very fast (O(1) attribute access)
+            t0 = time()
+            encoded = value.encode("utf-8")
+            t_encoding += time() - t0
+
+            length = len(encoded)  # Fast O(1) operation on bytes
+            entry_size = _STRING_LENGTH_PREFIX_SIZE + length  # length prefix + data
+
+        # if next string doesn't fit comfortably, flush first half
+        if pos + entry_size > batch_size:
+            # write everything up to 'pos'
+            t0 = time()
+            written = os.write(fd, view[:pos])
+            t_io += time() - t0
+            if written != pos:
+                raise IOError(f"Expected to write {pos} bytes, wrote {written}")
+            pos = 0
+
+        # Write length prefix (little-endian, signed int32 for -1 null marker)
+        t0 = time()
+        struct.pack_into("<i", buf, pos, length)
+        t_packing += time() - t0
+        pos += _STRING_LENGTH_PREFIX_SIZE
+
+        # write the bytes - skip for null values
+        if not is_null:
+            buf[pos : pos + length] = encoded
+            pos += length
+
+    # flush the tail
+    if pos > 0:
+        t0 = time()
+        written = os.write(fd, view[:pos])
+        t_io += time() - t0
+        if written != pos:
+            raise IOError(f"Expected to write {pos} bytes, wrote {written}")
+
+    t_total = time() - t_total_start
+    return (t_total, t_encoding, t_packing, t_io, num_strings)
+
+
+def _get_elem_size_for_type(d_type):
+    """Returns the element size in bytes for a given SystemDS type."""
+    return {
+        "INT32": 4,
+        "INT64": 8,
+        "FP64": 8,
+        "BOOLEAN": 1,
+        "FP32": 4,
+        "UINT8": 1,
+        "CHARACTER": 1,
+    }.get(d_type, 8)
+
+
+def _get_numpy_dtype_for_type(d_type):
+    """Returns the numpy dtype for a given SystemDS type."""
+    dtype_map = {
+        "INT32": np.int32,
+        "INT64": np.int64,
+        "FP64": np.float64,
+        "BOOLEAN": np.dtype("?"),
+        "FP32": np.float32,
+        "UINT8": np.uint8,
+        "CHARACTER": np.char,
+    }
+    return dtype_map.get(d_type, np.float64)
+
+
+def _receive_string_column_from_pipe(
+    sds, pipe, pipe_id, num_rows, batch_size_bytes, col_name
+):
+    """Receives a string column from FrameBlock via pipe."""
+    py_strings, py_total, py_decode, py_io, num_strings, header_received = (
+        _pipe_receive_strings(pipe, num_rows, batch_size_bytes, pipe_id, sds._log)
+    )
+
+    sds._log.debug(f"""
+        === FROM FrameBlock - Timing Breakdown (Strings) ===
+        Column: {col_name}
+        Total time: {py_total:.3f}s
+        Python side (reading):
+        Total: {py_total:.3f}s
+        Decoding: {py_decode:.3f}s ({100*py_decode/py_total:.1f}%)
+        I/O reads: {py_io:.3f}s ({100*py_io/py_total:.1f}%)
+        Other: {py_total - py_decode - py_io:.3f}s
+        Strings processed: {num_strings:,}
+        """)
+
+    if not header_received:
+        _pipe_receive_header(pipe, pipe_id, sds._log)
+
+    return py_strings
+
+
+def _receive_numeric_column_from_pipe(
+    sds, pipe, pipe_id, d_type, num_rows, batch_size_bytes, col_name
+):
+    """Receives a numeric column from FrameBlock via pipe."""
+    elem_size = _get_elem_size_for_type(d_type)
+    total_bytes = num_rows * elem_size
+    numpy_dtype = _get_numpy_dtype_for_type(d_type)
+
+    sds._log.debug(
+        "FROM FrameBlock - Using single FIFO pipe for transferring {} | {} bytes | Column: {} | Type: {}".format(
+            format_bytes(total_bytes),
+            total_bytes,
+            col_name,
+            d_type,
+        )
+    )
+
+    if d_type == "BOOLEAN":
+        # Read as uint8 first, then convert to boolean
+        # This ensures proper interpretation of 0/1 bytes
+        arr_uint8 = np.empty(num_rows, dtype=np.uint8)
+        mv = memoryview(arr_uint8).cast("B")
+        _pipe_receive_bytes(pipe, mv, 0, total_bytes, batch_size_bytes, sds._log)
+        ret = arr_uint8.astype(bool)
+    else:
+        arr = np.empty(num_rows, dtype=numpy_dtype)
+        mv = memoryview(arr).cast("B")
+        _pipe_receive_bytes(pipe, mv, 0, total_bytes, batch_size_bytes, sds._log)
+        ret = arr
+
+    _pipe_receive_header(pipe, pipe_id, sds._log)
+    return ret
+
+
+def _receive_column_py4j(fb, col_array, c_index, d_type, num_rows):
+    """Receives a column from FrameBlock using Py4J (fallback method)."""
+    if d_type == "STRING":
+        ret = []
+        for row in range(num_rows):
+            ent = col_array.getIndexAsBytes(row)
+            if ent:
+                ent = ent.decode()
+                ret.append(ent)
+            else:
+                ret.append(None)
+    elif d_type == "INT32":
+        byteArray = fb.getColumn(c_index).getAsByteArray()
+        ret = np.frombuffer(byteArray, dtype=np.int32)
+    elif d_type == "INT64":
+        byteArray = fb.getColumn(c_index).getAsByteArray()
+        ret = np.frombuffer(byteArray, dtype=np.int64)
+    elif d_type == "FP64":
+        byteArray = fb.getColumn(c_index).getAsByteArray()
+        ret = np.frombuffer(byteArray, dtype=np.float64)
+    elif d_type == "BOOLEAN":
+        # TODO maybe it is more efficient to bit pack the booleans.
+        # https://stackoverflow.com/questions/5602155/numpy-boolean-array-with-1-bit-entries
+        byteArray = fb.getColumn(c_index).getAsByteArray()
+        ret = np.frombuffer(byteArray, dtype=np.dtype("?"))
+    elif d_type == "CHARACTER":
+        byteArray = fb.getColumn(c_index).getAsByteArray()
+        ret = np.frombuffer(byteArray, dtype=np.char)
+    else:
+        raise NotImplementedError(
+            f"Not Implemented {d_type} for systemds to pandas parsing"
+        )
+    return ret
 
 
 def frame_block_to_pandas(sds, fb: JavaObject):
@@ -387,45 +889,55 @@ def frame_block_to_pandas(sds, fb: JavaObject):
     :param sds: The current systemds context.
     :param fb: A pointer to the JVM's FrameBlock object.
     """
-
     num_rows = fb.getNumRows()
     num_cols = fb.getNumColumns()
     df = pd.DataFrame()
 
+    ep = sds.java_gateway.entry_point
+    jvm = sds.java_gateway.jvm
+
     for c_index in range(num_cols):
         col_array = fb.getColumn(c_index)
-
         d_type = col_array.getValueType().toString()
-        if d_type == "STRING":
-            ret = []
-            for row in range(num_rows):
-                ent = col_array.getIndexAsBytes(row)
-                if ent:
-                    ent = ent.decode()
-                    ret.append(ent)
-                else:
-                    ret.append(None)
-        elif d_type == "INT32":
-            byteArray = fb.getColumn(c_index).getAsByteArray()
-            ret = np.frombuffer(byteArray, dtype=np.int32)
-        elif d_type == "INT64":
-            byteArray = fb.getColumn(c_index).getAsByteArray()
-            ret = np.frombuffer(byteArray, dtype=np.int64)
-        elif d_type == "FP64":
-            byteArray = fb.getColumn(c_index).getAsByteArray()
-            ret = np.frombuffer(byteArray, dtype=np.float64)
-        elif d_type == "BOOLEAN":
-            # TODO maybe it is more efficient to bit pack the booleans.
-            # https://stackoverflow.com/questions/5602155/numpy-boolean-array-with-1-bit-entries
-            byteArray = fb.getColumn(c_index).getAsByteArray()
-            ret = np.frombuffer(byteArray, dtype=np.dtype("?"))
-        elif d_type == "CHARACTER":
-            byteArray = fb.getColumn(c_index).getAsByteArray()
-            ret = np.frombuffer(byteArray, dtype=np.char)
-        else:
-            raise NotImplementedError(
-                f"Not Implemented {d_type} for systemds to pandas parsing"
+
+        if sds._data_transfer_mode == 1:
+            # Use pipe transfer for faster data transfer
+            batch_size_bytes = _DEFAULT_BATCH_SIZE_BYTES
+            pipe_id = 0
+            pipe = sds._FIFO_JAVA2PY_PIPES[pipe_id]
+
+            # Java starts writing to pipe in background
+            fut = sds._executor_pool.submit(
+                ep.startWritingColToPipe, pipe_id, fb, c_index
             )
+
+            _pipe_receive_header(pipe, pipe_id, sds._log)
+
+            if d_type == "STRING":
+                ret = _receive_string_column_from_pipe(
+                    sds,
+                    pipe,
+                    pipe_id,
+                    num_rows,
+                    batch_size_bytes,
+                    fb.getColumnName(c_index),
+                )
+            else:
+                ret = _receive_numeric_column_from_pipe(
+                    sds,
+                    pipe,
+                    pipe_id,
+                    d_type,
+                    num_rows,
+                    batch_size_bytes,
+                    fb.getColumnName(c_index),
+                )
+
+            fut.result()
+        else:
+            # Use Py4J transfer (original method)
+            ret = _receive_column_py4j(fb, col_array, c_index, d_type, num_rows)
+
         df[fb.getColumnName(c_index)] = ret
 
     return df
diff --git a/src/main/python/tests/README.md b/src/main/python/tests/README.md
index 24e0f018634..bea078ca28d 100644
--- a/src/main/python/tests/README.md
+++ b/src/main/python/tests/README.md
@@ -46,3 +46,5 @@ To execute the Federated Tests, use:
 Federated experiments are a little different from the rest, since they require some setup in form of federated workers.
 
 See more details in the [script](federated/runFedTest.sh)
+
+https://github.com/nttcslab/byol-a/blob/master/pretrained_weights/AudioNTT2020-BYOLA-64x96d512.pth
\ No newline at end of file
diff --git a/src/main/python/tests/algorithms/test_cov.py b/src/main/python/tests/algorithms/test_cov.py
index a20c0741f11..c498337ced7 100644
--- a/src/main/python/tests/algorithms/test_cov.py
+++ b/src/main/python/tests/algorithms/test_cov.py
@@ -26,7 +26,6 @@
 from systemds.context import SystemDSContext
 from systemds.operator.algorithm import cov
 
-
 A = np.array([2, 4, 4, 2])
 B = np.array([2, 4, 2, 4])
 W = np.array([7, 1, 1, 1])
diff --git a/src/main/python/tests/algorithms/test_solve.py b/src/main/python/tests/algorithms/test_solve.py
index 44914917183..2187dfe2faf 100644
--- a/src/main/python/tests/algorithms/test_solve.py
+++ b/src/main/python/tests/algorithms/test_solve.py
@@ -26,7 +26,6 @@
 from systemds.context import SystemDSContext
 from systemds.operator.algorithm import solve
 
-
 np.random.seed(7)
 A = np.random.random((10, 10))
 B = np.random.random(10)
diff --git a/src/main/python/tests/matrix/test_block_converter_unix_pipe.py b/src/main/python/tests/matrix/test_block_converter_unix_pipe.py
deleted file mode 100644
index c24a9357c84..00000000000
--- a/src/main/python/tests/matrix/test_block_converter_unix_pipe.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-
-
-import os
-import shutil
-import unittest
-import pandas as pd
-import numpy as np
-from systemds.context import SystemDSContext
-
-
-class TestMatrixBlockConverterUnixPipe(unittest.TestCase):
-
-    sds: SystemDSContext = None
-    temp_dir: str = "tests/iotests/temp_write_csv/"
-
-    @classmethod
-    def setUpClass(cls):
-        cls.sds = SystemDSContext(
-            data_transfer_mode=1, logging_level=50, capture_stdout=True
-        )
-        if not os.path.exists(cls.temp_dir):
-            os.makedirs(cls.temp_dir)
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.sds.close()
-        shutil.rmtree(cls.temp_dir, ignore_errors=True)
-
-    def test_python_to_java(self):
-        combinations = [  # (n_rows, n_cols)
-            (5, 0),
-            (5, 1),
-            (10, 10),
-        ]
-
-        for n_rows, n_cols in combinations:
-            matrix = (
-                np.random.random((n_rows, n_cols))
-                if n_cols != 0
-                else np.random.random(n_rows)
-            )
-            # Transfer into SystemDS and write to CSV
-            matrix_sds = self.sds.from_numpy(matrix)
-            matrix_sds.write(
-                self.temp_dir + "into_systemds_matrix.csv", format="csv", header=False
-            ).compute()
-
-            # Read the CSV file using pandas
-            result_df = pd.read_csv(
-                self.temp_dir + "into_systemds_matrix.csv", header=None
-            )
-            matrix_out = result_df.to_numpy()
-            if n_cols == 0:
-                matrix_out = matrix_out.flatten()
-            # Verify the data
-            self.assertTrue(np.allclose(matrix_out, matrix))
-
-    def test_java_to_python(self):
-        combinations = [  # (n_rows, n_cols)
-            (5, 1),
-            (10, 10),
-        ]
-
-        for n_rows, n_cols in combinations:
-            matrix = np.random.random((n_rows, n_cols))
-
-            # Create a CSV file to read into SystemDS
-            pd.DataFrame(matrix).to_csv(
-                self.temp_dir + "out_of_systemds_matrix.csv", header=False, index=False
-            )
-
-            matrix_sds = self.sds.read(
-                self.temp_dir + "out_of_systemds_matrix.csv",
-                data_type="matrix",
-                format="csv",
-            )
-            matrix_out = matrix_sds.compute()
-
-            # Verify the data
-            self.assertTrue(np.allclose(matrix_out, matrix))
-
-
-if __name__ == "__main__":
-    unittest.main(exit=False)
diff --git a/src/main/python/tests/matrix/test_unique.py b/src/main/python/tests/matrix/test_unique.py
index 66d1f19a9df..810d9977fae 100644
--- a/src/main/python/tests/matrix/test_unique.py
+++ b/src/main/python/tests/matrix/test_unique.py
@@ -23,7 +23,6 @@
 import numpy as np
 from systemds.context import SystemDSContext
 
-
 np.random.seed(7)
 
 
diff --git a/src/main/python/tests/python_java_data_transfer/__init__.py b/src/main/python/tests/python_java_data_transfer/__init__.py
new file mode 100644
index 00000000000..e66abb4646f
--- /dev/null
+++ b/src/main/python/tests/python_java_data_transfer/__init__.py
@@ -0,0 +1,20 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
diff --git a/src/main/python/tests/python_java_data_transfer/test_dense_numpy_matrix.py b/src/main/python/tests/python_java_data_transfer/test_dense_numpy_matrix.py
new file mode 100644
index 00000000000..fcfe683dc7f
--- /dev/null
+++ b/src/main/python/tests/python_java_data_transfer/test_dense_numpy_matrix.py
@@ -0,0 +1,246 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+
+import os
+import shutil
+import unittest
+import pandas as pd
+import numpy as np
+from systemds.context import SystemDSContext
+from tests.test_utils import timeout
+
+
+class TestMatrixBlockConverterUnixPipe(unittest.TestCase):
+
+    sds: SystemDSContext = None
+    temp_dir: str = "tests/iotests/temp_write_csv/"
+
+    @classmethod
+    def setUpClass(cls):
+        cls.sds = SystemDSContext(
+            data_transfer_mode=1, logging_level=10, capture_stdout=True
+        )
+        if not os.path.exists(cls.temp_dir):
+            os.makedirs(cls.temp_dir)
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sds.close()
+        shutil.rmtree(cls.temp_dir, ignore_errors=True)
+
+    @timeout(60)
+    def test_python_to_java(self):
+        combinations = [  # (n_rows, n_cols)
+            (5, 0),
+            (5, 1),
+            (10, 10),
+        ]
+
+        for n_rows, n_cols in combinations:
+            matrix = (
+                np.random.random((n_rows, n_cols))
+                if n_cols != 0
+                else np.random.random(n_rows)
+            )
+            # Transfer into SystemDS and write to CSV
+            matrix_sds = self.sds.from_numpy(matrix)
+            matrix_sds.write(
+                self.temp_dir + "into_systemds_matrix.csv", format="csv", header=False
+            ).compute()
+
+            # Read the CSV file using pandas
+            result_df = pd.read_csv(
+                self.temp_dir + "into_systemds_matrix.csv", header=None
+            )
+            matrix_out = result_df.to_numpy()
+            if n_cols == 0:
+                matrix_out = matrix_out.flatten()
+            # Verify the data
+            self.assertTrue(np.allclose(matrix_out, matrix))
+
+    @timeout(60)
+    def test_java_to_python(self):
+        """Test reading matrices from SystemDS back to Python with various dtypes."""
+        # (dtype, shapes, data_type, tolerance)
+        configs = [
+            (np.float64, [(5, 1), (10, 10), (100, 5)], "random", 1e-9),
+            (np.float32, [(10, 10), (50, 3)], "random", 1e-6),
+            (np.int32, [(10, 10), (20, 5)], "randint", 0.0),
+            (np.uint8, [(10, 10), (15, 8)], "randuint8", 0.0),
+        ]
+
+        def _gen_data(dtype, data_type):
+            if data_type == "random":
+                return lambda s: np.random.random(s).astype(dtype)
+            elif data_type == "randint":
+                return lambda s: np.random.randint(-10000, 10000, s).astype(dtype)
+            elif data_type == "randuint8":
+                return lambda s: np.random.randint(0, 255, s).astype(dtype)
+
+        test_cases = [
+            {
+                "dtype": dt,
+                "shape": sh,
+                "data": _gen_data(dt, data_type),
+                "tolerance": tol,
+            }
+            for dt, shapes, data_type, tol in configs
+            for sh in shapes
+        ] + [
+            # Edge cases
+            {
+                "dtype": np.float64,
+                "shape": (1, 1),
+                "data": lambda s: np.random.random(s).astype(np.float64),
+                "tolerance": 1e-9,
+            },
+            {
+                "dtype": np.float64,
+                "shape": (1, 10),
+                "data": lambda s: np.random.random(s).astype(np.float64),
+                "tolerance": 1e-9,
+            },
+            {
+                "dtype": np.float64,
+                "shape": (10, 10),
+                "data": lambda s: np.zeros(s, dtype=np.float64),
+                "tolerance": 0.0,
+            },
+            {
+                "dtype": np.float64,
+                "shape": (10, 5),
+                "data": lambda s: np.random.uniform(-100.0, 100.0, s).astype(
+                    np.float64
+                ),
+                "tolerance": 1e-9,
+            },
+        ]
+
+        for i, test_case in enumerate(test_cases):
+            with self.subTest(i=i, dtype=test_case["dtype"], shape=test_case["shape"]):
+                matrix = test_case["data"](test_case["shape"])
+
+                # Create a CSV file to read into SystemDS
+                csv_path = self.temp_dir + f"out_of_systemds_matrix_{i}.csv"
+                pd.DataFrame(matrix).to_csv(csv_path, header=False, index=False)
+
+                matrix_sds = self.sds.read(
+                    csv_path,
+                    data_type="matrix",
+                    format="csv",
+                )
+                matrix_out = matrix_sds.compute()
+
+                # Verify the data
+                # Note: SystemDS reads all matrices as FP64, so we compare accordingly
+                if test_case["tolerance"] == 0.0:
+                    # Exact match for integer types
+                    self.assertTrue(
+                        np.array_equal(
+                            matrix.astype(np.float64), matrix_out.astype(np.float64)
+                        ),
+                        f"Matrix with dtype {test_case['dtype']} and shape {test_case['shape']} doesn't match exactly",
+                    )
+                else:
+                    # Approximate match for float types
+                    self.assertTrue(
+                        np.allclose(
+                            matrix.astype(np.float64),
+                            matrix_out.astype(np.float64),
+                            atol=test_case["tolerance"],
+                        ),
+                        f"Matrix with dtype {test_case['dtype']} and shape {test_case['shape']} doesn't match within tolerance",
+                    )
+
+    @timeout(60)
+    def test_java_to_python_unsupported_dtypes(self):
+        """Test that unsupported dtypes are handled gracefully or converted."""
+        # Note: SystemDS will convert unsupported dtypes to FP64 when reading from CSV
+        # So these should still work, just with type conversion
+
+        test_cases = [
+            # INT64 - not directly supported for MatrixBlock, but CSV reads as FP64
+            {
+                "dtype": np.int64,
+                "shape": (10, 5),
+                "data": lambda s: np.random.randint(-1000000, 1000000, s).astype(
+                    np.int64
+                ),
+            },
+            # Complex types - not supported, should fail or be converted
+            {
+                "dtype": np.complex128,
+                "shape": (5, 5),
+                "data": lambda s: np.random.random(s) + 1j * np.random.random(s),
+                "should_fail": True,  # Complex numbers not supported in matrices
+            },
+        ]
+
+        for i, test_case in enumerate(test_cases):
+            with self.subTest(i=i, dtype=test_case["dtype"], shape=test_case["shape"]):
+                if test_case.get("should_fail", False):
+                    # Test that unsupported types fail gracefully
+                    matrix = test_case["data"](test_case["shape"])
+                    csv_path = self.temp_dir + f"unsupported_matrix_{i}.csv"
+
+                    # Writing complex numbers to CSV might fail or convert to real part
+                    try:
+                        pd.DataFrame(matrix).to_csv(csv_path, header=False, index=False)
+                        # If writing succeeds, reading might fail or behave unexpectedly
+                        with self.assertRaises(Exception):
+                            matrix_sds = self.sds.read(
+                                csv_path,
+                                data_type="matrix",
+                                format="csv",
+                            )
+                            matrix_sds.compute()
+                    except Exception:
+                        # Writing failed, which is expected
+                        pass
+                else:
+                    # Type should be converted to FP64
+                    matrix = test_case["data"](test_case["shape"])
+                    csv_path = self.temp_dir + f"converted_matrix_{i}.csv"
+
+                    # Write as the original dtype (pandas will handle conversion for CSV)
+                    pd.DataFrame(matrix).to_csv(csv_path, header=False, index=False)
+
+                    matrix_sds = self.sds.read(
+                        csv_path,
+                        data_type="matrix",
+                        format="csv",
+                    )
+                    matrix_out = matrix_sds.compute()
+
+                    # Should be converted to FP64 and match values
+                    self.assertTrue(
+                        np.allclose(
+                            matrix.astype(np.float64),
+                            matrix_out.astype(np.float64),
+                            atol=1e-9,
+                        ),
+                        f"Converted matrix with dtype {test_case['dtype']} doesn't match",
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main(exit=False)
diff --git a/src/main/python/tests/python_java_data_transfer/test_pandas_frame.py b/src/main/python/tests/python_java_data_transfer/test_pandas_frame.py
new file mode 100644
index 00000000000..a841795363a
--- /dev/null
+++ b/src/main/python/tests/python_java_data_transfer/test_pandas_frame.py
@@ -0,0 +1,265 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+
+import os
+import shutil
+import unittest
+import pandas as pd
+import numpy as np
+from systemds.context import SystemDSContext
+from tests.test_utils import timeout
+
+
+class TestFrameConverterUnixPipe(unittest.TestCase):
+
+    sds: SystemDSContext = None
+    temp_dir: str = "tests/iotests/temp_write_csv/"
+
+    @classmethod
+    def setUpClass(cls):
+        cls.sds = SystemDSContext(
+            data_transfer_mode=1, logging_level=10, capture_stdout=True
+        )
+        if not os.path.exists(cls.temp_dir):
+            os.makedirs(cls.temp_dir)
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sds.close()
+        shutil.rmtree(cls.temp_dir, ignore_errors=True)
+
+    @timeout(60)
+    def test_frame_python_to_java(self):
+        """Test converting pandas DataFrame to SystemDS FrameBlock and writing to CSV."""
+        combinations = [
+            # Float32 column
+            {"float32_col": np.random.random(50).astype(np.float32)},
+            # Float64 column
+            {"float64_col": np.random.random(50).astype(np.float64)},
+            # Int32 column
+            {"int32_col": np.random.randint(-1000, 1000, 50).astype(np.int32)},
+            # Int64 column
+            {"int64_col": np.random.randint(-1000000, 1000000, 50).astype(np.int64)},
+            # Uint8 column
+            {"uint8_col": np.random.randint(0, 255, 50).astype(np.uint8)},
+            # All numeric types together
+            {
+                "float32_col": np.random.random(30).astype(np.float32),
+                "float64_col": np.random.random(30).astype(np.float64),
+                "int32_col": np.random.randint(-1000, 1000, 30).astype(np.int32),
+                "int64_col": np.random.randint(-1000000, 1000000, 30).astype(np.int64),
+                "uint8_col": np.random.randint(0, 255, 30).astype(np.uint8),
+            },
+            # Mixed numeric types with strings
+            {
+                "float32_col": np.random.random(25).astype(np.float32),
+                "float64_col": np.random.random(25).astype(np.float64),
+                "int32_col": np.random.randint(-1000, 1000, 25).astype(np.int32),
+                "int64_col": np.random.randint(-1000000, 1000000, 25).astype(np.int64),
+                "uint8_col": np.random.randint(0, 255, 25).astype(np.uint8),
+                "string_col": [f"string_{i}" for i in range(25)],
+            },
+        ]
+
+        for frame_dict in combinations:
+            frame = pd.DataFrame(frame_dict)
+            # Transfer into SystemDS and write to CSV
+            frame_sds = self.sds.from_pandas(frame)
+            frame_sds.write(
+                self.temp_dir + "into_systemds_frame.csv", format="csv", header=False
+            ).compute()
+
+            # Read the CSV file using pandas
+            result_df = pd.read_csv(
+                self.temp_dir + "into_systemds_frame.csv", header=None
+            )
+
+            # For numeric columns, verify with allclose for floats, exact match for integers
+            # For string columns, verify exact match
+            for col_idx, col_name in enumerate(frame.columns):
+                original_col = frame[col_name]
+                result_col = result_df.iloc[:, col_idx]
+
+                if pd.api.types.is_numeric_dtype(original_col):
+                    original_dtype = original_col.dtype
+                    # For integer types (int32, int64, uint8), use exact equality
+                    if original_dtype in [np.int32, np.int64, np.uint8]:
+                        self.assertTrue(
+                            np.array_equal(
+                                original_col.values.astype(original_dtype),
+                                result_col.values.astype(original_dtype),
+                            ),
+                            f"Column {col_name} (dtype: {original_dtype}) integer values don't match exactly",
+                        )
+                    else:
+                        # For float types (float32, float64), use allclose
+                        self.assertTrue(
+                            np.allclose(
+                                original_col.values.astype(float),
+                                result_col.values.astype(float),
+                                equal_nan=True,
+                            ),
+                            f"Column {col_name} (dtype: {original_dtype}) float values don't match",
+                        )
+                else:
+                    # For string columns, compare as strings
+                    self.assertTrue(
+                        (
+                            original_col.astype(str).values
+                            == result_col.astype(str).values
+                        ).all(),
+                        f"Column {col_name} string values don't match",
+                    )
+
+    @timeout(60)
+    def test_frame_java_to_python(self):
+        """Test reading CSV into SystemDS FrameBlock and converting back to pandas DataFrame."""
+        combinations = [
+            {"float32_col": np.random.random(50).astype(np.float32)},
+            {"float64_col": np.random.random(50).astype(np.float64)},
+            {"int32_col": np.random.randint(-1000, 1000, 50).astype(np.int32)},
+            {"int64_col": np.random.randint(-1000000, 1000000, 50).astype(np.int64)},
+            {"uint8_col": np.random.randint(0, 255, 50).astype(np.uint8)},
+            # String column only
+            {"text_col": [f"text_value_{i}" for i in range(30)]},
+            # All numeric types together
+            {
+                "float32_col": np.random.random(30).astype(np.float32),
+                "float64_col": np.random.random(30).astype(np.float64),
+                "int32_col": np.random.randint(-1000, 1000, 30).astype(np.int32),
+                "int64_col": np.random.randint(-1000000, 1000000, 30).astype(np.int64),
+                "uint8_col": np.random.randint(0, 255, 30).astype(np.uint8),
+            },
+            # Mixed numeric types with strings
+            {
+                "float32_col": np.random.random(25).astype(np.float32),
+                "float64_col": np.random.random(25).astype(np.float64),
+                "int32_col": np.random.randint(-1000, 1000, 25).astype(np.int32),
+                "int64_col": np.random.randint(-1000000, 1000000, 25).astype(np.int64),
+                "uint8_col": np.random.randint(0, 255, 25).astype(np.uint8),
+                "string_col": [f"string_{i}" for i in range(25)],
+            },
+        ]
+        print("Running frame conversion test\n\n!!!!")
+        for frame_dict in combinations:
+            frame = pd.DataFrame(frame_dict)
+            # Create a CSV file to read into SystemDS
+            frame_sds = self.sds.from_pandas(frame)
+            frame_sds = frame_sds.rbind(frame_sds)
+            frame_out = frame_sds.compute()
+
+            frame = pd.concat([frame, frame], ignore_index=True)
+
+            # Verify it's a DataFrame
+            self.assertIsInstance(frame_out, pd.DataFrame)
+
+            # Verify shape matches
+            self.assertEqual(frame.shape, frame_out.shape, "Frame shapes don't match")
+
+            # Verify column data
+            for col_name in frame.columns:
+                original_col = frame[col_name]
+                # FrameBlock to pandas may not preserve column names, so compare by position
+                col_idx = list(frame.columns).index(col_name)
+                result_col = frame_out.iloc[:, col_idx]
+
+                if pd.api.types.is_numeric_dtype(original_col):
+                    original_dtype = original_col.dtype
+                    # For integer types (int32, int64, uint8), use exact equality
+                    if original_dtype in [np.int32, np.int64, np.uint8]:
+                        self.assertTrue(
+                            np.array_equal(
+                                original_col.values.astype(original_dtype),
+                                result_col.values.astype(original_dtype),
+                            ),
+                            f"Column {col_name} (dtype: {original_dtype}) integer values don't match exactly",
+                        )
+                    else:
+                        # For float types (float32, float64), use allclose
+                        # print difference in case of failure
+                        if not np.allclose(
+                            original_col.values.astype(float),
+                            result_col.values.astype(float),
+                            equal_nan=True,
+                            atol=1e-6,
+                        ):
+                            print(
+                                f"Column {col_name} (dtype: {original_dtype}) float values don't match: {np.abs(original_col.values.astype(float) - result_col.values.astype(float))}"
+                            )
+                            self.assertTrue(
+                                False,
+                                f"Column {col_name} (dtype: {original_dtype}) float values don't match",
+                            )
+
+                else:
+                    # For string columns, compare as strings
+                    original_str = original_col.astype(str).values
+                    result_str = result_col.astype(str).values
+                    self.assertTrue(
+                        (original_str == result_str).all(),
+                        f"Column {col_name} string values don't match",
+                    )
+
+    @timeout(60)
+    def test_frame_string_with_nulls(self):
+        """Test converting pandas DataFrame with null string values."""
+        # Create a simple DataFrame with 5 string values, 2 of them None
+        df = pd.DataFrame({"string_col": ["hello", None, "world", None, "test"]})
+
+        # Transfer into SystemDS and back
+        frame_sds = self.sds.from_pandas(df)
+        frame_sds = frame_sds.rbind(frame_sds)
+        frame_out = frame_sds.compute()
+        df = pd.concat([df, df], ignore_index=True)
+
+        # Verify it's a DataFrame
+        self.assertIsInstance(frame_out, pd.DataFrame)
+
+        # Verify shape matches
+        self.assertEqual(df.shape, frame_out.shape, "Frame shapes don't match")
+
+        # Verify column data - check that None values are preserved
+        original_col = df["string_col"]
+        result_col = frame_out.iloc[:, 0]
+
+        # Check each value
+        for i in range(len(original_col)):
+            original_val = original_col.iloc[i]
+            result_val = result_col.iloc[i]
+
+            if pd.isna(original_val):
+                # Original is null, result should also be null
+                self.assertTrue(
+                    pd.isna(result_val),
+                    f"Row {i}: Expected null but got '{result_val}'",
+                )
+            else:
+                # Original is not null, result should match
+                self.assertEqual(
+                    str(original_val),
+                    str(result_val),
+                    f"Row {i}: Expected '{original_val}' but got '{result_val}'",
+                )
+
+
+if __name__ == "__main__":
+    unittest.main(exit=False)
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index 3c43cabb3ee..ae78c50b8aa 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -26,6 +26,11 @@
 import random
 import os
 
+from sklearn import svm
+from sklearn.metrics import classification_report
+from sklearn.model_selection import train_test_split
+
+from systemds.scuro.models.model import Model
 from systemds.scuro.dataloader.base_loader import BaseLoader
 from systemds.scuro.dataloader.video_loader import VideoLoader
 from systemds.scuro.dataloader.audio_loader import AudioLoader
@@ -33,6 +38,7 @@
 from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.drsearch.task import Task
 
 
 class TestDataLoader(BaseLoader):
@@ -60,6 +66,7 @@ def __init__(self):
         self.modality_type = None
         self.metadata = {}
         self.data_type = np.float32
+        self.transform_time = None
 
     def create1DModality(
         self,
@@ -130,7 +137,7 @@ def create_timeseries_data(self, num_instances, sequence_length, num_features=1)
         }
         return data, metadata
 
-    def create_text_data(self, num_instances):
+    def create_text_data(self, num_instances, num_sentences_per_instance=1):
         subjects = [
             "The cat",
             "A dog",
@@ -172,18 +179,24 @@ def create_text_data(self, num_instances):
             "precisely",
             "methodically",
         ]
+        punctuation = [".", "?", "!"]
 
         sentences = []
         for _ in range(num_instances):
-            include_adverb = np.random.random() < 0.7
-
-            subject = np.random.choice(subjects)
-            verb = np.random.choice(verbs)
-            obj = np.random.choice(objects)
-            adverb = np.random.choice(adverbs) if include_adverb else ""
-
-            sentence = f"{subject} {adverb} {verb} {obj}"
-
+            sentence = ""
+            for i in range(num_sentences_per_instance):
+                include_adverb = np.random.random() < 0.7
+
+                subject = np.random.choice(subjects)
+                verb = np.random.choice(verbs)
+                obj = np.random.choice(objects)
+                adverb = np.random.choice(adverbs) if include_adverb else ""
+                punct = np.random.choice(punctuation)
+
+                sentence += " " if i > 0 else ""
+                sentence += f"{subject}"
+                sentence += f" {adverb}" if include_adverb else ""
+                sentence += f" {verb} {obj}{punct}"
             sentences.append(sentence)
 
         metadata = {
@@ -198,14 +211,14 @@ def create_visual_modality(
     ):
         if max_num_frames > 1:
             data = [
-                np.random.randint(
-                    0,
-                    256,
+                np.random.uniform(
+                    0.0,
+                    1.0,
                     (np.random.randint(10, max_num_frames + 1), height, width, 3),
-                    dtype=np.uint8,
                 )
                 for _ in range(num_instances)
             ]
+
             metadata = {
                 i: ModalityType.VIDEO.create_metadata(
                     30, data[i].shape[0], width, height, 3
@@ -382,3 +395,57 @@ def __create_audio_data(self, idx, duration, speed_factor):
         audio_data = 0.5 * np.sin(2 * np.pi * frequency * t)
 
         write(path, sample_rate, audio_data)
+
+
+class TestSVM(Model):
+    def __init__(self, name):
+        super().__init__(name)
+
+    def fit(self, X, y, X_test, y_test):
+        if X.ndim > 2:
+            X = X.reshape(X.shape[0], -1)
+        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
+        self.clf = self.clf.fit(X, np.array(y))
+        y_pred = self.clf.predict(X)
+
+        return {
+            "accuracy": classification_report(
+                y, y_pred, output_dict=True, digits=3, zero_division=1
+            )["accuracy"]
+        }, 0
+
+    def test(self, test_X: np.ndarray, test_y: np.ndarray):
+        if test_X.ndim > 2:
+            test_X = test_X.reshape(test_X.shape[0], -1)
+        y_pred = self.clf.predict(np.array(test_X))  # noqa]
+
+        return {
+            "accuracy": classification_report(
+                np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1
+            )["accuracy"]
+        }, 0
+
+
+class TestTask(Task):
+    def __init__(self, name, model_name, num_instances):
+        self.labels = ModalityRandomDataGenerator().create_balanced_labels(
+            num_instances=10
+        )
+        split = train_test_split(
+            np.array(range(num_instances)),
+            self.labels,
+            test_size=0.2,
+            random_state=42,
+            stratify=self.labels,
+        )
+        self.train_indizes, self.val_indizes = [int(i) for i in split[0]], [
+            int(i) for i in split[1]
+        ]
+
+        super().__init__(
+            name,
+            TestSVM(model_name),
+            self.labels,
+            self.train_indizes,
+            self.val_indizes,
+        )
diff --git a/src/main/python/tests/scuro/test_hp_tuner.py b/src/main/python/tests/scuro/test_hp_tuner.py
index 802f737b0a5..73c498e2360 100644
--- a/src/main/python/tests/scuro/test_hp_tuner.py
+++ b/src/main/python/tests/scuro/test_hp_tuner.py
@@ -22,17 +22,12 @@
 import unittest
 
 import numpy as np
-from sklearn import svm
-from sklearn.metrics import classification_report
-from sklearn.model_selection import train_test_split
 
 from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
 from systemds.scuro.representations.average import Average
 from systemds.scuro.representations.concatenation import Concatenation
 from systemds.scuro.representations.lstm import LSTM
 from systemds.scuro.drsearch.operator_registry import Registry
-from systemds.scuro.models.model import Model
-from systemds.scuro.drsearch.task import Task
 from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
 
 from systemds.scuro.representations.spectrogram import Spectrogram
@@ -45,70 +40,15 @@
 from systemds.scuro.representations.bow import BoW
 from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.representations.resnet import ResNet
-from tests.scuro.data_generator import ModalityRandomDataGenerator, TestDataLoader
+from tests.scuro.data_generator import (
+    ModalityRandomDataGenerator,
+    TestDataLoader,
+    TestTask,
+)
 
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.drsearch.hyperparameter_tuner import HyperparameterTuner
 
-
-class TestSVM(Model):
-    def __init__(self):
-        super().__init__("TestSVM")
-
-    def fit(self, X, y, X_test, y_test):
-        if X.ndim > 2:
-            X = X.reshape(X.shape[0], -1)
-        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
-        self.clf = self.clf.fit(X, np.array(y))
-        y_pred = self.clf.predict(X)
-
-        return {
-            "accuracy": classification_report(
-                y, y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-    def test(self, test_X: np.ndarray, test_y: np.ndarray):
-        if test_X.ndim > 2:
-            test_X = test_X.reshape(test_X.shape[0], -1)
-        y_pred = self.clf.predict(np.array(test_X))  # noqa]
-
-        return {
-            "accuracy": classification_report(
-                np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-
-class TestSVM2(Model):
-    def __init__(self):
-        super().__init__("TestSVM2")
-
-    def fit(self, X, y, X_test, y_test):
-        if X.ndim > 2:
-            X = X.reshape(X.shape[0], -1)
-        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
-        self.clf = self.clf.fit(X, np.array(y))
-        y_pred = self.clf.predict(X)
-
-        return {
-            "accuracy": classification_report(
-                y, y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-    def test(self, test_X: np.ndarray, test_y: np.ndarray):
-        if test_X.ndim > 2:
-            test_X = test_X.reshape(test_X.shape[0], -1)
-        y_pred = self.clf.predict(np.array(test_X))  # noqa
-
-        return {
-            "accuracy": classification_report(
-                np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-
 from unittest.mock import patch
 
 
@@ -120,36 +60,10 @@ class TestHPTuner(unittest.TestCase):
     def setUpClass(cls):
         cls.num_instances = 10
         cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
-        cls.labels = ModalityRandomDataGenerator().create_balanced_labels(
-            num_instances=cls.num_instances
-        )
         cls.indices = np.array(range(cls.num_instances))
-
-        split = train_test_split(
-            cls.indices,
-            cls.labels,
-            test_size=0.2,
-            random_state=42,
-        )
-        cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
-            int(i) for i in split[1]
-        ]
-
         cls.tasks = [
-            Task(
-                "UnimodalRepresentationTask1",
-                TestSVM(),
-                cls.labels,
-                cls.train_indizes,
-                cls.val_indizes,
-            ),
-            Task(
-                "UnimodalRepresentationTask2",
-                TestSVM2(),
-                cls.labels,
-                cls.train_indizes,
-                cls.val_indizes,
-            ),
+            TestTask("UnimodalRepresentationTask1", "TestSVM1", cls.num_instances),
+            TestTask("UnimodalRepresentationTask2", "TestSVM2", cls.num_instances),
         ]
 
     def test_hp_tuner_for_audio_modality(self):
@@ -233,16 +147,17 @@ def run_hp_for_modality(
                     min_modalities=2,
                     max_modalities=3,
                 )
-                fusion_results = m_o.optimize()
+                fusion_results = m_o.optimize(20)
 
                 hp.tune_multimodal_representations(
                     fusion_results,
                     k=1,
                     optimize_unimodal=tune_unimodal_representations,
+                    max_eval_per_rep=10,
                 )
 
             else:
-                hp.tune_unimodal_representations()
+                hp.tune_unimodal_representations(max_eval_per_rep=10)
 
             assert len(hp.results) == len(self.tasks)
             assert len(hp.results[self.tasks[0].model.name]) == 2
diff --git a/src/main/python/tests/scuro/test_multimodal_fusion.py b/src/main/python/tests/scuro/test_multimodal_fusion.py
index 395a9cd8623..a9fbf3ea1ce 100644
--- a/src/main/python/tests/scuro/test_multimodal_fusion.py
+++ b/src/main/python/tests/scuro/test_multimodal_fusion.py
@@ -22,9 +22,6 @@
 import unittest
 
 import numpy as np
-from sklearn import svm
-from sklearn.metrics import classification_report
-from sklearn.model_selection import train_test_split
 
 from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
 from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
@@ -32,8 +29,6 @@
 from systemds.scuro.representations.lstm import LSTM
 from systemds.scuro.representations.average import Average
 from systemds.scuro.drsearch.operator_registry import Registry
-from systemds.scuro.models.model import Model
-from systemds.scuro.drsearch.task import Task
 
 from systemds.scuro.representations.spectrogram import Spectrogram
 from systemds.scuro.representations.word2vec import W2V
@@ -43,70 +38,13 @@
 from tests.scuro.data_generator import (
     TestDataLoader,
     ModalityRandomDataGenerator,
+    TestTask,
 )
 
 from systemds.scuro.modality.type import ModalityType
 from unittest.mock import patch
 
 
-class TestSVM(Model):
-    def __init__(self):
-        super().__init__("TestSVM")
-
-    def fit(self, X, y, X_test, y_test):
-        if X.ndim > 2:
-            X = X.reshape(X.shape[0], -1)
-        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
-        self.clf = self.clf.fit(X, np.array(y))
-        y_pred = self.clf.predict(X)
-
-        return {
-            "accuracy": classification_report(
-                y, y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-    def test(self, test_X: np.ndarray, test_y: np.ndarray):
-        if test_X.ndim > 2:
-            test_X = test_X.reshape(test_X.shape[0], -1)
-        y_pred = self.clf.predict(np.array(test_X))  # noqa
-
-        return {
-            "accuracy": classification_report(
-                np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-
-class TestCNN(Model):
-    def __init__(self):
-        super().__init__("TestCNN")
-
-    def fit(self, X, y, X_test, y_test):
-        if X.ndim > 2:
-            X = X.reshape(X.shape[0], -1)
-        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
-        self.clf = self.clf.fit(X, np.array(y))
-        y_pred = self.clf.predict(X)
-
-        return {
-            "accuracy": classification_report(
-                y, y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-    def test(self, test_X: np.ndarray, test_y: np.ndarray):
-        if test_X.ndim > 2:
-            test_X = test_X.reshape(test_X.shape[0], -1)
-        y_pred = self.clf.predict(np.array(test_X))  # noqa
-
-        return {
-            "accuracy": classification_report(
-                np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-
 class TestMultimodalRepresentationOptimizer(unittest.TestCase):
     test_file_path = None
     data_generator = None
@@ -116,30 +54,10 @@ class TestMultimodalRepresentationOptimizer(unittest.TestCase):
     def setUpClass(cls):
         cls.num_instances = 10
         cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
-        cls.labels = ModalityRandomDataGenerator().create_balanced_labels(
-            num_instances=cls.num_instances
-        )
         cls.indices = np.array(range(cls.num_instances))
 
-        split = train_test_split(
-            cls.indices,
-            cls.labels,
-            test_size=0.2,
-            random_state=42,
-            stratify=cls.labels,
-        )
-        cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
-            int(i) for i in split[1]
-        ]
-
     def test_multimodal_fusion(self):
-        task = Task(
-            "MM_Fusion_Task1",
-            TestSVM(),
-            self.labels,
-            self.train_indizes,
-            self.val_indizes,
-        )
+        task = TestTask("MM_Fusion_Task1", "Test1", self.num_instances)
 
         audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
             self.num_instances, 1000
@@ -174,7 +92,9 @@ def test_multimodal_fusion(self):
             registry._fusion_operators = [Average, Concatenation, LSTM]
             unimodal_optimizer = UnimodalOptimizer([audio, text], [task], debug=False)
             unimodal_optimizer.optimize()
-            unimodal_optimizer.operator_performance.get_k_best_results(audio, 2, task)
+            unimodal_optimizer.operator_performance.get_k_best_results(
+                audio, 2, task, "accuracy"
+            )
             m_o = MultimodalOptimizer(
                 [audio, text],
                 unimodal_optimizer.operator_performance,
@@ -183,91 +103,87 @@ def test_multimodal_fusion(self):
                 min_modalities=2,
                 max_modalities=3,
             )
-            fusion_results = m_o.optimize()
+            fusion_results = m_o.optimize(20)
 
             best_results = sorted(
                 fusion_results[task.model.name],
-                key=lambda x: getattr(x, "val_score").average_scores["accuracy"],
+                key=lambda x: getattr(x, "val_score")["accuracy"],
                 reverse=True,
             )[:2]
 
             assert (
-                best_results[0].val_score.average_scores["accuracy"]
-                >= best_results[1].val_score.average_scores["accuracy"]
-            )
-
-    def test_parallel_multimodal_fusion(self):
-        task = Task(
-            "MM_Fusion_Task1",
-            TestSVM(),
-            self.labels,
-            self.train_indizes,
-            self.val_indizes,
-        )
-
-        audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
-            self.num_instances, 1000
-        )
-        text_data, text_md = ModalityRandomDataGenerator().create_text_data(
-            self.num_instances
-        )
-
-        audio = UnimodalModality(
-            TestDataLoader(
-                self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md
-            )
-        )
-        text = UnimodalModality(
-            TestDataLoader(
-                self.indices, None, ModalityType.TEXT, text_data, str, text_md
-            )
-        )
-
-        with patch.object(
-            Registry,
-            "_representations",
-            {
-                ModalityType.TEXT: [W2V],
-                ModalityType.AUDIO: [Spectrogram],
-                ModalityType.TIMESERIES: [Max, Min],
-                ModalityType.VIDEO: [ResNet],
-                ModalityType.EMBEDDING: [],
-            },
-        ):
-            registry = Registry()
-            registry._fusion_operators = [Average, Concatenation, LSTM]
-            unimodal_optimizer = UnimodalOptimizer([audio, text], [task], debug=False)
-            unimodal_optimizer.optimize()
-            unimodal_optimizer.operator_performance.get_k_best_results(audio, 2, task)
-            m_o = MultimodalOptimizer(
-                [audio, text],
-                unimodal_optimizer.operator_performance,
-                [task],
-                debug=False,
-                min_modalities=2,
-                max_modalities=3,
-            )
-            fusion_results = m_o.optimize()
-            parallel_fusion_results = m_o.optimize_parallel(max_workers=4, batch_size=8)
-
-            best_results = sorted(
-                fusion_results[task.model.name],
-                key=lambda x: getattr(x, "val_score").average_scores["accuracy"],
-                reverse=True,
-            )
-
-            best_results_parallel = sorted(
-                parallel_fusion_results[task.model.name],
-                key=lambda x: getattr(x, "val_score").average_scores["accuracy"],
-                reverse=True,
+                best_results[0].val_score["accuracy"]
+                >= best_results[1].val_score["accuracy"]
             )
 
-            assert len(best_results) == len(best_results_parallel)
-            for i in range(len(best_results)):
-                assert (
-                    best_results[i].val_score.average_scores["accuracy"]
-                    == best_results_parallel[i].val_score.average_scores["accuracy"]
-                )
+    # def test_parallel_multimodal_fusion(self):
+    #     task = TestTask("MM_Fusion_Task1", "Test2", self.num_instances)
+    #
+    #     audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
+    #         self.num_instances, 1000
+    #     )
+    #     text_data, text_md = ModalityRandomDataGenerator().create_text_data(
+    #         self.num_instances
+    #     )
+    #
+    #     audio = UnimodalModality(
+    #         TestDataLoader(
+    #             self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md
+    #         )
+    #     )
+    #     text = UnimodalModality(
+    #         TestDataLoader(
+    #             self.indices, None, ModalityType.TEXT, text_data, str, text_md
+    #         )
+    #     )
+    #
+    #     with patch.object(
+    #         Registry,
+    #         "_representations",
+    #         {
+    #             ModalityType.TEXT: [W2V],
+    #             ModalityType.AUDIO: [Spectrogram],
+    #             ModalityType.TIMESERIES: [Max, Min],
+    #             ModalityType.VIDEO: [ResNet],
+    #             ModalityType.EMBEDDING: [],
+    #         },
+    #     ):
+    #         registry = Registry()
+    #         registry._fusion_operators = [Average, Concatenation, LSTM]
+    #         unimodal_optimizer = UnimodalOptimizer([audio, text], [task], debug=False)
+    #         unimodal_optimizer.optimize()
+    #         unimodal_optimizer.operator_performance.get_k_best_results(
+    #             audio, 2, task, "accuracy"
+    #         )
+    #         m_o = MultimodalOptimizer(
+    #             [audio, text],
+    #             unimodal_optimizer.operator_performance,
+    #             [task],
+    #             debug=False,
+    #             min_modalities=2,
+    #             max_modalities=3,
+    #         )
+    #         fusion_results = m_o.optimize(max_combinations=16)
+    #         parallel_fusion_results = m_o.optimize_parallel(16, max_workers=2, batch_size=4)
+    #
+    #         best_results = sorted(
+    #             fusion_results[task.model.name],
+    #             key=lambda x: getattr(x, "val_score")["accuracy"],
+    #             reverse=True,
+    #         )
+    #
+    #         best_results_parallel = sorted(
+    #             parallel_fusion_results[task.model.name],
+    #             key=lambda x: getattr(x, "val_score")["accuracy"],
+    #             reverse=True,
+    #         )
+    #
+    #         # assert len(best_results) == len(best_results_parallel)
+    #         for i in range(len(best_results)):
+    #             assert (
+    #                 best_results[i].val_score["accuracy"]
+    #                 == best_results_parallel[i].val_score["accuracy"]
+    #             )
 
 
 if __name__ == "__main__":
diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py
index c33eb5fcc2b..189e3e44d71 100644
--- a/src/main/python/tests/scuro/test_operator_registry.py
+++ b/src/main/python/tests/scuro/test_operator_registry.py
@@ -21,7 +21,11 @@
 
 import unittest
 
-from systemds.scuro import FrequencyMagnitude
+from systemds.scuro.representations.text_context import (
+    SentenceBoundarySplit,
+    OverlappingSplit,
+)
+
 from systemds.scuro.representations.covarep_audio_features import (
     ZeroCrossing,
     Spectral,
@@ -124,11 +128,15 @@ def test_text_representations_in_registry(self):
 
     def test_context_operator_in_registry(self):
         registry = Registry()
-        assert registry.get_context_operators() == [
+        assert registry.get_context_operators(ModalityType.TIMESERIES) == [
             WindowAggregation,
             StaticWindow,
             DynamicWindow,
         ]
+        assert registry.get_context_operators(ModalityType.TEXT) == [
+            SentenceBoundarySplit,
+            OverlappingSplit,
+        ]
 
     # def test_fusion_operator_in_registry(self):
     #     registry = Registry()
diff --git a/src/main/python/tests/scuro/test_text_context_operators.py b/src/main/python/tests/scuro/test_text_context_operators.py
new file mode 100644
index 00000000000..1f041654076
--- /dev/null
+++ b/src/main/python/tests/scuro/test_text_context_operators.py
@@ -0,0 +1,113 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+
+import unittest
+from systemds.scuro.representations.text_context import (
+    SentenceBoundarySplit,
+    OverlappingSplit,
+)
+from systemds.scuro.representations.text_context_with_indices import (
+    SentenceBoundarySplitIndices,
+    OverlappingSplitIndices,
+)
+from tests.scuro.data_generator import (
+    ModalityRandomDataGenerator,
+    TestDataLoader,
+    TestTask,
+)
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.modality.type import ModalityType
+
+
+class TestTextContextOperator(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.data_generator = ModalityRandomDataGenerator()
+        cls.data, cls.md = cls.data_generator.create_text_data(10, 50)
+        cls.text_modality = UnimodalModality(
+            TestDataLoader(
+                [i for i in range(0, 10)],
+                None,
+                ModalityType.TEXT,
+                cls.data,
+                str,
+                cls.md,
+            )
+        )
+        cls.text_modality.extract_raw_data()
+        cls.task = TestTask("TextContextTask", "Test1", 10)
+
+    def test_sentence_boundary_split(self):
+        sentence_boundary_split = SentenceBoundarySplit(10, min_words=4)
+        chunks = sentence_boundary_split.execute(self.text_modality)
+        for i in range(0, len(chunks)):
+            for chunk in chunks[i]:
+                assert len(chunk.split(" ")) <= 10 and (
+                    chunk[-1] == "." or chunk[-1] == "!" or chunk[-1] == "?"
+                )
+
+    def test_overlapping_split(self):
+        overlapping_split = OverlappingSplit(40, 0.05)
+        chunks = overlapping_split.execute(self.text_modality)
+        for i in range(len(chunks)):
+            prev_chunk = ""
+            for j, chunk in enumerate(chunks[i]):
+                if j > 0:
+                    prev_words = prev_chunk.split(" ")
+                    curr_words = chunk.split(" ")
+                    assert prev_words[-2:] == curr_words[:2]
+                prev_chunk = chunk
+                assert len(chunk.split(" ")) <= 40
+
+    def test_sentence_boundary_split_indices(self):
+        sentence_boundary_split = SentenceBoundarySplitIndices(10, min_words=4)
+        chunks = sentence_boundary_split.execute(self.text_modality)
+        for i in range(0, len(chunks)):
+            for chunk in chunks[i]:
+                text = self.text_modality.data[i][chunk[0] : chunk[1]].split(" ")
+                assert len(text) <= 10 and (
+                    text[-1][-1] == "." or text[-1][-1] == "!" or text[-1][-1] == "?"
+                )
+
+    def test_overlapping_split_indices(self):
+        overlapping_split = OverlappingSplitIndices(40, 0.1)
+        chunks = overlapping_split.execute(self.text_modality)
+        for i in range(len(chunks)):
+            prev_chunk = (0, 0)
+            for j, chunk in enumerate(chunks[i]):
+                if j > 0:
+                    prev_words = self.text_modality.data[i][
+                        prev_chunk[0] : prev_chunk[1]
+                    ].split(" ")
+                    curr_words = self.text_modality.data[i][chunk[0] : chunk[1]].split(
+                        " "
+                    )
+                    assert prev_words[-4:] == curr_words[:4]
+                prev_chunk = chunk
+                assert (
+                    len(self.text_modality.data[i][chunk[0] : chunk[1]].split(" "))
+                    <= 40
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py
index 252dfe997a7..0d8ae901778 100644
--- a/src/main/python/tests/scuro/test_unimodal_optimizer.py
+++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py
@@ -23,17 +23,11 @@
 import unittest
 
 import numpy as np
-from sklearn import svm
-from sklearn.metrics import classification_report
-from sklearn.model_selection import train_test_split
-
 from systemds.scuro.representations.timeseries_representations import (
     Mean,
     ACF,
 )
 from systemds.scuro.drsearch.operator_registry import Registry
-from systemds.scuro.models.model import Model
-from systemds.scuro.drsearch.task import Task
 from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
 
 from systemds.scuro.representations.spectrogram import Spectrogram
@@ -44,69 +38,14 @@
 from systemds.scuro.representations.bow import BoW
 from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.representations.resnet import ResNet
-from tests.scuro.data_generator import ModalityRandomDataGenerator, TestDataLoader
+from tests.scuro.data_generator import (
+    ModalityRandomDataGenerator,
+    TestDataLoader,
+    TestTask,
+)
 
 from systemds.scuro.modality.type import ModalityType
 
-
-class TestSVM(Model):
-    def __init__(self):
-        super().__init__("TestSVM")
-
-    def fit(self, X, y, X_test, y_test):
-        if X.ndim > 2:
-            X = X.reshape(X.shape[0], -1)
-        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
-        self.clf = self.clf.fit(X, np.array(y))
-        y_pred = self.clf.predict(X)
-
-        return {
-            "accuracy": classification_report(
-                y, y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-    def test(self, test_X: np.ndarray, test_y: np.ndarray):
-        if test_X.ndim > 2:
-            test_X = test_X.reshape(test_X.shape[0], -1)
-        y_pred = self.clf.predict(np.array(test_X))  # noqa
-
-        return {
-            "accuracy": classification_report(
-                np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-
-class TestCNN(Model):
-    def __init__(self):
-        super().__init__("TestCNN")
-
-    def fit(self, X, y, X_test, y_test):
-        if X.ndim > 2:
-            X = X.reshape(X.shape[0], -1)
-        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
-        self.clf = self.clf.fit(X, np.array(y))
-        y_pred = self.clf.predict(X)
-
-        return {
-            "accuracy": classification_report(
-                y, y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-    def test(self, test_X: np.ndarray, test_y: np.ndarray):
-        if test_X.ndim > 2:
-            test_X = test_X.reshape(test_X.shape[0], -1)
-        y_pred = self.clf.predict(np.array(test_X))  # noqa
-
-        return {
-            "accuracy": classification_report(
-                np.array(test_y), y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-
 from unittest.mock import patch
 
 
@@ -118,36 +57,12 @@ class TestUnimodalRepresentationOptimizer(unittest.TestCase):
     def setUpClass(cls):
         cls.num_instances = 10
         cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
-        cls.labels = ModalityRandomDataGenerator().create_balanced_labels(
-            num_instances=cls.num_instances
-        )
-        cls.indices = np.array(range(cls.num_instances))
 
-        split = train_test_split(
-            cls.indices,
-            cls.labels,
-            test_size=0.2,
-            random_state=42,
-        )
-        cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
-            int(i) for i in split[1]
-        ]
+        cls.indices = np.array(range(cls.num_instances))
 
         cls.tasks = [
-            Task(
-                "UnimodalRepresentationTask1",
-                TestSVM(),
-                cls.labels,
-                cls.train_indizes,
-                cls.val_indizes,
-            ),
-            Task(
-                "UnimodalRepresentationTask2",
-                TestCNN(),
-                cls.labels,
-                cls.train_indizes,
-                cls.val_indizes,
-            ),
+            TestTask("UnimodalRepresentationTask1", "Test1", cls.num_instances),
+            TestTask("UnimodalRepresentationTask2", "Test2", cls.num_instances),
         ]
 
     def test_unimodal_optimizer_for_audio_modality(self):
@@ -210,7 +125,7 @@ def optimize_unimodal_representation_for_modality(self, modality):
             registry = Registry()
 
             unimodal_optimizer = UnimodalOptimizer([modality], self.tasks, False)
-            unimodal_optimizer.optimize_parallel()
+            unimodal_optimizer.optimize()
 
             assert (
                 unimodal_optimizer.operator_performance.modality_ids[0]
@@ -218,7 +133,7 @@ def optimize_unimodal_representation_for_modality(self, modality):
             )
             assert len(unimodal_optimizer.operator_performance.task_names) == 2
             result, cached = unimodal_optimizer.operator_performance.get_k_best_results(
-                modality, 1, self.tasks[0]
+                modality, 1, self.tasks[0], "accuracy"
             )
             assert len(result) == 1
             assert len(cached) == 1
diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py
index 3bc28ee23c5..0313cd29f88 100644
--- a/src/main/python/tests/scuro/test_unimodal_representations.py
+++ b/src/main/python/tests/scuro/test_unimodal_representations.py
@@ -169,7 +169,6 @@ def test_image_representations(self):
     def test_video_representations(self):
         video_representations = [
             CLIPVisual(),
-            ColorHistogram(),
             I3D(),
             X3D(),
             VGG19(),
diff --git a/src/main/python/tests/test_utils.py b/src/main/python/tests/test_utils.py
new file mode 100644
index 00000000000..b4aec71d4b3
--- /dev/null
+++ b/src/main/python/tests/test_utils.py
@@ -0,0 +1,58 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import functools
+import threading
+
+
+def timeout(seconds):
+    """Decorator to add timeout to test methods."""
+
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            result = [None]
+            exception = [None]
+
+            def target():
+                try:
+                    result[0] = func(*args, **kwargs)
+                except Exception as e:
+                    exception[0] = e
+
+            thread = threading.Thread(target=target)
+            thread.daemon = True
+            thread.start()
+            thread.join(seconds)
+
+            if thread.is_alive():
+                raise TimeoutError(
+                    f"Test {func.__name__} exceeded timeout of {seconds} seconds"
+                )
+
+            if exception[0]:
+                raise exception[0]
+
+            return result[0]
+
+        return wrapper
+
+    return decorator
diff --git a/src/test/java/org/apache/sysds/performance/matrix/MatrixRollPerf.java b/src/test/java/org/apache/sysds/performance/matrix/MatrixRollPerf.java
new file mode 100644
index 00000000000..624f3dc71a9
--- /dev/null
+++ b/src/test/java/org/apache/sysds/performance/matrix/MatrixRollPerf.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.performance.matrix;
+
+import org.apache.sysds.performance.compression.APerfTest;
+import org.apache.sysds.performance.generators.ConstMatrix;
+import org.apache.sysds.performance.generators.IGenerate;
+import org.apache.sysds.runtime.functionobjects.IndexFunction;
+import org.apache.sysds.runtime.functionobjects.RollIndex;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
+import org.apache.sysds.test.TestUtils;
+import org.apache.sysds.utils.stats.InfrastructureAnalyzer;
+
+import java.util.Random;
+
+public class MatrixRollPerf extends APerfTest<Object, MatrixBlock> {
+
+	private final int rows;
+	private final int cols;
+	private final int shift;
+	private final int k;
+
+	private final ReorgOperator reorg;
+	private MatrixBlock out;
+
+	public MatrixRollPerf(int N, int W, IGenerate<MatrixBlock> gen, int rows, int cols, int shift, int k) {
+		super(N, W, gen);
+		this.rows = rows;
+		this.cols = cols;
+		this.shift = shift;
+		this.k = k;
+
+		IndexFunction op = new RollIndex(shift);
+		this.reorg = new ReorgOperator(op, k);
+	}
+
+	public void run() throws Exception {
+		MatrixBlock mb = gen.take();
+		logInfos(rows, cols, shift, mb.getSparsity(), k);
+
+
+		String info = String.format("rows: %5d cols: %5d sp: %.4f shift: %4d k: %2d",
+			rows, cols, mb.getSparsity(), shift, k);
+
+
+		warmup(this::rollOnce, W);
+
+		execute(this::rollOnce, info);
+	}
+
+	private void logInfos(int rows, int cols, int shift, double sparsity, int k) {
+		String matrixType = sparsity == 1 ? "Dense" : "Sparse";
+		if (k == 1) {
+			System.out.println("---------------------------------------------------------------------------------------------------------");
+			System.out.printf("%s Experiment for rows %d columns %d and shift %d \n", matrixType, rows, cols, shift);
+			System.out.println("---------------------------------------------------------------------------------------------------------");
+		}
+	}
+
+	private void rollOnce() {
+		MatrixBlock in = gen.take();
+
+		if (out == null)
+			out = new MatrixBlock(rows, cols, in.isInSparseFormat());
+
+		out.reset(rows, cols, in.isInSparseFormat());
+
+		in.reorgOperations(reorg, out, 0, 0, 0);
+
+		ret.add(null);
+	}
+
+	@Override
+	protected String makeResString() {
+		return "";
+	}
+
+	public static void main(String[] args) throws Exception {
+		int kMulti = InfrastructureAnalyzer.getLocalParallelism();
+		int reps = 2000;
+		int warmup = 200;
+
+		//int minRows = 2017;
+		//int minCols = 1001;
+		double spSparse = 0.01;
+		int minShift = -50;
+		int maxShift = 1022;
+		int iterations = 10;
+
+		Random rand = new Random(42);
+
+		for (int i = 0; i < iterations; i++) {
+			int rows = 10_000_000;
+			int cols = 10;
+			int shift = rand.nextInt((maxShift - minShift) + 1) + minShift;
+
+			MatrixBlock denseIn = TestUtils.generateTestMatrixBlock(rows, cols, -100, 100, 1.0, 42);
+			MatrixBlock sparseIn = TestUtils.generateTestMatrixBlock(rows, cols, -100, 100, spSparse, 42);
+
+			// Run Dense Case (Single vs Multi-threaded)
+			new MatrixRollPerf(reps, warmup, new ConstMatrix(denseIn, -1), rows, cols, shift, 1).run();
+			new MatrixRollPerf(reps, warmup, new ConstMatrix(denseIn, -1), rows, cols, shift, kMulti).run();
+
+			// Run Sparse Case (Single vs Multi-threaded)
+			new MatrixRollPerf(reps, warmup, new ConstMatrix(sparseIn, -1), rows, cols, shift, 1).run();
+			new MatrixRollPerf(reps, warmup, new ConstMatrix(sparseIn, -1), rows, cols, shift, kMulti).run();
+		}
+	}
+}
diff --git a/src/test/java/org/apache/sysds/runtime/ooc/cache/SourceBackedCacheSchedulerTest.java b/src/test/java/org/apache/sysds/runtime/ooc/cache/SourceBackedCacheSchedulerTest.java
new file mode 100644
index 00000000000..423c2b7f425
--- /dev/null
+++ b/src/test/java/org/apache/sysds/runtime/ooc/cache/SourceBackedCacheSchedulerTest.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.ooc.cache;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.instructions.ooc.SubscribableTaskQueue;
+import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.io.MatrixWriter;
+import org.apache.sysds.runtime.io.MatrixWriterFactory;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class SourceBackedCacheSchedulerTest extends AutomatedTestBase {
+	private static final String TEST_NAME = "SourceBackedCacheScheduler";
+	private static final String TEST_DIR = "functions/ooc/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + SourceBackedCacheSchedulerTest.class.getSimpleName() + "/";
+
+	private OOCMatrixIOHandler handler;
+	private OOCLRUCacheScheduler scheduler;
+
+	@Override
+	@Before
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME));
+		handler = new OOCMatrixIOHandler();
+		scheduler = new OOCLRUCacheScheduler(handler, 0, Long.MAX_VALUE);
+	}
+
+	@After
+	public void tearDown() {
+		if (scheduler != null)
+			scheduler.shutdown();
+		if (handler != null)
+			handler.shutdown();
+	}
+
+	@Test
+	public void testPutSourceBackedAndReload() throws Exception {
+		getAndLoadTestConfiguration(TEST_NAME);
+		final int rows = 4;
+		final int cols = 4;
+		final int blen = 2;
+
+		MatrixBlock src = MatrixBlock.randOperations(rows, cols, 1.0, -1, 1, "uniform", 23);
+		String fname = input("binary_src_cache");
+		writeBinaryMatrix(src, fname, blen);
+
+		SubscribableTaskQueue<IndexedMatrixValue> target = new SubscribableTaskQueue<>();
+		OOCIOHandler.SourceReadRequest req = new OOCIOHandler.SourceReadRequest(fname, Types.FileFormat.BINARY,
+			rows, cols, blen, src.getNonZeros(), Long.MAX_VALUE, true, target);
+
+		OOCIOHandler.SourceReadResult res = handler.scheduleSourceRead(req).get();
+		IndexedMatrixValue imv = target.dequeue();
+		OOCIOHandler.SourceBlockDescriptor desc = res.blocks.get(0);
+
+		BlockKey key = new BlockKey(11, 0);
+		BlockEntry entry = scheduler.putAndPinSourceBacked(key, imv,
+			((MatrixBlock) imv.getValue()).getExactSerializedSize(), desc);
+		org.junit.Assert.assertEquals(BlockState.WARM, entry.getState());
+
+		scheduler.unpin(entry);
+		org.junit.Assert.assertEquals(BlockState.COLD, entry.getState());
+		org.junit.Assert.assertNull(entry.getDataUnsafe());
+
+		BlockEntry reloaded = scheduler.request(key).get();
+		IndexedMatrixValue reloadImv = (IndexedMatrixValue) reloaded.getData();
+		MatrixBlock expected = expectedBlock(src, desc.indexes, blen);
+		TestUtils.compareMatrices(expected, (MatrixBlock) reloadImv.getValue(), 1e-12);
+	}
+
+	private void writeBinaryMatrix(MatrixBlock mb, String fname, int blen) throws Exception {
+		MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(Types.FileFormat.BINARY);
+		writer.writeMatrixToHDFS(mb, fname, mb.getNumRows(), mb.getNumColumns(), blen, mb.getNonZeros());
+	}
+
+	private MatrixBlock expectedBlock(MatrixBlock src, org.apache.sysds.runtime.matrix.data.MatrixIndexes idx, int blen) {
+		int rowStart = (int) ((idx.getRowIndex() - 1) * blen);
+		int colStart = (int) ((idx.getColumnIndex() - 1) * blen);
+		int rowEnd = Math.min(rowStart + blen - 1, src.getNumRows() - 1);
+		int colEnd = Math.min(colStart + blen - 1, src.getNumColumns() - 1);
+		return src.slice(rowStart, rowEnd, colStart, colEnd);
+	}
+}
diff --git a/src/test/java/org/apache/sysds/runtime/ooc/cache/SourceBackedReadOOCIOHandlerTest.java b/src/test/java/org/apache/sysds/runtime/ooc/cache/SourceBackedReadOOCIOHandlerTest.java
new file mode 100644
index 00000000000..e688bf0f1c0
--- /dev/null
+++ b/src/test/java/org/apache/sysds/runtime/ooc/cache/SourceBackedReadOOCIOHandlerTest.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.ooc.cache;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.instructions.ooc.SubscribableTaskQueue;
+import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.io.MatrixWriter;
+import org.apache.sysds.runtime.io.MatrixWriterFactory;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class SourceBackedReadOOCIOHandlerTest extends AutomatedTestBase {
+	private static final String TEST_NAME = "SourceBackedReadOOCIOHandler";
+	private static final String TEST_DIR = "functions/ooc/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + SourceBackedReadOOCIOHandlerTest.class.getSimpleName() + "/";
+
+	private OOCMatrixIOHandler handler;
+
+	@Override
+	@Before
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME));
+		handler = new OOCMatrixIOHandler();
+	}
+
+	@After
+	public void tearDown() {
+		if (handler != null)
+			handler.shutdown();
+	}
+
+	@Test
+	public void testSourceBackedScheduleRead() throws Exception {
+		getAndLoadTestConfiguration(TEST_NAME);
+		final int rows = 4;
+		final int cols = 4;
+		final int blen = 2;
+
+		MatrixBlock src = MatrixBlock.randOperations(rows, cols, 1.0, -1, 1, "uniform", 17);
+		String fname = input("binary_src");
+		writeBinaryMatrix(src, fname, blen);
+
+		SubscribableTaskQueue<IndexedMatrixValue> target = new SubscribableTaskQueue<>();
+		OOCIOHandler.SourceReadRequest req = new OOCIOHandler.SourceReadRequest(fname, Types.FileFormat.BINARY,
+			rows, cols, blen, src.getNonZeros(), Long.MAX_VALUE, true, target);
+
+		OOCIOHandler.SourceReadResult res = handler.scheduleSourceRead(req).get();
+		org.junit.Assert.assertFalse(res.blocks.isEmpty());
+
+		OOCIOHandler.SourceBlockDescriptor desc = res.blocks.get(0);
+		BlockKey key = new BlockKey(7, 0);
+		handler.registerSourceLocation(key, desc);
+
+		BlockEntry entry = new BlockEntry(key, desc.serializedSize, null);
+		entry.setState(BlockState.COLD);
+		handler.scheduleRead(entry).get();
+
+		IndexedMatrixValue imv = (IndexedMatrixValue) entry.getDataUnsafe();
+		MatrixBlock readBlock = (MatrixBlock) imv.getValue();
+		MatrixBlock expected = expectedBlock(src, desc.indexes, blen);
+		TestUtils.compareMatrices(expected, readBlock, 1e-12);
+	}
+
+	private void writeBinaryMatrix(MatrixBlock mb, String fname, int blen) throws Exception {
+		MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(Types.FileFormat.BINARY);
+		writer.writeMatrixToHDFS(mb, fname, mb.getNumRows(), mb.getNumColumns(), blen, mb.getNonZeros());
+	}
+
+	private MatrixBlock expectedBlock(MatrixBlock src, org.apache.sysds.runtime.matrix.data.MatrixIndexes idx, int blen) {
+		int rowStart = (int) ((idx.getRowIndex() - 1) * blen);
+		int colStart = (int) ((idx.getColumnIndex() - 1) * blen);
+		int rowEnd = Math.min(rowStart + blen - 1, src.getNumRows() - 1);
+		int colEnd = Math.min(colStart + blen - 1, src.getNumColumns() - 1);
+		return src.slice(rowStart, rowEnd, colStart, colEnd);
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupDDCTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupDDCTest.java
new file mode 100644
index 00000000000..0f04cfc9c27
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupDDCTest.java
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.colgroup;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.colgroup.AColGroup;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupDDC;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupDeltaDDC;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.junit.Test;
+
+public class ColGroupDDCTest {
+
+	protected static final Log LOG = LogFactory.getLog(ColGroupDDCTest.class.getName());
+
+	@Test
+	public void testConvertToDeltaDDCBasic() {
+		IColIndex colIndexes = ColIndexFactory.create(2);
+		double[] dictValues = new double[] {10.0, 20.0, 11.0, 21.0, 12.0, 22.0};
+		Dictionary dict = Dictionary.create(dictValues);
+		AMapToData data = MapToFactory.create(3, 3);
+		data.set(0, 0);
+		data.set(1, 1);
+		data.set(2, 2);
+
+		ColGroupDDC ddc = (ColGroupDDC) ColGroupDDC.create(colIndexes, dict, data, null);
+		AColGroup result = ddc.convertToDeltaDDC();
+
+		assertNotNull(result);
+		assertTrue(result instanceof ColGroupDeltaDDC);
+		ColGroupDeltaDDC deltaDDC = (ColGroupDeltaDDC) result;
+
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		deltaDDC.decompressToDenseBlock(mb.getDenseBlock(), 0, 3);
+
+		assertEquals(10.0, mb.get(0, 0), 0.0);
+		assertEquals(20.0, mb.get(0, 1), 0.0);
+		assertEquals(11.0, mb.get(1, 0), 0.0);
+		assertEquals(21.0, mb.get(1, 1), 0.0);
+		assertEquals(12.0, mb.get(2, 0), 0.0);
+		assertEquals(22.0, mb.get(2, 1), 0.0);
+	}
+
+	@Test
+	public void testConvertToDeltaDDCSingleColumn() {
+		IColIndex colIndexes = ColIndexFactory.create(1);
+		double[] dictValues = new double[] {1.0, 2.0, 3.0, 4.0, 5.0};
+		Dictionary dict = Dictionary.create(dictValues);
+		AMapToData data = MapToFactory.create(5, 5);
+		for(int i = 0; i < 5; i++)
+			data.set(i, i);
+
+		ColGroupDDC ddc = (ColGroupDDC) ColGroupDDC.create(colIndexes, dict, data, null);
+		AColGroup result = ddc.convertToDeltaDDC();
+
+		assertNotNull(result);
+		assertTrue(result instanceof ColGroupDeltaDDC);
+		ColGroupDeltaDDC deltaDDC = (ColGroupDeltaDDC) result;
+
+		MatrixBlock mb = new MatrixBlock(5, 1, false);
+		mb.allocateDenseBlock();
+		deltaDDC.decompressToDenseBlock(mb.getDenseBlock(), 0, 5);
+
+		assertEquals(1.0, mb.get(0, 0), 0.0);
+		assertEquals(2.0, mb.get(1, 0), 0.0);
+		assertEquals(3.0, mb.get(2, 0), 0.0);
+		assertEquals(4.0, mb.get(3, 0), 0.0);
+		assertEquals(5.0, mb.get(4, 0), 0.0);
+	}
+
+	@Test
+	public void testConvertToDeltaDDCWithRepeatedValues() {
+		IColIndex colIndexes = ColIndexFactory.create(2);
+		double[] dictValues = new double[] {10.0, 20.0, 10.0, 20.0, 10.0, 20.0};
+		Dictionary dict = Dictionary.create(dictValues);
+		AMapToData data = MapToFactory.create(3, 3);
+		data.set(0, 0);
+		data.set(1, 1);
+		data.set(2, 2);
+
+		ColGroupDDC ddc = (ColGroupDDC) ColGroupDDC.create(colIndexes, dict, data, null);
+		AColGroup result = ddc.convertToDeltaDDC();
+
+		assertNotNull(result);
+		assertTrue(result instanceof ColGroupDeltaDDC);
+		ColGroupDeltaDDC deltaDDC = (ColGroupDeltaDDC) result;
+
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		deltaDDC.decompressToDenseBlock(mb.getDenseBlock(), 0, 3);
+
+		assertEquals(10.0, mb.get(0, 0), 0.0);
+		assertEquals(20.0, mb.get(0, 1), 0.0);
+		assertEquals(10.0, mb.get(1, 0), 0.0);
+		assertEquals(20.0, mb.get(1, 1), 0.0);
+		assertEquals(10.0, mb.get(2, 0), 0.0);
+		assertEquals(20.0, mb.get(2, 1), 0.0);
+	}
+
+	@Test
+	public void testConvertToDeltaDDCWithNegativeDeltas() {
+		IColIndex colIndexes = ColIndexFactory.create(2);
+		double[] dictValues = new double[] {10.0, 20.0, 8.0, 15.0, 12.0, 25.0};
+		Dictionary dict = Dictionary.create(dictValues);
+		AMapToData data = MapToFactory.create(3, 3);
+		data.set(0, 0);
+		data.set(1, 1);
+		data.set(2, 2);
+
+		ColGroupDDC ddc = (ColGroupDDC) ColGroupDDC.create(colIndexes, dict, data, null);
+		AColGroup result = ddc.convertToDeltaDDC();
+
+		assertNotNull(result);
+		assertTrue(result instanceof ColGroupDeltaDDC);
+		ColGroupDeltaDDC deltaDDC = (ColGroupDeltaDDC) result;
+
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		deltaDDC.decompressToDenseBlock(mb.getDenseBlock(), 0, 3);
+
+		assertEquals(10.0, mb.get(0, 0), 0.0);
+		assertEquals(20.0, mb.get(0, 1), 0.0);
+		assertEquals(8.0, mb.get(1, 0), 0.0);
+		assertEquals(15.0, mb.get(1, 1), 0.0);
+		assertEquals(12.0, mb.get(2, 0), 0.0);
+		assertEquals(25.0, mb.get(2, 1), 0.0);
+	}
+
+	@Test
+	public void testConvertToDeltaDDCWithZeroDeltas() {
+		IColIndex colIndexes = ColIndexFactory.create(2);
+		double[] dictValues = new double[] {5.0, 0.0, 5.0, 0.0, 0.0, 5.0};
+		Dictionary dict = Dictionary.create(dictValues);
+		AMapToData data = MapToFactory.create(3, 3);
+		data.set(0, 0);
+		data.set(1, 1);
+		data.set(2, 2);
+
+		ColGroupDDC ddc = (ColGroupDDC) ColGroupDDC.create(colIndexes, dict, data, null);
+		AColGroup result = ddc.convertToDeltaDDC();
+
+		assertNotNull(result);
+		assertTrue(result instanceof ColGroupDeltaDDC);
+		ColGroupDeltaDDC deltaDDC = (ColGroupDeltaDDC) result;
+
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		deltaDDC.decompressToDenseBlock(mb.getDenseBlock(), 0, 3);
+
+		assertEquals(5.0, mb.get(0, 0), 0.0);
+		assertEquals(0.0, mb.get(0, 1), 0.0);
+		assertEquals(5.0, mb.get(1, 0), 0.0);
+		assertEquals(0.0, mb.get(1, 1), 0.0);
+		assertEquals(0.0, mb.get(2, 0), 0.0);
+		assertEquals(5.0, mb.get(2, 1), 0.0);
+	}
+
+	@Test
+	public void testConvertToDeltaDDCMultipleUniqueDeltas() {
+		IColIndex colIndexes = ColIndexFactory.create(2);
+		double[] dictValues = new double[] {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+		Dictionary dict = Dictionary.create(dictValues);
+		AMapToData data = MapToFactory.create(4, 4);
+		for(int i = 0; i < 4; i++)
+			data.set(i, i);
+
+		ColGroupDDC ddc = (ColGroupDDC) ColGroupDDC.create(colIndexes, dict, data, null);
+		AColGroup result = ddc.convertToDeltaDDC();
+
+		assertNotNull(result);
+		assertTrue(result instanceof ColGroupDeltaDDC);
+		ColGroupDeltaDDC deltaDDC = (ColGroupDeltaDDC) result;
+
+		MatrixBlock mb = new MatrixBlock(4, 2, false);
+		mb.allocateDenseBlock();
+		deltaDDC.decompressToDenseBlock(mb.getDenseBlock(), 0, 4);
+
+		assertEquals(1.0, mb.get(0, 0), 0.0);
+		assertEquals(2.0, mb.get(0, 1), 0.0);
+		assertEquals(3.0, mb.get(1, 0), 0.0);
+		assertEquals(4.0, mb.get(1, 1), 0.0);
+		assertEquals(5.0, mb.get(2, 0), 0.0);
+		assertEquals(6.0, mb.get(2, 1), 0.0);
+		assertEquals(7.0, mb.get(3, 0), 0.0);
+		assertEquals(8.0, mb.get(3, 1), 0.0);
+	}
+}
+
diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupDeltaDDCTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupDeltaDDCTest.java
index 0f2d965bce8..c953792a038 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupDeltaDDCTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupDeltaDDCTest.java
@@ -19,64 +19,747 @@
 
 package org.apache.sysds.test.component.compress.colgroup;
 
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.EnumSet;
+
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.compress.CompressionSettings;
+import org.apache.sysds.runtime.compress.CompressionSettingsBuilder;
+import org.apache.sysds.runtime.compress.colgroup.AColGroup;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupDeltaDDC;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupIO;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.estim.ComEstExact;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.Divide;
+import org.apache.sysds.runtime.functionobjects.Equals;
+import org.apache.sysds.runtime.functionobjects.Multiply;
+import org.apache.sysds.runtime.functionobjects.GreaterThan;
+import org.apache.sysds.runtime.functionobjects.Minus;
+import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysds.runtime.util.DataConverter;
+import org.junit.Test;
+
 public class ColGroupDeltaDDCTest {
 
-	// protected static final Log LOG = LogFactory.getLog(JolEstimateTest.class.getName());
-
-	// @Test
-	// public void testDecompressToDenseBlockSingleColumn() {
-	// 	testDecompressToDenseBlock(new double[][] {{1, 2, 3, 4, 5}}, true);
-	// }
-
-	// @Test
-	// public void testDecompressToDenseBlockSingleColumnTransposed() {
-	// 	testDecompressToDenseBlock(new double[][] {{1}, {2}, {3}, {4}, {5}}, false);
-	// }
-
-	// @Test
-	// public void testDecompressToDenseBlockTwoColumns() {
-	// 	testDecompressToDenseBlock(new double[][] {{1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 1}}, false);
-	// }
-
-	// @Test
-	// public void testDecompressToDenseBlockTwoColumnsTransposed() {
-	// 	testDecompressToDenseBlock(new double[][] {{1, 2, 3, 4, 5}, {1, 1, 1, 1, 1}}, true);
-	// }
-
-	// public void testDecompressToDenseBlock(double[][] data, boolean isTransposed) {
-	// 	MatrixBlock mbt = DataConverter.convertToMatrixBlock(data);
-
-	// 	final int numCols = isTransposed ? mbt.getNumRows() : mbt.getNumColumns();
-	// 	final int numRows = isTransposed ? mbt.getNumColumns() : mbt.getNumRows();
-	// 	int[] colIndexes = new int[numCols];
-	// 	for(int x = 0; x < numCols; x++)
-	// 		colIndexes[x] = x;
-
-	// 	try {
-	// 		CompressionSettings cs = new CompressionSettingsBuilder().setSamplingRatio(1.0)
-	// 			.setValidCompressions(EnumSet.of(AColGroup.CompressionType.DeltaDDC)).create();
-	// 		cs.transposed = isTransposed;
-
-	// 		final CompressedSizeInfoColGroup cgi = new CompressedSizeEstimatorExact(mbt, cs)
-	// 			.getColGroupInfo(colIndexes);
-	// 		CompressedSizeInfo csi = new CompressedSizeInfo(cgi);
-	// 		AColGroup cg = ColGroupFactory.compressColGroups(mbt, csi, cs, 1).get(0);
-
-	// 		// Decompress to dense block
-	// 		MatrixBlock ret = new MatrixBlock(numRows, numCols, false);
-	// 		ret.allocateDenseBlock();
-	// 		cg.decompressToDenseBlock(ret.getDenseBlock(), 0, numRows);
-
-	// 		MatrixBlock expected = DataConverter.convertToMatrixBlock(data);
-	// 		if(isTransposed)
-	// 			LibMatrixReorg.transposeInPlace(expected, 1);
-	// 		Assert.assertArrayEquals(expected.getDenseBlockValues(), ret.getDenseBlockValues(), 0.01);
-
-	// 	}
-	// 	catch(Exception e) {
-	// 		e.printStackTrace();
-	// 		throw new DMLRuntimeException("Failed construction : " + this.getClass().getSimpleName());
-	// 	}
-	// }
+	protected static final Log LOG = LogFactory.getLog(ColGroupDeltaDDCTest.class.getName());
+
+	@Test
+	public void testDecompressToDenseBlockSingleColumn() {
+		testDecompressToDenseBlock(new double[][] {{1, 2, 3, 4, 5}}, false);
+	}
+
+	@Test(expected = NotImplementedException.class)
+	public void testDecompressToDenseBlockSingleColumnTransposed() {
+		testDecompressToDenseBlock(new double[][] {{1}, {2}, {3}, {4}, {5}}, true);
+	}
+
+	@Test
+	public void testDecompressToDenseBlockTwoColumns() {
+		testDecompressToDenseBlock(new double[][] {{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}}, false);
+	}
+
+	@Test(expected = NotImplementedException.class)
+	public void testDecompressToDenseBlockTwoColumnsTransposed() {
+		testDecompressToDenseBlock(new double[][] {{1, 2, 3, 4, 5}, {1, 1, 1, 1, 1}}, true);
+	}
+
+	@Test
+	public void testDecompressToDenseBlockPartialRangeSingleColumn() {
+		testDecompressToDenseBlockPartialRange(new double[][] {{1}, {2}, {3}, {4}, {5}}, false, 2, 5);
+	}
+
+	@Test
+	public void testDecompressToDenseBlockPartialRangeTwoColumns() {
+		testDecompressToDenseBlockPartialRange(new double[][] {{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}}, false, 1, 4);
+	}
+
+	@Test
+	public void testDecompressToDenseBlockPartialRangeFromMiddle() {
+		testDecompressToDenseBlockPartialRange(new double[][] {{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}}, false, 3, 6);
+	}
+
+	public void testDecompressToDenseBlock(double[][] data, boolean isTransposed) {
+		if(isTransposed) {
+			throw new NotImplementedException("Delta encoding for transposed matrices not yet implemented");
+		}
+		
+		MatrixBlock mbt = DataConverter.convertToMatrixBlock(data);
+
+		final int numCols = mbt.getNumColumns();
+		final int numRows = mbt.getNumRows();
+		IColIndex colIndexes = ColIndexFactory.create(numCols);
+
+		try {
+			CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setSamplingRatio(1.0)
+				.setValidCompressions(EnumSet.of(AColGroup.CompressionType.DeltaDDC))
+				.setPreferDeltaEncoding(true)
+				.setTransposeInput("false");
+			CompressionSettings cs = csb.create();
+
+			final CompressedSizeInfoColGroup cgi = new ComEstExact(mbt, cs).getColGroupInfo(colIndexes);
+			CompressedSizeInfo csi = new CompressedSizeInfo(cgi);
+			AColGroup cg = ColGroupFactory.compressColGroups(mbt, csi, cs, 1).get(0);
+
+			MatrixBlock ret = new MatrixBlock(numRows, numCols, false);
+			ret.allocateDenseBlock();
+			cg.decompressToDenseBlock(ret.getDenseBlock(), 0, numRows);
+
+			MatrixBlock expected = DataConverter.convertToMatrixBlock(data);
+			assertArrayEquals(expected.getDenseBlockValues(), ret.getDenseBlockValues(), 0.01);
+
+		}
+		catch(NotImplementedException e) {
+			throw e;
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			throw new DMLRuntimeException("Failed construction : " + this.getClass().getSimpleName(), e);
+		}
+	}
+
+	public void testDecompressToDenseBlockPartialRange(double[][] data, boolean isTransposed, int rl, int ru) {
+		if(isTransposed) {
+			throw new NotImplementedException("Delta encoding for transposed matrices not yet implemented");
+		}
+		
+		MatrixBlock mbt = DataConverter.convertToMatrixBlock(data);
+
+		final int numCols = mbt.getNumColumns();
+		final int numRows = mbt.getNumRows();
+		IColIndex colIndexes = ColIndexFactory.create(numCols);
+
+		try {
+			CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setSamplingRatio(1.0)
+				.setValidCompressions(EnumSet.of(AColGroup.CompressionType.DeltaDDC))
+				.setPreferDeltaEncoding(true)
+				.setTransposeInput("false");
+			CompressionSettings cs = csb.create();
+
+			final CompressedSizeInfoColGroup cgi = new ComEstExact(mbt, cs).getColGroupInfo(colIndexes);
+			CompressedSizeInfo csi = new CompressedSizeInfo(cgi);
+			AColGroup cg = ColGroupFactory.compressColGroups(mbt, csi, cs, 1).get(0);
+
+			assertTrue("Column group should be DeltaDDC, not Const", cg instanceof ColGroupDeltaDDC);
+
+			MatrixBlock ret = new MatrixBlock(numRows, numCols, false);
+			ret.allocateDenseBlock();
+			cg.decompressToDenseBlock(ret.getDenseBlock(), rl, ru);
+
+			MatrixBlock expected = DataConverter.convertToMatrixBlock(data);
+			for(int i = rl; i < ru; i++) {
+				for(int j = 0; j < numCols; j++) {
+					double expectedValue = expected.get(i, j);
+					double actualValue = ret.get(i, j);
+					assertArrayEquals(new double[] {expectedValue}, new double[] {actualValue}, 0.01);
+				}
+			}
+
+		}
+		catch(NotImplementedException e) {
+			throw e;
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			throw new DMLRuntimeException("Failed partial range decompression : " + this.getClass().getSimpleName(), e);
+		}
+	}
+
+	@Test
+	public void testSerializationSingleColumn() throws IOException {
+		double[][] data = {{1}, {2}, {3}, {4}, {5}};
+		MatrixBlock mbt = DataConverter.convertToMatrixBlock(data);
+		final int numCols = mbt.getNumColumns();
+		final int numRows = mbt.getNumRows();
+		IColIndex colIndexes = ColIndexFactory.create(numCols);
+
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setSamplingRatio(1.0)
+			.setValidCompressions(EnumSet.of(AColGroup.CompressionType.DeltaDDC))
+			.setPreferDeltaEncoding(true)
+			.setTransposeInput("false");
+		CompressionSettings cs = csb.create();
+
+		final CompressedSizeInfoColGroup cgi = new ComEstExact(mbt, cs).getDeltaColGroupInfo(colIndexes);
+		CompressedSizeInfo csi = new CompressedSizeInfo(cgi);
+		AColGroup original = ColGroupFactory.compressColGroups(mbt, csi, cs, 1).get(0);
+
+		assertTrue("Original should be ColGroupDeltaDDC", original instanceof ColGroupDeltaDDC);
+
+		ByteArrayOutputStream bos = new ByteArrayOutputStream();
+		DataOutputStream dos = new DataOutputStream(bos);
+		ColGroupIO.writeGroups(dos, Collections.singletonList(original));
+		assertEquals(original.getExactSizeOnDisk() + 4, bos.size());
+
+		ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray());
+		DataInputStream dis = new DataInputStream(bis);
+		AColGroup deserialized = ColGroupIO.readGroups(dis, numRows).get(0);
+
+		assertTrue("Deserialized should be ColGroupDeltaDDC", deserialized instanceof ColGroupDeltaDDC);
+		assertEquals("Compression type should match", original.getCompType(), deserialized.getCompType());
+		assertEquals("Exact size on disk should match", original.getExactSizeOnDisk(), deserialized.getExactSizeOnDisk());
+
+		MatrixBlock originalDecompressed = new MatrixBlock(numRows, numCols, false);
+		originalDecompressed.allocateDenseBlock();
+		original.decompressToDenseBlock(originalDecompressed.getDenseBlock(), 0, numRows);
+
+		MatrixBlock deserializedDecompressed = new MatrixBlock(numRows, numCols, false);
+		deserializedDecompressed.allocateDenseBlock();
+		deserialized.decompressToDenseBlock(deserializedDecompressed.getDenseBlock(), 0, numRows);
+
+		for(int i = 0; i < numRows; i++) {
+			for(int j = 0; j < numCols; j++) {
+				assertArrayEquals(new double[] {originalDecompressed.get(i, j)}, new double[] {deserializedDecompressed.get(i, j)}, 0.01);
+			}
+		}
+	}
+
+	@Test
+	public void testSerializationTwoColumns() throws IOException {
+		double[][] data = {{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}};
+		MatrixBlock mbt = DataConverter.convertToMatrixBlock(data);
+		final int numCols = mbt.getNumColumns();
+		final int numRows = mbt.getNumRows();
+		IColIndex colIndexes = ColIndexFactory.create(numCols);
+
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setSamplingRatio(1.0)
+			.setValidCompressions(EnumSet.of(AColGroup.CompressionType.DeltaDDC))
+			.setPreferDeltaEncoding(true)
+			.setTransposeInput("false");
+		CompressionSettings cs = csb.create();
+
+		final CompressedSizeInfoColGroup cgi = new ComEstExact(mbt, cs).getDeltaColGroupInfo(colIndexes);
+		CompressedSizeInfo csi = new CompressedSizeInfo(cgi);
+		AColGroup original = ColGroupFactory.compressColGroups(mbt, csi, cs, 1).get(0);
+
+		assertTrue("Original should be ColGroupDeltaDDC", original instanceof ColGroupDeltaDDC);
+
+		ByteArrayOutputStream bos = new ByteArrayOutputStream();
+		DataOutputStream dos = new DataOutputStream(bos);
+		ColGroupIO.writeGroups(dos, Collections.singletonList(original));
+		assertEquals(original.getExactSizeOnDisk() + 4, bos.size());
+
+		ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray());
+		DataInputStream dis = new DataInputStream(bis);
+		AColGroup deserialized = ColGroupIO.readGroups(dis, numRows).get(0);
+
+		assertTrue("Deserialized should be ColGroupDeltaDDC", deserialized instanceof ColGroupDeltaDDC);
+		assertEquals("Compression type should match", original.getCompType(), deserialized.getCompType());
+		assertEquals("Exact size on disk should match", original.getExactSizeOnDisk(), deserialized.getExactSizeOnDisk());
+
+		MatrixBlock originalDecompressed = new MatrixBlock(numRows, numCols, false);
+		originalDecompressed.allocateDenseBlock();
+		original.decompressToDenseBlock(originalDecompressed.getDenseBlock(), 0, numRows);
+
+		MatrixBlock deserializedDecompressed = new MatrixBlock(numRows, numCols, false);
+		deserializedDecompressed.allocateDenseBlock();
+		deserialized.decompressToDenseBlock(deserializedDecompressed.getDenseBlock(), 0, numRows);
+
+		for(int i = 0; i < numRows; i++) {
+			for(int j = 0; j < numCols; j++) {
+				assertArrayEquals(new double[] {originalDecompressed.get(i, j)}, new double[] {deserializedDecompressed.get(i, j)}, 0.01);
+			}
+		}
+	}
+
+	@Test
+	public void testScalarEquals() {
+		double[][] data = {{0}, {1}, {2}, {3}, {0}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		ScalarOperator op = new RightScalarOperator(Equals.getEqualsFnObject(), 0.0);
+		AColGroup res = cg.scalarOperation(op);
+		
+		MatrixBlock ret = new MatrixBlock(5, 1, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 5);
+		
+		assertEquals(1.0, ret.get(0, 0), 0.0);
+		assertEquals(0.0, ret.get(1, 0), 0.0);
+		assertEquals(0.0, ret.get(2, 0), 0.0);
+		assertEquals(0.0, ret.get(3, 0), 0.0);
+		assertEquals(1.0, ret.get(4, 0), 0.0);
+	}
+	
+	@Test
+	public void testScalarGreaterThan() {
+		double[][] data = {{0}, {1}, {2}, {3}, {0}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		ScalarOperator op = new RightScalarOperator(GreaterThan.getGreaterThanFnObject(), 1.5);
+		AColGroup res = cg.scalarOperation(op);
+		
+		MatrixBlock ret = new MatrixBlock(5, 1, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 5);
+		
+		assertEquals(0.0, ret.get(0, 0), 0.0);
+		assertEquals(0.0, ret.get(1, 0), 0.0);
+		assertEquals(1.0, ret.get(2, 0), 0.0);
+		assertEquals(1.0, ret.get(3, 0), 0.0);
+		assertEquals(0.0, ret.get(4, 0), 0.0);
+	}
+
+	@Test
+	public void testScalarPlus() {
+		double[][] data = {{1}, {2}, {3}, {4}, {5}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		ScalarOperator op = new RightScalarOperator(Plus.getPlusFnObject(), 10.0);
+		AColGroup res = cg.scalarOperation(op);
+		assertTrue("Should remain DeltaDDC after shift", res instanceof ColGroupDeltaDDC);
+		
+		MatrixBlock ret = new MatrixBlock(5, 1, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 5);
+		
+		assertEquals(11.0, ret.get(0, 0), 0.0);
+		assertEquals(12.0, ret.get(1, 0), 0.0);
+		assertEquals(13.0, ret.get(2, 0), 0.0);
+		assertEquals(14.0, ret.get(3, 0), 0.0);
+		assertEquals(15.0, ret.get(4, 0), 0.0);
+	}
+
+	@Test
+	public void testScalarMinus() {
+		double[][] data = {{11}, {12}, {13}, {14}, {15}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		ScalarOperator op = new RightScalarOperator(Minus.getMinusFnObject(), 10.0);
+		AColGroup res = cg.scalarOperation(op);
+		assertTrue("Should remain DeltaDDC after shift", res instanceof ColGroupDeltaDDC);
+		
+		MatrixBlock ret = new MatrixBlock(5, 1, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 5);
+		
+		assertEquals(1.0, ret.get(0, 0), 0.0);
+		assertEquals(2.0, ret.get(1, 0), 0.0);
+		assertEquals(3.0, ret.get(2, 0), 0.0);
+		assertEquals(4.0, ret.get(3, 0), 0.0);
+		assertEquals(5.0, ret.get(4, 0), 0.0);
+	}
+
+	@Test
+	public void testUnaryOperationSqrt() {
+		double[][] data = {{1}, {4}, {9}, {16}, {25}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		UnaryOperator op = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.SQRT));
+		AColGroup res = cg.unaryOperation(op);
+		
+		MatrixBlock ret = new MatrixBlock(5, 1, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 5);
+		
+		assertEquals(1.0, ret.get(0, 0), 0.01);
+		assertEquals(2.0, ret.get(1, 0), 0.01);
+		assertEquals(3.0, ret.get(2, 0), 0.01);
+		assertEquals(4.0, ret.get(3, 0), 0.01);
+		assertEquals(5.0, ret.get(4, 0), 0.01);
+	}
+
+	@Test
+	public void testScalarEqualsMultiColumn() {
+		double[][] data = {{0, 1}, {1, 2}, {2, 3}, {3, 4}, {0, 1}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		ScalarOperator op = new RightScalarOperator(Equals.getEqualsFnObject(), 0.0);
+		AColGroup res = cg.scalarOperation(op);
+		
+		MatrixBlock ret = new MatrixBlock(5, 2, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 5);
+		
+		assertEquals(1.0, ret.get(0, 0), 0.0);
+		assertEquals(0.0, ret.get(0, 1), 0.0);
+		assertEquals(0.0, ret.get(1, 0), 0.0);
+		assertEquals(0.0, ret.get(1, 1), 0.0);
+		assertEquals(0.0, ret.get(2, 0), 0.0);
+		assertEquals(0.0, ret.get(2, 1), 0.0);
+		assertEquals(0.0, ret.get(3, 0), 0.0);
+		assertEquals(0.0, ret.get(3, 1), 0.0);
+		assertEquals(1.0, ret.get(4, 0), 0.0);
+		assertEquals(0.0, ret.get(4, 1), 0.0);
+	}
+
+	@Test
+	public void testScalarMultiply() {
+		double[][] data = {{1}, {2}, {3}, {4}, {5}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		ScalarOperator op = new RightScalarOperator(Multiply.getMultiplyFnObject(), 2.0);
+		AColGroup res = cg.scalarOperation(op);
+		
+		MatrixBlock ret = new MatrixBlock(5, 1, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 5);
+		
+		assertEquals(2.0, ret.get(0, 0), 0.0);
+		assertEquals(4.0, ret.get(1, 0), 0.0);
+		assertEquals(6.0, ret.get(2, 0), 0.0);
+		assertEquals(8.0, ret.get(3, 0), 0.0);
+		assertEquals(10.0, ret.get(4, 0), 0.0);
+	}
+
+	@Test
+	public void testScalarDivide() {
+		double[][] data = {{2}, {4}, {6}, {8}, {10}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		ScalarOperator op = new RightScalarOperator(Divide.getDivideFnObject(), 2.0);
+		AColGroup res = cg.scalarOperation(op);
+		
+		MatrixBlock ret = new MatrixBlock(5, 1, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 5);
+		
+		assertEquals(1.0, ret.get(0, 0), 0.0);
+		assertEquals(2.0, ret.get(1, 0), 0.0);
+		assertEquals(3.0, ret.get(2, 0), 0.0);
+		assertEquals(4.0, ret.get(3, 0), 0.0);
+		assertEquals(5.0, ret.get(4, 0), 0.0);
+	}
+
+	@Test
+	public void testSliceRows() {
+		double[][] data = {{1, 2}, {3, 4}, {5, 6}, {7, 8}, {9, 10}};
+		AColGroup cg = compressForTest(data);
+		
+		AColGroup sliced = cg.sliceRows(1, 4);
+		assertTrue(sliced instanceof ColGroupDeltaDDC);
+		
+		MatrixBlock ret = new MatrixBlock(3, 2, false);
+		ret.allocateDenseBlock();
+		sliced.decompressToDenseBlock(ret.getDenseBlock(), 0, 3);
+		
+		assertEquals(3.0, ret.get(0, 0), 0.0);
+		assertEquals(4.0, ret.get(0, 1), 0.0);
+		assertEquals(5.0, ret.get(1, 0), 0.0);
+		assertEquals(6.0, ret.get(1, 1), 0.0);
+		assertEquals(7.0, ret.get(2, 0), 0.0);
+		assertEquals(8.0, ret.get(2, 1), 0.0);
+	}
+
+	@Test
+	public void testSliceRowsWithMatchingDictionaryEntry() {
+		double[][] data = {{1, 2}, {3, 4}, {1, 2}, {5, 6}, {7, 8}};
+		AColGroup cg = compressForTest(data);
+		
+		AColGroup sliced = cg.sliceRows(2, 5);
+		assertTrue(sliced instanceof ColGroupDeltaDDC);
+		
+		MatrixBlock ret = new MatrixBlock(3, 2, false);
+		ret.allocateDenseBlock();
+		sliced.decompressToDenseBlock(ret.getDenseBlock(), 0, 3);
+		
+		assertEquals(1.0, ret.get(0, 0), 0.0);
+		assertEquals(2.0, ret.get(0, 1), 0.0);
+		assertEquals(5.0, ret.get(1, 0), 0.0);
+		assertEquals(6.0, ret.get(1, 1), 0.0);
+		assertEquals(7.0, ret.get(2, 0), 0.0);
+		assertEquals(8.0, ret.get(2, 1), 0.0);
+	}
+
+	@Test
+	public void testSliceRowsWithNoMatchingDictionaryEntry() {
+		double[][] data = {{1, 2}, {3, 4}, {5, 6}};
+		AColGroup cg = compressForTest(data);
+		
+		AColGroup sliced = cg.sliceRows(1, 3);
+		assertTrue(sliced instanceof ColGroupDeltaDDC);
+		
+		MatrixBlock ret = new MatrixBlock(2, 2, false);
+		ret.allocateDenseBlock();
+		sliced.decompressToDenseBlock(ret.getDenseBlock(), 0, 2);
+		
+		assertEquals(3.0, ret.get(0, 0), 0.0);
+		assertEquals(4.0, ret.get(0, 1), 0.0);
+		assertEquals(5.0, ret.get(1, 0), 0.0);
+		assertEquals(6.0, ret.get(1, 1), 0.0);
+	}
+
+	@Test
+	public void testSliceRowsFromMiddleRow() {
+		double[][] data = {{1, 2}, {3, 4}, {5, 6}, {7, 8}};
+		AColGroup cg = compressForTest(data);
+		
+		AColGroup sliced = cg.sliceRows(2, 4);
+		assertTrue(sliced instanceof ColGroupDeltaDDC);
+		
+		MatrixBlock ret = new MatrixBlock(2, 2, false);
+		ret.allocateDenseBlock();
+		sliced.decompressToDenseBlock(ret.getDenseBlock(), 0, 2);
+		
+		assertEquals(5.0, ret.get(0, 0), 0.0);
+		assertEquals(6.0, ret.get(0, 1), 0.0);
+		assertEquals(7.0, ret.get(1, 0), 0.0);
+		assertEquals(8.0, ret.get(1, 1), 0.0);
+	}
+
+	@Test
+	public void testDecompressToSparseBlock() {
+		double[][] data = {{1, 2}, {3, 4}, {5, 6}};
+		AColGroup cg = compressForTest(data);
+		
+		MatrixBlock ret = new MatrixBlock(3, 2, true);
+		ret.allocateSparseRowsBlock();
+		cg.decompressToSparseBlock(ret.getSparseBlock(), 0, 3);
+		
+		assertEquals(1.0, ret.get(0, 0), 0.0);
+		assertEquals(2.0, ret.get(0, 1), 0.0);
+		assertEquals(3.0, ret.get(1, 0), 0.0);
+		assertEquals(4.0, ret.get(1, 1), 0.0);
+		assertEquals(5.0, ret.get(2, 0), 0.0);
+		assertEquals(6.0, ret.get(2, 1), 0.0);
+	}
+
+	@Test
+	public void testDecompressToSparseBlockWithRlGreaterThanZero() {
+		double[][] data = {{1, 2}, {3, 4}, {5, 6}, {7, 8}};
+		AColGroup cg = compressForTest(data);
+		
+		MatrixBlock ret = new MatrixBlock(4, 2, true);
+		ret.allocateSparseRowsBlock();
+		cg.decompressToSparseBlock(ret.getSparseBlock(), 2, 4, 0, 0);
+		
+		assertEquals(5.0, ret.get(2, 0), 0.0);
+		assertEquals(6.0, ret.get(2, 1), 0.0);
+		assertEquals(7.0, ret.get(3, 0), 0.0);
+		assertEquals(8.0, ret.get(3, 1), 0.0);
+	}
+
+	@Test
+	public void testDecompressToSparseBlockWithOffset() {
+		double[][] data = {{1, 2}, {3, 4}, {5, 6}};
+		AColGroup cg = compressForTest(data);
+		
+		MatrixBlock ret = new MatrixBlock(5, 4, true);
+		ret.allocateSparseRowsBlock();
+		cg.decompressToSparseBlock(ret.getSparseBlock(), 0, 3, 1, 1);
+		
+		assertEquals(1.0, ret.get(1, 1), 0.0);
+		assertEquals(2.0, ret.get(1, 2), 0.0);
+		assertEquals(3.0, ret.get(2, 1), 0.0);
+		assertEquals(4.0, ret.get(2, 2), 0.0);
+		assertEquals(5.0, ret.get(3, 1), 0.0);
+		assertEquals(6.0, ret.get(3, 2), 0.0);
+	}
+
+	@Test
+	public void testGetNumberNonZeros() {
+		double[][] data = {{1, 0}, {2, 3}, {0, 4}, {5, 0}};
+		AColGroup cg = compressForTest(data);
+		
+		long nnz = cg.getNumberNonZeros(4);
+		assertEquals(5L, nnz);
+	}
+
+	@Test
+	public void testGetNumberNonZerosAllZeros() {
+		double[][] data = {{0, 0}, {0, 0}, {0, 0}};
+		AColGroup cg = compressForTest(data);
+		
+		long nnz = cg.getNumberNonZeros(3);
+		assertEquals(0L, nnz);
+	}
+
+	@Test
+	public void testGetNumberNonZerosAllNonZeros() {
+		double[][] data = {{1, 2}, {3, 4}, {5, 6}};
+		AColGroup cg = compressForTest(data);
+		
+		long nnz = cg.getNumberNonZeros(3);
+		assertEquals(6L, nnz);
+	}
+
+	@Test
+	public void testDecompressToDenseBlockNonContiguousPath() {
+		double[][] data = {{1, 2}, {3, 4}, {5, 6}};
+		AColGroup cg = compressForTest(data);
+		
+		MatrixBlock ret = new MatrixBlock(3, 5, false);
+		ret.allocateDenseBlock();
+		cg.decompressToDenseBlock(ret.getDenseBlock(), 0, 3, 0, 2);
+		
+		assertEquals(1.0, ret.get(0, 2), 0.0);
+		assertEquals(2.0, ret.get(0, 3), 0.0);
+		assertEquals(3.0, ret.get(1, 2), 0.0);
+		assertEquals(4.0, ret.get(1, 3), 0.0);
+		assertEquals(5.0, ret.get(2, 2), 0.0);
+		assertEquals(6.0, ret.get(2, 3), 0.0);
+	}
+
+	@Test
+	public void testDecompressToDenseBlockFirstRowPath() {
+		double[][] data = {{10, 20}, {11, 21}, {12, 22}};
+		AColGroup cg = compressForTest(data);
+		
+		MatrixBlock ret = new MatrixBlock(3, 2, false);
+		ret.allocateDenseBlock();
+		cg.decompressToDenseBlock(ret.getDenseBlock(), 0, 1);
+		
+		assertEquals(10.0, ret.get(0, 0), 0.0);
+		assertEquals(20.0, ret.get(0, 1), 0.0);
+	}
+
+	@Test
+	public void testScalarOperationShiftWithExistingMatch() {
+		double[][] data = {{1}, {2}, {3}, {1}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		ScalarOperator op = new RightScalarOperator(Plus.getPlusFnObject(), 1.0);
+		AColGroup res = cg.scalarOperation(op);
+		assertTrue("Should remain DeltaDDC after shift", res instanceof ColGroupDeltaDDC);
+		
+		MatrixBlock ret = new MatrixBlock(4, 1, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 4);
+		
+		assertEquals(2.0, ret.get(0, 0), 0.0);
+		assertEquals(3.0, ret.get(1, 0), 0.0);
+		assertEquals(4.0, ret.get(2, 0), 0.0);
+		assertEquals(2.0, ret.get(3, 0), 0.0);
+	}
+
+	@Test
+	public void testScalarOperationShiftWithCountsId0EqualsOne() {
+		double[][] data = {{1}, {2}, {3}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		ScalarOperator op = new RightScalarOperator(Plus.getPlusFnObject(), 5.0);
+		AColGroup res = cg.scalarOperation(op);
+		assertTrue("Should remain DeltaDDC after shift", res instanceof ColGroupDeltaDDC);
+		
+		MatrixBlock ret = new MatrixBlock(3, 1, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 3);
+		
+		assertEquals(6.0, ret.get(0, 0), 0.0);
+		assertEquals(7.0, ret.get(1, 0), 0.0);
+		assertEquals(8.0, ret.get(2, 0), 0.0);
+	}
+
+	@Test
+	public void testScalarOperationShiftWithNoMatch() {
+		double[][] data = {{1}, {2}, {3}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		ScalarOperator op = new RightScalarOperator(Plus.getPlusFnObject(), 10.0);
+		AColGroup res = cg.scalarOperation(op);
+		assertTrue("Should remain DeltaDDC after shift", res instanceof ColGroupDeltaDDC);
+		
+		MatrixBlock ret = new MatrixBlock(3, 1, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 3);
+		
+		assertEquals(11.0, ret.get(0, 0), 0.0);
+		assertEquals(12.0, ret.get(1, 0), 0.0);
+		assertEquals(13.0, ret.get(2, 0), 0.0);
+	}
+
+	@Test
+	public void testUnaryOperationTriggersConvertToDDC() {
+		double[][] data = {{1, 2}, {3, 4}, {5, 6}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		UnaryOperator op = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.ABS));
+		AColGroup res = cg.unaryOperation(op);
+		
+		MatrixBlock ret = new MatrixBlock(3, 2, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 3);
+		
+		assertEquals(1.0, ret.get(0, 0), 0.01);
+		assertEquals(2.0, ret.get(0, 1), 0.01);
+		assertEquals(3.0, ret.get(1, 0), 0.01);
+		assertEquals(4.0, ret.get(1, 1), 0.01);
+		assertEquals(5.0, ret.get(2, 0), 0.01);
+		assertEquals(6.0, ret.get(2, 1), 0.01);
+	}
+
+	@Test
+	public void testUnaryOperationWithConstantResultSingleColumn() {
+		double[][] data = {{5}, {5}, {5}, {5}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		UnaryOperator op = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.ABS));
+		AColGroup res = cg.unaryOperation(op);
+		
+		MatrixBlock ret = new MatrixBlock(4, 1, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 4);
+		
+		assertEquals(5.0, ret.get(0, 0), 0.01);
+		assertEquals(5.0, ret.get(1, 0), 0.01);
+		assertEquals(5.0, ret.get(2, 0), 0.01);
+		assertEquals(5.0, ret.get(3, 0), 0.01);
+	}
+
+	@Test
+	public void testUnaryOperationWithConstantResultMultiColumn() {
+		double[][] data = {{10, 20}, {10, 20}, {10, 20}};
+		AColGroup cg = compressForTest(data);
+		assertTrue(cg instanceof ColGroupDeltaDDC);
+		
+		UnaryOperator op = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.ABS));
+		AColGroup res = cg.unaryOperation(op);
+		
+		MatrixBlock ret = new MatrixBlock(3, 2, false);
+		ret.allocateDenseBlock();
+		res.decompressToDenseBlock(ret.getDenseBlock(), 0, 3);
+		
+		assertEquals(10.0, ret.get(0, 0), 0.01);
+		assertEquals(20.0, ret.get(0, 1), 0.01);
+		assertEquals(10.0, ret.get(1, 0), 0.01);
+		assertEquals(20.0, ret.get(1, 1), 0.01);
+		assertEquals(10.0, ret.get(2, 0), 0.01);
+		assertEquals(20.0, ret.get(2, 1), 0.01);
+	}
+
+	private AColGroup compressForTest(double[][] data) {
+		MatrixBlock mb = DataConverter.convertToMatrixBlock(data);
+		IColIndex colIndexes = ColIndexFactory.create(data[0].length);
+		CompressionSettings cs = new CompressionSettingsBuilder()
+			.setValidCompressions(EnumSet.of(AColGroup.CompressionType.DeltaDDC))
+			.setPreferDeltaEncoding(true)
+			.create();
+		
+		final CompressedSizeInfoColGroup cgi = new ComEstExact(mb, cs).getDeltaColGroupInfo(colIndexes);
+		CompressedSizeInfo csi = new CompressedSizeInfo(cgi);
+		return ColGroupFactory.compressColGroups(mb, csi, cs, 1).get(0);
+	}
 
 }
diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryDeltaDDCTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryDeltaDDCTest.java
new file mode 100644
index 00000000000..c7439652956
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryDeltaDDCTest.java
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.colgroup;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.sysds.runtime.compress.CompressionSettings;
+import org.apache.sysds.runtime.compress.CompressionSettingsBuilder;
+import org.apache.sysds.runtime.compress.colgroup.AColGroup;
+import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupDeltaDDC;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupEmpty;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.estim.EstimationFactors;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.junit.Test;
+
+public class ColGroupFactoryDeltaDDCTest {
+
+	@Test
+	public void testCompressDeltaDDCSingleColumnWithGaps() {
+		MatrixBlock mb = new MatrixBlock(10, 1, true);
+		mb.set(0, 0, 10);
+		mb.set(5, 0, 15);
+		mb.set(9, 0, 20);
+
+		IColIndex cols = ColIndexFactory.create(1);
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder();
+		CompressionSettings cs = csb.create();
+
+		final int nRow = mb.getNumRows();
+		final int offs = 3;
+		final EstimationFactors f = new EstimationFactors(3, nRow, offs, 0.3);
+		final List<CompressedSizeInfoColGroup> es = new ArrayList<>();
+		es.add(new CompressedSizeInfoColGroup(cols, f, 314152, CompressionType.DeltaDDC));
+		final CompressedSizeInfo csi = new CompressedSizeInfo(es);
+
+		List<AColGroup> groups = ColGroupFactory.compressColGroups(mb, csi, cs);
+		assertNotNull("Compression should succeed", groups);
+		assertEquals("Should have one column group", 1, groups.size());
+		assertTrue("Should be DeltaDDC", groups.get(0) instanceof ColGroupDeltaDDC);
+	}
+
+	@Test
+	public void testCompressDeltaDDCSingleColumnEmpty() {
+		MatrixBlock mb = new MatrixBlock(10, 1, true);
+
+		IColIndex cols = ColIndexFactory.create(1);
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder();
+		CompressionSettings cs = csb.create();
+
+		final int nRow = mb.getNumRows();
+		final int offs = 0;
+		final EstimationFactors f = new EstimationFactors(0, nRow, offs, 0.0);
+		final List<CompressedSizeInfoColGroup> es = new ArrayList<>();
+		es.add(new CompressedSizeInfoColGroup(cols, f, 314152, CompressionType.DeltaDDC));
+		final CompressedSizeInfo csi = new CompressedSizeInfo(es);
+
+		List<AColGroup> groups = ColGroupFactory.compressColGroups(mb, csi, cs);
+		assertNotNull("Compression should succeed", groups);
+		assertEquals("Should have one column group", 1, groups.size());
+		assertTrue("Should be Empty", groups.get(0) instanceof ColGroupEmpty);
+	}
+
+	@Test
+	public void testCompressDeltaDDCMultiColumnWithGaps() {
+		MatrixBlock mb = new MatrixBlock(20, 2, true);
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(5, 0, 15);
+		mb.set(5, 1, 25);
+		mb.set(10, 0, 20);
+		mb.set(10, 1, 30);
+		mb.set(15, 0, 25);
+		mb.set(15, 1, 35);
+
+		IColIndex cols = ColIndexFactory.create(2);
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder();
+		CompressionSettings cs = csb.create();
+
+		final int nRow = mb.getNumRows();
+		final int offs = 4;
+		final EstimationFactors f = new EstimationFactors(4, nRow, offs, 0.2);
+		final List<CompressedSizeInfoColGroup> es = new ArrayList<>();
+		es.add(new CompressedSizeInfoColGroup(cols, f, 314152, CompressionType.DeltaDDC));
+		final CompressedSizeInfo csi = new CompressedSizeInfo(es);
+
+		List<AColGroup> groups = ColGroupFactory.compressColGroups(mb, csi, cs);
+		assertNotNull("Compression should succeed", groups);
+		assertEquals("Should have one column group", 1, groups.size());
+		assertTrue("Should be DeltaDDC", groups.get(0) instanceof ColGroupDeltaDDC);
+	}
+
+	@Test
+	public void testCompressDeltaDDCMultiColumnEmpty() {
+		MatrixBlock mb = new MatrixBlock(10, 2, true);
+
+		IColIndex cols = ColIndexFactory.create(2);
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder();
+		CompressionSettings cs = csb.create();
+
+		final int nRow = mb.getNumRows();
+		final int offs = 0;
+		final EstimationFactors f = new EstimationFactors(0, nRow, offs, 0.0);
+		final List<CompressedSizeInfoColGroup> es = new ArrayList<>();
+		es.add(new CompressedSizeInfoColGroup(cols, f, 314152, CompressionType.DeltaDDC));
+		final CompressedSizeInfo csi = new CompressedSizeInfo(es);
+
+		List<AColGroup> groups = ColGroupFactory.compressColGroups(mb, csi, cs);
+		assertNotNull("Compression should succeed", groups);
+		assertEquals("Should have one column group", 1, groups.size());
+		assertTrue("Should be Empty", groups.get(0) instanceof ColGroupEmpty);
+	}
+
+	@Test
+	public void testCompressDeltaDDCMultiColumnSparseWithGaps() {
+		MatrixBlock mb = new MatrixBlock(50, 3, true);
+		mb.set(0, 0, 1);
+		mb.set(0, 1, 2);
+		mb.set(0, 2, 3);
+		mb.set(10, 0, 11);
+		mb.set(10, 1, 12);
+		mb.set(10, 2, 13);
+		mb.set(20, 0, 21);
+		mb.set(20, 1, 22);
+		mb.set(20, 2, 23);
+		mb.set(30, 0, 31);
+		mb.set(30, 1, 32);
+		mb.set(30, 2, 33);
+		mb.set(40, 0, 41);
+		mb.set(40, 1, 42);
+		mb.set(40, 2, 43);
+
+		IColIndex cols = ColIndexFactory.create(3);
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder();
+		CompressionSettings cs = csb.create();
+
+		final int nRow = mb.getNumRows();
+		final int offs = 5;
+		final EstimationFactors f = new EstimationFactors(5, nRow, offs, 0.1);
+		final List<CompressedSizeInfoColGroup> es = new ArrayList<>();
+		es.add(new CompressedSizeInfoColGroup(cols, f, 314152, CompressionType.DeltaDDC));
+		final CompressedSizeInfo csi = new CompressedSizeInfo(es);
+
+		List<AColGroup> groups = ColGroupFactory.compressColGroups(mb, csi, cs);
+		assertNotNull("Compression should succeed", groups);
+		assertEquals("Should have one column group", 1, groups.size());
+		assertTrue("Should be DeltaDDC", groups.get(0) instanceof ColGroupDeltaDDC);
+	}
+
+	@Test
+	public void testCompressDeltaDDCSingleColumnDense() {
+		MatrixBlock mb = new MatrixBlock(10, 1, false);
+		mb.allocateDenseBlock();
+		for(int i = 0; i < 10; i++) {
+			mb.set(i, 0, i + 1);
+		}
+
+		IColIndex cols = ColIndexFactory.create(1);
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder();
+		CompressionSettings cs = csb.create();
+
+		final int nRow = mb.getNumRows();
+		final int offs = 10;
+		final EstimationFactors f = new EstimationFactors(10, nRow, offs, 1.0);
+		final List<CompressedSizeInfoColGroup> es = new ArrayList<>();
+		es.add(new CompressedSizeInfoColGroup(cols, f, 314152, CompressionType.DeltaDDC));
+		final CompressedSizeInfo csi = new CompressedSizeInfo(es);
+
+		List<AColGroup> groups = ColGroupFactory.compressColGroups(mb, csi, cs);
+		assertNotNull("Compression should succeed", groups);
+		assertEquals("Should have one column group", 1, groups.size());
+		assertTrue("Should be DeltaDDC", groups.get(0) instanceof ColGroupDeltaDDC);
+	}
+
+	@Test
+	public void testCompressDeltaDDCMultiColumnDense() {
+		MatrixBlock mb = new MatrixBlock(10, 2, false);
+		mb.allocateDenseBlock();
+		for(int i = 0; i < 10; i++) {
+			mb.set(i, 0, i + 1);
+			mb.set(i, 1, (i + 1) * 2);
+		}
+
+		IColIndex cols = ColIndexFactory.create(2);
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder();
+		CompressionSettings cs = csb.create();
+
+		final int nRow = mb.getNumRows();
+		final int offs = 10;
+		final EstimationFactors f = new EstimationFactors(10, nRow, offs, 1.0);
+		final List<CompressedSizeInfoColGroup> es = new ArrayList<>();
+		es.add(new CompressedSizeInfoColGroup(cols, f, 314152, CompressionType.DeltaDDC));
+		final CompressedSizeInfo csi = new CompressedSizeInfo(es);
+
+		List<AColGroup> groups = ColGroupFactory.compressColGroups(mb, csi, cs);
+		assertNotNull("Compression should succeed", groups);
+		assertEquals("Should have one column group", 1, groups.size());
+		assertTrue("Should be DeltaDDC", groups.get(0) instanceof ColGroupDeltaDDC);
+	}
+
+}
+
diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java
index 0468de4dc04..c4da48a0232 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupFactoryTest.java
@@ -323,9 +323,9 @@ public boolean isContiguous() {
 			return false;
 		}
 
-		@Override
-		public int numBlocks() {
-			return 2;
-		}
+	@Override
+	public int numBlocks() {
+		return 2;
 	}
 }
+}
diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupNegativeTests.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupNegativeTests.java
index c3efeea4014..e6e41755dd9 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupNegativeTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/ColGroupNegativeTests.java
@@ -59,7 +59,7 @@
 import org.apache.sysds.runtime.functionobjects.Multiply;
 import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
-import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
+import org.apache.sysds.runtime.instructions.cp.CmCovObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
 import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
@@ -634,7 +634,7 @@ public void computeColSums(double[] c, int nRows) {
 		}
 
 		@Override
-		public CM_COV_Object centralMoment(CMOperator op, int nRows) {
+		public CmCovObject centralMoment(CMOperator op, int nRows) {
 			return null;
 		}
 
diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateDeltaDDCTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateDeltaDDCTest.java
index de2d310acce..f0a3dda1c1c 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateDeltaDDCTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateDeltaDDCTest.java
@@ -24,6 +24,8 @@
 
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.util.DataConverter;
+import org.apache.sysds.test.TestUtils;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 
@@ -34,28 +36,25 @@ public class JolEstimateDeltaDDCTest extends JolEstimateTest {
 	public static Collection<Object[]> data() {
 		ArrayList<Object[]> tests = new ArrayList<>();
 
-		// MatrixBlock mb;
+		MatrixBlock mb;
 
-		// mb = DataConverter.convertToMatrixBlock(new double[][] {{0}});
-		// tests.add(new Object[] {mb});
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{0}});
+		tests.add(new Object[] {mb});
 
-		// mb = DataConverter.convertToMatrixBlock(new double[][] {{1}});
-		// tests.add(new Object[] {mb});
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{1}});
+		tests.add(new Object[] {mb});
 
-		// TODO add reader that reads as if Delta encoded.
-		// then afterwards use this test.
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{1, 2, 3, 4, 5}});
+		tests.add(new Object[] {mb});
 
-		// mb = DataConverter.convertToMatrixBlock(new double[][] {{1, 2, 3, 4, 5}});
-		// tests.add(new Object[] {mb});
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{1,2,3},{1,1,1}});
+		tests.add(new Object[] {mb});
 
-		// mb = DataConverter.convertToMatrixBlock(new double[][] {{1,2,3},{1,1,1}});
-		// tests.add(new Object[] {mb});
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 1}});
+		tests.add(new Object[] {mb});
 
-		// mb = DataConverter.convertToMatrixBlock(new double[][] {{1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 1}});
-		// tests.add(new Object[] {mb});
-
-		// mb = TestUtils.generateTestMatrixBlock(2, 5, 0, 20, 1.0, 7);
-		// tests.add(new Object[] {mb});
+		mb = TestUtils.generateTestMatrixBlock(2, 5, 0, 20, 1.0, 7);
+		tests.add(new Object[] {mb});
 
 		return tests;
 	}
@@ -68,4 +67,9 @@ public JolEstimateDeltaDDCTest(MatrixBlock mb) {
 	public AColGroup.CompressionType getCT() {
 		return delta;
 	}
+
+	@Override
+	protected boolean shouldTranspose() {
+		return false;
+	}
 }
diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateTest.java
index 8c30b398b7c..f4ffe92eb60 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateTest.java
@@ -70,6 +70,10 @@ public abstract class JolEstimateTest {
 
 	public abstract CompressionType getCT();
 
+	protected boolean shouldTranspose() {
+		return true;
+	}
+
 	private final long actualSize;
 	private final int actualNumberUnique;
 	private final AColGroup cg;
@@ -77,16 +81,21 @@ public abstract class JolEstimateTest {
 	public JolEstimateTest(MatrixBlock mbt) {
 		CompressedMatrixBlock.debug = true;
 		this.mbt = mbt;
-		colIndexes = ColIndexFactory.create(mbt.getNumRows());
+		colIndexes = ColIndexFactory.create(shouldTranspose() ? mbt.getNumRows() : mbt.getNumColumns());
 
 		mbt.recomputeNonZeros();
 		mbt.examSparsity();
 		try {
-			CompressionSettings cs = new CompressionSettingsBuilder().setSamplingRatio(1.0)
-				.setValidCompressions(EnumSet.of(getCT())).create();
-			cs.transposed = true;
+			CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setSamplingRatio(1.0)
+				.setValidCompressions(EnumSet.of(getCT()));
+			boolean useDelta = getCT() == CompressionType.DeltaDDC;
+			if(useDelta)
+				csb.setPreferDeltaEncoding(true);
+			CompressionSettings cs = csb.create();
+			cs.transposed = shouldTranspose();
 
-			final CompressedSizeInfoColGroup cgi = new ComEstExact(mbt, cs).getColGroupInfo(colIndexes);
+			final ComEstExact est = new ComEstExact(mbt, cs);
+			final CompressedSizeInfoColGroup cgi = useDelta ? est.getDeltaColGroupInfo(colIndexes) : est.getColGroupInfo(colIndexes);
 
 			final CompressedSizeInfo csi = new CompressedSizeInfo(cgi);
 			final List<AColGroup> groups = ColGroupFactory.compressColGroups(mbt, csi, cs, 1);
@@ -158,13 +167,17 @@ public void compressedSizeInfoEstimatorSample(double ratio, double tolerance) {
 			if(mbt.getNumColumns() > 10000)
 				tolerance *= 0.95;
 
-			final CompressionSettings cs = csb.setSamplingRatio(ratio).setMinimumSampleSize(10)
-				.setValidCompressions(EnumSet.of(getCT())).create();
-			cs.transposed = true;
+			CompressionSettingsBuilder testCsb = csb.setSamplingRatio(ratio).setMinimumSampleSize(10)
+				.setValidCompressions(EnumSet.of(getCT()));
+			boolean useDelta = getCT() == CompressionType.DeltaDDC;
+			if(useDelta)
+				testCsb.setPreferDeltaEncoding(true);
+			final CompressionSettings cs = testCsb.create();
+			cs.transposed = shouldTranspose();
 
 			final int sampleSize = Math.max(10, (int) (mbt.getNumColumns() * ratio));
 			final AComEst est = ComEstFactory.createEstimator(mbt, cs, sampleSize, 1);
-			final CompressedSizeInfoColGroup cInfo = est.getColGroupInfo(colIndexes);
+			final CompressedSizeInfoColGroup cInfo = useDelta ? est.getDeltaColGroupInfo(colIndexes) : est.getColGroupInfo(colIndexes);
 			final int estimateNUniques = cInfo.getNumVals();
 
 			final double estimateCSI = (cg.getCompType() == CompressionType.CONST) ? ColGroupSizes
diff --git a/src/test/java/org/apache/sysds/test/component/compress/dictionary/DeltaDictionaryTest.java b/src/test/java/org/apache/sysds/test/component/compress/dictionary/DeltaDictionaryTest.java
index 5ba6b88d251..52b88d83a53 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/dictionary/DeltaDictionaryTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/dictionary/DeltaDictionaryTest.java
@@ -18,13 +18,18 @@
  */
 package org.apache.sysds.test.component.compress.dictionary;
 
-import org.apache.commons.lang3.NotImplementedException;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
 import org.apache.sysds.runtime.compress.colgroup.dictionary.DeltaDictionary;
-import org.apache.sysds.runtime.functionobjects.And;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary;
+import org.apache.sysds.runtime.compress.colgroup.dictionary.IDictionary.DictType;
 import org.apache.sysds.runtime.functionobjects.Divide;
-import org.apache.sysds.runtime.functionobjects.Minus;
 import org.apache.sysds.runtime.functionobjects.Multiply;
-import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.matrix.operators.LeftScalarOperator;
 import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
@@ -83,51 +88,82 @@ public void testScalarOpRightDivideTwoColumns() {
 		Assert.assertArrayEquals(expected, d.getValues(), 0.01);
 	}
 
+
 	@Test
-	public void testScalarOpRightPlusSingleColumn() {
-		double scalar = 2;
-		DeltaDictionary d = new DeltaDictionary(new double[] {1, 2}, 1);
-		ScalarOperator sop = new RightScalarOperator(Plus.getPlusFnObject(), scalar, 1);
-		d = d.applyScalarOp(sop);
-		double[] expected = new double[] {3, 2};
-		Assert.assertArrayEquals(expected, d.getValues(), 0.01);
+	public void testSerializationSingleColumn() throws IOException {
+		DeltaDictionary original = new DeltaDictionary(new double[] {1, 2, 3, 4, 5}, 1);
+		
+		ByteArrayOutputStream bos = new ByteArrayOutputStream();
+		DataOutputStream dos = new DataOutputStream(bos);
+		original.write(dos);
+		Assert.assertEquals(original.getExactSizeOnDisk(), bos.size());
+		
+		ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray());
+		DataInputStream dis = new DataInputStream(bis);
+		IDictionary deserialized = DictionaryFactory.read(dis);
+		
+		Assert.assertTrue("Deserialized dictionary should be DeltaDictionary", deserialized instanceof DeltaDictionary);
+		DeltaDictionary deltaDict = (DeltaDictionary) deserialized;
+		Assert.assertArrayEquals("Values should match after serialization", original.getValues(), deltaDict.getValues(), 0.01);
 	}
 
 	@Test
-	public void testScalarOpRightPlusTwoColumns() {
-		double scalar = 2;
-		DeltaDictionary d = new DeltaDictionary(new double[] {1, 2, 3, 4}, 2);
-		ScalarOperator sop = new RightScalarOperator(Plus.getPlusFnObject(), scalar, 1);
-		d = d.applyScalarOp(sop);
-		double[] expected = new double[] {3, 4, 3, 4};
-		Assert.assertArrayEquals(expected, d.getValues(), 0.01);
+	public void testSerializationTwoColumns() throws IOException {
+		DeltaDictionary original = new DeltaDictionary(new double[] {1, 2, 3, 4, 5, 6}, 2);
+		
+		ByteArrayOutputStream bos = new ByteArrayOutputStream();
+		DataOutputStream dos = new DataOutputStream(bos);
+		original.write(dos);
+		Assert.assertEquals(original.getExactSizeOnDisk(), bos.size());
+		
+		ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray());
+		DataInputStream dis = new DataInputStream(bis);
+		IDictionary deserialized = DictionaryFactory.read(dis);
+		
+		Assert.assertTrue("Deserialized dictionary should be DeltaDictionary", deserialized instanceof DeltaDictionary);
+		DeltaDictionary deltaDict = (DeltaDictionary) deserialized;
+		Assert.assertArrayEquals("Values should match after serialization", original.getValues(), deltaDict.getValues(), 0.01);
 	}
 
 	@Test
-	public void testScalarOpRightMinusTwoColumns() {
-		double scalar = 2;
-		DeltaDictionary d = new DeltaDictionary(new double[] {1, 2, 3, 4}, 2);
-		ScalarOperator sop = new RightScalarOperator(Minus.getMinusFnObject(), scalar, 1);
-		d = d.applyScalarOp(sop);
-		double[] expected = new double[] {-1, 0, 3, 4};
-		Assert.assertArrayEquals(expected, d.getValues(), 0.01);
+	public void testGetValue() {
+		DeltaDictionary d = new DeltaDictionary(new double[] {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, 2);
+		Assert.assertEquals(1.0, d.getValue(0, 0, 2), 0.01);
+		Assert.assertEquals(2.0, d.getValue(0, 1, 2), 0.01);
+		Assert.assertEquals(3.0, d.getValue(1, 0, 2), 0.01);
+		Assert.assertEquals(4.0, d.getValue(1, 1, 2), 0.01);
+		Assert.assertEquals(5.0, d.getValue(2, 0, 2), 0.01);
+		Assert.assertEquals(6.0, d.getValue(2, 1, 2), 0.01);
 	}
 
 	@Test
-	public void testScalarOpLeftPlusTwoColumns() {
-		double scalar = 2;
-		DeltaDictionary d = new DeltaDictionary(new double[] {1, 2, 3, 4}, 2);
-		ScalarOperator sop = new LeftScalarOperator(Plus.getPlusFnObject(), scalar, 1);
-		d = d.applyScalarOp(sop);
-		double[] expected = new double[] {3, 4, 3, 4};
-		Assert.assertArrayEquals(expected, d.getValues(), 0.01);
+	public void testGetValueSingleColumn() {
+		DeltaDictionary d = new DeltaDictionary(new double[] {1.0, 2.0, 3.0}, 1);
+		Assert.assertEquals(1.0, d.getValue(0, 0, 1), 0.01);
+		Assert.assertEquals(2.0, d.getValue(1, 0, 1), 0.01);
+		Assert.assertEquals(3.0, d.getValue(2, 0, 1), 0.01);
 	}
 
-	@Test(expected = NotImplementedException.class)
-	public void testNotImplemented() {
-		double scalar = 2;
+	@Test
+	public void testGetDictType() {
 		DeltaDictionary d = new DeltaDictionary(new double[] {1, 2, 3, 4}, 2);
-		ScalarOperator sop = new LeftScalarOperator(And.getAndFnObject(), scalar, 1);
-		d = d.applyScalarOp(sop);
+		Assert.assertEquals(DictType.Delta, d.getDictType());
 	}
+
+	@Test
+	public void testGetString() {
+		DeltaDictionary d = new DeltaDictionary(new double[] {1.0, 2.0, 3.0, 4.0}, 2);
+		String result = d.getString(2);
+		String expected = "1.0, 2.0\n3.0, 4.0";
+		Assert.assertEquals(expected, result);
+	}
+
+	@Test
+	public void testGetStringSingleColumn() {
+		DeltaDictionary d = new DeltaDictionary(new double[] {1.0, 2.0, 3.0}, 1);
+		String result = d.getString(1);
+		String expected = "1.0\n2.0\n3.0";
+		Assert.assertEquals(expected, result);
+	}
+
 }
diff --git a/src/test/java/org/apache/sysds/test/component/compress/estim/encoding/EncodeDeltaTest.java b/src/test/java/org/apache/sysds/test/component/compress/estim/encoding/EncodeDeltaTest.java
new file mode 100644
index 00000000000..8cb3d93a58c
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/compress/estim/encoding/EncodeDeltaTest.java
@@ -0,0 +1,468 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.estim.encoding;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.sysds.runtime.compress.estim.encoding.EncodingFactory;
+import org.apache.sysds.runtime.compress.estim.encoding.EmptyEncoding;
+import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
+import org.apache.sysds.runtime.compress.estim.encoding.DenseEncoding;
+import org.apache.sysds.runtime.compress.estim.encoding.SparseEncoding;
+import org.apache.sysds.runtime.compress.estim.encoding.ConstEncoding;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.junit.Test;
+
+public class EncodeDeltaTest {
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaBasic() {
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 11);
+		mb.set(1, 1, 21);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 22);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2));
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Encoding should be DenseEncoding", encoding instanceof DenseEncoding);
+		assertEquals("First row [10,20] stored as-is, deltas [1,1] for rows 1-2, so 2 unique: [10,20] and [1,1]", 2, encoding.getUnique());
+		assertTrue("Encoding should be dense", encoding.isDense());
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaWithSampleSize() {
+		MatrixBlock mb = new MatrixBlock(5, 2, false);
+		mb.allocateDenseBlock();
+		for(int i = 0; i < 5; i++) {
+			mb.set(i, 0, 10 + i);
+			mb.set(i, 1, 20 + i);
+		}
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2), 3);
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Encoding should be DenseEncoding", encoding instanceof DenseEncoding);
+		assertEquals("Sample size is 3, so should process 3 rows", 3, ((DenseEncoding) encoding).getMap().size());
+		assertTrue("Should have at least 1 unique delta value", encoding.getUnique() >= 1);
+		assertTrue("Should have at most 3 unique delta values (one per row)", encoding.getUnique() <= 3);
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaFirstRowAsIs() {
+		MatrixBlock mb = new MatrixBlock(2, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 5);
+		mb.set(0, 1, 10);
+		mb.set(1, 0, 5);
+		mb.set(1, 1, 10);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2));
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Encoding should be DenseEncoding", encoding instanceof DenseEncoding);
+		assertEquals("First row [5,10] stored as-is, delta [0,0] for row 1. Map has 2 unique: [5,10] and [0,0]. With zero=true, unique = 2 + 1 = 3", 3, encoding.getUnique());
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaConstantDeltas() {
+		MatrixBlock mb = new MatrixBlock(4, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 11);
+		mb.set(1, 1, 21);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 22);
+		mb.set(3, 0, 13);
+		mb.set(3, 1, 23);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2));
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Encoding should be DenseEncoding", encoding instanceof DenseEncoding);
+		assertEquals("First row [10,20] stored as-is, all deltas are [1,1], so 2 unique: [10,20] and [1,1]", 2, encoding.getUnique());
+		assertTrue("Encoding should be dense", encoding.isDense());
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaSingleRow() {
+		MatrixBlock mb = new MatrixBlock(1, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2));
+		assertNotNull("Encoding should not be null", encoding);
+		// Single row results in ConstEncoding because there is only 1 unique value (the row itself)
+		assertTrue("Single row should result in ConstEncoding", encoding instanceof ConstEncoding);
+		assertEquals("Single row has no deltas, so should have 1 unique value (the row itself)", 1, encoding.getUnique());
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaSparse() {
+		MatrixBlock mb = new MatrixBlock(3, 2, true);
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 11);
+		mb.set(2, 1, 22);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2));
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Sparse input may result in SparseEncoding or DenseEncoding", 
+			encoding instanceof DenseEncoding || encoding instanceof SparseEncoding);
+		assertTrue("Should have at least 1 unique value", encoding.getUnique() >= 1);
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaColumnSelection() {
+		MatrixBlock mb = new MatrixBlock(3, 4, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(0, 2, 30);
+		mb.set(0, 3, 40);
+		mb.set(1, 0, 11);
+		mb.set(1, 1, 21);
+		mb.set(1, 2, 31);
+		mb.set(1, 3, 41);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 22);
+		mb.set(2, 2, 32);
+		mb.set(2, 3, 42);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(0, 2));
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Encoding should be DenseEncoding", encoding instanceof DenseEncoding);
+		assertEquals("Selected columns 0 and 2: first row [10,30] stored as-is, deltas [1,1] for rows 1-2, so 2 unique: [10,30] and [1,1]", 2, encoding.getUnique());
+		assertEquals("Should have 3 rows in mapping", 3, ((DenseEncoding) encoding).getMap().size());
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaNegativeValues() {
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 8);
+		mb.set(1, 1, 15);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 25);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2));
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Encoding should be DenseEncoding", encoding instanceof DenseEncoding);
+		// Deltas: R0=[10,20], R1=[-2,-5], R2=[4,10] -> 3 unique values
+		assertEquals("Should have 3 unique values", 3, encoding.getUnique());
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaZeros() {
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 5);
+		mb.set(0, 1, 0);
+		mb.set(1, 0, 5);
+		mb.set(1, 1, 0);
+		mb.set(2, 0, 0);
+		mb.set(2, 1, 5);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2));
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Encoding should be DenseEncoding or SparseEncoding", 
+			encoding instanceof DenseEncoding || encoding instanceof SparseEncoding);
+		assertTrue("Should have at least 1 unique value", encoding.getUnique() >= 1);
+	}
+
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaLargeMatrix() {
+		MatrixBlock mb = new MatrixBlock(100, 3, false);
+		mb.allocateDenseBlock();
+		for(int i = 0; i < 100; i++) {
+			mb.set(i, 0, i);
+			mb.set(i, 1, i * 2);
+			mb.set(i, 2, i * 3);
+		}
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(3));
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Encoding should be DenseEncoding", encoding instanceof DenseEncoding);
+		assertEquals("First row [0,0,0] stored as-is, all deltas are [1,2,3]. Map has 2 unique: [0,0,0] and [1,2,3]. All rows have non-zero deltas, so offsets.size()=100=ru, zero=false, unique=2", 2, encoding.getUnique());
+		assertEquals("Should have 100 rows in mapping", 100, ((DenseEncoding) encoding).getMap().size());
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaSampleSizeSmaller() {
+		MatrixBlock mb = new MatrixBlock(10, 2, false);
+		mb.allocateDenseBlock();
+		for(int i = 0; i < 10; i++) {
+			mb.set(i, 0, 10 + i);
+			mb.set(i, 1, 20 + i);
+		}
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2), 5);
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Encoding should be DenseEncoding", encoding instanceof DenseEncoding);
+		assertEquals("Sample size is 5, so should process 5 rows", 5, ((DenseEncoding) encoding).getMap().size());
+		assertEquals("First row [10,20] stored as-is, all deltas are [1,1], so 2 unique: [10,20] and [1,1]", 2, encoding.getUnique());
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaSampleSizeLarger() {
+		MatrixBlock mb = new MatrixBlock(5, 2, false);
+		mb.allocateDenseBlock();
+		for(int i = 0; i < 5; i++) {
+			mb.set(i, 0, 10 + i);
+			mb.set(i, 1, 20 + i);
+		}
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2), 10);
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Encoding should be DenseEncoding", encoding instanceof DenseEncoding);
+		assertEquals("Sample size 10 > matrix rows 5, so should process all 5 rows", 5, ((DenseEncoding) encoding).getMap().size());
+		assertEquals("First row [10,20] stored as-is, all deltas are [1,1], so 2 unique: [10,20] and [1,1]", 2, encoding.getUnique());
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaEmptyMatrix() {
+		MatrixBlock mb = new MatrixBlock(5, 2, false);
+		mb.allocateDenseBlock();
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2));
+		assertNotNull("Encoding should not be null", encoding);
+		// Empty matrix (all zeros) is constant 0 in delta encoding
+		assertTrue("Empty matrix should result in ConstEncoding or EmptyEncoding", 
+			encoding instanceof ConstEncoding || encoding instanceof EmptyEncoding);
+		// Both ConstEncoding(0) and EmptyEncoding return 1 unique value (the zero tuple)
+		assertEquals("Encoding of zeros should have 1 unique value", 1, encoding.getUnique());
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaEmptyMatrixSparse() {
+		MatrixBlock mb = new MatrixBlock(5, 2, true);
+		mb.setNonZeros(0);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2));
+		assertNotNull("Encoding should not be null", encoding);
+		// Empty sparse matrix is also constant 0
+		assertTrue("Empty sparse matrix should result in ConstEncoding or EmptyEncoding", 
+			encoding instanceof ConstEncoding || encoding instanceof EmptyEncoding);
+		// Both ConstEncoding(0) and EmptyEncoding return 1 unique value (the zero tuple)
+		assertEquals("Encoding of zeros should have 1 unique value", 1, encoding.getUnique());
+	}
+
+	@Test
+	public void testCombineTwoDenseDeltaEncodings() {
+		MatrixBlock mb1 = new MatrixBlock(3, 1, false);
+		mb1.allocateDenseBlock();
+		mb1.set(0, 0, 10);
+		mb1.set(1, 0, 11);
+		mb1.set(2, 0, 12);
+
+		MatrixBlock mb2 = new MatrixBlock(3, 1, false);
+		mb2.allocateDenseBlock();
+		mb2.set(0, 0, 20);
+		mb2.set(1, 0, 21);
+		mb2.set(2, 0, 22);
+
+		IEncode enc1 = EncodingFactory.createFromMatrixBlockDelta(mb1, false, ColIndexFactory.create(1));
+		IEncode enc2 = EncodingFactory.createFromMatrixBlockDelta(mb2, false, ColIndexFactory.create(1));
+
+		assertNotNull("First encoding should not be null", enc1);
+		assertNotNull("Second encoding should not be null", enc2);
+		assertTrue("First encoding should be DenseEncoding", enc1 instanceof DenseEncoding);
+		assertTrue("Second encoding should be DenseEncoding", enc2 instanceof DenseEncoding);
+
+		IEncode combined = enc1.combine(enc2);
+		assertNotNull("Combined encoding should not be null", combined);
+		assertTrue("Combined encoding should be DenseEncoding", combined instanceof DenseEncoding);
+		assertTrue("Combined unique count should be at least max of inputs", 
+			combined.getUnique() >= Math.max(enc1.getUnique(), enc2.getUnique()));
+		assertTrue("Combined unique count should be at most product of inputs", 
+			combined.getUnique() <= enc1.getUnique() * enc2.getUnique());
+		assertEquals("Combined mapping should have same size as input", 
+			((DenseEncoding) enc1).getMap().size(), ((DenseEncoding) combined).getMap().size());
+	}
+
+	@Test
+	public void testCombineDenseDeltaEncodingWithEmpty() {
+		MatrixBlock mb = new MatrixBlock(3, 1, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(1, 0, 11);
+		mb.set(2, 0, 12);
+
+		IEncode enc1 = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(1));
+		IEncode enc2 = new EmptyEncoding();
+
+		assertNotNull("First encoding should not be null", enc1);
+		assertTrue("First encoding should be DenseEncoding", enc1 instanceof DenseEncoding);
+
+		IEncode combined = enc1.combine(enc2);
+		assertNotNull("Combined encoding should not be null", combined);
+		assertEquals("Combining with EmptyEncoding should return original encoding", enc1, combined);
+	}
+
+	@Test
+	public void testCombineDenseDeltaEncodingWithConst() {
+		MatrixBlock mb = new MatrixBlock(3, 1, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(1, 0, 11);
+		mb.set(2, 0, 12);
+
+		IEncode enc1 = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(1));
+		
+		MatrixBlock constMb = new MatrixBlock(3, 1, false);
+		constMb.allocateDenseBlock();
+		constMb.set(0, 0, 5);
+		constMb.set(1, 0, 5);
+		constMb.set(2, 0, 5);
+		IEncode enc2 = EncodingFactory.createFromMatrixBlock(constMb, false, ColIndexFactory.create(1));
+
+		assertNotNull("First encoding should not be null", enc1);
+		assertTrue("First encoding should be DenseEncoding", enc1 instanceof DenseEncoding);
+		assertTrue("Second encoding should be ConstEncoding", enc2 instanceof ConstEncoding);
+
+		IEncode combined = enc1.combine(enc2);
+		assertNotNull("Combined encoding should not be null", combined);
+		assertEquals("Combining with ConstEncoding should return original encoding", enc1, combined);
+	}
+
+	@Test
+	public void testCombineDenseDeltaEncodingsWithDifferentDeltas() {
+		MatrixBlock mb1 = new MatrixBlock(4, 1, false);
+		mb1.allocateDenseBlock();
+		mb1.set(0, 0, 1);
+		mb1.set(1, 0, 2);
+		mb1.set(2, 0, 4);
+		mb1.set(3, 0, 8);
+
+		MatrixBlock mb2 = new MatrixBlock(4, 1, false);
+		mb2.allocateDenseBlock();
+		mb2.set(0, 0, 10);
+		mb2.set(1, 0, 20);
+		mb2.set(2, 0, 40);
+		mb2.set(3, 0, 80);
+
+		IEncode enc1 = EncodingFactory.createFromMatrixBlockDelta(mb1, false, ColIndexFactory.create(1));
+		IEncode enc2 = EncodingFactory.createFromMatrixBlockDelta(mb2, false, ColIndexFactory.create(1));
+
+		assertNotNull("First encoding should not be null", enc1);
+		assertNotNull("Second encoding should not be null", enc2);
+		assertTrue("First encoding should be DenseEncoding", enc1 instanceof DenseEncoding);
+		assertTrue("Second encoding should be DenseEncoding", enc2 instanceof DenseEncoding);
+
+		IEncode combined = enc1.combine(enc2);
+		assertNotNull("Combined encoding should not be null", combined);
+		assertTrue("Combined encoding should be DenseEncoding", combined instanceof DenseEncoding);
+		assertEquals("Combined mapping should have same size as input", 
+			4, ((DenseEncoding) combined).getMap().size());
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaDensePath() {
+		MatrixBlock mb = new MatrixBlock(10, 2, true);
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 11);
+		mb.set(1, 1, 21);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 22);
+		mb.set(3, 0, 13);
+		mb.set(3, 1, 23);
+		mb.set(4, 0, 14);
+		mb.set(4, 1, 24);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2), 10);
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Should result in DenseEncoding (5 non-zero rows >= 10/4=2.5, so dense path)", 
+			encoding instanceof DenseEncoding);
+		assertTrue("Should have at least 1 unique value", encoding.getUnique() >= 1);
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaEmptyEncoding() {
+		MatrixBlock mb = new MatrixBlock(10, 2, true);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2), 10);
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Empty matrix should result in EmptyEncoding", encoding instanceof EmptyEncoding);
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaConstEncoding() {
+		MatrixBlock mb = new MatrixBlock(5, 2, false);
+		mb.allocateDenseBlock();
+		for(int i = 0; i < 5; i++) {
+			mb.set(i, 0, 10);
+			mb.set(i, 1, 20);
+		}
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2), 5);
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Constant matrix with delta encoding: first row is absolute [10,20], rest are deltas [0,0], so map.size()=2, not ConstEncoding", 
+			encoding instanceof DenseEncoding || encoding instanceof SparseEncoding);
+		assertTrue("Should have 2 unique values (first row absolute, rest are zero deltas)", encoding.getUnique() >= 2);
+	}
+
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaSparseEncoding() {
+		MatrixBlock mb = new MatrixBlock(20, 2, true);
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 11);
+		mb.set(1, 1, 21);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 22);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2), 20);
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Sparse matrix with few non-zero rows (3 < 20/4=5) should result in SparseEncoding", 
+			encoding instanceof SparseEncoding);
+		assertTrue("Should have at least 1 unique value", encoding.getUnique() >= 1);
+	}
+
+	@Test
+	public void testCreateFromMatrixBlockDeltaDenseWithZero() {
+		MatrixBlock mb = new MatrixBlock(10, 2, true);
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 11);
+		mb.set(1, 1, 21);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 22);
+		mb.set(3, 0, 13);
+		mb.set(3, 1, 23);
+
+		IEncode encoding = EncodingFactory.createFromMatrixBlockDelta(mb, false, ColIndexFactory.create(2), 10);
+		assertNotNull("Encoding should not be null", encoding);
+		assertTrue("Sparse matrix with some non-zero rows (4 >= 10/4=2.5 but 4 < 10) should result in DenseEncoding with zero=true", 
+			encoding instanceof DenseEncoding);
+		assertTrue("Should have at least 1 unique value", encoding.getUnique() >= 1);
+	}
+
+}
+
diff --git a/src/test/java/org/apache/sysds/test/component/compress/estim/encoding/EncodeNegativeTest.java b/src/test/java/org/apache/sysds/test/component/compress/estim/encoding/EncodeNegativeTest.java
index d2d255c0da9..caa56a44d5e 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/estim/encoding/EncodeNegativeTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/estim/encoding/EncodeNegativeTest.java
@@ -20,6 +20,7 @@
 package org.apache.sysds.test.component.compress.estim.encoding;
 
 import org.apache.commons.lang3.NotImplementedException;
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
 import org.apache.sysds.runtime.compress.estim.encoding.EncodingFactory;
 import org.apache.sysds.runtime.data.DenseBlockFP64;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -44,12 +45,12 @@ public void encodeNonContiguousTransposed() {
 		EncodingFactory.createFromMatrixBlock(mock, true, 3);
 	}
 
-	@Test(expected = NotImplementedException.class)
+	@Test(expected = NullPointerException.class)
 	public void testInvalidToCallWithNullDeltaTransposed() {
 		EncodingFactory.createFromMatrixBlockDelta(null, true, null);
 	}
 
-	@Test(expected = NotImplementedException.class)
+	@Test(expected = NullPointerException.class)
 	public void testInvalidToCallWithNullDelta() {
 		EncodingFactory.createFromMatrixBlockDelta(null, false, null);
 	}
@@ -61,20 +62,30 @@ public void testInvalidToCallWithNull() {
 
 	@Test(expected = NotImplementedException.class)
 	public void testDeltaTransposed() {
-		EncodingFactory.createFromMatrixBlockDelta(new MatrixBlock(10, 10, false), true, null);
+		MatrixBlock mb = new MatrixBlock(10, 10, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 1);
+		mb.set(0, 1, 2);
+		mb.setNonZeros(2);
+		EncodingFactory.createFromMatrixBlockDelta(mb, true, ColIndexFactory.create(2));
 	}
 
-	@Test(expected = NotImplementedException.class)
+	@Test(expected = NullPointerException.class)
 	public void testDelta() {
 		EncodingFactory.createFromMatrixBlockDelta(new MatrixBlock(10, 10, false), false, null);
 	}
 
 	@Test(expected = NotImplementedException.class)
 	public void testDeltaTransposedNVals() {
-		EncodingFactory.createFromMatrixBlockDelta(new MatrixBlock(10, 10, false), true, null, 2);
+		MatrixBlock mb = new MatrixBlock(10, 10, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 1);
+		mb.set(0, 1, 2);
+		mb.setNonZeros(2);
+		EncodingFactory.createFromMatrixBlockDelta(mb, true, ColIndexFactory.create(2), 2);
 	}
 
-	@Test(expected = NotImplementedException.class)
+	@Test(expected = NullPointerException.class)
 	public void testDeltaNVals() {
 		EncodingFactory.createFromMatrixBlockDelta(new MatrixBlock(10, 10, false), false, null, 1);
 	}
diff --git a/src/test/java/org/apache/sysds/test/component/compress/lib/CLALibUnaryDeltaTest.java b/src/test/java/org/apache/sysds/test/component/compress/lib/CLALibUnaryDeltaTest.java
new file mode 100644
index 00000000000..414db621ade
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/compress/lib/CLALibUnaryDeltaTest.java
@@ -0,0 +1,274 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.lib;
+
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysds.runtime.compress.CompressedMatrixBlockFactory;
+import org.apache.sysds.runtime.compress.CompressionSettingsBuilder;
+import org.apache.sysds.runtime.compress.colgroup.AColGroup;
+import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
+import org.apache.sysds.runtime.compress.lib.CLALibUnary;
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.UnaryOperator;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+public class CLALibUnaryDeltaTest {
+
+	protected static final Log LOG = LogFactory.getLog(CLALibUnaryDeltaTest.class.getName());
+
+	@Test
+	public void testCumsumResultsInDeltaEncoding() {
+		// Use data that results in repetitive deltas to ensure DeltaDDC is chosen
+		MatrixBlock mb = new MatrixBlock(20, 1, false);
+		mb.allocateDenseBlock();
+		// Input: 1, 2, 1, 2, ...
+		// Cumsum: 1, 3, 4, 6, ...
+		// Deltas: 1, 2, 1, 2, ...
+		for(int i = 0; i < 20; i++) {
+			mb.set(i, 0, (i % 2 == 0) ? 1.0 : 2.0);
+		}
+		mb.setNonZeros(20);
+
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setMinimumCompressionRatio(0.0);
+		csb.addValidCompression(CompressionType.DDC);
+		CompressedMatrixBlock cmb = compress(mb, csb);
+
+		UnaryOperator cumsumOp = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.CUMSUM));
+		MatrixBlock result = CLALibUnary.unaryOperations(cmb, cumsumOp, null);
+
+		assertNotNull("Result should not be null", result);
+		assertTrue("Result should be compressed", result instanceof CompressedMatrixBlock);
+
+		CompressedMatrixBlock compressedResult = (CompressedMatrixBlock) result;
+		boolean hasDeltaDDC = false;
+		for(AColGroup cg : compressedResult.getColGroups()) {
+			if(cg.getCompType() == CompressionType.DeltaDDC) {
+				hasDeltaDDC = true;
+				break;
+			}
+		}
+
+		assertTrue("Result should contain DeltaDDC column group", hasDeltaDDC);
+	}
+
+	@Test
+	public void testCumsumCorrectness() {
+		MatrixBlock mb = TestUtils.generateTestMatrixBlock(10, 3, 0, 10, 1.0, 7);
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setMinimumCompressionRatio(0.0);
+		csb.addValidCompression(CompressionType.DDC);
+		CompressedMatrixBlock cmb = compress(mb, csb);
+
+		UnaryOperator cumsumOp = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.CUMSUM));
+		MatrixBlock result = CLALibUnary.unaryOperations(cmb, cumsumOp, null);
+		MatrixBlock expected = mb.unaryOperations(cumsumOp, new MatrixBlock());
+
+		TestUtils.compareMatrices(expected, result, 0.0, "Cumsum result should match expected");
+	}
+
+	@Test
+	public void testRowcumsumCorrectness() {
+		MatrixBlock mb = TestUtils.generateTestMatrixBlock(10, 5, 0, 10, 1.0, 7);
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setMinimumCompressionRatio(0.0);
+		csb.addValidCompression(CompressionType.DDC);
+		CompressedMatrixBlock cmb = compress(mb, csb);
+
+		UnaryOperator rowCumsumOp = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.ROWCUMSUM));
+		MatrixBlock result = CLALibUnary.unaryOperations(cmb, rowCumsumOp, null);
+		MatrixBlock expected = mb.unaryOperations(rowCumsumOp, new MatrixBlock());
+
+		TestUtils.compareMatrices(expected, result, 0.0, "RowCumsum result should match expected");
+	}
+
+	@Test
+	public void testNonCumsumOperationDoesNotUseDeltaEncoding() {
+		MatrixBlock mb = new MatrixBlock(10, 2, false);
+		mb.allocateDenseBlock();
+		for(int i = 0; i < 10; i++) {
+			mb.set(i, 0, i);
+			mb.set(i, 1, i * 2);
+		}
+		mb.setNonZeros(20);
+
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setMinimumCompressionRatio(0.0);
+		csb.addValidCompression(CompressionType.DDC);
+		CompressedMatrixBlock cmb = compress(mb, csb);
+
+		UnaryOperator absOp = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.ABS));
+		MatrixBlock result = CLALibUnary.unaryOperations(cmb, absOp, null);
+
+		assertNotNull("Result should not be null", result);
+		
+		if(result instanceof CompressedMatrixBlock) {
+			CompressedMatrixBlock compressedResult = (CompressedMatrixBlock) result;
+			boolean hasDeltaDDC = false;
+			for(AColGroup cg : compressedResult.getColGroups()) {
+				if(cg.getCompType() == CompressionType.DeltaDDC) {
+					hasDeltaDDC = true;
+					break;
+				}
+			}
+			// Should not have delta DDC
+			assertTrue("Result should NOT contain DeltaDDC column group for ABS", !hasDeltaDDC);
+		}
+		// If not compressed, it's also fine (standard execution)
+	}
+
+	@Test
+	public void testCumsumSparseMatrix() {
+		MatrixBlock mb = new MatrixBlock(100, 10, true);
+		mb.set(0, 0, 1.0);
+		mb.set(10, 0, 2.0);
+		mb.set(20, 0, 3.0);
+		mb.setNonZeros(3);
+
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setMinimumCompressionRatio(0.0);
+		csb.addValidCompression(CompressionType.DDC);
+		CompressedMatrixBlock cmb = compress(mb, csb);
+
+		UnaryOperator cumsumOp = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.CUMSUM));
+		MatrixBlock result = CLALibUnary.unaryOperations(cmb, cumsumOp, null);
+		MatrixBlock expected = mb.unaryOperations(cumsumOp, new MatrixBlock());
+
+		TestUtils.compareMatrices(expected, result, 0.0, "Cumsum result for sparse matrix should match expected");
+	}
+	
+	@Test
+	public void testCumsumWithDifferentInputCompressionTypes() {
+		MatrixBlock mb = new MatrixBlock(10, 1, false);
+		mb.allocateDenseBlock();
+		// RLE friendly data: 1, 1, 1, 2, 2, 2, 3, 3, 3, 4
+		for(int i=0; i<3; i++) mb.set(i, 0, 1.0);
+		for(int i=3; i<6; i++) mb.set(i, 0, 2.0);
+		for(int i=6; i<9; i++) mb.set(i, 0, 3.0);
+		mb.set(9, 0, 4.0);
+		mb.setNonZeros(10);
+
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setMinimumCompressionRatio(0.0);
+		csb.addValidCompression(CompressionType.RLE);
+		CompressedMatrixBlock cmb = compress(mb, csb);
+
+		UnaryOperator cumsumOp = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.CUMSUM));
+		MatrixBlock result = CLALibUnary.unaryOperations(cmb, cumsumOp, null);
+		
+		assertTrue("Result should be compressed", result instanceof CompressedMatrixBlock);
+		MatrixBlock expected = mb.unaryOperations(cumsumOp, new MatrixBlock());
+		TestUtils.compareMatrices(expected, result, 0.0, "Cumsum result from RLE input should match expected");
+	}
+	
+	@Test
+	public void testCumsumLargeMatrix() {
+		// Larger matrix to trigger multi-threaded execution if applicable
+		MatrixBlock mb = TestUtils.generateTestMatrixBlock(100, 5, 0, 100, 1.0, 7);
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setMinimumCompressionRatio(0.0);
+		csb.addValidCompression(CompressionType.DDC);
+		CompressedMatrixBlock cmb = compress(mb, csb);
+
+		UnaryOperator cumsumOp = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.CUMSUM));
+		MatrixBlock result = CLALibUnary.unaryOperations(cmb, cumsumOp, null);
+		MatrixBlock expected = mb.unaryOperations(cumsumOp, new MatrixBlock());
+
+		TestUtils.compareMatrices(expected, result, 0.0, "Cumsum result for large matrix should match expected");
+	}
+	
+	@Test
+	public void testCumsumWithConstantColumns() {
+		MatrixBlock mb = new MatrixBlock(10, 2, false);
+		mb.allocateDenseBlock();
+		for(int i=0; i<10; i++) {
+			mb.set(i, 0, 1.0); // Constant column
+			mb.set(i, 1, i);   // Increasing column
+		}
+		mb.setNonZeros(20);
+		
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setMinimumCompressionRatio(0.0);
+		csb.addValidCompression(CompressionType.DDC);
+		csb.addValidCompression(CompressionType.CONST);
+		CompressedMatrixBlock cmb = compress(mb, csb);
+		
+		UnaryOperator cumsumOp = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.CUMSUM));
+		MatrixBlock result = CLALibUnary.unaryOperations(cmb, cumsumOp, null);
+		MatrixBlock expected = mb.unaryOperations(cumsumOp, new MatrixBlock());
+		
+		TestUtils.compareMatrices(expected, result, 0.0, "Cumsum result with constant columns should match expected");
+	}
+	
+	@Test
+	public void testCumsumMultiColumn() {
+		MatrixBlock mb = new MatrixBlock(10, 4, false);
+		mb.allocateDenseBlock();
+		for(int i=0; i<10; i++) {
+			for(int j=0; j<4; j++) {
+				mb.set(i, j, i+j);
+			}
+		}
+		mb.setNonZeros(40);
+		
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setMinimumCompressionRatio(0.0);
+		csb.addValidCompression(CompressionType.DDC);
+		CompressedMatrixBlock cmb = compress(mb, csb);
+		
+		UnaryOperator cumsumOp = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.CUMSUM));
+		MatrixBlock result = CLALibUnary.unaryOperations(cmb, cumsumOp, null);
+		MatrixBlock expected = mb.unaryOperations(cumsumOp, new MatrixBlock());
+		
+		TestUtils.compareMatrices(expected, result, 0.0, "Cumsum result for multi-column matrix should match expected");
+	}
+	
+	@Test
+	public void testCumsumWhenDeltaDDCNotInValidCompressions() {
+		MatrixBlock mb = new MatrixBlock(4, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 1.0);
+		mb.set(0, 1, 2.0);
+		mb.set(1, 0, 3.0);
+		mb.set(1, 1, 4.0);
+		mb.set(2, 0, 5.0);
+		mb.set(2, 1, 6.0);
+		mb.set(3, 0, 7.0);
+		mb.set(3, 1, 8.0);
+		mb.setNonZeros(8);
+		
+		CompressionSettingsBuilder csb = new CompressionSettingsBuilder().setMinimumCompressionRatio(0.0);
+		csb.addValidCompression(CompressionType.RLE);
+		CompressedMatrixBlock cmb = compress(mb, csb);
+		
+		UnaryOperator cumsumOp = new UnaryOperator(Builtin.getBuiltinFnObject(Builtin.BuiltinCode.CUMSUM));
+		MatrixBlock result = CLALibUnary.unaryOperations(cmb, cumsumOp, null);
+		
+		assertNotNull("Result should not be null", result);
+		MatrixBlock expected = mb.unaryOperations(cumsumOp, new MatrixBlock());
+		TestUtils.compareMatrices(expected, result, 0.0, "Cumsum result should match expected even when DeltaDDC not in valid compressions");
+	}
+
+	private CompressedMatrixBlock compress(MatrixBlock mb, CompressionSettingsBuilder csb) {
+		MatrixBlock mbComp = CompressedMatrixBlockFactory.compress(mb, 1, csb).getLeft();
+		if(mbComp instanceof CompressedMatrixBlock)
+			return (CompressedMatrixBlock) mbComp;
+		else
+			return CompressedMatrixBlockFactory.genUncompressedCompressedMatrixBlock(mbComp);
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/component/compress/readers/ReaderColumnSelectionSparseDeltaTest.java b/src/test/java/org/apache/sysds/test/component/compress/readers/ReaderColumnSelectionSparseDeltaTest.java
new file mode 100644
index 00000000000..37aeb8fb987
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/compress/readers/ReaderColumnSelectionSparseDeltaTest.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.readers;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.readers.ReaderColumnSelection;
+import org.apache.sysds.runtime.compress.utils.DblArray;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.junit.Test;
+
+public class ReaderColumnSelectionSparseDeltaTest {
+
+	@Test
+	public void testSparseDeltaReaderEmptyRowSkips() {
+		MatrixBlock mb = new MatrixBlock(4, 3, true);
+		mb.allocateSparseRowsBlock();
+		
+		mb.appendValue(0, 0, 1.0);
+		mb.appendValue(2, 0, 5.0);
+		mb.appendValue(3, 2, 10.0);
+
+		IColIndex colIndexes = ColIndexFactory.create(new int[] {0});
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, colIndexes, false);
+		
+		DblArray row0 = reader.nextRow();
+		assertEquals(1.0, row0.getData()[0], 0.0);
+		
+		DblArray row1 = reader.nextRow();
+		assertEquals(-1.0, row1.getData()[0], 0.0);
+		
+		DblArray row2 = reader.nextRow();
+		assertEquals(5.0, row2.getData()[0], 0.0);
+		
+		DblArray row3 = reader.nextRow();
+		assertEquals(-5.0, row3.getData()[0], 0.0);
+	}
+
+	@Test
+	public void testSparseDeltaReaderTargetSmallerThanSparse() {
+		MatrixBlock mb = new MatrixBlock(2, 5, true);
+		mb.allocateSparseRowsBlock();
+		
+		mb.appendValue(0, 1, 10.0);
+		mb.appendValue(0, 3, 20.0);
+		
+		mb.appendValue(1, 2, 30.0);
+		mb.appendValue(1, 4, 40.0);
+
+		IColIndex colIndexes = ColIndexFactory.create(new int[] {0, 2});
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, colIndexes, false);
+		
+		DblArray row0 = reader.nextRow();
+		assertNotNull(row0);
+		assertEquals(0.0, row0.getData()[0], 0.0);
+		assertEquals(0.0, row0.getData()[1], 0.0);
+		
+		DblArray row1 = reader.nextRow();
+		assertNotNull(row1);
+		assertEquals(0.0, row1.getData()[0], 0.0);
+		assertEquals(30.0, row1.getData()[1], 0.0);
+	}
+
+	@Test
+	public void testSparseDeltaReaderColumnIndexAheadOfSparse() {
+		MatrixBlock mb = new MatrixBlock(2, 10, true);
+		mb.allocateSparseRowsBlock();
+		
+		mb.appendValue(0, 1, 10.0);
+		mb.appendValue(0, 2, 15.0);
+		
+		mb.appendValue(1, 1, 20.0);
+		mb.appendValue(1, 2, 25.0);
+		mb.appendValue(1, 3, 30.0);
+
+		IColIndex colIndexes = ColIndexFactory.create(new int[] {3, 4});
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, colIndexes, false);
+		
+		DblArray row0 = reader.nextRow();
+		assertNotNull(row0);
+		assertEquals(0.0, row0.getData()[0], 0.0);
+		assertEquals(0.0, row0.getData()[1], 0.0);
+		
+		DblArray row1 = reader.nextRow();
+		assertNotNull(row1);
+		assertEquals(30.0, row1.getData()[0], 0.0);
+		assertEquals(0.0, row1.getData()[1], 0.0);
+	}
+
+	@Test
+	public void testSparseDeltaReaderColumnIndexBehindSparse() {
+		MatrixBlock mb = new MatrixBlock(2, 10, true);
+		mb.allocateSparseRowsBlock();
+		
+		mb.appendValue(0, 3, 10.0);
+		mb.appendValue(0, 5, 20.0);
+		
+		mb.appendValue(1, 1, 30.0);
+		mb.appendValue(1, 7, 40.0);
+
+		IColIndex colIndexes = ColIndexFactory.create(new int[] {1, 3, 5});
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, colIndexes, false);
+		
+		DblArray row0 = reader.nextRow();
+		assertNotNull(row0);
+		assertEquals(0.0, row0.getData()[0], 0.0);
+		assertEquals(10.0, row0.getData()[1], 0.0);
+		assertEquals(20.0, row0.getData()[2], 0.0);
+		
+		DblArray row1 = reader.nextRow();
+		assertNotNull(row1);
+		assertEquals(30.0, row1.getData()[0], 0.0);
+		assertEquals(-10.0, row1.getData()[1], 0.0);
+		assertEquals(-20.0, row1.getData()[2], 0.0);
+	}
+}
+
diff --git a/src/test/java/org/apache/sysds/test/component/compress/readers/ReadersDeltaTest.java b/src/test/java/org/apache/sysds/test/component/compress/readers/ReadersDeltaTest.java
new file mode 100644
index 00000000000..cf6e3627141
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/compress/readers/ReadersDeltaTest.java
@@ -0,0 +1,654 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.readers;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.DMLCompressionException;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
+import org.apache.sysds.runtime.compress.colgroup.indexes.IIterate;
+import org.apache.sysds.runtime.compress.readers.ReaderColumnSelection;
+import org.apache.sysds.runtime.compress.readers.ReaderColumnSelectionDenseSingleBlockDelta;
+import org.apache.sysds.runtime.compress.readers.ReaderColumnSelectionDenseMultiBlockDelta;
+import org.apache.sysds.runtime.compress.readers.ReaderColumnSelectionSparseDelta;
+import org.apache.sysds.runtime.compress.readers.ReaderColumnSelectionEmpty;
+import org.apache.sysds.runtime.compress.utils.DblArray;
+import org.apache.sysds.runtime.data.DenseBlockFP64;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.junit.Test;
+
+public class ReadersDeltaTest {
+
+	protected static final Log LOG = LogFactory.getLog(ReadersDeltaTest.class.getName());
+
+	@Test
+	public void testDeltaReaderDenseSingleBlockBasic() {
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 11);
+		mb.set(1, 1, 21);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 22);
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), false);
+		assertNotNull(reader);
+		assertEquals(ReaderColumnSelectionDenseSingleBlockDelta.class, reader.getClass());
+
+		DblArray row0 = reader.nextRow();
+		assertNotNull(row0);
+		assertArrayEquals(new double[] {10, 20}, row0.getData(), 0.0);
+
+		DblArray row1 = reader.nextRow();
+		assertNotNull(row1);
+		assertArrayEquals(new double[] {1, 1}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertNotNull(row2);
+		assertArrayEquals(new double[] {1, 1}, row2.getData(), 0.0);
+
+		assertNull(reader.nextRow());
+	}
+
+	@Test
+	public void testDeltaReaderNegativeValues() {
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 8);
+		mb.set(1, 1, 15);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 25);
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), false);
+		reader.nextRow();
+		DblArray row1 = reader.nextRow();
+		assertArrayEquals(new double[] {-2, -5}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertArrayEquals(new double[] {4, 10}, row2.getData(), 0.0);
+	}
+
+	@Test
+	public void testDeltaReaderZeros() {
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 5);
+		mb.set(0, 1, 0);
+		mb.set(1, 0, 5);
+		mb.set(1, 1, 0);
+		mb.set(2, 0, 0);
+		mb.set(2, 1, 5);
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), false);
+		reader.nextRow();
+		DblArray row1 = reader.nextRow();
+		assertArrayEquals(new double[] {0, 0}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertArrayEquals(new double[] {-5, 5}, row2.getData(), 0.0);
+	}
+
+	@Test
+	public void testDeltaReaderSingleRow() {
+		MatrixBlock mb = new MatrixBlock(1, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), false);
+		DblArray row0 = reader.nextRow();
+		assertNotNull(row0);
+		assertArrayEquals(new double[] {10, 20}, row0.getData(), 0.0);
+		assertNull(reader.nextRow());
+	}
+
+	@Test
+	public void testDeltaReaderTwoRows() {
+		MatrixBlock mb = new MatrixBlock(2, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 15);
+		mb.set(1, 1, 25);
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), false);
+		DblArray row0 = reader.nextRow();
+		assertArrayEquals(new double[] {10, 20}, row0.getData(), 0.0);
+
+		DblArray row1 = reader.nextRow();
+		assertArrayEquals(new double[] {5, 5}, row1.getData(), 0.0);
+
+		assertNull(reader.nextRow());
+	}
+
+	@Test
+	public void testDeltaReaderColumnSelection() {
+		MatrixBlock mb = new MatrixBlock(3, 4, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(0, 2, 30);
+		mb.set(0, 3, 40);
+		mb.set(1, 0, 11);
+		mb.set(1, 1, 21);
+		mb.set(1, 2, 31);
+		mb.set(1, 3, 41);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 22);
+		mb.set(2, 2, 32);
+		mb.set(2, 3, 42);
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.createI(0, 2), false);
+		DblArray row0 = reader.nextRow();
+		assertArrayEquals(new double[] {10, 30}, row0.getData(), 0.0);
+
+		DblArray row1 = reader.nextRow();
+		assertArrayEquals(new double[] {1, 1}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertArrayEquals(new double[] {1, 1}, row2.getData(), 0.0);
+	}
+
+	@Test
+	public void testDeltaReaderSparse() {
+		MatrixBlock mb = new MatrixBlock(3, 2, true);
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 11);
+		mb.set(2, 1, 22);
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), false);
+		assertNotNull(reader);
+		assertEquals(ReaderColumnSelectionSparseDelta.class, reader.getClass());
+
+		DblArray row0 = reader.nextRow();
+		assertArrayEquals(new double[] {10, 20}, row0.getData(), 0.0);
+
+		DblArray row1 = reader.nextRow();
+		assertArrayEquals(new double[] {1, -20}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertArrayEquals(new double[] {-11, 22}, row2.getData(), 0.0);
+	}
+
+	@Test
+	public void testDeltaReaderSparseZeros() {
+		MatrixBlock mb = new MatrixBlock(3, 2, true);
+		mb.set(0, 0, 5);
+		mb.set(1, 1, 10);
+		mb.set(2, 0, 5);
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), false);
+		DblArray row0 = reader.nextRow();
+		assertArrayEquals(new double[] {5, 0}, row0.getData(), 0.0);
+
+		DblArray row1 = reader.nextRow();
+		assertArrayEquals(new double[] {-5, 10}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertArrayEquals(new double[] {5, -10}, row2.getData(), 0.0);
+	}
+
+	@Test
+	public void testDeltaReaderRange() {
+		MatrixBlock mb = new MatrixBlock(5, 2, false);
+		mb.allocateDenseBlock();
+		for(int i = 0; i < 5; i++) {
+			mb.set(i, 0, 10 + i);
+			mb.set(i, 1, 20 + i);
+		}
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), false, 1, 4);
+		DblArray row1 = reader.nextRow();
+		assertArrayEquals(new double[] {11, 21}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertArrayEquals(new double[] {1, 1}, row2.getData(), 0.0);
+
+		DblArray row3 = reader.nextRow();
+		assertArrayEquals(new double[] {1, 1}, row3.getData(), 0.0);
+
+		assertNull(reader.nextRow());
+	}
+
+	@Test(expected = DMLCompressionException.class)
+	public void testDeltaReaderInvalidRange() {
+		MatrixBlock mb = new MatrixBlock(10, 2, false);
+		mb.allocateDenseBlock();
+		ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), false, 10, 9);
+	}
+
+
+	@Test
+	public void testDeltaReaderLargeMatrix() {
+		MatrixBlock mb = new MatrixBlock(100, 3, false);
+		mb.allocateDenseBlock();
+		for(int i = 0; i < 100; i++) {
+			mb.set(i, 0, i);
+			mb.set(i, 1, i * 2);
+			mb.set(i, 2, i * 3);
+		}
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(3), false);
+		DblArray row0 = reader.nextRow();
+		assertArrayEquals(new double[] {0, 0, 0}, row0.getData(), 0.0);
+
+		for(int i = 1; i < 100; i++) {
+			DblArray row = reader.nextRow();
+			assertNotNull(row);
+			assertArrayEquals(new double[] {1, 2, 3}, row.getData(), 0.0);
+		}
+
+		assertNull(reader.nextRow());
+	}
+
+	@Test
+	public void testDeltaReaderEmptyMatrix() {
+		// Test empty matrix with dimensions but all zeros
+		MatrixBlock mb = new MatrixBlock(5, 2, false);
+		mb.allocateDenseBlock();
+		// Matrix has dimensions but is empty (all zeros)
+		// isEmpty() should return true
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), false);
+		assertNotNull(reader);
+		assertTrue(reader instanceof ReaderColumnSelectionEmpty);
+
+		// Empty reader should return null immediately
+		assertNull(reader.nextRow());
+	}
+
+	@Test
+	public void testDeltaReaderEmptyMatrixSparse() {
+		// Test empty sparse matrix with dimensions
+		MatrixBlock mb = new MatrixBlock(5, 2, true);
+		// Sparse matrix with no values is empty
+		mb.setNonZeros(0);
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), false);
+		assertNotNull(reader);
+		assertTrue(reader instanceof ReaderColumnSelectionEmpty);
+
+		// Empty reader should return null immediately
+		assertNull(reader.nextRow());
+	}
+
+	@Test
+	public void testDeltaReaderDenseMultiBlock() {
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 11);
+		mb.set(1, 1, 21);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 22);
+
+		MatrixBlock mbMultiBlock = new MatrixBlock(mb.getNumRows(), mb.getNumColumns(),
+			new DenseBlockFP64Mock(mb.getNumRows(), mb.getNumColumns(), mb.getDenseBlockValues()));
+		mbMultiBlock.setNonZeros(mb.getNonZeros());
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mbMultiBlock, ColIndexFactory.create(2), false);
+		assertNotNull(reader);
+		assertEquals(ReaderColumnSelectionDenseMultiBlockDelta.class, reader.getClass());
+
+		DblArray row0 = reader.nextRow();
+		assertNotNull(row0);
+		assertArrayEquals(new double[] {10, 20}, row0.getData(), 0.0);
+
+		DblArray row1 = reader.nextRow();
+		assertNotNull(row1);
+		assertArrayEquals(new double[] {1, 1}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertNotNull(row2);
+		assertArrayEquals(new double[] {1, 1}, row2.getData(), 0.0);
+
+		assertNull(reader.nextRow());
+	}
+
+	@Test
+	public void testDeltaReaderDenseMultiBlockSingleRow() {
+		MatrixBlock mb = new MatrixBlock(1, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+
+		MatrixBlock mbMultiBlock = new MatrixBlock(mb.getNumRows(), mb.getNumColumns(),
+			new DenseBlockFP64Mock(mb.getNumRows(), mb.getNumColumns(), mb.getDenseBlockValues()));
+		mbMultiBlock.setNonZeros(mb.getNonZeros());
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mbMultiBlock, ColIndexFactory.create(2), false);
+		assertNotNull(reader);
+		assertEquals(ReaderColumnSelectionDenseMultiBlockDelta.class, reader.getClass());
+
+		DblArray row0 = reader.nextRow();
+		assertNotNull(row0);
+		assertArrayEquals(new double[] {10, 20}, row0.getData(), 0.0);
+
+		assertNull(reader.nextRow());
+	}
+
+	@Test
+	public void testDeltaReaderDenseMultiBlockNegativeValues() {
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 8);
+		mb.set(1, 1, 15);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 25);
+
+		MatrixBlock mbMultiBlock = new MatrixBlock(mb.getNumRows(), mb.getNumColumns(),
+			new DenseBlockFP64Mock(mb.getNumRows(), mb.getNumColumns(), mb.getDenseBlockValues()));
+		mbMultiBlock.setNonZeros(mb.getNonZeros());
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mbMultiBlock, ColIndexFactory.create(2), false);
+		assertEquals(ReaderColumnSelectionDenseMultiBlockDelta.class, reader.getClass());
+
+		DblArray row0 = reader.nextRow();
+		assertArrayEquals(new double[] {10, 20}, row0.getData(), 0.0);
+
+		DblArray row1 = reader.nextRow();
+		assertArrayEquals(new double[] {-2, -5}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertArrayEquals(new double[] {4, 10}, row2.getData(), 0.0);
+	}
+
+	@Test
+	public void testDeltaReaderDenseMultiBlockColumnSelection() {
+		MatrixBlock mb = new MatrixBlock(3, 4, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(0, 2, 30);
+		mb.set(0, 3, 40);
+		mb.set(1, 0, 11);
+		mb.set(1, 1, 21);
+		mb.set(1, 2, 31);
+		mb.set(1, 3, 41);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 22);
+		mb.set(2, 2, 32);
+		mb.set(2, 3, 42);
+
+		MatrixBlock mbMultiBlock = new MatrixBlock(mb.getNumRows(), mb.getNumColumns(),
+			new DenseBlockFP64Mock(mb.getNumRows(), mb.getNumColumns(), mb.getDenseBlockValues()));
+		mbMultiBlock.setNonZeros(mb.getNonZeros());
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mbMultiBlock, ColIndexFactory.createI(0, 2), false);
+		assertEquals(ReaderColumnSelectionDenseMultiBlockDelta.class, reader.getClass());
+
+		DblArray row0 = reader.nextRow();
+		assertArrayEquals(new double[] {10, 30}, row0.getData(), 0.0);
+
+		DblArray row1 = reader.nextRow();
+		assertArrayEquals(new double[] {1, 1}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertArrayEquals(new double[] {1, 1}, row2.getData(), 0.0);
+	}
+
+	@Test
+	public void testDeltaReaderDenseMultiBlockWithRange() {
+		MatrixBlock mb = new MatrixBlock(5, 2, false);
+		mb.allocateDenseBlock();
+		for(int i = 0; i < 5; i++) {
+			mb.set(i, 0, 10 + i);
+			mb.set(i, 1, 20 + i);
+		}
+
+		MatrixBlock mbMultiBlock = new MatrixBlock(mb.getNumRows(), mb.getNumColumns(),
+			new DenseBlockFP64Mock(mb.getNumRows(), mb.getNumColumns(), mb.getDenseBlockValues()));
+		mbMultiBlock.setNonZeros(mb.getNonZeros());
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mbMultiBlock, ColIndexFactory.create(2), false, 1, 4);
+		assertEquals(ReaderColumnSelectionDenseMultiBlockDelta.class, reader.getClass());
+
+		DblArray row1 = reader.nextRow();
+		assertArrayEquals(new double[] {11, 21}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertArrayEquals(new double[] {1, 1}, row2.getData(), 0.0);
+
+		DblArray row3 = reader.nextRow();
+		assertArrayEquals(new double[] {1, 1}, row3.getData(), 0.0);
+
+		assertNull(reader.nextRow());
+	}
+
+	@Test
+	public void testDeltaReaderDenseMultiBlockZeros() {
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 5);
+		mb.set(0, 1, 0);
+		mb.set(1, 0, 5);
+		mb.set(1, 1, 0);
+		mb.set(2, 0, 0);
+		mb.set(2, 1, 5);
+
+		MatrixBlock mbMultiBlock = new MatrixBlock(mb.getNumRows(), mb.getNumColumns(),
+			new DenseBlockFP64Mock(mb.getNumRows(), mb.getNumColumns(), mb.getDenseBlockValues()));
+		mbMultiBlock.setNonZeros(mb.getNonZeros());
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mbMultiBlock, ColIndexFactory.create(2), false);
+		assertEquals(ReaderColumnSelectionDenseMultiBlockDelta.class, reader.getClass());
+
+		DblArray row0 = reader.nextRow();
+		assertArrayEquals(new double[] {5, 0}, row0.getData(), 0.0);
+
+		DblArray row1 = reader.nextRow();
+		assertArrayEquals(new double[] {0, 0}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertArrayEquals(new double[] {-5, 5}, row2.getData(), 0.0);
+	}
+
+	@Test(expected = DMLCompressionException.class)
+	public void testDeltaReaderEmptyColumnIndices() {
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		IColIndex emptyColIndex = new EmptyColIndexMock();
+		ReaderColumnSelection.createDeltaReader(mb, emptyColIndex, false);
+	}
+
+	private static class DenseBlockFP64Mock extends DenseBlockFP64 {
+		private static final long serialVersionUID = -3601232958390554672L;
+
+		public DenseBlockFP64Mock(int nRow, int nCol, double[] data) {
+			super(new int[] {nRow, nCol}, data);
+		}
+
+		@Override
+		public boolean isContiguous() {
+			return false;
+		}
+
+		@Override
+		public int numBlocks() {
+			return 2;
+		}
+	}
+
+	private static class EmptyColIndexMock implements IColIndex {
+		@Override
+		public int size() {
+			return 0;
+		}
+
+		@Override
+		public int get(int i) {
+			throw new IndexOutOfBoundsException();
+		}
+
+		@Override
+		public IColIndex combine(IColIndex other) {
+			throw new UnsupportedOperationException();
+		}
+
+		@Override
+		public IColIndex shift(int i) {
+			throw new UnsupportedOperationException();
+		}
+
+		@Override
+		public IColIndex.SliceResult slice(int l, int u) {
+			throw new UnsupportedOperationException();
+		}
+
+		@Override
+		public boolean contains(int i) {
+			return false;
+		}
+
+		@Override
+		public boolean contains(IColIndex a, IColIndex b) {
+			return false;
+		}
+
+		@Override
+		public boolean containsStrict(IColIndex a, IColIndex b) {
+			return false;
+		}
+
+		@Override
+		public boolean containsAny(IColIndex idx) {
+			return false;
+		}
+
+		@Override
+		public int findIndex(int i) {
+			return -1;
+		}
+
+		@Override
+		public boolean equals(Object other) {
+			return other instanceof IColIndex && equals((IColIndex) other);
+		}
+
+		@Override
+		public boolean equals(IColIndex other) {
+			return other != null && other.size() == 0;
+		}
+
+		@Override
+		public int hashCode() {
+			return 0;
+		}
+
+		@Override
+		public IIterate iterator() {
+			return new IIterate() {
+				@Override
+				public boolean hasNext() {
+					return false;
+				}
+
+				@Override
+				public int next() {
+					throw new java.util.NoSuchElementException();
+				}
+
+				@Override
+				public int v() {
+					throw new java.util.NoSuchElementException();
+				}
+
+				@Override
+				public int i() {
+					return -1;
+				}
+			};
+		}
+
+		@Override
+		public void write(DataOutput out) throws IOException {
+			throw new UnsupportedOperationException();
+		}
+
+		@Override
+		public long getExactSizeOnDisk() {
+			return 0;
+		}
+
+		@Override
+		public long estimateInMemorySize() {
+			return 0;
+		}
+
+		@Override
+		public boolean isContiguous() {
+			return false;
+		}
+
+		@Override
+		public int[] getReorderingIndex() {
+			return new int[0];
+		}
+
+		@Override
+		public boolean isSorted() {
+			return true;
+		}
+
+		@Override
+		public IColIndex sort() {
+			return this;
+		}
+
+		@Override
+		public double avgOfIndex() {
+			return 0;
+		}
+
+		@Override
+		public void decompressToDenseFromSparse(org.apache.sysds.runtime.data.SparseBlock sb, int vr, int off, double[] c) {
+			throw new UnsupportedOperationException();
+		}
+
+		@Override
+		public void decompressVec(int nCol, double[] c, int off, double[] values, int rowIdx) {
+			throw new UnsupportedOperationException();
+		}
+
+		@Override
+		public String toString() {
+			return "EmptyColIndexMock[]";
+		}
+	}
+
+}
+
diff --git a/src/test/java/org/apache/sysds/test/component/compress/readers/ReadersTest.java b/src/test/java/org/apache/sysds/test/component/compress/readers/ReadersTest.java
index ae92d3a4313..94e2fb5c29f 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/readers/ReadersTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/readers/ReadersTest.java
@@ -23,6 +23,7 @@
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 
+import org.apache.commons.lang3.NotImplementedException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.DMLCompressionException;
@@ -39,10 +40,11 @@ public class ReadersTest {
 
 	protected static final Log LOG = LogFactory.getLog(ReadersTest.class.getName());
 
-	@Test(expected = DMLCompressionException.class)
+	@Test
 	public void testDenseSingleCol() {
 		MatrixBlock mb = TestUtils.generateTestMatrixBlock(10, 1, 1, 1, 0.5, 21342);
-		ReaderColumnSelection.createReader(mb, ColIndexFactory.create(1), false);
+		ReaderColumnSelection reader = ReaderColumnSelection.createReader(mb, ColIndexFactory.create(1), false);
+		assertNotNull(reader);
 	}
 
 	@Test
@@ -125,6 +127,49 @@ public void testReaderColumnSelectionQuantized() {
 			}
 		}
 	}
-	
+
+	@Test
+	public void testDeltaReaderBasic() {
+		MatrixBlock mb = new MatrixBlock(3, 2, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 10);
+		mb.set(0, 1, 20);
+		mb.set(1, 0, 11);
+		mb.set(1, 1, 21);
+		mb.set(2, 0, 12);
+		mb.set(2, 1, 22);
+
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), false);
+		DblArray row0 = reader.nextRow();
+		assertNotNull(row0);
+		assertArrayEquals(new double[] {10, 20}, row0.getData(), 0.0);
+
+		DblArray row1 = reader.nextRow();
+		assertNotNull(row1);
+		assertArrayEquals(new double[] {1, 1}, row1.getData(), 0.0);
+
+		DblArray row2 = reader.nextRow();
+		assertNotNull(row2);
+		assertArrayEquals(new double[] {1, 1}, row2.getData(), 0.0);
+
+		assertEquals(null, reader.nextRow());
+	}
+
+	@Test
+	public void testDeltaReaderSingleCol() {
+		MatrixBlock mb = TestUtils.generateTestMatrixBlock(10, 1, 1, 1, 0.5, 21342);
+		ReaderColumnSelection reader = ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(1), false);
+		assertNotNull(reader);
+	}
+
+	@Test(expected = NotImplementedException.class)
+	public void testDeltaReaderTransposed() {
+		MatrixBlock mb = new MatrixBlock(10, 10, false);
+		mb.allocateDenseBlock();
+		mb.set(0, 0, 1);
+		mb.set(0, 1, 2);
+		mb.setNonZeros(2);
+		ReaderColumnSelection.createDeltaReader(mb, ColIndexFactory.create(2), true);
+	}
 
 }
diff --git a/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/DenseMatrixRollOperationCorrectnessTest.java b/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/DenseMatrixRollOperationCorrectnessTest.java
new file mode 100644
index 00000000000..157e411cf68
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/DenseMatrixRollOperationCorrectnessTest.java
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.matrix.libMatrixReorg;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.sysds.runtime.functionobjects.IndexFunction;
+import org.apache.sysds.runtime.functionobjects.RollIndex;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class DenseMatrixRollOperationCorrectnessTest {
+
+	private final double[][] input;
+	private final double[][] expected;
+	private final int shift;
+
+	public DenseMatrixRollOperationCorrectnessTest(double[][] input, double[][] expected, int shift) {
+		this.input = input;
+		this.expected = expected;
+		this.shift = shift;
+	}
+
+	@Parameterized.Parameters(name = "Shift={2}, Size={0}x{1}")
+	public static Collection<Object[]> data() {
+		return Arrays.asList(new Object[][] {
+			{
+				new double[][] {{1, 2, 3, 4, 5}},
+				new double[][] {{1, 2, 3, 4, 5}},
+				0
+			},
+			{
+				new double[][] {{1, 2, 3, 4, 5}},
+				new double[][] {{1, 2, 3, 4, 5}},
+				1
+			},
+			{
+				new double[][] {{1, 2, 3, 4, 5}},
+				new double[][] {{1, 2, 3, 4, 5}},
+				-3
+			},
+			{
+				new double[][] {{1, 2, 3, 4, 5}},
+				new double[][] {{1, 2, 3, 4, 5}},
+				999
+			},
+			{
+				new double[][] {{1}, {2}, {3}, {4}, {5}},
+				new double[][] {{4}, {5}, {1}, {2}, {3}},
+				2
+			},
+			{
+				new double[][] {{1}, {2}, {3}, {4}, {5}},
+				new double[][] {{2}, {3}, {4}, {5}, {1}},
+				-1
+			},
+			{
+				new double[][] {{1}, {2}, {3}, {4}, {5}},
+				new double[][] {{1}, {2}, {3}, {4}, {5}},
+				5
+			},
+			{
+				new double[][] {{1, 2, 3}, {4, 5, 6}},
+				new double[][] {{4, 5, 6}, {1, 2, 3}},
+				1
+			},
+			{
+				new double[][] {{1, 2, 3}, {4, 5, 6}},
+				new double[][] {{4, 5, 6}, {1, 2, 3}},
+				7
+			},
+			{
+				new double[][] {{1, 2, 3}, {4, 5, 6}},
+				new double[][] {{1, 2, 3}, {4, 5, 6}},
+				2
+			},
+			{
+				new double[][] {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
+				new double[][] {{7, 8, 9}, {1, 2, 3}, {4, 5, 6}},
+				1
+			},
+			{
+				new double[][] {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
+				new double[][] {{4, 5, 6}, {7, 8, 9}, {1, 2, 3}},
+				-1
+			},
+			{
+				new double[][] {{9, 8, 7}, {6, 5, 4}, {3, 2, 1}},
+				new double[][] {{3, 2, 1}, {9, 8, 7}, {6, 5, 4}},
+				1
+			},
+			{
+				new double[][] {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
+				new double[][] {{9, 10, 11, 12}, {1, 2, 3, 4}, {5, 6, 7, 8}},
+				1
+			},
+			{
+				new double[][] {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
+				new double[][] {{5, 6, 7, 8}, {9, 10, 11, 12}, {1, 2, 3, 4}},
+				-1
+			},
+			{
+				new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}, {16, 17, 18, 19, 20}, {21, 22, 23, 24, 25}},
+				new double[][] {{21, 22, 23, 24, 25}, {1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}, {16, 17, 18, 19, 20}},
+				1
+			},
+			{
+				new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}, {16, 17, 18, 19, 20}, {21, 22, 23, 24, 25}},
+				new double[][] {{11, 12, 13, 14, 15}, {16, 17, 18, 19, 20}, {21, 22, 23, 24, 25}, {1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}},
+				-2
+			},
+			{
+				new double[][] {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}, {13, 14, 15}, {16, 17, 18}, {19, 20, 21},
+					{22, 23, 24}, {25, 26, 27}, {28, 29, 30}},
+				new double[][] {{22, 23, 24}, {25, 26, 27}, {28, 29, 30}, {1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12},
+					{13, 14, 15}, {16, 17, 18}, {19, 20, 21}},
+				3
+			},
+			{
+				new double[][] {{1, 2}, {3, 4}, {5, 6}, {7, 8}},
+				new double[][] {{5, 6}, {7, 8}, {1, 2}, {3, 4}},
+				1002
+			},
+			{
+				new double[][] {{1}, {2}, {3}, {4}, {5}},
+				new double[][] {{3}, {4}, {5}, {1}, {2}},
+				-12
+			},
+			{
+				new double[][] {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
+				new double[][] {{4, 5, 6}, {7, 8, 9}, {1, 2, 3}},
+				-10
+			},
+			{
+				new double[][] {{1, 2}, {3, 4}, {5, 6}, {7, 8}},
+				new double[][] {{1, 2}, {3, 4}, {5, 6}, {7, 8}},
+				-4
+			},
+			{
+				new double[][] {{1, 2}, {3, 4}, {5, 6}, {7, 8}},
+				new double[][] {{3, 4}, {5, 6}, {7, 8}, {1, 2}},
+				-5
+			}
+		});
+	}
+
+	@Test
+	public void testRollOperationProducesExpectedOutput() {
+		MatrixBlock inBlock = new MatrixBlock(input.length, input[0].length, false);
+		inBlock.init(input, input.length, input[0].length);
+
+		IndexFunction op = new RollIndex(shift);
+		MatrixBlock outBlock = inBlock.reorgOperations(new ReorgOperator(op), new MatrixBlock(), 0, 0, 5);
+
+		MatrixBlock expectedBlock = new MatrixBlock(expected.length, expected[0].length, false);
+		expectedBlock.init(expected, expected.length, expected[0].length);
+
+		TestUtils.compareMatrices(outBlock, expectedBlock, 1e-12, "Dense Roll operation does not match expected output");
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/RollOperationThreadSafetyTest.java b/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/RollOperationThreadSafetyTest.java
new file mode 100644
index 00000000000..b6b5053ca1c
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/RollOperationThreadSafetyTest.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.matrix.libMatrixReorg;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Random;
+
+import org.apache.sysds.runtime.functionobjects.IndexFunction;
+import org.apache.sysds.runtime.functionobjects.RollIndex;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(Parameterized.class)
+public class RollOperationThreadSafetyTest {
+
+	private static final int MIN_ROWS = 2017;
+	private static final int MIN_COLS = 1001;
+	private static final int MIN_SHIFT = -50;
+	private static final int MAX_SHIFT = 1022;
+	private static final int NUM_TESTS = 100;
+	private static final double TEST_SPARSITY = 0.01;
+	private final int rows;
+	private final int cols;
+	private final int shift;
+	private final MatrixBlock inputDense;
+	private final MatrixBlock inputSparse;
+
+	public RollOperationThreadSafetyTest(int rows, int cols, int shift) {
+		this.rows = rows;
+		this.cols = cols;
+		this.shift = shift;
+
+		MatrixBlock tempInput = TestUtils.generateTestMatrixBlock(rows, cols, -100, 100, TEST_SPARSITY, 42);
+
+		this.inputSparse = tempInput;
+
+		this.inputDense = new MatrixBlock(rows, cols, false);
+		this.inputDense.copy(tempInput, false);
+		this.inputDense.recomputeNonZeros();
+	}
+
+	/**
+	 * Defines the parameters for the test cases (Random Rows, Random Cols, Random Shift).
+	 *
+	 * @return Collection of test parameters.
+	 */
+	@Parameters(name = "Case {index}: Size={0}x{1}, Shift={2}")
+	public static Collection<Object[]> data() {
+		ArrayList<Object[]> tests = new ArrayList<>();
+		Random rand = new Random(42);
+
+		for(int i = 0; i < NUM_TESTS; i++) {
+			// Generate random dimensions (adding random buffer to the minimums)
+			int r = MIN_ROWS + rand.nextInt(500);
+			int c = MIN_COLS + rand.nextInt(500);
+
+			int s = rand.nextInt((MAX_SHIFT - MIN_SHIFT) + 1) + MIN_SHIFT;
+
+			tests.add(new Object[] {r, c, s});
+		}
+		return tests;
+	}
+
+	@Test
+	public void denseRollOperationSingleAndMultiThreadedShouldReturnSameResult() {
+		int numThreads = getNumThreads();
+
+		MatrixBlock outSingle = rollOperation(inputDense, 1);
+
+		MatrixBlock outMulti = rollOperation(inputDense, numThreads);
+
+		TestUtils.compareMatrices(outSingle, outMulti, 1e-12,
+			"Dense Mismatch (numThreads=1 vs numThreads>1) for Size=" + rows + "x" + cols + " Shift=" + shift);
+	}
+
+	@Test
+	public void sparseRollOperationSingleAndMultiThreadedShouldReturnSameResult() {
+		int numThreads = getNumThreads();
+
+		MatrixBlock outSingle = rollOperation(inputSparse, 1);
+
+		MatrixBlock outMulti = rollOperation(inputSparse, numThreads);
+
+		TestUtils.compareMatrices(outSingle, outMulti, 1e-12,
+			"Sparse Mismatch (numThreads=1 vs numThreads>1) for Size=" + rows + "x" + cols + " Shift=" + shift);
+	}
+
+	private MatrixBlock rollOperation(MatrixBlock inBlock, int numThreads) {
+		IndexFunction op = new RollIndex(shift);
+		ReorgOperator reorgOperator = new ReorgOperator(op, numThreads);
+
+		MatrixBlock outBlock = new MatrixBlock(rows, cols, inBlock.isInSparseFormat());
+
+		return inBlock.reorgOperations(reorgOperator, outBlock, 0, 0, 0);
+	}
+
+	private static int getNumThreads() {
+		// number of threads should be at least two to invoke multithreaded operation
+		int cores = Runtime.getRuntime().availableProcessors();
+		return Math.max(2, cores);
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/RollTest.java b/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/RollTest.java
index d2ad83597bc..dc37990c331 100644
--- a/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/RollTest.java
+++ b/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/RollTest.java
@@ -100,15 +100,36 @@ public static Collection<Object[]> data() {
 	/**
 	 * The actual test method that performs the roll operation on both
 	 * sparse and dense matrices and compares the results.
+	 * This test will execute the single threaded operation
 	 */
 	@Test
-	public void test() {
+	public void testSingleThreadedOperation() {
+		int numThreads = 1;
+		compareDenseAndSparseRepresentation(numThreads);
+	}
+
+
+	/**
+	 * The actual test method that performs the roll operation on both
+	 * sparse and dense matrices and compares the results.
+	 * This test will execute the multithreaded operation
+	 */
+	@Test
+	public void testMultiThreadedOperation() {
+		// number of threads should be at least two to invoke multithreaded operation
+		int cores = Runtime.getRuntime().availableProcessors();
+		int numThreads = Math.max(2, cores);
+
+		compareDenseAndSparseRepresentation(numThreads);
+	}
+
+	private void compareDenseAndSparseRepresentation(int numThreads) {
 		try {
 			IndexFunction op = new RollIndex(shift);
 			MatrixBlock outputDense = inputDense.reorgOperations(
-					new ReorgOperator(op), new MatrixBlock(), 0, 0, 0);
+					new ReorgOperator(op, numThreads), new MatrixBlock(), 0, 0, 0);
 			MatrixBlock outputSparse = inputSparse.reorgOperations(
-					new ReorgOperator(op), new MatrixBlock(), 0, 0, 0);
+					new ReorgOperator(op, numThreads), new MatrixBlock(), 0, 0, 0);
 			outputSparse.sparseToDense();
 
 			// Compare the dense representations of both outputs
diff --git a/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/SparseMatrixRollOperationCorrectnessTest.java b/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/SparseMatrixRollOperationCorrectnessTest.java
new file mode 100644
index 00000000000..e72b29072c1
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/matrix/libMatrixReorg/SparseMatrixRollOperationCorrectnessTest.java
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.matrix.libMatrixReorg;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.sysds.runtime.functionobjects.IndexFunction;
+import org.apache.sysds.runtime.functionobjects.RollIndex;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class SparseMatrixRollOperationCorrectnessTest {
+
+	private final double[][] input;
+	private final double[][] expected;
+	private final int shift;
+
+	public SparseMatrixRollOperationCorrectnessTest(double[][] input, double[][] expected, int shift) {
+		this.input = input;
+		this.expected = expected;
+		this.shift = shift;
+	}
+
+	@Parameterized.Parameters(name = "Shift={2}, Size={0}x{1} (Sparse)")
+	public static Collection<Object[]> data() {
+		return Arrays.asList(new Object[][] {
+			{
+				new double[][] {{1, 0, 0}, {0, 2, 0}, {0, 0, 3}},
+				new double[][] {{0, 0, 3}, {1, 0, 0}, {0, 2, 0}},
+				1
+			},
+			{
+				new double[][] {{1, 0, 0}, {0, 2, 0}, {0, 0, 3}},
+				new double[][] {{0, 2, 0}, {0, 0, 3}, {1, 0, 0}},
+				-1
+			},
+			{
+				new double[][] {{0}, {10}, {0}, {20}, {0}},
+				new double[][] {{20}, {0}, {0}, {10}, {0}},
+				2
+			},
+			{
+				new double[][] {{1, 2}, {0, 0}, {3, 4}, {0, 0}},
+				new double[][] {{0, 0}, {1, 2}, {0, 0}, {3, 4}},
+				1
+			},
+			{
+				new double[][] {{0, 0, 0}, {0, 0, 0}, {0, 5, 0}, {0, 0, 0}},
+				new double[][] {{0, 5, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
+				2
+			},
+			{
+				new double[][] {{1, 0}, {0, 2}, {3, 0}},
+				new double[][] {{3, 0}, {1, 0}, {0, 2}},
+				4
+			},
+			{
+				new double[][] {{0, 1}, {0, 0}, {2, 0}},
+				new double[][] {{0, 0}, {2, 0}, {0, 1}},
+				-1
+			},
+			{
+				new double[][] {{0, 0}, {0, 0}},
+				new double[][] {{0, 0}, {0, 0}},
+				1
+			},
+			{
+				new double[][] {{1, 0, 1}, {0, 1, 0}, {1, 0, 1}},
+				new double[][] {{1, 0, 1}, {1, 0, 1}, {0, 1, 0}},
+				1
+			},
+			{
+				new double[][] {{0, 5}, {0, 0}, {2, 0}},
+				new double[][] {{0, 5}, {0, 0}, {2, 0}},
+				0
+			},
+			{
+				new double[][] {{0, 5}, {0, 0}, {2, 0}},
+				new double[][] {{0, 5}, {0, 0}, {2, 0}},
+				3
+			},
+			{
+				new double[][] {{0, 5}, {0, 0}, {2, 0}},
+				new double[][] {{0, 5}, {0, 0}, {2, 0}},
+				-3
+			},
+			{
+				new double[][] {{0, 0, 1, 0}, {0, 2, 0, 0}},
+				new double[][] {{0, 2, 0, 0}, {0, 0, 1, 0}},
+				1
+			},
+			{
+				new double[][] {{0, 0, 1, 0}, {0, 2, 0, 0}},
+				new double[][] {{0, 2, 0, 0}, {0, 0, 1, 0}},
+				-1
+			},
+			{
+				new double[][] {{1, 1}, {0, 0}, {2, 2}, {0, 0}},
+				new double[][] {{0, 0}, {1, 1}, {0, 0}, {2, 2}},
+				1
+			},
+			{
+				new double[][] {{0, 0}, {0, 0}, {1, 2}, {3, 4}},
+				new double[][] {{1, 2}, {3, 4}, {0, 0}, {0, 0}},
+				2
+			},
+			{
+				new double[][] {{1, 0}, {0, 0}, {0, 2}},
+				new double[][] {{0, 2}, {1, 0}, {0, 0}},
+				10
+			},
+			{
+				new double[][] {{1, 0}, {0, 0}, {0, 2}},
+				new double[][] {{0, 0}, {0, 2}, {1, 0}},
+				-10
+			},
+			{
+				new double[][] {{5, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}},
+				new double[][] {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {5, 0, 0, 0}},
+				3
+			},
+			{
+				new double[][] {{5, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}},
+				new double[][] {{5, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}},
+				4
+			},
+			{
+				new double[][] {{0, 1}, {0, 2}, {0, 3}, {0, 4}, {0, 5}},
+				new double[][] {{0, 3}, {0, 4}, {0, 5}, {0, 1}, {0, 2}},
+				3
+			},
+			{
+				new double[][] {{-1, 0}, {0, 0}, {0, 5}},
+				new double[][] {{0, 5}, {-1, 0}, {0, 0}},
+				1
+			}
+		});
+	}
+
+	@Test
+	public void testRollOperationProducesExpectedOutputSparse() {
+		MatrixBlock inBlock = new MatrixBlock(input.length, input[0].length, false);
+		inBlock.init(input, input.length, input[0].length);
+
+		inBlock.denseToSparse(true);
+
+		Assert.assertTrue("Input block must be in sparse format", inBlock.isInSparseFormat());
+
+		IndexFunction op = new RollIndex(shift);
+		ReorgOperator reorgOperator = new ReorgOperator(op);
+		MatrixBlock matrixBlock = new MatrixBlock();
+
+		MatrixBlock outBlock = inBlock.reorgOperations(reorgOperator, matrixBlock, 0, 0, 0);
+
+		MatrixBlock expectedBlock = new MatrixBlock(expected.length, expected[0].length, false);
+		expectedBlock.init(expected, expected.length, expected[0].length);
+
+		TestUtils.compareMatrices(outBlock, expectedBlock, 1e-12,
+			"Sparse Roll operation does not match expected output");
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/component/misc/DMLScriptTest.java b/src/test/java/org/apache/sysds/test/component/misc/DMLScriptTest.java
index f7cfb748b78..2a4f085db70 100644
--- a/src/test/java/org/apache/sysds/test/component/misc/DMLScriptTest.java
+++ b/src/test/java/org/apache/sysds/test/component/misc/DMLScriptTest.java
@@ -52,357 +52,357 @@
 @net.jcip.annotations.NotThreadSafe
 public class DMLScriptTest {
 
-    @Test
-    public void executeDMLScriptParsingExceptionTest() throws IOException {
-        // Create a ListAppender to capture log messages
-        final LoggingUtils.TestAppender appender = LoggingUtils.overwrite();
-        try {
-            Logger.getLogger(DMLScript.class).setLevel(Level.DEBUG);
-
-            String[] args = new String[]{"-f", "test","-explain","XYZ"};
-            Assert.assertFalse(executeScript(args));
-
-            final List<LoggingEvent> log = LoggingUtils.reinsert(appender);
-            Assert.assertEquals(log.get(0).getMessage(), "Parsing Exception Invalid argument specified for -hops option, must be one of [hops, runtime, recompile_hops, recompile_runtime, codegen, codegen_recompile]");
-        } finally {
-            LoggingUtils.reinsert(appender);
-        }
-    }
-
-    @Test
-    public void executeDMLScriptAlreadySelectedExceptionTest() throws IOException {
-        final LoggingUtils.TestAppender appender = LoggingUtils.overwrite();
-        try {
-            Logger.getLogger(DMLScript.class).setLevel(Level.DEBUG);
-
-            String[] args = new String[]{"-f", "test", "-clean"};
-            Assert.assertFalse(executeScript(args));
-
-            final List<LoggingEvent> log = LoggingUtils.reinsert(appender);
-            Assert.assertEquals(log.get(0).getMessage(), "Mutually exclusive options were selected. The option 'clean' was specified but an option from this group has already been selected: 'f'");
-        } finally {
-            LoggingUtils.reinsert(appender);
-        }
-    }
-
-    @Test
-    public void executeDMLHelpTest() throws IOException {
-        String[] args = new String[]{"-help"};
-        Assert.assertTrue(executeScript(args));
-    }
-
-    @Test
-    public void executeDMLCleanTest() throws IOException {
-        String[] args = new String[]{"-clean"};
-        Assert.assertTrue(executeScript(args));
-    }
-
-    @Test
-    public void executeDMLfedMonitoringTest() {
-        ExecutorService executor = Executors.newSingleThreadExecutor();
-
-        try {
-            String[] args = new String[]{"-fedMonitoring", "1"};
-            Future<?> future = executor.submit(() -> executeScript(args));
-
-            try {
-                future.get(10, TimeUnit.SECONDS); // Wait for up to 10 seconds
-            } catch (TimeoutException e) {
-                future.cancel(true); // Cancel if timeout occurs
-                System.out.println("Test fedMonitoring was forcefully terminated after 10s.");
-            } catch (Exception e) {
-                future.cancel(true); // Cancel in case of any other failure
-                throw new RuntimeException("Test execution failed", e);
-            }
-        } finally {
-            executor.shutdownNow();
-        }
-    }
-
-    @Test(expected = RuntimeException.class)
-    public void executeDMLfedMonitoringAddressTest1() throws Throwable {
-        ExecutorService executor = Executors.newSingleThreadExecutor();
-        try {
-            String[] args = new String[]{"-f","src/test/scripts/usertest/helloWorld.dml","-fedMonitoringAddress",
-                    "http://localhost:8080"};
-            Future<?> future = executor.submit(() -> executeScript(args));
-            try {
-                future.get(10, TimeUnit.SECONDS);
-            } catch (TimeoutException e) {
-                future.cancel(true);
-                System.out.println("Test fedMonitoring was forcefully terminated after 10s.");
-            } catch (Exception e) {
-                future.cancel(true);
-                throw e.getCause();
-            }
-        } finally {
-            executor.shutdownNow();
-            DMLScript.MONITORING_ADDRESS = null;
-        }
-    }
-
-    @Test
-    public void executeDMLfedMonitoringAddressTest2() throws Throwable {
-        ExecutorService executor = Executors.newSingleThreadExecutor();
-        try {
-            String[] args = new String[]{"-f","src/test/scripts/usertest/helloWorld.dml","-fedMonitoringAddress",
-                    "https://example.com"};
-            Future<?> future = executor.submit(() -> executeScript(args));
-            try {
-                future.get(10, TimeUnit.SECONDS);
-            } catch (TimeoutException e) {
-                future.cancel(true);
-                System.out.println("Test fedMonitoring was forcefully terminated after 10s.");
-            } catch (Exception e) {
-                future.cancel(true);
-                throw e.getCause();
-            }
-        } finally {
-            executor.shutdownNow();
-            DMLScript.MONITORING_ADDRESS = null;
-        }
-    }
-
-    @Test
-    public void executeDMLWithScriptTest() throws IOException {
-        String cl = "systemds -s \"print('hello')\"";
-        String[] args = cl.split(" ");
-        final PrintStream originalOut = System.out;
-        final ByteArrayOutputStream outputStreamCaptor = new ByteArrayOutputStream();
-
-        System.setOut(new PrintStream(outputStreamCaptor));
-        try{
-            Assert.assertTrue(executeScript(args));
-            Assert.assertEquals("hello", outputStreamCaptor.toString().split(System.lineSeparator())[0]);
-        } finally {
-            System.setOut(originalOut);
-        }
-    }
-
-    @Test(expected = LanguageException.class)
-    public void readDMLWithNoScriptTest() throws IOException {
-        readDMLScript(false, null);
-    }
-
-    @Test(expected = LanguageException.class)
-    public void readDMLWithNoFilepathTest() throws IOException {
-        readDMLScript(true, null);
-    }
-
-    @Test(expected = IOException.class)
-    public void readDMLWrongHDFSPathTest1() throws IOException {
-        readDMLScript(true, "hdfs:/namenodehost/test.txt");
-    }
-
-    @Test(expected = IllegalArgumentException.class)
-    public void readDMLWrongHDFSPathTes2t() throws IOException {
-        readDMLScript(true, "hdfs://namenodehost/test.txt");
-    }
-
-    @Test(expected = IOException.class)
-    public void readDMLWrongGPFSPathTest() throws IOException {
-        readDMLScript(true, "gpfs:/namenodehost/test.txt");
-    }
-
-    @Test
-    public void setActiveAMTest(){
-        DMLScript.setActiveAM();
-        try {
-
-            Assert.assertTrue(DMLScript.isActiveAM());
-        } finally {
-            DMLScript._activeAM = false;
-        }
-    }
-
-    @Test
-    public void runDMLScriptMainLanguageExceptionTest(){
-        String cl = "systemds -debug -s \"printx('hello')\"";
-        String[] args = cl.split(" ");
-        final PrintStream originalErr = System.err;
-
-        try {
-            final ByteArrayOutputStream outputStreamCaptor = new ByteArrayOutputStream();
-            System.setErr(new PrintStream(outputStreamCaptor));
-            DMLScript.main(args);
-            System.setErr(originalErr);
-            Assert.assertTrue(outputStreamCaptor.toString().split(System.lineSeparator())[0]
-                    .startsWith("org.apache.sysds.parser.LanguageException: ERROR: [line 1:0] -> printx('hello') -- function printx is undefined"));
-        } finally {
-            System.setErr(originalErr);
-        }
-
-    }
-
-    @Test
-    public void runDMLScriptMainDMLRuntimeExceptionTest(){
-        String cl = "systemds -s \"F=as.frame(matrix(1,1,1));spec=\"{ids:true,recod:[1]}\";" +
-                "M=transformapply(target=F,spec=spec,meta=F);print(M[1,1]) \"";
-        String[] args = cl.split(" ");
-
-        final PrintStream originalOut = System.out;
-        try {
-            final ByteArrayOutputStream outputStreamCaptor = new ByteArrayOutputStream();
-            System.setOut(new PrintStream(outputStreamCaptor));
-            DMLScript.main(args);
-            System.setOut(originalOut);
-            String[] lines = outputStreamCaptor.toString().split(System.lineSeparator());
-            for (int i = 0; i < lines.length; i++) {
-                if(lines[i].startsWith("An Error Occurred :")){
-                    for (int j = 0; j < 4; j++) {
-                        Assert.assertTrue(lines[i + 1 + j].trim().startsWith("DMLRuntimeException"));
-                    }
-                    break;
-                }
-            }
-        } finally {
-            System.setOut(originalOut);
-        }
-
-    }
-
-    @Test(expected = RuntimeException.class)
-    public void executeDMLWithScriptInvalidConfTest1() throws IOException {
-        String cl = "systemds -config src/test/resources/conf/invalid-gpu-conf.xml -s \"print('hello')\"";
-        String[] args = cl.split(" ");
-        executeScript(args);
-    }
-
-    @Test(expected = RuntimeException.class)
-    public void executeDMLWithScriptInvalidConfTest2() throws IOException {
-        String cl = "systemds -config src/test/resources/conf/invalid-shadow-buffer1-conf.xml -s \"print('hello')\"";
-        String[] args = cl.split(" ");
-        executeScript(args);
-    }
-
-    @Test(expected = RuntimeException.class)
-    public void executeDMLWithScriptInvalidConfTest3() throws IOException {
-        String cl = "systemds -config src/test/resources/conf/invalid-shadow-buffer2-conf.xml -s \"print('hello')\"";
-        String[] args = cl.split(" ");
-        executeScript(args);
-    }
-
-    @Test
-    public void executeDMLWithScriptValidCodegenConfTest() throws IOException {
-        String cl = "systemds -config src/test/resources/conf/invalid-codegen-conf.xml -s \"print('hello')\"";
-        String[] args = cl.split(" ");
-        executeScript(args);
-    }
-
-    @Test
-    public void executeDMLWithScriptShadowBufferWarnTest() throws IOException {
-        String cl = "systemds -config src/test/resources/conf/shadow-buffer-conf.xml -s \"print('hello')\"";
-        String[] args = cl.split(" ");
-        DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES =1000000000L;
-
-        final PrintStream originalOut = System.out;
-        try {
-            final ByteArrayOutputStream outputStreamCaptor = new ByteArrayOutputStream();
-            System.setOut(new PrintStream(outputStreamCaptor));
-            executeScript(args);
-            System.setOut(originalOut);
-            String[] lines = outputStreamCaptor.toString().split(System.lineSeparator());
-            Assert.assertTrue(lines[0].startsWith("WARN: Cannot use the shadow buffer due to potentially cached GPU objects. Current shadow buffer size (in bytes)"));
-        } finally {
-            System.setOut(originalOut);
-        }
-    }
-
-    @Test
-    public void executeDMLWithScriptAndInfoTest() throws IOException {
-        String cl = "systemds -s \"print('hello')\"";
-        String[] args = cl.split(" ");
-        Logger.getLogger(DMLScript.class).setLevel(Level.INFO);
-        final LoggingUtils.TestAppender appender = LoggingUtils.overwrite();
-        try {
-            Assert.assertTrue(executeScript(args));
-            final List<LoggingEvent> log = LoggingUtils.reinsert(appender);
-            try {
-                int i = log.get(0).getMessage().toString().startsWith("Low memory budget") ? 1 : 0;
-                Assert.assertTrue(log.get(i++).getMessage().toString().startsWith("BEGIN DML run"));
-                Assert.assertTrue(log.get(i).getMessage().toString().startsWith("Process id"));
-            } catch (Error e) {
-                System.out.println("ERROR while evaluating INFO logs: ");
-                for (LoggingEvent loggingEvent : log) {
-                    System.out.println(loggingEvent.getMessage());
-                }
-                throw e;
-            }
-
-        } finally {
-            LoggingUtils.reinsert(appender);
-        }
-    }
-
-    @Test
-    public void executeDMLWithScriptAndDebugTest() throws IOException {
-        // have to run sequentially, to avoid concurrent call to Logger.getLogger(DMLScript.class)
-        String cl = "systemds -s \"print('hello')\"";
-        String[] args = cl.split(" ");
-
-        Logger.getLogger(DMLScript.class).setLevel(Level.DEBUG);
-        final LoggingUtils.TestAppender appender2 = LoggingUtils.overwrite();
-        try{
-            Assert.assertTrue(executeScript(args));
-            final List<LoggingEvent> log = LoggingUtils.reinsert(appender2);
-            try {
-                int i = log.get(0).getMessage().toString().startsWith("Low memory budget") ? 2 : 1;
-                Assert.assertTrue(log.get(i++).getMessage().toString().startsWith("BEGIN DML run"));
-                Assert.assertTrue(log.get(i++).getMessage().toString().startsWith("DML script"));
-                Assert.assertTrue(log.get(i).getMessage().toString().startsWith("Process id"));
-            } catch (Error e){
-                for (LoggingEvent loggingEvent : log) {
-                    System.out.println(loggingEvent.getMessage());
-                }
-                throw e;
-            }
-        } finally {
-            LoggingUtils.reinsert(appender2);
-        }
-    }
-
-    @Test
-    public void createDMLScriptInstance(){
-        DMLScript script = new DMLScript();
-        Assert.assertTrue(script != null);
-
-    }
-
-    @Test
-    public void testLineageScriptExecutorUtilTestTest() throws IOException {
-        // just for code coverage
-        new ScriptExecutorUtils();
-
-        String cl = "systemds -lineage estimate -s \"print('hello')\"";
-        String[] args = cl.split(" ");
-        final PrintStream originalOut = System.out;
-        try {
-            final ByteArrayOutputStream outputStreamCaptor = new ByteArrayOutputStream();
-            System.setOut(new PrintStream(outputStreamCaptor));
-            executeScript(args);
-            System.setOut(originalOut);
-            String[] lines = outputStreamCaptor.toString().split(System.lineSeparator());
-            Assert.assertTrue(lines[0].startsWith("hello"));
-            Assert.assertTrue(Arrays.stream(lines).anyMatch(s -> s.startsWith("Compute Time (Elapsed/Saved):")));
-            Assert.assertTrue(Arrays.stream(lines).anyMatch(s -> s.startsWith("Space Used (C/R/L):")));
-            Assert.assertTrue(Arrays.stream(lines).anyMatch(s -> s.startsWith("Cache Full Timestamp:")));
-        } finally {
-            System.setOut(originalOut);
-        }
-    }
-
-    @Test
-    public void testScriptExecutorUtilTestTest() throws IOException, ParseException {
-        boolean old = DMLScript.USE_ACCELERATOR;
-        DMLScript.USE_ACCELERATOR = true;
-        try {
-            ExecutionContext ec = ExecutionContextFactory.createContext();
-            ScriptExecutorUtils.executeRuntimeProgram(null, ec, ConfigurationManager.getDMLConfig(), 0, null);
-        } catch (Error e){
-            Assert.assertTrue("Expecting Message starting with \"Error while loading native library. Instead got:"
-                    + e.getMessage(), e.getMessage().startsWith("Error while loading native library"));
-        } finally {
-            DMLScript.USE_ACCELERATOR = old;
-        }
-    }
+	@Test
+	public void executeDMLScriptParsingExceptionTest() throws IOException {
+		// Create a ListAppender to capture log messages
+		final LoggingUtils.TestAppender appender = LoggingUtils.overwrite();
+		try {
+			Logger.getLogger(DMLScript.class).setLevel(Level.DEBUG);
+
+			String[] args = new String[]{"-f", "test","-explain","XYZ"};
+			Assert.assertFalse(executeScript(args));
+
+			final List<LoggingEvent> log = LoggingUtils.reinsert(appender);
+			Assert.assertEquals(log.get(0).getMessage(), "Parsing Exception Invalid argument specified for -hops option, must be one of [hops, runtime, recompile_hops, recompile_runtime, codegen, codegen_recompile]");
+		} finally {
+			LoggingUtils.reinsert(appender);
+		}
+	}
+
+	@Test
+	public void executeDMLScriptAlreadySelectedExceptionTest() throws IOException {
+		final LoggingUtils.TestAppender appender = LoggingUtils.overwrite();
+		try {
+			Logger.getLogger(DMLScript.class).setLevel(Level.DEBUG);
+
+			String[] args = new String[]{"-f", "test", "-clean"};
+			Assert.assertFalse(executeScript(args));
+
+			final List<LoggingEvent> log = LoggingUtils.reinsert(appender);
+			Assert.assertEquals(log.get(0).getMessage(), "Mutually exclusive options were selected. The option 'clean' was specified but an option from this group has already been selected: 'f'");
+		} finally {
+			LoggingUtils.reinsert(appender);
+		}
+	}
+
+	@Test
+	public void executeDMLHelpTest() throws IOException {
+		String[] args = new String[]{"-help"};
+		Assert.assertTrue(executeScript(args));
+	}
+
+	@Test
+	public void executeDMLCleanTest() throws IOException {
+		String[] args = new String[]{"-clean"};
+		Assert.assertTrue(executeScript(args));
+	}
+
+	@Test
+	public void executeDMLfedMonitoringTest() {
+		ExecutorService executor = Executors.newSingleThreadExecutor();
+
+		try {
+			String[] args = new String[]{"-fedMonitoring", "1"};
+			Future<?> future = executor.submit(() -> executeScript(args));
+
+			try {
+				future.get(10, TimeUnit.SECONDS); // Wait for up to 10 seconds
+			} catch (TimeoutException e) {
+				future.cancel(true); // Cancel if timeout occurs
+				System.out.println("Test fedMonitoring was forcefully terminated after 10s.");
+			} catch (Exception e) {
+				future.cancel(true); // Cancel in case of any other failure
+				throw new RuntimeException("Test execution failed", e);
+			}
+		} finally {
+			executor.shutdownNow();
+		}
+	}
+
+	@Test(expected = RuntimeException.class)
+	public void executeDMLfedMonitoringAddressTest1() throws Throwable {
+		ExecutorService executor = Executors.newSingleThreadExecutor();
+		try {
+			String[] args = new String[]{"-f","src/test/scripts/usertest/helloWorld.dml","-fedMonitoringAddress",
+					"http://localhost:8080"};
+			Future<?> future = executor.submit(() -> executeScript(args));
+			try {
+				future.get(10, TimeUnit.SECONDS);
+			} catch (TimeoutException e) {
+				future.cancel(true);
+				System.out.println("Test fedMonitoring was forcefully terminated after 10s.");
+			} catch (Exception e) {
+				future.cancel(true);
+				throw e.getCause();
+			}
+		} finally {
+			executor.shutdownNow();
+			DMLScript.MONITORING_ADDRESS = null;
+		}
+	}
+
+	@Test
+	public void executeDMLfedMonitoringAddressTest2() throws Throwable {
+		ExecutorService executor = Executors.newSingleThreadExecutor();
+		try {
+			String[] args = new String[]{"-f","src/test/scripts/usertest/helloWorld.dml","-fedMonitoringAddress",
+					"https://example.com"};
+			Future<?> future = executor.submit(() -> executeScript(args));
+			try {
+				future.get(10, TimeUnit.SECONDS);
+			} catch (TimeoutException e) {
+				future.cancel(true);
+				System.out.println("Test fedMonitoring was forcefully terminated after 10s.");
+			} catch (Exception e) {
+				future.cancel(true);
+				throw e.getCause();
+			}
+		} finally {
+			executor.shutdownNow();
+			DMLScript.MONITORING_ADDRESS = null;
+		}
+	}
+
+	@Test
+	public void executeDMLWithScriptTest() throws IOException {
+		String cl = "systemds -s \"print('hello')\"";
+		String[] args = cl.split(" ");
+		final PrintStream originalOut = System.out;
+		final ByteArrayOutputStream outputStreamCaptor = new ByteArrayOutputStream();
+
+		System.setOut(new PrintStream(outputStreamCaptor));
+		try{
+			Assert.assertTrue(executeScript(args));
+			Assert.assertEquals("hello", outputStreamCaptor.toString().split(System.lineSeparator())[0]);
+		} finally {
+			System.setOut(originalOut);
+		}
+	}
+
+	@Test(expected = LanguageException.class)
+	public void readDMLWithNoScriptTest() throws IOException {
+		readDMLScript(false, null);
+	}
+
+	@Test(expected = LanguageException.class)
+	public void readDMLWithNoFilepathTest() throws IOException {
+		readDMLScript(true, null);
+	}
+
+	@Test(expected = IOException.class)
+	public void readDMLWrongHDFSPathTest1() throws IOException {
+		readDMLScript(true, "hdfs:/namenodehost/test.txt");
+	}
+
+	@Test(expected = IllegalArgumentException.class)
+	public void readDMLWrongHDFSPathTes2t() throws IOException {
+		readDMLScript(true, "hdfs://namenodehost/test.txt");
+	}
+
+	@Test(expected = IOException.class)
+	public void readDMLWrongGPFSPathTest() throws IOException {
+		readDMLScript(true, "gpfs:/namenodehost/test.txt");
+	}
+
+	@Test
+	public void setActiveAMTest(){
+		DMLScript.setActiveAM();
+		try {
+
+			Assert.assertTrue(DMLScript.isActiveAM());
+		} finally {
+			DMLScript._activeAM = false;
+		}
+	}
+
+	@Test
+	public void runDMLScriptMainLanguageExceptionTest(){
+		String cl = "systemds -debug -s \"printx('hello')\"";
+		String[] args = cl.split(" ");
+		final PrintStream originalErr = System.err;
+
+		try {
+			final ByteArrayOutputStream outputStreamCaptor = new ByteArrayOutputStream();
+			System.setErr(new PrintStream(outputStreamCaptor));
+			DMLScript.main(args);
+			System.setErr(originalErr);
+			Assert.assertTrue(outputStreamCaptor.toString().split(System.lineSeparator())[0]
+					.startsWith("org.apache.sysds.parser.LanguageException: ERROR: [line 1:0] -> printx('hello') -- function printx is undefined"));
+		} finally {
+			System.setErr(originalErr);
+		}
+
+	}
+
+	@Test
+	public void runDMLScriptMainDMLRuntimeExceptionTest(){
+		String cl = "systemds -s \"F=as.frame(matrix(1,1,1));spec=\"{ids:true,recod:[1]}\";" +
+				"M=transformapply(target=F,spec=spec,meta=F);print(M[1,1]) \"";
+		String[] args = cl.split(" ");
+
+		final PrintStream originalOut = System.out;
+		try {
+			final ByteArrayOutputStream outputStreamCaptor = new ByteArrayOutputStream();
+			System.setOut(new PrintStream(outputStreamCaptor));
+			DMLScript.main(args);
+			System.setOut(originalOut);
+			String[] lines = outputStreamCaptor.toString().split(System.lineSeparator());
+			for (int i = 0; i < lines.length; i++) {
+				if(lines[i].startsWith("An Error Occurred :")){
+					for (int j = 0; j < 4; j++) {
+						Assert.assertTrue(lines[i + 1 + j].trim().startsWith("DMLRuntimeException"));
+					}
+					break;
+				}
+			}
+		} finally {
+			System.setOut(originalOut);
+		}
+
+	}
+
+	@Test(expected = RuntimeException.class)
+	public void executeDMLWithScriptInvalidConfTest1() throws IOException {
+		String cl = "systemds -config src/test/resources/conf/invalid-gpu-conf.xml -s \"print('hello')\"";
+		String[] args = cl.split(" ");
+		executeScript(args);
+	}
+
+	@Test(expected = RuntimeException.class)
+	public void executeDMLWithScriptInvalidConfTest2() throws IOException {
+		String cl = "systemds -config src/test/resources/conf/invalid-shadow-buffer1-conf.xml -s \"print('hello')\"";
+		String[] args = cl.split(" ");
+		executeScript(args);
+	}
+
+	@Test(expected = RuntimeException.class)
+	public void executeDMLWithScriptInvalidConfTest3() throws IOException {
+		String cl = "systemds -config src/test/resources/conf/invalid-shadow-buffer2-conf.xml -s \"print('hello')\"";
+		String[] args = cl.split(" ");
+		executeScript(args);
+	}
+
+	@Test
+	public void executeDMLWithScriptValidCodegenConfTest() throws IOException {
+		String cl = "systemds -config src/test/resources/conf/invalid-codegen-conf.xml -s \"print('hello')\"";
+		String[] args = cl.split(" ");
+		executeScript(args);
+	}
+
+	@Test
+	public void executeDMLWithScriptShadowBufferWarnTest() throws IOException {
+		String cl = "systemds -config src/test/resources/conf/shadow-buffer-conf.xml -s \"print('hello')\"";
+		String[] args = cl.split(" ");
+		DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES =1000000000L;
+
+		final PrintStream originalOut = System.out;
+		try {
+			final ByteArrayOutputStream outputStreamCaptor = new ByteArrayOutputStream();
+			System.setOut(new PrintStream(outputStreamCaptor));
+			executeScript(args);
+			System.setOut(originalOut);
+			String[] lines = outputStreamCaptor.toString().split(System.lineSeparator());
+			Assert.assertTrue(lines[0].startsWith("WARN: Cannot use the shadow buffer due to potentially cached GPU objects. Current shadow buffer size (in bytes)"));
+		} finally {
+			System.setOut(originalOut);
+		}
+	}
+
+	@Test
+	public void executeDMLWithScriptAndInfoTest() throws IOException {
+		String cl = "systemds -s \"print('hello')\"";
+		String[] args = cl.split(" ");
+		Logger.getLogger(DMLScript.class).setLevel(Level.INFO);
+		final LoggingUtils.TestAppender appender = LoggingUtils.overwrite();
+		try {
+			Assert.assertTrue(executeScript(args));
+			final List<LoggingEvent> log = LoggingUtils.reinsert(appender);
+			try {
+				int i = log.get(0).getMessage().toString().startsWith("Low memory budget") ? 1 : 0;
+				Assert.assertTrue(log.get(i++).getMessage().toString().startsWith("BEGIN DML run"));
+				Assert.assertTrue(log.get(i).getMessage().toString().startsWith("Process id"));
+			} catch (Error e) {
+				System.out.println("ERROR while evaluating INFO logs: ");
+				for (LoggingEvent loggingEvent : log) {
+					System.out.println(loggingEvent.getMessage());
+				}
+				throw e;
+			}
+
+		} finally {
+			LoggingUtils.reinsert(appender);
+		}
+	}
+
+	@Test
+	public void executeDMLWithScriptAndDebugTest() throws IOException {
+		// have to run sequentially, to avoid concurrent call to Logger.getLogger(DMLScript.class)
+		String cl = "systemds -s \"print('hello')\"";
+		String[] args = cl.split(" ");
+
+		Logger.getLogger(DMLScript.class).setLevel(Level.DEBUG);
+		final LoggingUtils.TestAppender appender2 = LoggingUtils.overwrite();
+		try{
+			Assert.assertTrue(executeScript(args));
+			final List<LoggingEvent> log = LoggingUtils.reinsert(appender2);
+			try {
+				int i = log.get(0).getMessage().toString().startsWith("Low memory budget") ? 2 : 1;
+				Assert.assertTrue(log.get(i++).getMessage().toString().startsWith("BEGIN DML run"));
+				Assert.assertTrue(log.get(i++).getMessage().toString().startsWith("DML script"));
+				Assert.assertTrue(log.get(i).getMessage().toString().startsWith("Process id"));
+			} catch (Error e){
+				for (LoggingEvent loggingEvent : log) {
+					System.out.println(loggingEvent.getMessage());
+				}
+				throw e;
+			}
+		} finally {
+			LoggingUtils.reinsert(appender2);
+		}
+	}
+
+	@Test
+	public void createDMLScriptInstance(){
+		DMLScript script = new DMLScript();
+		Assert.assertTrue(script != null);
+
+	}
+
+	@Test
+	public void testLineageScriptExecutorUtilTestTest() throws IOException {
+		// just for code coverage
+		new ScriptExecutorUtils();
+
+		String cl = "systemds -lineage estimate -s \"print('hello')\"";
+		String[] args = cl.split(" ");
+		final PrintStream originalOut = System.out;
+		try {
+			final ByteArrayOutputStream outputStreamCaptor = new ByteArrayOutputStream();
+			System.setOut(new PrintStream(outputStreamCaptor));
+			executeScript(args);
+			System.setOut(originalOut);
+			String[] lines = outputStreamCaptor.toString().split(System.lineSeparator());
+			Assert.assertTrue(lines[0].startsWith("hello"));
+			Assert.assertTrue(Arrays.stream(lines).anyMatch(s -> s.startsWith("Compute Time (Elapsed/Saved):")));
+			Assert.assertTrue(Arrays.stream(lines).anyMatch(s -> s.startsWith("Space Used (C/R/L):")));
+			Assert.assertTrue(Arrays.stream(lines).anyMatch(s -> s.startsWith("Cache Full Timestamp:")));
+		} finally {
+			System.setOut(originalOut);
+		}
+	}
+
+	@Test
+	public void testScriptExecutorUtilTestTest() throws IOException, ParseException {
+		boolean old = DMLScript.USE_ACCELERATOR;
+		DMLScript.USE_ACCELERATOR = true;
+		try {
+			ExecutionContext ec = ExecutionContextFactory.createContext();
+			ScriptExecutorUtils.executeRuntimeProgram(null, ec, ConfigurationManager.getDMLConfig(), 0, null);
+		} catch (Error e){
+			Assert.assertTrue("Expecting Message starting with \"Error while loading native library. Instead got:"
+					+ e.getMessage(), e.getMessage().startsWith("Error while loading native library"));
+		} finally {
+			DMLScript.USE_ACCELERATOR = old;
+		}
+	}
 }
diff --git a/src/test/java/org/apache/sysds/test/component/resource/CloudUtilsTests.java b/src/test/java/org/apache/sysds/test/component/resource/CloudUtilsTests.java
index 3cd39148a14..3d0e9a575ab 100644
--- a/src/test/java/org/apache/sysds/test/component/resource/CloudUtilsTests.java
+++ b/src/test/java/org/apache/sysds/test/component/resource/CloudUtilsTests.java
@@ -30,9 +30,10 @@
 import java.io.IOException;
 import java.util.HashMap;
 
-import static org.apache.sysds.resource.CloudUtils.*;
-import static org.apache.sysds.test.component.resource.ResourceTestUtils.*;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 @net.jcip.annotations.NotThreadSafe
 public class CloudUtilsTests {
@@ -132,7 +133,7 @@ public void loadDefaultFeeTableTest() {
 
 		for (String region : regions) {
 			try {
-				double[] prices = CloudUtils.loadRegionalPrices(DEFAULT_REGIONAL_PRICE_TABLE, region);
+				double[] prices = CloudUtils.loadRegionalPrices(ResourceTestUtils.DEFAULT_REGIONAL_PRICE_TABLE, region);
 				double feeRatio = prices[0];
 				double ebsPrice = prices[1];
 				Assert.assertTrue(feeRatio >= 0.15 && feeRatio <= 0.25);
@@ -148,18 +149,20 @@ public void loadingInstanceInfoTest() throws IOException {
 		// test the proper loading of the table
 		File file = ResourceTestUtils.getMinimalInstanceInfoTableFile();
 
-		HashMap<String, CloudInstance> actual = CloudUtils.loadInstanceInfoTable(file.getPath(), TEST_FEE_RATIO, TEST_STORAGE_PRICE);
-		HashMap<String, CloudInstance> expected = getSimpleCloudInstanceMap();
+		HashMap<String, CloudInstance> actual = CloudUtils.loadInstanceInfoTable(file.getPath(),
+			ResourceTestUtils.TEST_FEE_RATIO, ResourceTestUtils.TEST_STORAGE_PRICE);
+		HashMap<String, CloudInstance> expected = ResourceTestUtils.getSimpleCloudInstanceMap();
 
 		for (String instanceName: expected.keySet()) {
-			assertEqualsCloudInstances(expected.get(instanceName), actual.get(instanceName));
+			ResourceTestUtils.assertEqualsCloudInstances(expected.get(instanceName), actual.get(instanceName));
 		}
 	}
 
 	@Test
 	public void loadDefaultInstanceInfoTableFileTest() throws IOException {
 		// test that the provided default file is accounted as valid by the function for loading
-		HashMap<String, CloudInstance> instanceMap = CloudUtils.loadInstanceInfoTable(DEFAULT_INSTANCE_INFO_TABLE, TEST_FEE_RATIO, TEST_STORAGE_PRICE);
+		HashMap<String, CloudInstance> instanceMap = CloudUtils.loadInstanceInfoTable(
+			ResourceTestUtils.DEFAULT_INSTANCE_INFO_TABLE, ResourceTestUtils.TEST_FEE_RATIO, ResourceTestUtils.TEST_STORAGE_PRICE);
 		// test if all instances from 'M', 'C' or 'R' families
 		// and if the minimum size is xlarge as required for EMR
 		for (String instanceType : instanceMap.keySet()) {
@@ -170,7 +173,7 @@ public void loadDefaultInstanceInfoTableFileTest() throws IOException {
 
 	@Test
 	public void getEffectiveExecutorResourcesGeneralCaseTest() {
-		long inputMemory = GBtoBytes(16);
+		long inputMemory = CloudUtils.GBtoBytes(16);
 		int inputCores = 4;
 		int inputNumExecutors = 4;
 
@@ -181,7 +184,7 @@ public void getEffectiveExecutorResourcesGeneralCaseTest() {
 		int expectedAmCores = 1;
 		int expectedExecutorCores = inputCores - expectedAmCores;
 
-		int[] result = getEffectiveExecutorResources(inputMemory, inputCores, inputNumExecutors);
+		int[] result = CloudUtils.getEffectiveExecutorResources(inputMemory, inputCores, inputNumExecutors);
 		int resultExecutorMemoryMB = result[0];
 		int resultExecutorCores = result[1];
 		int resultNumExecutors = result[2];
@@ -198,13 +201,13 @@ public void getEffectiveExecutorResourcesGeneralCaseTest() {
 	@Test
 	public void getEffectiveExecutorResourcesEdgeCaseTest() {
 		// edge case -> large cluster with small machines -> dedicated machine for the AM
-		long inputMemory = GBtoBytes(8);
+		long inputMemory = CloudUtils.GBtoBytes(8);
 		int inputCores = 4;
 		int inputNumExecutors = 48;
 
 		int expectedContainerMemoryMB = (int) (((0.75 * inputMemory / (1024 * 1024))) / 1.1);
 
-		int[] result = getEffectiveExecutorResources(inputMemory, inputCores, inputNumExecutors);
+		int[] result = CloudUtils.getEffectiveExecutorResources(inputMemory, inputCores, inputNumExecutors);
 		int resultExecutorMemoryMB = result[0];
 		int resultExecutorCores = result[1];
 		int resultNumExecutors = result[2];
diff --git a/src/test/java/org/apache/sysds/test/component/resource/CostEstimatorTest.java b/src/test/java/org/apache/sysds/test/component/resource/CostEstimatorTest.java
index c9ef1b7109e..68e1b72af2b 100644
--- a/src/test/java/org/apache/sysds/test/component/resource/CostEstimatorTest.java
+++ b/src/test/java/org/apache/sysds/test/component/resource/CostEstimatorTest.java
@@ -43,7 +43,6 @@
 import org.apache.sysds.test.TestConfiguration;
 import scala.Tuple2;
 
-import static org.apache.sysds.test.component.resource.ResourceTestUtils.*;
 
 public class CostEstimatorTest extends AutomatedTestBase {
 	static {
@@ -53,7 +52,7 @@ public class CostEstimatorTest extends AutomatedTestBase {
 	private static final String HOME = SCRIPT_DIR + TEST_DIR;
 	private static final String TEST_CLASS_DIR = TEST_DIR + CostEstimatorTest.class.getSimpleName() + "/";
 	private static final int DEFAULT_NUM_EXECUTORS = 4;
-	private static final HashMap<String, CloudInstance> INSTANCE_MAP = getSimpleCloudInstanceMap();
+	private static final HashMap<String, CloudInstance> INSTANCE_MAP = ResourceTestUtils.getSimpleCloudInstanceMap();
 	
 	@Override
 	public void setUp() {}
diff --git a/src/test/java/org/apache/sysds/test/component/resource/EnumeratorTests.java b/src/test/java/org/apache/sysds/test/component/resource/EnumeratorTests.java
index 5436193caf3..7cfe302225a 100644
--- a/src/test/java/org/apache/sysds/test/component/resource/EnumeratorTests.java
+++ b/src/test/java/org/apache/sysds/test/component/resource/EnumeratorTests.java
@@ -43,8 +43,8 @@
 import java.util.stream.Collectors;
 
 import static org.apache.sysds.resource.CloudUtils.GBtoBytes;
-import static org.apache.sysds.test.component.resource.ResourceTestUtils.*;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.eq;
 
@@ -60,7 +60,7 @@ public void setUp() {}
 	@Test
 	public void builderWithInstanceRangeTest() {
 		// test the parsing of mechanism for instance family and instance size ranges
-		HashMap<String, CloudInstance> availableInstances = getSimpleCloudInstanceMap();
+		HashMap<String, CloudInstance> availableInstances = ResourceTestUtils.getSimpleCloudInstanceMap();
 
 		Enumerator defaultEnumerator = getGridBasedEnumeratorPrebuild().build();
 		Assert.assertEquals(availableInstances.size(), defaultEnumerator.getInstances().size());
@@ -535,11 +535,11 @@ public void processingTest() {
 		// and the cheapest instance for the driver
 		// Grid-Based
 		Assert.assertEquals(0, actualSolutionGB.numberExecutors);
-		assertEqualsCloudInstances(instances.get("c5.xlarge"), actualSolutionGB.driverInstance);
+		ResourceTestUtils.assertEqualsCloudInstances(instances.get("c5.xlarge"), actualSolutionGB.driverInstance);
 		Assert.assertNull(actualSolutionIB.executorInstance);
 		// Interest-Based
 		Assert.assertEquals(0, actualSolutionIB.numberExecutors);
-		assertEqualsCloudInstances(instances.get("c5.xlarge"), actualSolutionIB.driverInstance);
+		ResourceTestUtils.assertEqualsCloudInstances(instances.get("c5.xlarge"), actualSolutionIB.driverInstance);
 		Assert.assertNull(actualSolutionIB.executorInstance);
 	}
 
diff --git a/src/test/java/org/apache/sysds/test/component/resource/RecompilationTest.java b/src/test/java/org/apache/sysds/test/component/resource/RecompilationTest.java
index 7fdf9614d98..6b1241724ec 100644
--- a/src/test/java/org/apache/sysds/test/component/resource/RecompilationTest.java
+++ b/src/test/java/org/apache/sysds/test/component/resource/RecompilationTest.java
@@ -36,7 +36,11 @@
 import org.junit.Test;
 
 import java.io.IOException;
-import java.util.*;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
diff --git a/src/test/java/org/apache/sysds/test/component/resource/ResourceOptimizerTest.java b/src/test/java/org/apache/sysds/test/component/resource/ResourceOptimizerTest.java
index 0bcde088d7a..74ac602f06c 100644
--- a/src/test/java/org/apache/sysds/test/component/resource/ResourceOptimizerTest.java
+++ b/src/test/java/org/apache/sysds/test/component/resource/ResourceOptimizerTest.java
@@ -45,7 +45,6 @@
 
 import static org.apache.sysds.resource.ResourceOptimizer.createOptions;
 import static org.apache.sysds.resource.ResourceOptimizer.initEnumerator;
-import static org.apache.sysds.test.component.resource.ResourceTestUtils.*;
 
 public class ResourceOptimizerTest extends AutomatedTestBase {
 	private static final String TEST_DIR = "component/resource/";
@@ -59,15 +58,15 @@ public void initEnumeratorFromArgsDefaultsTest() {
 		String[] args = {
 				"-f", HOME+"Algorithm_L2SVM.dml",
 		};
-		PropertiesConfiguration options = generateTestingOptionsRequired("any");
+		PropertiesConfiguration options = ResourceTestUtils.generateTestingOptionsRequired("any");
 
 		Enumerator actualEnumerator = assertProperEnumeratorInitialization(args, options);
 		Assert.assertTrue(actualEnumerator instanceof GridBasedEnumerator);
 		// assert all defaults
-		HashMap<String, CloudInstance> expectedInstances = getSimpleCloudInstanceMap();
+		HashMap<String, CloudInstance> expectedInstances = ResourceTestUtils.getSimpleCloudInstanceMap();
 		HashMap<String, CloudInstance> actualInstances = actualEnumerator.getInstances();
 		for (String instanceName: expectedInstances.keySet()) {
-			assertEqualsCloudInstances(expectedInstances.get(instanceName), actualInstances.get(instanceName));
+			ResourceTestUtils.assertEqualsCloudInstances(expectedInstances.get(instanceName), actualInstances.get(instanceName));
 		}
 		Assert.assertEquals(Enumerator.EnumerationStrategy.GridBased, actualEnumerator.getEnumStrategy());
 		Assert.assertEquals(Enumerator.OptimizationStrategy.MinCosts, actualEnumerator.getOptStrategy());
@@ -79,13 +78,13 @@ public void initEnumeratorFromArgsDefaultsTest() {
 
 	@Test
 	public void initEnumeratorFromArgsWithArgNTest() throws IOException {
-		File dmlScript = generateTmpDMLScript("m = $1;", "n = $2;");
+		File dmlScript = ResourceTestUtils.generateTmpDMLScript("m = $1;", "n = $2;");
 
 		String[] args = {
 				"-f", dmlScript.getPath(),
 				"-args", "10", "100"
 		};
-		PropertiesConfiguration options = generateTestingOptionsRequired("any");
+		PropertiesConfiguration options = ResourceTestUtils.generateTestingOptionsRequired("any");
 
 		assertProperEnumeratorInitialization(args, options);
 
@@ -94,13 +93,13 @@ public void initEnumeratorFromArgsWithArgNTest() throws IOException {
 
 	@Test
 	public void initEnumeratorFromArgsWithNvargTest() throws IOException {
-		File dmlScript = generateTmpDMLScript("m = $m;", "n = $n;");
+		File dmlScript = ResourceTestUtils.generateTmpDMLScript("m = $m;", "n = $n;");
 
 		String[] args = {
 				"-f", dmlScript.getPath(),
 				"-nvargs", "m=10", "n=100"
 		};
-		PropertiesConfiguration options = generateTestingOptionsRequired("any");
+		PropertiesConfiguration options = ResourceTestUtils.generateTestingOptionsRequired("any");
 
 		assertProperEnumeratorInitialization(args, options);
 
@@ -120,7 +119,7 @@ public void initEnumeratorCostsWeightOptimizationInvalidTest() {
 		} catch (ParseException e) {
 			Assert.fail("ParseException should not have been raise here: "+e);
 		}
-		PropertiesConfiguration invalidOptions = generateTestingOptionsRequired("any");
+		PropertiesConfiguration invalidOptions = ResourceTestUtils.generateTestingOptionsRequired("any");
 		invalidOptions.setProperty("OPTIMIZATION_FUNCTION", "costs");
 		invalidOptions.setProperty("COSTS_WEIGHT", "10");
 		try {
@@ -134,7 +133,7 @@ public void initEnumeratorCostsWeightOptimizationInvalidTest() {
 		String[] validArgs = {
 				"-f", HOME+"Algorithm_L2SVM.dml",
 		};
-		PropertiesConfiguration validOptions = generateTestingOptionsRequired("any");
+		PropertiesConfiguration validOptions = ResourceTestUtils.generateTestingOptionsRequired("any");
 		validOptions.setProperty("OPTIMIZATION_FUNCTION", "costs");
 		validOptions.setProperty("COSTS_WEIGHT", "0.1");
 		Enumerator actualEnumerator = assertProperEnumeratorInitialization(validArgs, validOptions);
@@ -155,7 +154,7 @@ public void initEnumeratorMinTimeOptimizationInvalidTest() {
 		} catch (ParseException e) {
 			Assert.fail("ParseException should not have been raise here: "+e);
 		}
-		PropertiesConfiguration invalidOptions = generateTestingOptionsRequired("any");
+		PropertiesConfiguration invalidOptions = ResourceTestUtils.generateTestingOptionsRequired("any");
 		invalidOptions.setProperty("OPTIMIZATION_FUNCTION", "time");
 		try {
 			initEnumerator(line, invalidOptions);
@@ -168,7 +167,7 @@ public void initEnumeratorMinTimeOptimizationInvalidTest() {
 		String[] validArgs = {
 				"-f", HOME+"Algorithm_L2SVM.dml",
 		};
-		PropertiesConfiguration validOptions = generateTestingOptionsRequired("any");
+		PropertiesConfiguration validOptions = ResourceTestUtils.generateTestingOptionsRequired("any");
 		validOptions.setProperty("OPTIMIZATION_FUNCTION", "time");
 		validOptions.setProperty("MAX_PRICE", "1000");
 		Enumerator actualEnumerator = assertProperEnumeratorInitialization(validArgs, validOptions);
@@ -189,7 +188,7 @@ public void initEnumeratorMinPriceOptimizationInvalidTest() {
 		} catch (ParseException e) {
 			Assert.fail("ParseException should not have been raise here: "+e);
 		}
-		PropertiesConfiguration invalidOptions = generateTestingOptionsRequired("any");
+		PropertiesConfiguration invalidOptions = ResourceTestUtils.generateTestingOptionsRequired("any");
 		invalidOptions.setProperty("OPTIMIZATION_FUNCTION", "price");
 		try {
 			initEnumerator(line, invalidOptions);
@@ -202,7 +201,7 @@ public void initEnumeratorMinPriceOptimizationInvalidTest() {
 		String[] validArgs = {
 				"-f", HOME+"Algorithm_L2SVM.dml",
 		};
-		PropertiesConfiguration validOptions = generateTestingOptionsRequired("any");
+		PropertiesConfiguration validOptions = ResourceTestUtils.generateTestingOptionsRequired("any");
 		validOptions.setProperty("OPTIMIZATION_FUNCTION", "price");
 		validOptions.setProperty("MAX_TIME", "1000");
 		Enumerator actualEnumerator = assertProperEnumeratorInitialization(validArgs, validOptions);
@@ -215,7 +214,7 @@ public void initGridEnumeratorWithAllOptionalArgsTest() {
 		String[] args = {
 				"-f", HOME+"Algorithm_L2SVM.dml",
 		};
-		PropertiesConfiguration options = generateTestingOptionsRequired("any");
+		PropertiesConfiguration options = ResourceTestUtils.generateTestingOptionsRequired("any");
 		options.setProperty("ENUMERATION", "grid");
 		options.setProperty("STEP_SIZE", "3");
 		options.setProperty("EXPONENTIAL_BASE", "2");
@@ -232,7 +231,7 @@ public void initInterestEnumeratorWithDefaultsTest() {
 		String[] args = {
 				"-f", HOME+"Algorithm_L2SVM.dml",
 		};
-		PropertiesConfiguration options = generateTestingOptionsRequired("any");
+		PropertiesConfiguration options = ResourceTestUtils.generateTestingOptionsRequired("any");
 		options.setProperty("ENUMERATION", "interest");
 
 		Enumerator actualEnumerator = assertProperEnumeratorInitialization(args, options);
@@ -250,7 +249,7 @@ public void initPruneEnumeratorWithDefaultsTest() {
 		String[] args = {
 				"-f", HOME+"Algorithm_L2SVM.dml",
 		};
-		PropertiesConfiguration options = generateTestingOptionsRequired("any");
+		PropertiesConfiguration options = ResourceTestUtils.generateTestingOptionsRequired("any");
 		options.setProperty("ENUMERATION", "prune");
 
 		Enumerator actualEnumerator = assertProperEnumeratorInitialization(args, options);
@@ -263,7 +262,7 @@ public void initInterestEnumeratorWithWithAllOptionsTest() {
 		String[] args = {
 				"-f", HOME+"Algorithm_L2SVM.dml",
 		};
-		PropertiesConfiguration options = generateTestingOptionsRequired("any");
+		PropertiesConfiguration options = ResourceTestUtils.generateTestingOptionsRequired("any");
 		options.setProperty("ENUMERATION", "interest");
 		options.setProperty("USE_LARGEST_ESTIMATE", "false");
 		options.setProperty("USE_CP_ESTIMATES", "false");
@@ -284,20 +283,20 @@ public void initEnumeratorWithInstanceRangeTest() {
 		String[] args = {
 				"-f", HOME+"Algorithm_L2SVM.dml",
 		};
-		PropertiesConfiguration options = generateTestingOptionsRequired("any");
+		PropertiesConfiguration options = ResourceTestUtils.generateTestingOptionsRequired("any");
 		options.setProperty("INSTANCE_FAMILIES", "m5");
 		options.setProperty("INSTANCE_SIZES", "2xlarge");
 
 		Enumerator actualEnumerator = assertProperEnumeratorInitialization(args, options);
 
-		HashMap<String, CloudInstance> inputInstances = getSimpleCloudInstanceMap();
+		HashMap<String, CloudInstance> inputInstances = ResourceTestUtils.getSimpleCloudInstanceMap();
 		HashMap<String, CloudInstance> expectedInstances = new HashMap<>();
 		expectedInstances.put("m5.2xlarge", inputInstances.get("m5.2xlarge"));
 
 		HashMap<String, CloudInstance> actualInstances = actualEnumerator.getInstances();
 
 		for (String instanceName: expectedInstances.keySet()) {
-			assertEqualsCloudInstances(expectedInstances.get(instanceName), actualInstances.get(instanceName));
+			ResourceTestUtils.assertEqualsCloudInstances(expectedInstances.get(instanceName), actualInstances.get(instanceName));
 		}
 	}
 
@@ -306,7 +305,7 @@ public void initEnumeratorWithCustomCPUQuotaTest() {
 		String[] args = {
 				"-f", HOME+"Algorithm_L2SVM.dml",
 		};
-		PropertiesConfiguration options = generateTestingOptionsRequired("any");
+		PropertiesConfiguration options = ResourceTestUtils.generateTestingOptionsRequired("any");
 		options.setProperty("CPU_QUOTA", "256");
 
 		Enumerator actualEnumerator = assertProperEnumeratorInitialization(args, options);
@@ -357,13 +356,13 @@ public void executeForL2SVM_MinimalSearchSpace_Test() throws IOException, ParseE
 		} catch (ParseException e) {
 			Assert.fail("ParseException should not have been raise here: "+e);
 		}
-		PropertiesConfiguration options = generateTestingOptionsRequired(tmpOutFolder.toString());
+		PropertiesConfiguration options = ResourceTestUtils.generateTestingOptionsRequired(tmpOutFolder.toString());
 		options.setProperty("MAX_EXECUTORS", "10");
 
 		ResourceOptimizer.execute(line, options);
 
 		if (!DEBUG) {
-			deleteDirectoryWithFiles(tmpOutFolder);
+			ResourceTestUtils.deleteDirectoryWithFiles(tmpOutFolder);
 		}
 	}
 
@@ -383,7 +382,7 @@ public void executeForL2SVM_MinimalSearchSpace_C5_XLARGE_Test() throws IOExcepti
 		} catch (ParseException e) {
 			Assert.fail("ParseException should not have been raise here: "+e);
 		}
-		PropertiesConfiguration options = generateTestingOptionsRequired(tmpOutFolder.toString());
+		PropertiesConfiguration options = ResourceTestUtils.generateTestingOptionsRequired(tmpOutFolder.toString());
 		options.setProperty("MAX_EXECUTORS", "10");
 		options.setProperty("INSTANCE_FAMILIES", "c5,c5d,c5n");
 		options.setProperty("INSTANCE_SIZES", "xlarge");
@@ -391,7 +390,7 @@ public void executeForL2SVM_MinimalSearchSpace_C5_XLARGE_Test() throws IOExcepti
 		ResourceOptimizer.execute(line, options);
 
 		if (!DEBUG) {
-			deleteDirectoryWithFiles(tmpOutFolder);
+			ResourceTestUtils.deleteDirectoryWithFiles(tmpOutFolder);
 		}
 	}
 
@@ -415,7 +414,7 @@ public void executeForReadAndWrite_Test() throws IOException, ParseException {
 		} catch (ParseException e) {
 			Assert.fail("ParseException should not have been raise here: "+e);
 		}
-		PropertiesConfiguration options = generateTestingOptionsRequired(tmpOutFolder.toString());
+		PropertiesConfiguration options = ResourceTestUtils.generateTestingOptionsRequired(tmpOutFolder.toString());
 		options.setProperty("MAX_EXECUTORS", "2");
 		String localInputs = "s3://data/in/A.csv=" + HOME + "data/A.csv";
 		options.setProperty("LOCAL_INPUTS", localInputs);
@@ -423,7 +422,7 @@ public void executeForReadAndWrite_Test() throws IOException, ParseException {
 		ResourceOptimizer.execute(line, options);
 
 		if (!DEBUG) {
-			deleteDirectoryWithFiles(tmpOutFolder);
+			ResourceTestUtils.deleteDirectoryWithFiles(tmpOutFolder);
 		}
 	}
 
diff --git a/src/test/java/org/apache/sysds/test/component/resource/ResourceTestUtils.java b/src/test/java/org/apache/sysds/test/component/resource/ResourceTestUtils.java
index ee5315d41fe..e0c0e6abcd3 100644
--- a/src/test/java/org/apache/sysds/test/component/resource/ResourceTestUtils.java
+++ b/src/test/java/org/apache/sysds/test/component/resource/ResourceTestUtils.java
@@ -25,7 +25,10 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.nio.file.*;
+import java.nio.file.FileVisitResult;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.SimpleFileVisitor;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.util.Arrays;
 import java.util.HashMap;
diff --git a/src/test/java/org/apache/sysds/test/component/sparse/SparseBlockIndexRange.java b/src/test/java/org/apache/sysds/test/component/sparse/SparseBlockIndexRange.java
index 2125be294f5..0a35e312a2b 100644
--- a/src/test/java/org/apache/sysds/test/component/sparse/SparseBlockIndexRange.java
+++ b/src/test/java/org/apache/sysds/test/component/sparse/SparseBlockIndexRange.java
@@ -22,9 +22,15 @@
 import java.util.Arrays;
 import java.util.Iterator;
 
-import org.apache.sysds.runtime.data.*;
 import org.junit.Assert;
 import org.junit.Test;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockCOO;
+import org.apache.sysds.runtime.data.SparseBlockCSC;
+import org.apache.sysds.runtime.data.SparseBlockCSR;
+import org.apache.sysds.runtime.data.SparseBlockDCSR;
+import org.apache.sysds.runtime.data.SparseBlockMCSC;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
 import org.apache.sysds.runtime.matrix.data.IJV;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.DataConverter;
diff --git a/src/test/java/org/apache/sysds/test/component/sparse/SparseBlockScan.java b/src/test/java/org/apache/sysds/test/component/sparse/SparseBlockScan.java
index 67295802098..cb5c1369fff 100644
--- a/src/test/java/org/apache/sysds/test/component/sparse/SparseBlockScan.java
+++ b/src/test/java/org/apache/sysds/test/component/sparse/SparseBlockScan.java
@@ -19,9 +19,15 @@
 
 package org.apache.sysds.test.component.sparse;
 
-import org.apache.sysds.runtime.data.*;
 import org.junit.Assert;
 import org.junit.Test;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockCOO;
+import org.apache.sysds.runtime.data.SparseBlockCSC;
+import org.apache.sysds.runtime.data.SparseBlockCSR;
+import org.apache.sysds.runtime.data.SparseBlockDCSR;
+import org.apache.sysds.runtime.data.SparseBlockMCSC;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.DataConverter;
 import org.apache.sysds.test.AutomatedTestBase;
diff --git a/src/test/java/org/apache/sysds/test/component/sparse/SparseBlockSize.java b/src/test/java/org/apache/sysds/test/component/sparse/SparseBlockSize.java
index 7059dfe3339..b560fdeadcc 100644
--- a/src/test/java/org/apache/sysds/test/component/sparse/SparseBlockSize.java
+++ b/src/test/java/org/apache/sysds/test/component/sparse/SparseBlockSize.java
@@ -19,9 +19,15 @@
 
 package org.apache.sysds.test.component.sparse;
 
-import org.apache.sysds.runtime.data.*;
 import org.junit.Assert;
 import org.junit.Test;
+import org.apache.sysds.runtime.data.SparseBlock;
+import org.apache.sysds.runtime.data.SparseBlockCOO;
+import org.apache.sysds.runtime.data.SparseBlockCSC;
+import org.apache.sysds.runtime.data.SparseBlockCSR;
+import org.apache.sysds.runtime.data.SparseBlockDCSR;
+import org.apache.sysds.runtime.data.SparseBlockMCSC;
+import org.apache.sysds.runtime.data.SparseBlockMCSR;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.DataConverter;
 import org.apache.sysds.test.AutomatedTestBase;
diff --git a/src/test/java/org/apache/sysds/test/component/utils/UnixPipeUtilsTest.java b/src/test/java/org/apache/sysds/test/component/utils/UnixPipeUtilsTest.java
index 650d6c1053f..424d513a99f 100644
--- a/src/test/java/org/apache/sysds/test/component/utils/UnixPipeUtilsTest.java
+++ b/src/test/java/org/apache/sysds/test/component/utils/UnixPipeUtilsTest.java
@@ -20,6 +20,8 @@
 package org.apache.sysds.test.component.utils;
 
 import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.frame.data.columns.Array;
+import org.apache.sysds.runtime.frame.data.columns.ArrayFactory;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.UnixPipeUtils;
 import org.junit.Rule;
@@ -44,6 +46,8 @@
 
 import static org.junit.Assert.assertArrayEquals;
 
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
 
 @RunWith(Enclosed.class)
 public class UnixPipeUtilsTest {
@@ -59,6 +63,7 @@ public static Collection<Object[]> data() {
 					{Types.ValueType.FP64, 6, 48, 99, new MatrixBlock(2, 3, new double[]{1.0, 2.0, 3.0, 4.0, 5.0, 6.0})},
 					{Types.ValueType.FP32, 6, 24, 88, new MatrixBlock(3, 2, new double[]{1.0, 2.0, 3.0, 4.0, 5.0, 6.0})},
 					{Types.ValueType.INT32, 4, 16, 77, new MatrixBlock(2, 2, new double[]{0, -1, 2, -3})},
+					{Types.ValueType.INT64, 4, 64, 55, new MatrixBlock(2, 2, new double[]{0, 1, 2, 3})},
 					{Types.ValueType.UINT8, 4, 4, 66, new MatrixBlock(2, 2, new double[]{0, 1, 2, 3})}
 			});
 		}
@@ -81,6 +86,7 @@ public ParameterizedTest(Types.ValueType type, int numElem, int batchSize, int i
 		@Test
 		public void testReadWriteNumpyArrayBatch() throws IOException {
 			File tempFile = folder.newFile("pipe_test_" + type.name());
+			matrixBlock.recomputeNonZeros();
 
 			try (BufferedOutputStream out = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), id)) {
 				UnixPipeUtils.writeNumpyArrayInBatches(out, id, batchSize, numElem, type, matrixBlock);
@@ -88,35 +94,105 @@ public void testReadWriteNumpyArrayBatch() throws IOException {
 
 			double[] output = new double[numElem];
 			try (BufferedInputStream in = UnixPipeUtils.openInput(tempFile.getAbsolutePath(), id)) {
-				UnixPipeUtils.readNumpyArrayInBatches(in, id, batchSize, numElem, type, output, 0);
+				long nonZeros = UnixPipeUtils.readNumpyArrayInBatches(in, id, batchSize, numElem, type, output, 0);
+				// Verify nonzero count matches MatrixBlock
+				org.junit.Assert.assertEquals(matrixBlock.getNonZeros(), nonZeros);
 			}
 
 			assertArrayEquals(matrixBlock.getDenseBlockValues(), output, 1e-9);
 		}
 	}
 
+	@RunWith(Parameterized.class)
+	public static class FrameColumnParameterizedTest {
+		@Rule
+		public TemporaryFolder folder = new TemporaryFolder();
+
+		@Parameterized.Parameters(name = "{index}: frameType={0}")
+		public static Collection<Object[]> data() {
+			return Arrays.asList(new Object[][]{
+				{Types.ValueType.FP64, new Object[]{1.0, -2.5, 3.25, 4.75}, 64, 201},
+				{Types.ValueType.FP32, new Object[]{1.0f, -2.25f, 3.5f, -4.125f}, 48, 202},
+				{Types.ValueType.INT32, new Object[]{0, -1, 5, 42}, 32, 203},
+				{Types.ValueType.BOOLEAN, new Object[]{true, false, true, false}, 8, 205},
+				{Types.ValueType.STRING, new Object[]{"alpha", "beta", "gamma",null, "delta"}, 64, 205},
+				{Types.ValueType.STRING, new Object[]{"alphaalphaalphaalphaalpha", "beta", "gamma",null, "delta"}, 16, 205},
+				{Types.ValueType.FP64, new Object[]{1.0, -2.5, 3.25, 4.75}, 8, 201},
+			});
+		}
+
+		private final Types.ValueType type;
+		private final Object[] values;
+		private final int batchSize;
+		private final int id;
+
+		public FrameColumnParameterizedTest(Types.ValueType type, Object[] values, int batchSize, int id) {
+			this.type = type;
+			this.values = values;
+			this.batchSize = batchSize;
+			this.id = id;
+		}
+
+		@Test
+		public void testReadWriteFrameColumn() throws IOException {
+			File tempFile = folder.newFile("frame_pipe_" + type.name());
+			Array<?> column = createColumn(type, values);
+
+			long bytesWritten;
+			try(BufferedOutputStream out = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), id)) {
+				bytesWritten = UnixPipeUtils.writeFrameColumnToPipe(out, id, batchSize, column, type);
+			}
+
+			int totalBytes = Math.toIntExact(bytesWritten);
+			try(BufferedInputStream in = UnixPipeUtils.openInput(tempFile.getAbsolutePath(), id)) {
+				Array<?> read = UnixPipeUtils.readFrameColumnFromPipe(in, id, values.length, totalBytes, batchSize, type);
+				assertFrameColumnEquals(column, read, type);
+			}
+		}
+
+		private static Array<?> createColumn(Types.ValueType type, Object[] values) {
+			Array<?> array = ArrayFactory.allocate(type, values.length);
+			for(int i = 0; i < values.length; i++) {
+				switch(type) {
+					case STRING -> array.set(i, (String) values[i]);
+					case BOOLEAN -> array.set(i, ((Boolean) values[i]) ? 1.0 : 0.0);
+					default -> array.set(i, ((Number) values[i]).doubleValue());
+				}
+			}
+			return array;
+		}
+
+		private static void assertFrameColumnEquals(Array<?> expected, Array<?> actual, Types.ValueType type) {
+			org.junit.Assert.assertEquals(expected.size(), actual.size());
+			for(int i = 0; i < expected.size(); i++) {
+				switch(type) {
+					case FP64 -> org.junit.Assert.assertEquals(
+						((Number) expected.get(i)).doubleValue(),
+						((Number) actual.get(i)).doubleValue(), 1e-9);
+					case FP32 -> org.junit.Assert.assertEquals(
+						((Number) expected.get(i)).floatValue(),
+						((Number) actual.get(i)).floatValue(), 1e-6f);
+					case STRING -> org.junit.Assert.assertEquals(expected.get(i), actual.get(i));
+					default -> org.junit.Assert.assertEquals(expected.get(i), actual.get(i));
+				}
+			}
+		}
+	}
+
 	public static class NonParameterizedTest {
 		@Rule
 		public TemporaryFolder folder = new TemporaryFolder();
 
 		@Test(expected = FileNotFoundException.class)
 		public void testOpenInputFileNotFound() throws IOException {
-			// instantiate class once for coverage
 			new UnixPipeUtils();
-
-			// Create a path that does not exist
 			File nonExistentFile = new File(folder.getRoot(), "nonexistent.pipe");
-
-			// This should throw FileNotFoundException
 			UnixPipeUtils.openInput(nonExistentFile.getAbsolutePath(), 123);
 		}
 
 		@Test(expected = FileNotFoundException.class)
 		public void testOpenOutputFileNotFound() throws IOException {
-			// Create a path that does not exist
 			File nonExistentFile = new File(folder.getRoot(), "nonexistent.pipe");
-
-			// This should throw FileNotFoundException
 			UnixPipeUtils.openOutput(nonExistentFile.getAbsolutePath(), 123);
 		}
 
@@ -125,14 +201,9 @@ public void testOpenOutputFileNotFound() throws IOException {
 		public void testOpenInputAndOutputHandshakeMatch() throws IOException {
 			File tempFile = folder.newFile("pipe_test1");
 			int id = 42;
-
-			// Write expected handshake
+			
 			try (BufferedOutputStream bos = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), id)) {}
-
-			// Read and validate handshake
-			try (BufferedInputStream bis = UnixPipeUtils.openInput(tempFile.getAbsolutePath(), id)) {
-				// success: no exception = handshake passed
-			}
+			try (BufferedInputStream bis = UnixPipeUtils.openInput(tempFile.getAbsolutePath(), id)) {}
 		}
 
 		@Test(expected = IllegalStateException.class)
@@ -142,8 +213,6 @@ public void testOpenInputHandshakeMismatch() throws IOException {
 			int wrongReadId = 456;
 
 			try (BufferedOutputStream bos = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), writeId)) {}
-
-			// Will throw due to ID mismatch
 			UnixPipeUtils.openInput(tempFile.getAbsolutePath(), wrongReadId);
 		}
 
@@ -159,6 +228,18 @@ public void testOpenInputIncompleteHandshake() throws IOException {
 			UnixPipeUtils.openInput(tempFile.getAbsolutePath(), 100);
 		}
 
+		@Test(expected = IOException.class)
+		public void testReadColumnFromPipeError() throws IOException {
+			File tempFile = folder.newFile("pipe_test3");
+			int id = 42;
+			
+			BufferedOutputStream bos = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), id);
+			BufferedInputStream bis = UnixPipeUtils.openInput(tempFile.getAbsolutePath(), id);
+			Array<?> column = ArrayFactory.allocate(Types.ValueType.INT64, 4);
+			UnixPipeUtils.writeFrameColumnToPipe(bos, id, 16, column, Types.ValueType.INT64);
+			UnixPipeUtils.readFrameColumnFromPipe(bis, 42, 4, 12, 32 * 1024, Types.ValueType.INT32);
+		}
+
 		@Test(expected = EOFException.class)
 		public void testReadNumpyArrayUnexpectedEOF() throws IOException {
 			File tempFile = folder.newFile("pipe_test5");
@@ -187,5 +268,201 @@ public void testReadNumpyArrayUnexpectedEOF() throws IOException {
 				UnixPipeUtils.readNumpyArrayInBatches(in, id, batchSize, numElem, type, outArr, 0);
 			}
 		}
+
+		@Test(expected = UnsupportedOperationException.class)
+		public void testGetElementSizeUnsupportedType() {
+			UnixPipeUtils.getElementSize(Types.ValueType.STRING);
+		}
+
+		@Test(expected = UnsupportedOperationException.class)
+		public void testReadNumpyArrayUnsupportedType() throws IOException {
+			File file = folder.newFile("unsupported_type.pipe");
+			int id = 7;
+			try (BufferedOutputStream out = UnixPipeUtils.openOutput(file.getAbsolutePath(), id)) {
+				UnixPipeUtils.writeHandshake(id, out); // start handshake
+				out.flush();
+				UnixPipeUtils.writeHandshake(id, out); // end handshake, no payload
+			}
+			double[] outArr = new double[0];
+			try (BufferedInputStream in = UnixPipeUtils.openInput(file.getAbsolutePath(), id)) {
+				UnixPipeUtils.readNumpyArrayInBatches(in, id, 32, 0, Types.ValueType.STRING, outArr, 0);
+			}
+		}
+
+		@Test(expected = NullPointerException.class)
+		public void testWriteNumpyArrayInBatchesError() throws IOException {
+			UnixPipeUtils.writeNumpyArrayInBatches(null, 0, 0, 0, Types.ValueType.INT32, null);
+		}
+
+		@Test
+		public void testGetBufferReaderUnsupportedType() throws Exception {
+			Method m = UnixPipeUtils.class.getDeclaredMethod("getBufferReader", Types.ValueType.class);
+			m.setAccessible(true);
+
+			try {
+				m.invoke(null, Types.ValueType.STRING);
+				org.junit.Assert.fail("Expected UnsupportedOperationException");
+			} catch (InvocationTargetException e) {
+				org.junit.Assert.assertTrue(e.getCause() instanceof UnsupportedOperationException);
+			}
+		}
+
+		@Test
+		public void testGetBufferWriterUnsupportedType() throws Exception {
+			Method m = UnixPipeUtils.class.getDeclaredMethod("getBufferWriter", Types.ValueType.class);
+			m.setAccessible(true);
+
+			try {
+				m.invoke(null, Types.ValueType.STRING);
+				org.junit.Assert.fail("Expected UnsupportedOperationException");
+			} catch (InvocationTargetException e) {
+				org.junit.Assert.assertTrue(e.getCause() instanceof UnsupportedOperationException);
+			}
+		}
+
+		@Test(expected = UnsupportedOperationException.class)
+		public void testReadWriteFrameColumnUINT8() throws IOException {
+			File tempFile = folder.newFile("frame_pipe_UINT8");
+			int id  = 204;
+
+			BufferedOutputStream out = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), id);
+			UnixPipeUtils.writeHandshake(id, out);
+			out.write(new byte[]{(byte) 0x00, (byte) 0x01, (byte) 0x02, (byte) 0x03});
+			UnixPipeUtils.writeHandshake(id, out);
+			out.flush();
+
+			Array<?> read = null;
+			try(BufferedInputStream in = UnixPipeUtils.openInput(tempFile.getAbsolutePath(), id)) {
+				read = UnixPipeUtils.readFrameColumnFromPipe(in, id, 4, 4, 32 * 1024, Types.ValueType.UINT8);
+				for(int i = 0; i < 4; i++) {
+					org.junit.Assert.assertEquals(i, read.get(i));
+				}
+			}
+			try(BufferedOutputStream out2 = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), id)) {
+				UnixPipeUtils.writeFrameColumnToPipe(out2, id, 16, read, Types.ValueType.UINT8);
+			}
+		}
+
+		@Test
+		public void testReadWriteFrameColumnINT64() throws IOException {
+			File tempFile = folder.newFile("frame_pipe_INT32");
+			int id  = 204;
+
+			BufferedOutputStream out = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), id);
+			UnixPipeUtils.writeHandshake(id, out);
+			// write 4 int64 values
+			ByteBuffer bb = ByteBuffer.allocate(8 * 4).order(ByteOrder.LITTLE_ENDIAN);
+			for(int i = 0; i < 4; i++) {
+				bb.putLong(i);
+			}
+			out.write(bb.array());
+			UnixPipeUtils.writeHandshake(id, out);
+			out.flush();
+
+			Array<?> read = null;
+			try(BufferedInputStream in = UnixPipeUtils.openInput(tempFile.getAbsolutePath(), id)) {
+				read = UnixPipeUtils.readFrameColumnFromPipe(in, id, 4, 32, 32 * 1024, Types.ValueType.INT64);
+				for(int i = 0; i < 4; i++) {
+					org.junit.Assert.assertEquals(i, ((Number) read.get(i)).longValue());
+				}
+			}
+		}
+
+		@Test(expected = UnsupportedOperationException.class)
+		public void testReadWriteFrameColumnUnsupportedType() throws IOException {
+			File tempFile = folder.newFile("frame_pipe_HASH64");
+			int id  = 204;
+
+			BufferedOutputStream out = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), id);
+			UnixPipeUtils.writeHandshake(id, out);
+			out.flush();
+
+			try(BufferedInputStream in = UnixPipeUtils.openInput(tempFile.getAbsolutePath(), id)) {
+				UnixPipeUtils.readFrameColumnFromPipe(in, id, 4, 32, 4, Types.ValueType.HASH64);
+			}
+		}
+
+		@Test
+		public void testReadWriteFrameColumnLongString1() throws IOException {
+			File tempFile = folder.newFile("frame_pipe_long_string");
+			Array<?> column = FrameColumnParameterizedTest.createColumn(Types.ValueType.STRING, 
+				new Object[]{"alphaalphaalphaalphaalphaa", "beta", "gamma",null, "delta"});
+			int id = 205;
+			int batchSize = 16;
+
+			long bytesWritten;
+			try(BufferedOutputStream out = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), id)) {
+				bytesWritten = UnixPipeUtils.writeFrameColumnToPipe(out, id, batchSize, column, Types.ValueType.STRING);
+			}
+
+			int totalBytes = Math.toIntExact(bytesWritten);
+			try(BufferedInputStream in = UnixPipeUtils.openInput(tempFile.getAbsolutePath(), id)) {
+				Array<?> read = UnixPipeUtils.readFrameColumnFromPipe(in, id, column.size(), totalBytes, batchSize, Types.ValueType.STRING);
+				FrameColumnParameterizedTest.assertFrameColumnEquals(column, read, Types.ValueType.STRING);
+			}
+		}
+
+		@Test
+		public void testReadWriteFrameColumnLongString2() throws IOException {
+			File tempFile = folder.newFile("frame_pipe_long_string");
+			StringBuilder sb = new StringBuilder();
+			for(int i = 0; i < 35*1024; i++) {
+				sb.append("a");
+			}
+			Array<?> column = FrameColumnParameterizedTest.createColumn(Types.ValueType.STRING, 
+				new Object[]{sb.toString()});
+			int id = 205;
+			int batchSize = 16*1024;
+
+			long bytesWritten;
+			try(BufferedOutputStream out = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), id)) {
+				bytesWritten = UnixPipeUtils.writeFrameColumnToPipe(out, id, batchSize, column, Types.ValueType.STRING);
+			}
+
+			int totalBytes = Math.toIntExact(bytesWritten);
+			try(BufferedInputStream in = UnixPipeUtils.openInput(tempFile.getAbsolutePath(), id)) {
+				Array<?> read = UnixPipeUtils.readFrameColumnFromPipe(in, id, column.size(), totalBytes, batchSize, Types.ValueType.STRING);
+				FrameColumnParameterizedTest.assertFrameColumnEquals(column, read, Types.ValueType.STRING);
+			}
+		}
+
+		@Test
+		public void testReadWriteFrameColumnString() throws IOException {
+			File tempFile = folder.newFile("frame_pipe_long_string");
+			Array<?> column = FrameColumnParameterizedTest.createColumn(Types.ValueType.STRING, 
+				new Object[]{"alphabet"});
+			int id = 205;
+			int batchSize = 12;
+
+			long bytesWritten;
+			try(BufferedOutputStream out = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), id)) {
+				bytesWritten = UnixPipeUtils.writeFrameColumnToPipe(out, id, batchSize, column, Types.ValueType.STRING);
+			}
+
+			int totalBytes = Math.toIntExact(bytesWritten);
+			try(BufferedInputStream in = UnixPipeUtils.openInput(tempFile.getAbsolutePath(), id)) {
+				Array<?> read = UnixPipeUtils.readFrameColumnFromPipe(in, id, column.size(), totalBytes, batchSize, Types.ValueType.STRING);
+				FrameColumnParameterizedTest.assertFrameColumnEquals(column, read, Types.ValueType.STRING);
+			}
+		}
+
+		@Test
+		public void testWriteFrameColumnINT32() throws IOException {
+			File tempFile = folder.newFile("frame_pip2_INT32");
+			int id  = 204;
+			Array<?> column = FrameColumnParameterizedTest.createColumn(Types.ValueType.INT32, 
+				new Object[]{0, 1, 2, 3});
+
+			try(BufferedOutputStream out = UnixPipeUtils.openOutput(tempFile.getAbsolutePath(), id)) {
+				UnixPipeUtils.writeFrameColumnToPipe(out, id, 4, column, Types.ValueType.INT32);
+			}
+
+			Array<?> read = null;
+			try(BufferedInputStream in = UnixPipeUtils.openInput(tempFile.getAbsolutePath(), id)) {
+				read = UnixPipeUtils.readFrameColumnFromPipe(in, id, 4, 16, 4, Types.ValueType.INT32);
+				FrameColumnParameterizedTest.assertFrameColumnEquals(column, read, Types.ValueType.INT32);
+			}
+
+		}
 	}
 }
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinMatrixProfileTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinMatrixProfileTest.java
index e3ee065417e..67c3be751a6 100644
--- a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinMatrixProfileTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinMatrixProfileTest.java
@@ -28,7 +28,6 @@
 import org.junit.Test;
 
 import java.io.IOException;
-import java.lang.Math;
 import java.util.Random;
 import java.util.Collections;
 import java.util.Comparator;
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinScaleRobustTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinScaleRobustTest.java
new file mode 100644
index 00000000000..630b149aae6
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinScaleRobustTest.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part2;
+
+import java.util.HashMap;
+
+import org.junit.Test;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.common.Types.ExecType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+
+public class BuiltinScaleRobustTest extends AutomatedTestBase {
+	private final static String TEST_NAME = "scaleRobust";
+	private final static String TEST_DIR = "functions/builtin/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + BuiltinScaleRobustTest.class.getSimpleName() + "/";
+	private final static double eps = 1e-10;
+	private final static int rows = 70;
+	private final static int cols = 50;
+
+
+	@Override
+	public void setUp() {
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"B"}));
+	}
+
+	@Test
+	public void testScaleRobustDenseCP() {
+		runTest(false, ExecType.CP);
+	}
+
+	private void runTest(boolean sparse, ExecType et) {
+		ExecMode old = setExecMode(et);
+		try {
+			loadTestConfiguration(getTestConfiguration(TEST_NAME));
+			double sparsity = sparse ? 0.1 : 0.9;
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME + ".dml";
+			fullRScriptName = HOME + TEST_NAME + ".R";
+			programArgs = new String[]{"-args", input("A"), output("B")};
+			programArgs = new String[]{"-exec", "singlenode", "-args", input("A"), output("B")};
+			rCmd = "Rscript " + fullRScriptName + " " + inputDir() + " " + expectedDir();
+
+			double[][] A = getRandomMatrix(rows, cols, -10, 10, sparsity, 7);
+			writeInputMatrixWithMTD("A", A, true);
+
+			// Run DML
+			runTest(true, false, null, -1); 
+
+			// Run R
+			runRScript(true);
+
+			// Read matrices and compare
+			HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("B");
+			HashMap<CellIndex, Double> rfile  = readRMatrixFromExpectedDir("B");
+			TestUtils.compareMatrices(dmlfile, rfile, eps, "DML", "R");
+		} catch (Exception e) {
+			throw new RuntimeException(e);
+		} finally {
+			resetExecMode(old);
+		}
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/functions/einsum/EinsumTest.java b/src/test/java/org/apache/sysds/test/functions/einsum/EinsumTest.java
index dbf9047968f..f86da127d68 100644
--- a/src/test/java/org/apache/sysds/test/functions/einsum/EinsumTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/einsum/EinsumTest.java
@@ -37,65 +37,109 @@
 import java.io.IOException;
 import java.nio.file.Files;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 @RunWith(Parameterized.class)
 public class EinsumTest extends AutomatedTestBase
 {
 	final private static List<Config> TEST_CONFIGS = List.of(
-			new Config("ij,jk->ik", List.of(shape(50, 600), shape(600, 10))), // mm
-			new Config("ji,jk->ik", List.of(shape(600, 5), shape(600, 10))),
-			new Config("ji,kj->ik", List.of(shape(600, 5), shape(10, 600))),
-			new Config("ij,kj->ik", List.of(shape(5, 600), shape(10, 600))),
-
-			new Config("ji,jk->i", List.of(shape(600, 5), shape(600, 10))),
-			new Config("ij,jk->i", List.of(shape(5, 600), shape(600, 10))),
-
-			new Config("ji,jk->k", List.of(shape(600, 5), shape(600, 10))),
-			new Config("ij,jk->k", List.of(shape(5, 600), shape(600, 10))),
-
-			new Config("ji,jk->j", List.of(shape(600, 5), shape(600, 10))),
-
-			new Config("ji,ji->ji", List.of(shape(600, 5), shape(600, 5))), // elemwise mult
-			new Config("ji,ji,ji->ji", List.of(shape(600, 5),shape(600, 5), shape(600, 5)),
-					List.of(0.0001, 0.0005, 0.001)),
-			new Config("ji,ij->ji", List.of(shape(600, 5), shape(5, 600))), // elemwise mult
-
-
-			new Config("ij,i->ij",   List.of(shape(100, 50), shape(100))), // col mult
-			new Config("ji,i->ij",   List.of(shape(50, 100), shape(100))), // row mult
-			new Config("ij,i->i",   List.of(shape(100, 50), shape(100))),
-			new Config("ij,i->j",   List.of(shape(100, 50), shape(100))),
-
-			new Config("i,i->",     List.of(shape(50), shape(50))),
-			new Config("i,j->",     List.of(shape(50), shape(80))),
-			new Config("i,j->ij",     List.of(shape(50), shape(80))), // outer vect mult
-			new Config("i,j->ji",     List.of(shape(50), shape(80))), // outer vect mult
-
-			new Config("ij->",     List.of(shape(100, 50))), // sum
-			new Config("ij->i",     List.of(shape(100, 50))), // sum(1)
-			new Config("ij->j",     List.of(shape(100, 50))), // sum(0)
-			new Config("ij->ji",     List.of(shape(100, 50))), // T
-
-			new Config("ab,cd->ba",     List.of(shape( 600, 10), shape(6, 5))),
-			new Config("ab,cd,g->ba",     List.of(shape( 600, 10), shape(6, 5), shape(3))),
-
-			new Config("ab,bc,cd,de->ae",   List.of(shape(5, 600), shape(600, 10), shape(10, 5), shape(5, 4))), // chain of mm
-
-			new Config("ji,jz,zx->ix",   List.of(shape(600, 5), shape( 600, 10), shape(10, 2))),
-			new Config("fx,fg,fz,xg->z",   List.of(shape(600, 5), shape( 600, 10), shape(600, 6), shape(5, 10))),
-			new Config("fx,fg,fz,xg,zx,zg->g", // each idx 3 times (cell tpl)
-					List.of(shape(5, 60), shape(5, 30), shape(5, 10), shape(60, 30), shape(10, 60), shape(10, 30))),
-
-			new Config("i->",     List.of(shape(100))),
-			new Config("i->i",     List.of(shape(100)))
+		new Config("ij,jk->ik", List.of(shape(5, 6), shape(6, 5))), // mm
+		new Config("ji,jk->ik", List.of(shape(6, 5), shape(6, 10))),
+		new Config("ji,kj->ik", List.of(shape(6, 5), shape(10, 6))),
+		new Config("ij,kj->ik", List.of(shape(5, 6), shape(10, 6))),
+		new Config("ij,jk->ki", List.of(shape(5, 6), shape(6, 5))), // mm t
+		new Config("ji,jk->ki", List.of(shape(6, 5), shape(6, 10))),
+		new Config("ji,kj->ki", List.of(shape(6, 5), shape(10, 6))),
+		new Config("ij,kj->ki", List.of(shape(5, 6), shape(10, 6))),
+		new Config("ij,kp,pj->ki", List.of(shape(5,6), shape(5,4), shape(4, 6))), // reordering
+		new Config("ab,bc,cd,de->ae", List.of(shape(5, 6), shape(6, 5),shape(5, 6), shape(6, 5))), // mm chain
+		new Config("de,cd,bc,ab->ae", List.of(shape(6, 5), shape(5, 6),shape(6, 5), shape(5, 6))), // mm chain
+		new Config("ab,cb,de,cd->ae", List.of(shape(5, 6), shape(5,6), shape(6, 5),shape(5, 6))), // mm chain
+
+		new Config("ji,jk->i", List.of(shape(6, 5), shape(6, 4))),
+		new Config("ij,jk->i", List.of(shape(5, 6), shape(6, 4))),
+		new Config("ji,jk->k", List.of(shape(6, 5), shape(6, 4))),
+		new Config("ij,jk->k", List.of(shape(5, 6), shape(6, 4))),
+		new Config("ji,jk->j", List.of(shape(6, 5), shape(6, 4))),
+
+		new Config("ji,ji->ji", List.of(shape(60, 5), shape(60, 5))), // elemwise mult
+		new Config("ji,ji->j", List.of(shape(60, 5), shape(60, 5))),
+		new Config("ji,ji->i", List.of(shape(60, 5), shape(60, 5))),
+		new Config("ji,ij->ji", List.of(shape(60, 5), shape(5, 60))), // elemwise mult
+		new Config("ji,ij->i", List.of(shape(60, 5), shape(5, 60))),
+		new Config("ji,ij->j", List.of(shape(60, 5), shape(5, 60))),
+
+		new Config("ij,i->ij",   List.of(shape(10, 5), shape(10))), // col mult
+		new Config("ji,i->ij",   List.of(shape(5, 10), shape(10))), // row mult
+		new Config("ij,i->i",   List.of(shape(10, 5), shape(10))),
+		new Config("ij,i->j",   List.of(shape(10, 5), shape(10))),
+
+		new Config("i,i->",	 List.of(shape(5), shape(5))), // dot
+		new Config("i,j->",	 List.of(shape(5), shape(80))), // sum
+		new Config("i,j->ij",	 List.of(shape(5), shape(80))), // outer vect mult
+		new Config("i,j->ji",	 List.of(shape(5), shape(80))), // outer vect mult
+
+		new Config("ij->",	 List.of(shape(10, 5))), // sum
+		new Config("i->",	 List.of(shape(10))), // sum
+		new Config("ij->i",	 List.of(shape(10, 5))), // sum(1)
+		new Config("ij->j",	 List.of(shape(10, 5))), // sum(0)
+		new Config("ij->ji",	 List.of(shape(10, 5))), // T
+		new Config("ij->ij",	 List.of(shape(10, 5))),
+		new Config("i->i",	 List.of(shape(10))),
+		new Config("ii->i",	 List.of(shape(10, 10))), // Diag
+		new Config("ii->",	 List.of(shape(10, 10))), // Trace
+		new Config("ii,i->i",	 List.of(shape(10, 10),shape(10))), // Diag*vec
+
+		new Config("ab,cd->ba",	 List.of(shape( 6, 10), shape(6, 5))), // sum cd to scalar and multiply ab
+
+		new Config("fx,fg,fz,xg,zx,zg->g", // each idx 3 times (the cell tpl fallback)
+				List.of(shape(5, 6), shape(5, 3), shape(5, 10), shape(6, 3), shape(10, 6), shape(10, 3))),
+
+		// test fused:
+		new Config("ij,ij,ji->ij",   List.of(shape(10, 5), shape(10, 5), shape(5, 10))),
+		new Config("ij,ij,ji,i,j->ij",   List.of(shape(10, 5), shape(10, 5),shape(5, 10),shape(10),shape(5))),
+		new Config("ij,ij,ji,i->ij",   List.of(shape(10, 5), shape(10, 5),shape(5, 10),shape(10))),
+		new Config("ij,ij,ji,j->ij",   List.of(shape(10, 5), shape(10, 5),shape(5, 10),shape(5))),
+
+		new Config("ij,ij,ji->i",   List.of(shape(10, 5), shape(10, 5), shape(5, 10))),
+		new Config("ij,ij,ji,i,j->i",   List.of(shape(10, 5), shape(10, 5),shape(5, 10),shape(10),shape(5))),
+		new Config("ij,ij,ji,i->i",   List.of(shape(10, 5), shape(10, 5),shape(5, 10),shape(10))),
+		new Config("ij,ij,ji,j->i",   List.of(shape(10, 5), shape(10, 5),shape(5, 10),shape(5))),
+
+		new Config("ij,ij,ji->j",   List.of(shape(10, 5), shape(10, 5), shape(5, 10))),
+		new Config("ij,ij,ji,i,j->j",   List.of(shape(10, 5), shape(10, 5),shape(5, 10),shape(10),shape(5))),
+		new Config("ij,ij,ji,i->j",   List.of(shape(10, 5), shape(10, 5),shape(5, 10),shape(10))),
+		new Config("ij,ij,ji,j->j",   List.of(shape(10, 5), shape(10, 5),shape(5, 10),shape(5))),
+
+		new Config("ij,ij,ji->",   List.of(shape(10, 5), shape(10, 5), shape(5, 10))),
+		new Config("ij,ij,ji,i,j->",   List.of(shape(10, 5), shape(10, 5),shape(5, 10),shape(10),shape(5))),
+		new Config("ij,ij,ji,i->",   List.of(shape(10, 5), shape(10, 5),shape(5, 10),shape(10))),
+		new Config("ij,ij,ji,j->",   List.of(shape(10, 5), shape(10, 5),shape(5, 10),shape(5))),
+
+		new Config("ij,ij,ij,i,j,iz->z",  List.of(shape(5, 5), shape(5, 5), shape(5, 5),shape(5),shape(5),shape(5, 6))),
+		new Config("ij,ij,ij,i,j,iz,z->z",  List.of(shape(5, 5), shape(5, 5), shape(5, 5),shape(5),shape(5),shape(5, 6),shape(6))),
+
+		new Config("ij,i,j,iz->z",  List.of(shape(10, 5),shape(10),shape(5),shape(10, 51))),
+		new Config("ij,i,j,iz,iz->z",  List.of(shape(10, 5),shape(10),shape(5),shape(10, 51),shape(10, 51))),
+		new Config("ij,i,j,iz,iz->z",  List.of(shape(10, 5),shape(10),shape(5),shape(10, 4),shape(10, 4))), // order swapped because sizeof(iz) < sizeof(ij), but should still produce the same tmpl
+		new Config("ij,i,j,iz->z",  List.of(shape(20, 10),shape(20),shape(10),shape(20, 10))),
+
+		new Config("ij,ij,ji,j,i, ab,ba,ab,a,b->jb",   Map.of('i',10, 'j',5, 'a', 11, 'b', 6)),
+		new Config("ij,ij,ji,j,i,iz->jz",   Map.of('i',10, 'j',10,'z', 10)),  // // no skinny right
+		// takes 2 mins each due to R package being super slow so commented out:
+//			new Config("ij,ij,i,j,iz->jz",   Map.of('i',100000, 'j',50,'z', 40), List.of(0.0000001,0.000002, 0.000003,0.000004,0.000004)),  // skinny right with outer mm
+//			new Config("ij,ij,i,iz->jz",   Map.of('i',100000, 'j',50,'z', 40), List.of(0.0000001,0.000002, 0.000003,0.000004)),  // skinny right with outer mm
+//			new Config("ij,j,i,iz->zj",   Map.of('i',100000, 'j',50,'z', 40), List.of(0.0000001,0.000002, 0.000003,0.000004)),  // skinny right with outer mm
+		new Config("ab,ab,a,ag,gz->bz",   List.of(shape(10, 5), shape(10, 5),shape(10),shape(10,200),shape(200,7)))
+		,new Config("ab,ab,a,ag,gz->bz",   List.of(shape(10, 5), shape(10, 5),shape(10),shape(10,20),shape(20,7)))
+		,new Config("ab,ab,bc,bc->bc",   List.of(shape(10, 5), shape(10, 5),shape(5,20),shape(5,20)))
 	);
-
 	private final int id;
 	private final String einsumStr;
-	//private final List<int[]> shapes;
 	private final File dmlFile;
 	private final File rFile;
 	private final boolean outputScalar;
@@ -103,7 +147,6 @@ public class EinsumTest extends AutomatedTestBase
 	public EinsumTest(String einsumStr, List<int[]> shapes, File dmlFile, File rFile, boolean outputScalar, int id){
 		this.id = id;
 		this.einsumStr = einsumStr;
-		//this.shapes = shapes;
 		this.dmlFile = dmlFile;
 		this.rFile = rFile;
 		this.outputScalar = outputScalar;
@@ -116,7 +159,6 @@ public static Collection<Object[]> data() throws IOException {
 		int counter = 1;
 
 		for (Config config : TEST_CONFIGS) {
-			//List<File> files = new ArrayList<>();
 			String fullDMLScriptName = "SystemDS_einsum_test" + counter;
 
 			File dmlFile = File.createTempFile(fullDMLScriptName, ".dml");
@@ -153,12 +195,12 @@ private static StringBuilder createDmlFile(Config config, boolean outputScalar)
 			sb.append("A");
 			sb.append(i);
 
-			if (dims.length == 1) { // A1 = seq(1,1000) * 0.0001
+			if (dims.length == 1) { // e.g. A1 = seq(1,100) * 0.0001
 				sb.append(" = seq(1,");
 				sb.append(dims[0]);
 				sb.append(") * ");
 				sb.append(factor);
-			} else { // A0 = matrix(seq(1,50000), 1000, 50) * 0.0001
+			} else { // e.g. A0 = matrix(seq(1,5000), 100, 5) * 0.0001
 				sb.append(" = matrix(seq(1, ");
 				sb.append(dims[0]*dims[1]);
 				sb.append("), ");
@@ -172,7 +214,6 @@ private static StringBuilder createDmlFile(Config config, boolean outputScalar)
 			sb.append("\n");
 		}
 		sb.append("\n");
-
 		sb.append("R = einsum(\"");
 		sb.append(config.einsumStr);
 		sb.append("\", ");
@@ -202,17 +243,17 @@ private static StringBuilder createRFile(Config config, boolean outputScalar) {
 
 		for (int i = 0; i < config.shapes.size(); i++) {
 			int[] dims = config.shapes.get(i);
-			
+
 			double factor = config.factors != null ? config.factors.get(i) : 0.0001;
 			sb.append("A");
 			sb.append(i);
 
-			if (dims.length == 1) { // A1 = seq(1,1000) * 0.0001
+			if (dims.length == 1) { // e.g. A1 = seq(1,100) * 0.0001
 				sb.append(" = seq(1,");
 				sb.append(dims[0]);
 				sb.append(") * ");
 				sb.append(factor);
-			} else { // A0 = matrix(seq(1,50000), 1000, 50, byrow=TRUE) * 0.0001
+			} else { // e.g. A0 = matrix(seq(1,5000), 100, 5, byrow=TRUE) * 0.0001
 				sb.append(" = matrix(seq(1, ");
 				sb.append(dims[0]*dims[1]);
 				sb.append("), ");
@@ -252,7 +293,7 @@ private static StringBuilder createRFile(Config config, boolean outputScalar) {
 	@Test
 	public void testEinsumWithFiles() {
 		System.out.println("Testing einsum: " + this.einsumStr);
-		testCodegenIntegration(TEST_NAME_EINSUM+this.id);
+		test(TEST_NAME_EINSUM+this.id);
 	}
 	@After
 	public void cleanUp() {
@@ -276,9 +317,26 @@ private static class Config {
 		List<int[]> shapes;
 
 		Config(String einsum, List<int[]> shapes) {
+			this(einsum,shapes,null);
+		}
+		Config(String einsum, Map<Character, Integer> charToSize){
+			this(einsum, charToSize, null);
+		}
+
+		Config(String einsum, Map<Character, Integer> charToSize, List<Double> factors) {
 			this.einsumStr = einsum;
+			String leftPart = einsum.split("->")[0];
+			List<int[]> shapes = new ArrayList<>();
+			for(String op : Arrays.stream(leftPart.split(",")).map(x->x.trim()).toList()){
+				if (op.length() == 1) {
+					shapes.add(new int[]{charToSize.get(op.charAt(0))});
+				}else{
+					shapes.add(new int[]{charToSize.get(op.charAt(0)),charToSize.get(op.charAt(1))});
+				}
+
+			}
 			this.shapes = shapes;
-			this.factors = null;
+			this.factors = factors;
 		}
 		Config(String einsum, List<int[]> shapes, List<Double> factors) {
 			this.einsumStr = einsum;
@@ -295,7 +353,7 @@ private static int[] shape(int... dims) {
 	private static final String TEST_NAME_EINSUM = "einsum";
 	private static final String TEST_DIR = "functions/einsum/";
 	private static final String TEST_CLASS_DIR = TEST_DIR + EinsumTest.class.getSimpleName() + "/";
-	private final static String TEST_CONF = "SystemDS-config-codegen.xml";
+	private final static String TEST_CONF = "SystemDS-config-einsum.xml";
 	private final static File   TEST_CONF_FILE = new File(SCRIPT_DIR + TEST_DIR, TEST_CONF);
 
 	private static double eps = Math.pow(10, -10);
@@ -307,7 +365,7 @@ public void setUp() {
 			addTestConfiguration( TEST_NAME_EINSUM+i, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME_EINSUM+i, new String[] { String.valueOf(i) }) );
 	}
 
-	private void testCodegenIntegration( String testname)
+	private void test(String testname)
 	{
 		boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
 		ExecMode platformOld = setExecMode(ExecType.CP);
@@ -329,16 +387,17 @@ private void testCodegenIntegration( String testname)
 			runTest(true, false, null, -1);
 			runRScript(true);
 
+			HashMap<CellIndex, Double> dmlfile;
+			HashMap<CellIndex, Double> rfile;
 			if(outputScalar){
-				HashMap<CellIndex, Double> dmlfile = readDMLScalarFromOutputDir("S");
-				HashMap<CellIndex, Double> rfile = readRScalarFromExpectedDir("S");
-				TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+				dmlfile = readDMLScalarFromOutputDir("S");
+				rfile = readRScalarFromExpectedDir("S");
 			}else {
 				//compare matrices
-				HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromOutputDir("S");
-				HashMap<CellIndex, Double> rfile = readRMatrixFromExpectedDir("S");
-				TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
+				dmlfile = readDMLMatrixFromOutputDir("S");
+				rfile = readRMatrixFromExpectedDir("S");
 			}
+			TestUtils.compareMatrices(dmlfile, rfile, eps, "Stat-DML", "Stat-R");
 		}
 		finally {
 			resetExecMode(platformOld);
diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java
index c20294cd85b..7a23cf72d56 100644
--- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java
+++ b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test.java
@@ -19,7 +19,18 @@
 
 package org.apache.sysds.test.functions.io.hdf5;
 
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.io.File;
+import org.apache.commons.io.FileUtils;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 
 import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.common.Types.ExecMode;
@@ -27,30 +38,52 @@
 import org.apache.sysds.runtime.matrix.data.MatrixValue;
 import org.apache.sysds.test.TestConfiguration;
 import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
-public abstract class ReadHDF5Test extends ReadHDF5TestBase {
+public class ReadHDF5Test extends ReadHDF5TestBase {
 
-	protected abstract int getId();
+	private static final double eps = 1e-9;
+	private static final String TEST_NAME = "ReadHDF5Test";
 
-	protected String getInputHDF5FileName() {
-		return "transfusion_" + getId() + ".h5";
+	private static final List<Hdf5TestCase> TEST_CASES = Collections.unmodifiableList(
+		Arrays.asList(new Hdf5TestCase("test_single_dataset.h5", "data", DmlVariant.FORMAT_AND_DATASET),
+			new Hdf5TestCase("test_multiple_datasets.h5", "matrix_2d", DmlVariant.DATASET_ONLY),
+			new Hdf5TestCase("test_multiple_datasets.h5", "matrix_3d", DmlVariant.DATASET_ONLY),
+			new Hdf5TestCase("test_multi_tensor_samples.h5", "label", DmlVariant.DATASET_ONLY),
+			new Hdf5TestCase("test_multi_tensor_samples.h5", "sen1", DmlVariant.DATASET_ONLY)));
+			//TODO new Hdf5TestCase("test_nested_groups.h5", "group1/subgroup/data2", DmlVariant.FORMAT_AND_DATASET)));
+
+	@Override
+	protected String getTestName() {
+		return TEST_NAME;
 	}
 
-	private final static double eps = 1e-9;
+	@Override
+	protected String getTestClassDir() {
+		return TEST_CLASS_DIR;
+	}
 
-	@Test
-	public void testHDF51_Seq_CP() {
-		runReadHDF5Test(getId(), ExecMode.SINGLE_NODE, false);
+	@BeforeClass
+	public static void setUpClass() {
+		Path scriptDir = Paths.get(SCRIPT_DIR + TEST_DIR);
+		generateHdf5Data(scriptDir);
 	}
 
 	@Test
-	public void testHDF51_Parallel_CP() {
-		runReadHDF5Test(getId(), ExecMode.SINGLE_NODE, true);
+	public void testReadSequential() {
+		for(Hdf5TestCase tc : TEST_CASES)
+			runReadHDF5Test(tc, ExecMode.SINGLE_NODE, false);
 	}
 
-	protected void runReadHDF5Test(int testNumber, ExecMode platform, boolean parallel) {
+	@Test
+	public void testReadSequentialParallelIO() {
+		for(Hdf5TestCase tc : TEST_CASES)
+			runReadHDF5Test(tc, ExecMode.SINGLE_NODE, true);
+	}
 
+	protected void runReadHDF5Test(Hdf5TestCase testCase, ExecMode platform, boolean parallel) {
 		ExecMode oldPlatform = rtplatform;
 		rtplatform = platform;
 
@@ -61,21 +94,28 @@ protected void runReadHDF5Test(int testNumber, ExecMode platform, boolean parall
 		boolean oldpar = CompilerConfig.FLAG_PARREADWRITE_TEXT;
 
 		try {
-
 			CompilerConfig.FLAG_PARREADWRITE_TEXT = parallel;
 
 			TestConfiguration config = getTestConfiguration(getTestName());
 			loadTestConfiguration(config);
 
 			String HOME = SCRIPT_DIR + TEST_DIR;
-			String inputMatrixName = HOME + INPUT_DIR + getInputHDF5FileName(); // always read the same data
-			String datasetName = "DATASET_1";
+			String inputMatrixName = HOME + INPUT_DIR + testCase.hdf5File;
+
+			fullDMLScriptName = HOME + testCase.variant.getScriptName();
+			programArgs = new String[] {"-args", inputMatrixName, testCase.dataset, output("Y")};
 
-			fullDMLScriptName = HOME + getTestName() + "_" + testNumber + ".dml";
-			programArgs = new String[] {"-args", inputMatrixName, datasetName, output("Y")};
+			// Clean per-case output/expected to avoid reusing stale metadata between looped cases
+			String outY = output("Y");
+			String expY = expected("Y");
+			FileUtils.deleteQuietly(new File(outY));
+			FileUtils.deleteQuietly(new File(outY + ".mtd"));
+			FileUtils.deleteQuietly(new File(expY));
+			FileUtils.deleteQuietly(new File(expY + ".mtd"));
 
 			fullRScriptName = HOME + "ReadHDF5_Verify.R";
-			rCmd = "Rscript" + " " + fullRScriptName + " " + inputMatrixName + " " + datasetName + " " + expectedDir();
+			rCmd = "Rscript" + " " + fullRScriptName + " " + inputMatrixName + " " + testCase.dataset + " "
+				+ expectedDir();
 
 			runTest(true, false, null, -1);
 			runRScript(true);
@@ -90,4 +130,61 @@ protected void runReadHDF5Test(int testNumber, ExecMode platform, boolean parall
 			DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
 		}
 	}
+
+	private static void generateHdf5Data(Path scriptDir) {
+		ProcessBuilder processBuilder = new ProcessBuilder("Rscript", "gen_HDF5_testdata.R");
+		processBuilder.directory(scriptDir.toFile());
+		processBuilder.redirectErrorStream(true);
+
+		try {
+			Process process = processBuilder.start();
+			StringBuilder output = new StringBuilder();
+			try(BufferedReader reader = new BufferedReader(
+				new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
+				reader.lines().forEach(line -> output.append(line).append(System.lineSeparator()));
+			}
+			int exitCode = process.waitFor();
+			if(exitCode != 0)
+				Assert.fail("Failed to execute gen_HDF5_testdata.R (exit " + exitCode + "):\n" + output);
+		}
+		catch(IOException e) {
+			Assert.fail("Unable to execute gen_HDF5_testdata.R: " + e.getMessage());
+		}
+		catch(InterruptedException e) {
+			Thread.currentThread().interrupt();
+			Assert.fail("Interrupted while generating HDF5 test data.");
+		}
+	}
+
+	private enum DmlVariant {
+		FORMAT_AND_DATASET("ReadHDF5_WithFormatAndDataset.dml"), DATASET_ONLY("ReadHDF5_WithDataset.dml"),
+		DEFAULT("ReadHDF5_Default.dml");
+
+		private final String scriptName;
+
+		DmlVariant(String scriptName) {
+			this.scriptName = scriptName;
+		}
+
+		public String getScriptName() {
+			return scriptName;
+		}
+	}
+
+	private static final class Hdf5TestCase {
+		private final String hdf5File;
+		private final String dataset;
+		private final DmlVariant variant;
+
+		private Hdf5TestCase(String hdf5File, String dataset, DmlVariant variant) {
+			this.hdf5File = hdf5File;
+			this.dataset = dataset;
+			this.variant = variant;
+		}
+
+		@Override
+		public String toString() {
+			return hdf5File + "::" + dataset;
+		}
+	}
 }
diff --git a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test2.java b/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test2.java
deleted file mode 100644
index d6a4c763c34..00000000000
--- a/src/test/java/org/apache/sysds/test/functions/io/hdf5/ReadHDF5Test2.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.test.functions.io.hdf5;
-
-public class ReadHDF5Test2 extends ReadHDF5Test {
-
-	private final static String TEST_NAME = "ReadHDF5Test";
-	private final static String TEST_CLASS_DIR = TEST_DIR + ReadHDF5Test2.class.getSimpleName() + "/";
-
-	protected String getTestName() {
-		return TEST_NAME;
-	}
-
-	protected String getTestClassDir() {
-		return TEST_CLASS_DIR;
-	}
-
-	protected int getId() {
-		return 2;
-	}
-}
diff --git a/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextFrameTest.java b/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextFrameTest.java
index db4a3eac37c..bb4c96da604 100644
--- a/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextFrameTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/mlcontext/MLContextFrameTest.java
@@ -57,11 +57,11 @@ public class MLContextFrameTest extends MLContextTestBase {
 
 	protected static final Log LOG = LogFactory.getLog(MLContextFrameTest.class.getName());
 
-	public static enum SCRIPT_TYPE {
+	public static enum ScriptType {
 		DML
 	}
 
-	public static enum IO_TYPE {
+	public static enum IoType {
 		ANY, FILE, JAVA_RDD_STR_CSV, JAVA_RDD_STR_IJV, RDD_STR_CSV, RDD_STR_IJV, DATAFRAME
 	}
 
@@ -75,50 +75,50 @@ public static void setUpClass() {
 
 	@Test
 	public void testFrameJavaRDD_CSV_DML() {
-		testFrame(FrameFormat.CSV, SCRIPT_TYPE.DML, IO_TYPE.JAVA_RDD_STR_CSV, IO_TYPE.ANY);
+		testFrame(FrameFormat.CSV, ScriptType.DML, IoType.JAVA_RDD_STR_CSV, IoType.ANY);
 	}
 
 	@Test
 	public void testFrameJavaRDD_CSV_DML_OutJavaRddCSV() {
-		testFrame(FrameFormat.CSV, SCRIPT_TYPE.DML, IO_TYPE.JAVA_RDD_STR_CSV, IO_TYPE.JAVA_RDD_STR_CSV);
+		testFrame(FrameFormat.CSV, ScriptType.DML, IoType.JAVA_RDD_STR_CSV, IoType.JAVA_RDD_STR_CSV);
 	}
 
 	@Test
 	public void testFrameJavaRDD_IJV_DML() {
-		testFrame(FrameFormat.IJV, SCRIPT_TYPE.DML, IO_TYPE.JAVA_RDD_STR_IJV, IO_TYPE.ANY);
+		testFrame(FrameFormat.IJV, ScriptType.DML, IoType.JAVA_RDD_STR_IJV, IoType.ANY);
 	}
 
 	@Test
 	public void testFrameRDD_IJV_DML() {
-		testFrame(FrameFormat.IJV, SCRIPT_TYPE.DML, IO_TYPE.RDD_STR_IJV, IO_TYPE.ANY);
+		testFrame(FrameFormat.IJV, ScriptType.DML, IoType.RDD_STR_IJV, IoType.ANY);
 	}
 
 	@Test
 	public void testFrameJavaRDD_IJV_DML_OutRddCSV() {
-		testFrame(FrameFormat.IJV, SCRIPT_TYPE.DML, IO_TYPE.JAVA_RDD_STR_IJV, IO_TYPE.RDD_STR_CSV);
+		testFrame(FrameFormat.IJV, ScriptType.DML, IoType.JAVA_RDD_STR_IJV, IoType.RDD_STR_CSV);
 	}
 
 	@Test
 	public void testFrameFile_CSV_DML() {
-		testFrame(FrameFormat.CSV, SCRIPT_TYPE.DML, IO_TYPE.FILE, IO_TYPE.ANY);
+		testFrame(FrameFormat.CSV, ScriptType.DML, IoType.FILE, IoType.ANY);
 	}
 
 	@Test
 	public void testFrameFile_IJV_DML() {
-		testFrame(FrameFormat.IJV, SCRIPT_TYPE.DML, IO_TYPE.FILE, IO_TYPE.ANY);
+		testFrame(FrameFormat.IJV, ScriptType.DML, IoType.FILE, IoType.ANY);
 	}
 
 	@Test
 	public void testFrameDataFrame_CSV_DML() {
-		testFrame(FrameFormat.CSV, SCRIPT_TYPE.DML, IO_TYPE.DATAFRAME, IO_TYPE.ANY);
+		testFrame(FrameFormat.CSV, ScriptType.DML, IoType.DATAFRAME, IoType.ANY);
 	}
 
 	@Test
 	public void testFrameDataFrameOutDataFrame_CSV_DML() {
-		testFrame(FrameFormat.CSV, SCRIPT_TYPE.DML, IO_TYPE.DATAFRAME, IO_TYPE.DATAFRAME);
+		testFrame(FrameFormat.CSV, ScriptType.DML, IoType.DATAFRAME, IoType.DATAFRAME);
 	}
 
-	public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE inputType, IO_TYPE outputType) {
+	public void testFrame(FrameFormat format, ScriptType script_type, IoType inputType, IoType outputType) {
 
 		// System.out.println("MLContextTest - Frame JavaRDD<String> for format: " + format + " Script: " + script_type);
 
@@ -133,7 +133,7 @@ public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE input
 		List<ValueType> lschemaB = Arrays.asList(schemaB);
 		FrameSchema fschemaB = new FrameSchema(lschemaB);
 
-		if (inputType != IO_TYPE.FILE) {
+		if (inputType != IoType.FILE) {
 			if (format == FrameFormat.CSV) {
 				listA.add("1,Str2,3.0,true");
 				listA.add("4,Str5,6.0,false");
@@ -171,7 +171,7 @@ public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE input
 			JavaRDD<String> javaRDDA = sc.parallelize(listA);
 			JavaRDD<String> javaRDDB = sc.parallelize(listB);
 
-			if (inputType == IO_TYPE.DATAFRAME) {
+			if (inputType == IoType.DATAFRAME) {
 				JavaRDD<Row> javaRddRowA = FrameRDDConverterUtils.csvToRowRDD(sc, javaRDDA, CSV_DELIM, schemaA);
 				JavaRDD<Row> javaRddRowB = FrameRDDConverterUtils.csvToRowRDD(sc, javaRDDB, CSV_DELIM, schemaB);
 
@@ -180,19 +180,19 @@ public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE input
 				Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, dfSchemaA);
 				StructType dfSchemaB = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaB, false);
 				Dataset<Row> dataFrameB = spark.createDataFrame(javaRddRowB, dfSchemaB);
-				if (script_type == SCRIPT_TYPE.DML)
+				if (script_type == ScriptType.DML)
 					script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", dataFrameA, fmA).in("B", dataFrameB, fmB).out("A")
 							.out("C");
 			} else {
-				if (inputType == IO_TYPE.JAVA_RDD_STR_CSV || inputType == IO_TYPE.JAVA_RDD_STR_IJV) {
-					if (script_type == SCRIPT_TYPE.DML)
+				if (inputType == IoType.JAVA_RDD_STR_CSV || inputType == IoType.JAVA_RDD_STR_IJV) {
+					if (script_type == ScriptType.DML)
 						script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", javaRDDA, fmA).in("B", javaRDDB, fmB).out("A")
 								.out("C");
-				} else if (inputType == IO_TYPE.RDD_STR_CSV || inputType == IO_TYPE.RDD_STR_IJV) {
+				} else if (inputType == IoType.RDD_STR_CSV || inputType == IoType.RDD_STR_IJV) {
 					RDD<String> rddA = JavaRDD.toRDD(javaRDDA);
 					RDD<String> rddB = JavaRDD.toRDD(javaRDDB);
 
-					if (script_type == SCRIPT_TYPE.DML)
+					if (script_type == ScriptType.DML)
 						script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", rddA, fmA).in("B", rddB, fmB).out("A")
 								.out("C");
 				}
@@ -209,7 +209,7 @@ public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE input
 				fileB = baseDirectory + File.separator + "FrameB.ijv";
 			}
 
-			if (script_type == SCRIPT_TYPE.DML)
+			if (script_type == ScriptType.DML)
 				script = dml("A=read($A); B=read($B);A[2:3,2:4]=B;C=A[2:3,2:3];A[1,1]=234").in("$A", fileA, fmA)
 						.in("$B", fileB, fmB).out("A").out("C");
 		}
@@ -220,7 +220,7 @@ public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE input
 		List<ValueType> lschemaOutA = Arrays.asList(mlResults.getFrameObject("A").getSchema());
 		List<ValueType> lschemaOutC = Arrays.asList(mlResults.getFrameObject("C").getSchema());
 
-		if(inputType != IO_TYPE.RDD_STR_IJV && inputType != IO_TYPE.JAVA_RDD_STR_IJV){
+		if(inputType != IoType.RDD_STR_IJV && inputType != IoType.JAVA_RDD_STR_IJV){
 
 			Assert.assertEquals(ValueType.INT64, lschemaOutA.get(0));
 			Assert.assertEquals(ValueType.STRING, lschemaOutA.get(1));
@@ -231,7 +231,7 @@ public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE input
 			Assert.assertEquals(ValueType.FP64, lschemaOutC.get(1));
 		}
 
-		if (outputType == IO_TYPE.JAVA_RDD_STR_CSV) {
+		if (outputType == IoType.JAVA_RDD_STR_CSV) {
 
 			JavaRDD<String> javaRDDStringCSVA = mlResults.getJavaRDDStringCSV("A");
 			List<String> linesA = javaRDDStringCSVA.collect();
@@ -243,7 +243,7 @@ public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE input
 			List<String> linesC = javaRDDStringCSVC.collect();
 			Assert.assertEquals("Str12,13.0", linesC.get(0));
 			Assert.assertEquals("Str25,26.0", linesC.get(1));
-		} else if (outputType == IO_TYPE.JAVA_RDD_STR_IJV) {
+		} else if (outputType == IoType.JAVA_RDD_STR_IJV) {
 			JavaRDD<String> javaRDDStringIJVA = mlResults.getJavaRDDStringIJV("A");
 			List<String> linesA = javaRDDStringIJVA.collect();
 			Assert.assertEquals("1 1 1", linesA.get(0));
@@ -261,7 +261,7 @@ public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE input
 			Assert.assertEquals("1 2 13.0", linesC.get(1));
 			Assert.assertEquals("2 1 Str25", linesC.get(2));
 			Assert.assertEquals("2 2 26.0", linesC.get(3));
-		} else if (outputType == IO_TYPE.RDD_STR_CSV) {
+		} else if (outputType == IoType.RDD_STR_CSV) {
 			RDD<String> rddStringCSVA = mlResults.getRDDStringCSV("A");
 			Iterator<String> iteratorA = rddStringCSVA.toLocalIterator();
 			Assert.assertEquals("1,Str2,3.0,true", iteratorA.next());
@@ -272,7 +272,7 @@ public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE input
 			Iterator<String> iteratorC = rddStringCSVC.toLocalIterator();
 			Assert.assertEquals("Str12,13.0", iteratorC.next());
 			Assert.assertEquals("Str25,26.0", iteratorC.next());
-		} else if (outputType == IO_TYPE.RDD_STR_IJV) {
+		} else if (outputType == IoType.RDD_STR_IJV) {
 			RDD<String> rddStringIJVA = mlResults.getRDDStringIJV("A");
 			Iterator<String> iteratorA = rddStringIJVA.toLocalIterator();
 			Assert.assertEquals("1 1 1", iteratorA.next());
@@ -295,7 +295,7 @@ public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE input
 			Assert.assertEquals("2 1 Str25", iteratorC.next());
 			Assert.assertEquals("2 2 26.0", iteratorC.next());
 
-		} else if (outputType == IO_TYPE.DATAFRAME) {
+		} else if (outputType == IoType.DATAFRAME) {
 
 			Dataset<Row> dataFrameA = mlResults.getDataFrame("A").drop(RDDConverterUtils.DF_ID_COLUMN);
 			StructType dfschemaA = dataFrameA.schema(); 
diff --git a/src/test/java/org/apache/sysds/test/functions/ooc/LmCGTest.java b/src/test/java/org/apache/sysds/test/functions/ooc/LmCGTest.java
new file mode 100644
index 00000000000..72650daf9c2
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/ooc/LmCGTest.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.ooc;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.io.MatrixWriter;
+import org.apache.sysds.runtime.io.MatrixWriterFactory;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.runtime.util.DataConverter;
+import org.apache.sysds.runtime.util.HDFSTool;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.io.IOException;
+
+public class LmCGTest extends AutomatedTestBase {
+	private final static String TEST_NAME1 = "lmCG";
+	private final static String TEST_DIR = "functions/ooc/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + LmCGTest.class.getSimpleName() + "/";
+	private final static double eps = 1e-8;
+	private static final String INPUT_NAME_1 = "X";
+	private static final String INPUT_NAME_2 = "y";
+	private static final String OUTPUT_NAME = "res";
+
+	private final static int rows = 10000;
+	private final static int cols = 500;
+	private final static int maxVal = 2;
+	private final static double sparsity1 = 1;
+	private final static double sparsity2 = 0.05;
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		TestConfiguration config = new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1);
+		addTestConfiguration(TEST_NAME1, config);
+	}
+
+	@Test
+	public void testlmCGDense() {
+		runLmCGTest(false);
+	}
+
+	@Test
+	public void testLmCGSparse() {
+		runLmCGTest(true);
+	}
+
+	private void runLmCGTest(boolean sparse) {
+		Types.ExecMode platformOld = setExecMode(Types.ExecMode.SINGLE_NODE);
+
+		try {
+			getAndLoadTestConfiguration(TEST_NAME1);
+
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
+			programArgs = new String[] {"-explain", /*"hops",*/ "-stats", "-ooc", "-args", input(INPUT_NAME_1), input(INPUT_NAME_2), output(OUTPUT_NAME)};
+
+			// 1. Generate the data in-memory as MatrixBlock objects
+			double[][] X_data = getRandomMatrix(rows, cols, 0, maxVal, sparse ? sparsity2 : sparsity1, 7);
+			double[][] y_data = getRandomMatrix(rows, 1, 0, 1, 1.0, 3);
+
+			// 2. Convert the double arrays to MatrixBlock objects
+			MatrixBlock X_mb = DataConverter.convertToMatrixBlock(X_data);
+			MatrixBlock y_mb = DataConverter.convertToMatrixBlock(y_data);
+
+			// 3. Create a binary matrix writer
+			MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(Types.FileFormat.BINARY);
+
+			// 4. Write matrix A to a binary SequenceFile
+			writer.writeMatrixToHDFS(X_mb, input(INPUT_NAME_1), rows, cols, 1000, X_mb.getNonZeros());
+			HDFSTool.writeMetaDataFile(input(INPUT_NAME_1 + ".mtd"), Types.ValueType.FP64,
+				new MatrixCharacteristics(rows, cols, 1000, X_mb.getNonZeros()), Types.FileFormat.BINARY);
+
+			// 5. Write vector x to a binary SequenceFile
+			writer.writeMatrixToHDFS(y_mb, input(INPUT_NAME_2), rows, 1, 1000, y_mb.getNonZeros());
+			HDFSTool.writeMetaDataFile(input(INPUT_NAME_2 + ".mtd"), Types.ValueType.FP64,
+				new MatrixCharacteristics(rows, 1, 1000, y_mb.getNonZeros()), Types.FileFormat.BINARY);
+
+			runTest(true, false, null, -1);
+
+			//check replace OOC op
+			/*Assert.assertTrue("OOC wasn't used for contains",
+				heavyHittersContainsString(Instruction.OOC_INST_PREFIX + Opcodes.CONTAINS));*/
+
+			//compare results
+
+			// rerun without ooc flag
+			programArgs = new String[] {"-explain", "-stats", "-args", input(INPUT_NAME_1), input(INPUT_NAME_2), output(OUTPUT_NAME + "_target")};
+			runTest(true, false, null, -1);
+
+			// compare matrices
+			MatrixBlock ret1 = DataConverter.readMatrixFromHDFS(output(OUTPUT_NAME),
+				Types.FileFormat.BINARY, cols, 1, 1000);
+			MatrixBlock ret2 = DataConverter.readMatrixFromHDFS(output(OUTPUT_NAME + "_target"),
+				Types.FileFormat.BINARY, cols, 1, 1000);
+			TestUtils.compareMatrices(ret1, ret2, eps);
+		}
+		catch(IOException e) {
+			throw new RuntimeException(e);
+		}
+		finally {
+			resetExecMode(platformOld);
+		}
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/functions/ooc/PCATest.java b/src/test/java/org/apache/sysds/test/functions/ooc/PCATest.java
new file mode 100644
index 00000000000..202c6b988ef
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/ooc/PCATest.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.ooc;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.io.MatrixWriter;
+import org.apache.sysds.runtime.io.MatrixWriterFactory;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.meta.MatrixCharacteristics;
+import org.apache.sysds.runtime.util.DataConverter;
+import org.apache.sysds.runtime.util.HDFSTool;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.io.IOException;
+
+public class PCATest extends AutomatedTestBase {
+	private final static String TEST_NAME1 = "PCA";
+	private final static String TEST_DIR = "functions/ooc/";
+	private final static String TEST_CLASS_DIR = TEST_DIR + PCATest.class.getSimpleName() + "/";
+	//private final static double eps = 1e-8;
+	private static final String INPUT_NAME_1 = "X";
+	private static final String OUTPUT_NAME_1 = "PC";
+	private static final String OUTPUT_NAME_2 = "V";
+
+	private final static int rows = 25000;
+	private final static int cols = 1000;
+	private final static int maxVal = 2;
+
+	@Override
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		TestConfiguration config = new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1);
+		addTestConfiguration(TEST_NAME1, config);
+	}
+
+	@Test
+	public void testPCA() {
+		runPCATest(16);
+	}
+
+	private void runPCATest(int k) {
+		Types.ExecMode platformOld = setExecMode(Types.ExecMode.SINGLE_NODE);
+
+		try {
+			getAndLoadTestConfiguration(TEST_NAME1);
+
+			String HOME = SCRIPT_DIR + TEST_DIR;
+			fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
+			programArgs = new String[] {"-explain", "hops", "-stats", "-ooc", "-oocStats", "5", "-oocLogEvents", output(""),  "-args", input(INPUT_NAME_1), Integer.toString(k), output(OUTPUT_NAME_1), output(OUTPUT_NAME_2)};
+
+			// 1. Generate the data in-memory as MatrixBlock objects
+			double[][] X_data = getRandomMatrix(rows, cols, 0, maxVal, 1, 7);
+
+			// 2. Convert the double arrays to MatrixBlock objects
+			MatrixBlock X_mb = DataConverter.convertToMatrixBlock(X_data);
+
+			// 3. Create a binary matrix writer
+			MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(Types.FileFormat.BINARY);
+
+			// 4. Write matrix A to a binary SequenceFile
+			writer.writeMatrixToHDFS(X_mb, input(INPUT_NAME_1), rows, cols, 1000, X_mb.getNonZeros());
+			HDFSTool.writeMetaDataFile(input(INPUT_NAME_1 + ".mtd"), Types.ValueType.FP64,
+				new MatrixCharacteristics(rows, cols, 1000, X_mb.getNonZeros()), Types.FileFormat.BINARY);
+			X_data = null;
+			X_mb = null;
+
+			runTest(true, false, null, -1);
+
+			//check replace OOC op
+			//Assert.assertTrue("OOC wasn't used for replacement",
+			//	heavyHittersContainsString(Instruction.OOC_INST_PREFIX + Opcodes.REPLACE));
+
+			//compare results
+
+			// rerun without ooc flag
+			programArgs = new String[] {"-explain", "hops", "-stats", "-args", input(INPUT_NAME_1), Integer.toString(k), output(OUTPUT_NAME_1 + "_target"), output(OUTPUT_NAME_2 + "_target")};
+			runTest(true, false, null, -1);
+
+			// compare matrices
+			/*MatrixBlock ret1 = DataConverter.readMatrixFromHDFS(output(OUTPUT_NAME_1),
+				Types.FileFormat.BINARY, rows, cols, 1000);
+			MatrixBlock ret2 = DataConverter.readMatrixFromHDFS(output(OUTPUT_NAME_1 + "_target"),
+				Types.FileFormat.BINARY, rows, cols, 1000);
+			TestUtils.compareMatrices(ret1, ret2, eps);
+
+			MatrixBlock ret2_1 = DataConverter.readMatrixFromHDFS(output(OUTPUT_NAME_2),
+				Types.FileFormat.BINARY, rows, cols, 1000);
+			MatrixBlock ret2_2 = DataConverter.readMatrixFromHDFS(output(OUTPUT_NAME_2 + "_target"),
+				Types.FileFormat.BINARY, rows, cols, 1000);
+			TestUtils.compareMatrices(ret2_1, ret2_2, eps);*/
+		}
+		catch(IOException e) {
+			throw new RuntimeException(e);
+		}
+		finally {
+			resetExecMode(platformOld);
+		}
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/functions/ooc/SourceReadOOCIOHandlerTest.java b/src/test/java/org/apache/sysds/test/functions/ooc/SourceReadOOCIOHandlerTest.java
new file mode 100644
index 00000000000..34dd01d6620
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/ooc/SourceReadOOCIOHandlerTest.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.ooc;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.instructions.ooc.SubscribableTaskQueue;
+import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.ooc.cache.OOCIOHandler;
+import org.apache.sysds.runtime.ooc.cache.OOCMatrixIOHandler;
+import org.apache.sysds.runtime.controlprogram.parfor.LocalTaskQueue;
+import org.apache.sysds.runtime.io.MatrixWriter;
+import org.apache.sysds.runtime.io.MatrixWriterFactory;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class SourceReadOOCIOHandlerTest extends AutomatedTestBase {
+	private static final String TEST_NAME = "SourceReadOOCIOHandler";
+	private static final String TEST_DIR = "functions/ooc/";
+	private static final String TEST_CLASS_DIR = TEST_DIR + SourceReadOOCIOHandlerTest.class.getSimpleName() + "/";
+
+	private OOCMatrixIOHandler handler;
+
+	@Override
+	@Before
+	public void setUp() {
+		TestUtils.clearAssertionInformation();
+		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME));
+		handler = new OOCMatrixIOHandler();
+	}
+
+	@After
+	public void tearDown() {
+		if (handler != null)
+			handler.shutdown();
+	}
+
+	@Test
+	public void testSourceReadCompletes() throws Exception {
+		getAndLoadTestConfiguration(TEST_NAME);
+		final int rows = 4;
+		final int cols = 4;
+		final int blen = 2;
+
+		MatrixBlock src = MatrixBlock.randOperations(rows, cols, 1.0, -1, 1, "uniform", 7);
+		String fname = input("binary_full");
+		writeBinaryMatrix(src, fname, blen);
+
+		SubscribableTaskQueue<IndexedMatrixValue> target = new SubscribableTaskQueue<>();
+		OOCIOHandler.SourceReadRequest req = new OOCIOHandler.SourceReadRequest(fname, Types.FileFormat.BINARY,
+			rows, cols, blen, src.getNonZeros(), Long.MAX_VALUE, true, target);
+
+		OOCIOHandler.SourceReadResult res = handler.scheduleSourceRead(req).get();
+		// Drain after EOF
+		MatrixBlock reconstructed = drainToMatrix(target, rows, cols, blen);
+
+		TestUtils.compareMatrices(src, reconstructed, 1e-12);
+		org.junit.Assert.assertTrue(res.eof);
+		org.junit.Assert.assertNull(res.continuation);
+		org.junit.Assert.assertNotNull(res.blocks);
+		org.junit.Assert.assertEquals((rows / blen) * (cols / blen), res.blocks.size());
+		org.junit.Assert.assertTrue(res.blocks.stream().allMatch(b -> b.indexes != null));
+	}
+
+	@Test
+	public void testSourceReadStopsOnBudgetAndContinues() throws Exception {
+		getAndLoadTestConfiguration(TEST_NAME);
+		final int rows = 4;
+		final int cols = 4;
+		final int blen = 2;
+
+		MatrixBlock src = MatrixBlock.randOperations(rows, cols, 1.0, -1, 1, "uniform", 13);
+		String fname = input("binary_budget");
+		writeBinaryMatrix(src, fname, blen);
+
+		long singleBlockSize = new MatrixBlock(blen, blen, false).getExactSerializedSize();
+		long budget = singleBlockSize + 1; // ensure we stop before the second block
+
+		SubscribableTaskQueue<IndexedMatrixValue> target = new SubscribableTaskQueue<>();
+		OOCIOHandler.SourceReadRequest req = new OOCIOHandler.SourceReadRequest(fname, Types.FileFormat.BINARY,
+			rows, cols, blen, src.getNonZeros(), budget, true, target);
+
+		OOCIOHandler.SourceReadResult first = handler.scheduleSourceRead(req).get();
+		org.junit.Assert.assertFalse(first.eof);
+		org.junit.Assert.assertNotNull(first.continuation);
+		org.junit.Assert.assertNotNull(first.blocks);
+
+		OOCIOHandler.SourceReadResult second = handler.continueSourceRead(first.continuation, Long.MAX_VALUE).get();
+		org.junit.Assert.assertTrue(second.eof);
+		org.junit.Assert.assertNull(second.continuation);
+		org.junit.Assert.assertNotNull(second.blocks);
+		org.junit.Assert.assertEquals((rows / blen) * (cols / blen), first.blocks.size() + second.blocks.size());
+
+		MatrixBlock reconstructed = drainToMatrix(target, rows, cols, blen);
+		TestUtils.compareMatrices(src, reconstructed, 1e-12);
+	}
+
+	private void writeBinaryMatrix(MatrixBlock mb, String fname, int blen) throws Exception {
+		MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(Types.FileFormat.BINARY);
+		writer.writeMatrixToHDFS(mb, fname, mb.getNumRows(), mb.getNumColumns(), blen, mb.getNonZeros());
+	}
+
+	private MatrixBlock drainToMatrix(SubscribableTaskQueue<IndexedMatrixValue> target, int rows, int cols, int blen) {
+		List<IndexedMatrixValue> blocks = new ArrayList<>();
+		IndexedMatrixValue tmp;
+		while((tmp = target.dequeue()) != LocalTaskQueue.NO_MORE_TASKS) {
+			blocks.add(tmp);
+		}
+
+		MatrixBlock out = new MatrixBlock(rows, cols, false);
+		for (IndexedMatrixValue imv : blocks) {
+			int rowOffset = (int)((imv.getIndexes().getRowIndex() - 1) * blen);
+			int colOffset = (int)((imv.getIndexes().getColumnIndex() - 1) * blen);
+			((MatrixBlock)imv.getValue()).putInto(out, rowOffset, colOffset, true);
+		}
+		out.recomputeNonZeros();
+		return out;
+	}
+}
diff --git a/src/test/java/org/apache/sysds/test/usertest/pythonapi/StartupTest.java b/src/test/java/org/apache/sysds/test/usertest/pythonapi/StartupTest.java
index f08dd63c65a..6efa72d6ae2 100644
--- a/src/test/java/org/apache/sysds/test/usertest/pythonapi/StartupTest.java
+++ b/src/test/java/org/apache/sysds/test/usertest/pythonapi/StartupTest.java
@@ -1,18 +1,18 @@
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
+ * or more contributor license agreements.	See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
- *   http://www.apache.org/licenses/LICENSE-2.0
+ *	 http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
+ * KIND, either express or implied.	 See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
@@ -24,7 +24,10 @@
 import org.apache.log4j.spi.LoggingEvent;
 import org.apache.sysds.api.PythonDMLScript;
 import org.apache.sysds.common.Types;
+import org.apache.sysds.common.Types.ValueType;
 import org.apache.sysds.runtime.DMLRuntimeException;
+import org.apache.sysds.runtime.frame.data.FrameBlock;
+import org.apache.sysds.runtime.frame.data.columns.Array;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.UnixPipeUtils;
 import org.apache.sysds.test.LoggingUtils;
@@ -39,7 +42,6 @@
 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
 import java.io.File;
-import java.security.Permission;
 import java.util.List;
 
 import static org.junit.Assert.assertArrayEquals;
@@ -48,29 +50,19 @@
 /** Simple tests to verify startup of Python Gateway server happens without crashes */
 public class StartupTest {
 	private LoggingUtils.TestAppender appender;
-	@SuppressWarnings("removal")
-	private SecurityManager sm;
 
 	@Before
-	@SuppressWarnings("removal")
 	public void setUp() {
 		appender = LoggingUtils.overwrite();
-		sm = System.getSecurityManager();
-		System.setSecurityManager(new NoExitSecurityManager());
+		PythonDMLScript.setExitHandler(new ExitCalled());
 		PythonDMLScript.setDMLGateWayListenerLoggerLevel(Level.ALL);
 		Logger.getLogger(PythonDMLScript.class.getName()).setLevel(Level.ALL);
 	}
 
 	@After
-	@SuppressWarnings("removal")
 	public void tearDown() {
 		LoggingUtils.reinsert(appender);
-		System.setSecurityManager(sm);
-	}
-
-	@SuppressWarnings("unused")
-	private void assertLogMessages(String... expectedMessages) {
-		assertLogMessages(true, expectedMessages);
+		PythonDMLScript.resetExitHandler();
 	}
 
 	private void assertLogMessages(boolean strict, String... expectedMessages) {
@@ -92,9 +84,9 @@ private void assertLogMessages(boolean strict, String... expectedMessages) {
 				// order does not matter
 				boolean found = false;
 
-                for (LoggingEvent loggingEvent : log) {
-                    found |= loggingEvent.getMessage().toString().startsWith(message);
-                }
+				for (LoggingEvent loggingEvent : log) {
+					found |= loggingEvent.getMessage().toString().startsWith(message);
+				}
 				Assert.assertTrue("Expected log message not found: " + message,found);
 			}
 		}
@@ -137,7 +129,7 @@ public void testStartupIncorrect_6() throws Exception {
 			Thread.sleep(200);
 			PythonDMLScript.main(new String[]{"-python", "4001"});
 			Thread.sleep(200);
-		} catch (SecurityException e) {
+		} catch (ExitCalled e) {
 			assertLogMessages(false,
 					"GatewayServer started",
 					"failed startup"
@@ -185,8 +177,9 @@ public void testDataTransfer() throws Exception {
 		MatrixBlock mb = new MatrixBlock(2, 3, data);
 		script.startWritingMbToPipe(0, mb);
 		double[] rcv_data = new double[data.length];
-		UnixPipeUtils.readNumpyArrayInBatches(java2py, 0, 32, data.length, Types.ValueType.FP64, rcv_data, 0);
+		long nonZeros = UnixPipeUtils.readNumpyArrayInBatches(java2py, 0, 32, data.length, Types.ValueType.FP64, rcv_data, 0);
 		assertArrayEquals(data, rcv_data, 1e-9);
+		Assert.assertEquals((long) data.length, nonZeros); // All values are non-zero
 
 		// Read Test
 		UnixPipeUtils.writeNumpyArrayInBatches(py2java, 0, 32, data.length, Types.ValueType.FP64, mb);
@@ -230,6 +223,46 @@ public void testDataTransferMultiPipes() throws Exception {
 		PythonDMLScript.GwS.shutdown();
 		Thread.sleep(200);
 	}
+	
+
+	@Test
+	public void testDataFrameTransfer() throws Exception {
+		PythonDMLScript.main(new String[]{"-python", "4003"});
+		Thread.sleep(200);
+		PythonDMLScript script = (PythonDMLScript) PythonDMLScript.GwS.getGateway().getEntryPoint();
+
+		File in = folder.newFile("py2java-0");
+		File out = folder.newFile("java2py-0");
+
+		// Init Test
+		BufferedOutputStream py2java = UnixPipeUtils.openOutput(in.getAbsolutePath(), 0);
+		script.openPipes(folder.getRoot().getPath(), 1);
+		BufferedInputStream java2py = UnixPipeUtils.openInput(out.getAbsolutePath(), 0);
+
+		// Write Test
+		String[][] data = new String[][]{{"1", "2", "3"}, {"4", "5", "6"}};
+		ValueType[] schema = new ValueType[]{Types.ValueType.STRING, Types.ValueType.STRING, Types.ValueType.STRING};
+		FrameBlock fb = new FrameBlock(schema, data);
+		
+		FrameBlock rcv_fb = new FrameBlock(schema, 2);
+		
+		for (int i = 0; i < 3; i++) {
+			script.startWritingColToPipe(0, fb, i);
+			Array<?> rcv_arr = UnixPipeUtils.readFrameColumnFromPipe(java2py, 0, 2, -1, 32 * 1024, Types.ValueType.STRING);
+			rcv_fb.setColumn(i, rcv_arr);
+		}
+
+		for (int i = 0; i < 3; i++) {
+			UnixPipeUtils.writeFrameColumnToPipe(py2java, 0, 32, fb.getColumn(i), Types.ValueType.STRING);
+			script.startReadingColFromPipe(0, rcv_fb, 2, -1, i, Types.ValueType.STRING, false);
+		}
+
+		script.closePipes();
+
+		PythonDMLScript.GwS.shutdown();
+		Thread.sleep(200);
+	}
+
 
 	@Test(expected = DMLRuntimeException.class)
 	public void testDataTransferNotInit1() throws Exception {
@@ -255,14 +288,27 @@ public void testDataTransferNotInit3() throws Exception {
 		script.startReadingMbFromPipes(new int[]{3,3}, 2, 3, Types.ValueType.FP64);
 	}
 
-	@SuppressWarnings("removal")
-	class NoExitSecurityManager extends SecurityManager {
-		@Override
-		public void checkPermission(Permission perm) { }
+	@Test(expected = Exception.class)
+	public void testDataTransferNotInit4() throws Exception {
+		PythonDMLScript.main(new String[]{"-python", "4007"});
+		Thread.sleep(200);
+		PythonDMLScript script = (PythonDMLScript) PythonDMLScript.GwS.getGateway().getEntryPoint();
+		script.startReadingColFromPipe(0, null, 2, -1, 0, Types.ValueType.STRING, false);
+	}
+
+	@Test(expected = Exception.class)
+	public void testDataTransferNotInit5() throws Exception {
+		PythonDMLScript.main(new String[]{"-python", "4007"});
+		Thread.sleep(200);
+		PythonDMLScript script = (PythonDMLScript) PythonDMLScript.GwS.getGateway().getEntryPoint();
+		script.startWritingColToPipe(0, null, 0);
+	}
+	private static class ExitCalled extends RuntimeException implements PythonDMLScript.ExitHandler {
+		private static final long serialVersionUID = -4247240099965056602L;
 
 		@Override
-		public void checkExit(int status) {
-			throw new SecurityException("Intercepted exit()");
+		public void exit(int status) {
+			throw this;
 		}
 	}
 
diff --git a/src/test/scripts/functions/builtin/scaleRobust.R b/src/test/scripts/functions/builtin/scaleRobust.R
new file mode 100644
index 00000000000..553555cb39c
--- /dev/null
+++ b/src/test/scripts/functions/builtin/scaleRobust.R
@@ -0,0 +1,42 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+library("Matrix")
+
+args <- commandArgs(TRUE)
+options(digits=22)
+
+
+X = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+colnames(X) = colnames(X, do.NULL=FALSE, prefix="C")
+Y = X
+
+for (j in 1:ncol(X)) {
+  col = X[, j]
+  med = quantile(col, probs=0.5, type=1, names=FALSE, na.rm=FALSE)
+  q1  = quantile(col, probs=0.25, type=1, names=FALSE, na.rm=FALSE)
+  q3  = quantile(col, probs=0.75, type=1, names=FALSE, na.rm=FALSE)
+  iqr = q3 - q1
+  if (iqr == 0 || is.nan(iqr)) iqr = 1
+  Y[, j] = (col - med) / iqr
+}
+
+writeMM(as(Y, "CsparseMatrix"), paste(args[2], "B", sep=""))
diff --git a/src/test/scripts/functions/builtin/scaleRobust.dml b/src/test/scripts/functions/builtin/scaleRobust.dml
new file mode 100644
index 00000000000..23dcd5f97a4
--- /dev/null
+++ b/src/test/scripts/functions/builtin/scaleRobust.dml
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1);
+[Y, med, iqr] = scaleRobust(X);
+write(Y, $2);
diff --git a/src/test/scripts/functions/builtin/scaleRobust.py b/src/test/scripts/functions/builtin/scaleRobust.py
new file mode 100644
index 00000000000..37d13f41e66
--- /dev/null
+++ b/src/test/scripts/functions/builtin/scaleRobust.py
@@ -0,0 +1,38 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+import sys
+import numpy as np
+from scipy.io import mmread, mmwrite
+from scipy.sparse import csc_matrix
+from sklearn.preprocessing import RobustScaler
+
+if __name__ == "__main__":
+    input_path = sys.argv[1] + "A.mtx"
+    output_path = sys.argv[2] + "B"
+
+    X = mmread(input_path).toarray()
+
+    # Apply RobustScaler
+    scaler = RobustScaler()
+    Y = scaler.fit_transform(X)
+
+    mmwrite(output_path, csc_matrix(Y))
diff --git a/src/test/scripts/functions/einsum/SystemDS-config-codegen.xml b/src/test/scripts/functions/einsum/SystemDS-config-einsum.xml
similarity index 83%
rename from src/test/scripts/functions/einsum/SystemDS-config-codegen.xml
rename to src/test/scripts/functions/einsum/SystemDS-config-einsum.xml
index 626b31ebd76..f6640593c42 100644
--- a/src/test/scripts/functions/einsum/SystemDS-config-codegen.xml
+++ b/src/test/scripts/functions/einsum/SystemDS-config-einsum.xml
@@ -23,9 +23,6 @@
    <sysds.optlevel>2</sysds.optlevel>
    <sysds.codegen.plancache>true</sysds.codegen.plancache>
    <sysds.codegen.literals>1</sysds.codegen.literals>
-
-   <!-- The number of threads for the spark instance artificially selected-->
-   <sysds.local.spark.number.threads>16</sysds.local.spark.number.threads>
-
-   <sysds.codegen.api>auto</sysds.codegen.api>
-</root>
\ No newline at end of file
+    <sysds.local.spark.number.threads>16</sysds.local.spark.number.threads>
+    <sysds.native.blas>auto</sysds.native.blas>
+</root>
diff --git a/src/test/scripts/functions/io/hdf5/ReadHDF5Test_3.dml b/src/test/scripts/functions/io/hdf5/ReadHDF5_Default.dml
similarity index 100%
rename from src/test/scripts/functions/io/hdf5/ReadHDF5Test_3.dml
rename to src/test/scripts/functions/io/hdf5/ReadHDF5_Default.dml
diff --git a/src/test/scripts/functions/io/hdf5/ReadHDF5_Verify.R b/src/test/scripts/functions/io/hdf5/ReadHDF5_Verify.R
index 2b977007dd2..925e092f724 100644
--- a/src/test/scripts/functions/io/hdf5/ReadHDF5_Verify.R
+++ b/src/test/scripts/functions/io/hdf5/ReadHDF5_Verify.R
@@ -26,5 +26,19 @@ options(digits=22)
 
 library("rhdf5")
 
-Y = h5read(args[1],args[2],native = TRUE)
-writeMM(as(Y, "CsparseMatrix"), paste(args[3], "Y", sep=""))
+Y = h5read(args[1], args[2], native = TRUE)
+dims = dim(Y)
+
+if(length(dims) == 1) {
+  # convert to a column matrix
+  Y_mat = matrix(Y, ncol = 1)
+} else if(length(dims) > 2) {
+  # flatten everything beyond the first dimension into columns
+  perm = c(1, rev(seq(2, length(dims))))
+  Y_mat = matrix(aperm(Y, perm), nrow = dims[1], ncol = prod(dims[-1]))
+} else {
+  # for 2d , systemds treats it the same
+  Y_mat = Y
+}
+
+writeMM(as(Y_mat, "CsparseMatrix"), paste(args[3], "Y", sep=""))
diff --git a/src/test/scripts/functions/io/hdf5/ReadHDF5Test_2.dml b/src/test/scripts/functions/io/hdf5/ReadHDF5_WithDataset.dml
similarity index 100%
rename from src/test/scripts/functions/io/hdf5/ReadHDF5Test_2.dml
rename to src/test/scripts/functions/io/hdf5/ReadHDF5_WithDataset.dml
diff --git a/src/test/scripts/functions/io/hdf5/ReadHDF5Test_1.dml b/src/test/scripts/functions/io/hdf5/ReadHDF5_WithFormatAndDataset.dml
similarity index 100%
rename from src/test/scripts/functions/io/hdf5/ReadHDF5Test_1.dml
rename to src/test/scripts/functions/io/hdf5/ReadHDF5_WithFormatAndDataset.dml
diff --git a/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
new file mode 100644
index 00000000000..fb9fed140ab
--- /dev/null
+++ b/src/test/scripts/functions/io/hdf5/gen_HDF5_testdata.R
@@ -0,0 +1,247 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+# Generate various HDF5 test files with different formats.
+# Creates test files in the 'in' directory.
+
+if (!require("rhdf5", quietly = TRUE)) {
+  cat("Error: rhdf5 is not installed.\n")
+  quit(status = 1)
+}
+
+SMALL_MATRIX_2D <- c(200, 40)
+SMALL_MATRIX_3D <- c(15, 15, 5)
+SMALL_TENSOR_4D_A <- c(120, 16, 16, 4)
+SMALL_TENSOR_4D_B <- c(120, 16, 16, 5)
+SMALL_LABEL_MATRIX <- c(120, 12)
+
+VECTOR_LENGTH <- 200
+STRING_ARRAY_LENGTH <- 30
+
+CHUNK_SHAPE <- c(100, 20)
+
+write_matrix <- function(file_path, dataset_name, shape, generator = function(n) rnorm(n), storage.mode = "double", H5type = NULL) {
+  values <- generator(prod(shape))
+  h5createDataset(
+    file_path,
+    dataset_name,
+    dims = rev(shape),
+    chunk = NULL,
+    filter = "NONE", # contiguous, uncompressed layout
+    level = 0,
+    shuffle = FALSE,
+    storage.mode = storage.mode,
+    H5type = H5type,
+    native = TRUE # use R column-major order, same in h5read(..., native=TRUE) in tests.
+  )
+  h5write(array(values, dim = shape), file_path, dataset_name, native = TRUE)
+}
+
+generate_test_file_single_dataset <- function(dir) {
+  file_path <- file.path(dir, "test_single_dataset.h5")
+  h5createFile(file_path)
+  write_matrix(file_path, "data", SMALL_MATRIX_2D)
+  cat("Created test_single_dataset.h5 (single 2D dataset)\n")
+}
+
+generate_test_file_multiple_datasets <- function(dir) {
+  file_path <- file.path(dir, "test_multiple_datasets.h5")
+  h5createFile(file_path)
+  write_matrix(file_path, "matrix_2d", SMALL_MATRIX_2D)
+  # Create 1D vector without compression/filters
+  h5createDataset(file_path, "vector_1d", dims = VECTOR_LENGTH, chunk = NULL, filter = "NONE", level = 0, shuffle = FALSE)
+  h5write(rnorm(VECTOR_LENGTH), file_path, "vector_1d", native = TRUE)
+  write_matrix(file_path, "matrix_3d", SMALL_MATRIX_3D)
+  cat("Created test_multiple_datasets.h5 (1D/2D/3D datasets)\n")
+}
+
+generate_test_file_different_dtypes <- function(dir) {
+  file_path <- file.path(dir, "test_different_dtypes.h5")
+  h5createFile(file_path)
+  # H5T_IEEE_F64LE (64-bit float)
+  write_matrix(file_path, "double_primary", SMALL_MATRIX_2D, storage.mode = "double")
+  # H5T_IEEE_F32LE (32-bit float)
+  write_matrix(file_path, "float32", SMALL_MATRIX_2D, H5type = "H5T_IEEE_F32LE")
+  # H5T_STD_I32LE (32-bit integer)
+  write_matrix(
+    file_path,
+    "int32",
+    SMALL_MATRIX_2D,
+    generator = function(n) as.integer(sample(-100:100, n, replace = TRUE)),
+    storage.mode = "integer"
+  )
+  # H5T_STD_I64LE (64-bit integer)
+  write_matrix(
+    file_path,
+    "int64",
+    SMALL_MATRIX_2D,
+    generator = function(n) as.integer(sample(-100:100, n, replace = TRUE)),
+    H5type = "H5T_STD_I64LE"
+  )
+  cat("Created test_different_dtypes.h5 (double/float/int32/int64 datasets)\n")
+}
+
+# https://support.hdfgroup.org/documentation/hdf5-docs/advanced_topics/chunking_in_hdf5.html
+generate_test_file_chunked <- function(dir) {
+  file_path <- file.path(dir, "test_chunked.h5")
+  h5createFile(file_path)
+
+  data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D)
+
+  h5createDataset(file_path, "chunked_data", dims = SMALL_MATRIX_2D, chunk = CHUNK_SHAPE,
+                  filter = "NONE", level = 0, shuffle = FALSE)
+  h5write(data, file_path, "chunked_data", native = TRUE)
+
+  write_matrix(file_path, "non_chunked_data", SMALL_MATRIX_2D)
+  cat("Created test_chunked.h5 (chunked dataset)\n")
+}
+
+generate_test_file_compressed <- function(dir) {
+  file_path <- file.path(dir, "test_compressed.h5")
+  h5createFile(file_path)
+  data <- array(rnorm(prod(SMALL_MATRIX_2D)), dim = SMALL_MATRIX_2D)
+  h5createDataset(file_path, "gzip_compressed_9", dims = SMALL_MATRIX_2D,
+                  chunk = SMALL_MATRIX_2D, level = 9)
+  h5write(data, file_path, "gzip_compressed_9", native = TRUE)
+  h5createDataset(file_path, "gzip_compressed_1", dims = SMALL_MATRIX_2D,
+                  chunk = SMALL_MATRIX_2D, level = 1)
+  h5write(data, file_path, "gzip_compressed_1", native = TRUE)
+  cat("Created test_compressed.h5 (gzip compression)\n")
+}
+
+generate_test_file_multi_tensor_samples <- function(dir) {
+  file_path <- file.path(dir, "test_multi_tensor_samples.h5")
+  h5createFile(file_path)
+  write_matrix(
+    file_path,
+    "sen1",
+    SMALL_TENSOR_4D_A
+  )
+  write_matrix(
+    file_path,
+    "sen2",
+    SMALL_TENSOR_4D_B
+  )
+  write_matrix(
+    file_path,
+    "label",
+    SMALL_LABEL_MATRIX,
+    generator = function(n) as.integer(sample(0:1, n, replace = TRUE))
+  )
+  cat("Created test_multi_tensor_samples.h5 (multi-input tensors)\n")
+}
+
+generate_test_file_nested_groups <- function(dir) {
+  file_path <- file.path(dir, "test_nested_groups.h5")
+  h5createFile(file_path)
+  write_matrix(file_path, "root_data", SMALL_MATRIX_2D)
+  h5createGroup(file_path, "group1")
+  write_matrix(file_path, "group1/data1", SMALL_MATRIX_2D)
+  h5createGroup(file_path, "group1/subgroup")
+  write_matrix(file_path, "group1/subgroup/data2", SMALL_MATRIX_2D)
+  cat("Created test_nested_groups.h5 (nested group hierarchy)\n")
+}
+
+generate_test_file_with_attributes <- function(dir) {
+  file_path <- file.path(dir, "test_with_attributes.h5")
+  h5createFile(file_path)
+  write_matrix(file_path, "data", SMALL_MATRIX_2D)
+
+  fid <- H5Fopen(file_path)
+  did <- H5Dopen(fid, "data")
+  h5writeAttribute("Test dataset with attributes", did, "description")
+  h5writeAttribute(1.0, did, "version")
+  h5writeAttribute(SMALL_MATRIX_2D, did, "shape")
+  H5Dclose(did)
+
+  h5writeAttribute("2025-11-26", fid, "file_created")
+  h5writeAttribute("attributes", fid, "test_type")
+  H5Fclose(fid)
+  cat("Created test_with_attributes.h5 (dataset + file attributes)\n")
+}
+
+generate_test_file_empty_datasets <- function(dir) {
+  file_path <- file.path(dir, "test_empty_datasets.h5")
+  h5createFile(file_path)
+  h5createDataset(file_path, "empty", dims = c(0, SMALL_MATRIX_2D[2]),
+                  filter = "NONE", level = 0, shuffle = FALSE)
+
+  h5createDataset(file_path, "scalar", dims = 1,
+                  filter = "NONE", level = 0, shuffle = FALSE, chunk = 1)
+  h5write(1.0, file_path, "scalar", native = TRUE)
+  h5createDataset(file_path, "vector", dims = VECTOR_LENGTH,
+                  filter = "NONE", level = 0, shuffle = FALSE, chunk = VECTOR_LENGTH)
+  h5write(rnorm(VECTOR_LENGTH), file_path, "vector", native = TRUE)
+  cat("Created test_empty_datasets.h5 (empty/scalar/vector)\n")
+}
+
+generate_test_file_string_datasets <- function(dir) {
+  file_path <- file.path(dir, "test_string_datasets.h5")
+  h5createFile(file_path)
+  strings <- paste0("string_", 0:(STRING_ARRAY_LENGTH - 1))
+  # Create string dataset without compression/filters
+  h5createDataset(file_path, "string_array", dims = STRING_ARRAY_LENGTH,
+                  storage.mode = "character", filter = "NONE", level = 0,
+                  shuffle = FALSE, chunk = STRING_ARRAY_LENGTH)
+  h5write(strings, file_path, "string_array", native = TRUE)
+  cat("Created test_string_datasets.h5 (string datasets)\n")
+}
+
+main <- function() {
+  if (basename(getwd()) != "hdf5") {
+    cat("You must execute this script from the 'hdf5' directory\n")
+    quit(status = 1)
+  }
+
+  testdir <- "in"
+  if (!dir.exists(testdir)) {
+    dir.create(testdir)
+  }
+
+  test_functions <- list(
+    generate_test_file_single_dataset,
+    generate_test_file_multiple_datasets,
+    generate_test_file_different_dtypes,
+    generate_test_file_chunked,
+    generate_test_file_compressed,
+    generate_test_file_multi_tensor_samples,
+    generate_test_file_nested_groups,
+    generate_test_file_with_attributes,
+    generate_test_file_empty_datasets,
+    generate_test_file_string_datasets
+  )
+
+  for (test_func in test_functions) {
+    tryCatch({
+      test_func(testdir)
+    }, error = function(e) {
+      cat(sprintf("  ✗ Error: %s\n", conditionMessage(e)))
+    })
+  }
+
+  files <- sort(list.files(testdir, pattern = "\\.h5$", full.names = TRUE))
+  cat(sprintf("\nGenerated %d HDF5 test files in %s\n", length(files), normalizePath(testdir)))
+}
+
+if (!interactive()) {
+  main()
+}
diff --git a/src/test/scripts/functions/ooc/PCA.dml b/src/test/scripts/functions/ooc/PCA.dml
new file mode 100644
index 00000000000..567d701ec06
--- /dev/null
+++ b/src/test/scripts/functions/ooc/PCA.dml
@@ -0,0 +1,28 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1);
+k = $2;
+
+[PC, V] = pca(X=X, K=k)
+
+write(PC, $3, format="binary");
+write(V, $4, format="binary");
diff --git a/src/test/scripts/functions/ooc/lmCG.dml b/src/test/scripts/functions/ooc/lmCG.dml
new file mode 100644
index 00000000000..3c5cee73594
--- /dev/null
+++ b/src/test/scripts/functions/ooc/lmCG.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1)
+y = read($2)
+C = lmCG(X = X, y = y, reg = 1e-12)
+write(C, $3, format="binary")
+