scitix · Sebastian-Keith · Mar 13, 2026 · Mar 13, 2026
@@ -48,6 +48,15 @@ jobs:
         run: |
           ./build.sh --mode=release --clean --py_ver ${{ env.PYTHON_VERSION }}
 
+      - name: Install auditwheel
+          run: pip install auditwheel patchelf
+
+      - name: Repair wheel (linux → manylinux)
+        run: |
+          auditwheel repair python/dist/*-linux_x86_64.whl \
+            --wheel-dir python/dist/
+          rm -f python/dist/*-linux_x86_64.whl
+
       - name: Upload wheel artifact
         uses: actions/upload-artifact@v4
         with:
@@ -154,3 +163,39 @@ jobs:
           packages-dir: python/dist/
           verbose: true
           skip-existing: true
+
+    # ─────────────────────────────────────────────────────────────
+    # Job 4: upload wheel to PyPI
+    #   run when push to tag(v*.*.*)
+    #   requires publish-pypi-test to succeed first
+    # ─────────────────────────────────────────────────────────────
+    publish-pypi:
+      name: Publish to PyPI
+      runs-on: ubuntu-22.04
+      needs: publish-pypi-test
+      if: startsWith(github.ref, 'refs/tags/v')
+
+      environment:
+        name: pypi
+        url: https://pypi.org/project/simm/
+
+      permissions:
+        id-token: write
+
+      steps:
+        - name: Download wheel artifact
+          uses: actions/download-artifact@v4
+          with:
+            name: simm-wheel-py${{ env.PYTHON_VERSION }}
+            path: python/dist/
+
+        - name: List wheel files
+          run: ls -lh python/dist/
+
+        - name: Upload to PyPI
+          uses: pypa/gh-action-pypi-publish@release/v1
+          with:
+            password: ${{ secrets.PYPI_TOKEN }}
+            packages-dir: python/dist/
+            verbose: true
+            skip-existing: true
@@ -16,7 +16,7 @@ Seamlessly integrated with leading inference engines like SGLang and vLLM, enabl
 - **High Bandwidth**: Maximizes I/O bandwidth by fully utilizing **ALL RDMA NICs** of client nodes (effectively eliminating the bottlenecks exposed in [DualPath](https://arxiv.org/abs/2602.21548))
 - **Ease of Use**: Offers seamless integration with popular inference engines, with deployment orchestrated via **Kubernetes (K8s)** for production-grade reliability  
 
-Under multi-turn long-context LLM workloads with significant KV cache reuse, **SiMM drastically reduces prefill latency (TTFT)** by transforming the prefill phase from a compute-heavy task into a high-speed I/O retrieval operation. Under 32K context length, SiMM achieves **3.1x** speedup over "No Cache" configuration and **2.1x** speedup over local CPU caching, **1.2x** outperforming industry-leading alternatives [[details](#integration-with-vllmlmcache)].   
+Under multi-turn long-context LLM workloads with significant KV cache reuse, **SiMM drastically reduces prefill latency (TTFT)** by transforming the prefill phase from a compute-heavy task into a high-speed I/O retrieval operation. Under 32K context length, SiMM achieves **3.1x** speedup over "No Cache" configuration and **2.1x** speedup over local CPU caching, **1.2x** outperforming industry-leading alternatives [[details](#benchmark-with-vllmlmcache)].   
 <div align="center">
   <img src="docs/images/SiMM_LLM_Benchmark_Results.png" alt="SiMM LLM Benchmark results" width="75%" />
 </div>