neuro-ml · AnihilatorGun · Jan 7, 2025 · Aug 8, 2024 · Dec 28, 2024 · Dec 28, 2024
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,6 @@
+[flake8]
+
+max-line-length = 120
+per-file-ignores =
+    kerops/kernels/*: B007
+    __init__.py: F401
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,23 @@
+name: Lint
+
+on: [ pull_request ]
+
+env:
+  MODULE_NAME: kerops
+
+jobs:
+  lint:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Check python code style
+        run: |
+          pip install -r requirements-dev.txt
+          flake8 .
+          isort --check .
+          black --check .
diff --git a/README.md b/README.md
@@ -1,2 +1,28 @@
-# kerops
-Efficient and fast algorithms on the GPU
+# Kerops
+Fast algorithms for GPU
+
+# Install
+*pip is not available right now*
+```shell
+pip install kerops
+```
+
+# How fast is it?
+Time comparison (ms) for NVidia RTX 3090. Input is an array of size (1, channels, 350, 350, 128); float16; <b>channels_last_3d</b>. Compared to usual 3d convolution from torch (kernel_size=3, padding=1, stride=1, bias=False, in_channels=channels, out_channels=channels). Slowdown compared to copying is shown in parentheses.
+
+| channels             |torch.clone|  kerops.ops.DWConv   |torch.nn.Conv3d(C->C)|
+|:--------------------:|:---------:|:--------------------:|:-------------------:|
+| 8                    |   0.61    |         0.79 (x1.30) |     2.45 (x4.00)    |
+| 16                   |   1.21    |         1.41 (x1.17) |     4.48 (x3.70)    |
+| 32                   |   2.40    |         2.99 (x1.25) |     15.3 (x6.38)    |
+| 64                   |   4.78    |         6.29 (x1.32) |     52.0 (x10.89)   |
+| 128                  |   9.55    |         12.8 (x1.34) |     195.0 (x20.44)  |
+
+
+| channels             |torch.clone|kerops.ops.DWConvWGRAD|torch.nn.Conv3d(C->C)|
+|:--------------------:|:---------:|:--------------------:|:-------------------:|
+| 8                    |   0.61    |         2.55 (x4.18) |     7.14 (x11.70)   |
+| 16                   |   1.21    |         3.01 (x2.49) |     12.1 (x10.00)   |
+| 32                   |   2.40    |         4.80 (x2.00) |     24.6 (x10.25)   |
+| 64                   |   4.78    |         8.72 (x1.82) |     71.3 (x14.91)   |
+| 128                  |   9.55    |         17.9 (x1.87) |     245.0 (x25.65)  |
diff --git a/kerops/__version__.py b/kerops/__version__.py
@@ -1 +1 @@
-__version__ = '0.0.1'
+__version__ = '0.0.2'
diff --git a/kerops/kernels/dw_conv.py b/kerops/kernels/dw_conv.py
diff --git a/kerops/ops/_settings.py b/kerops/ops/_settings.py
diff --git a/kerops/ops/addition.py b/kerops/ops/addition.py
@@ -5,13 +5,10 @@
 from triton import next_power_of_2
 
 from ..kernels.addition import _AddStats_cl3d_backward_impl, _AddStats_cl3d_impl
-from ._settings import configure, get_l1_cache, ConfigurableArg
+from ..settings import ConfigurableArg, configure, get_l1_cache
 
 
-@configure(
-    _l1_cache_bytes=lambda: get_l1_cache(),
-    _num_warps=lambda: 8
-)
+@configure(_l1_cache_bytes=get_l1_cache, _num_warps=8)
 def AddStats(x, y, inplace=False, *, _l1_cache_bytes: ConfigurableArg, _num_warps: ConfigurableArg):
     num_channels = x.shape[1]
     numel = x.numel()
@@ -52,11 +49,10 @@ def AddStats(x, y, inplace=False, *, _l1_cache_bytes: ConfigurableArg, _num_warp
     return output, mean, sqmean
 
 
-@configure(
-    _l1_cache_bytes=lambda: get_l1_cache(),
-    _num_warps=lambda: 8
-)
-def AddStatsBackward(add_grad, mean_grad, sqmean_grad, add_result, *, _l1_cache_bytes: ConfigurableArg, _num_warps: ConfigurableArg):
+@configure(_l1_cache_bytes=get_l1_cache, _num_warps=8)
+def AddStatsBackward(
+    add_grad, mean_grad, sqmean_grad, add_result, *, _l1_cache_bytes: ConfigurableArg, _num_warps: ConfigurableArg
+):
     num_channels = add_grad.shape[1]
     numel = add_grad.numel()
     assert add_result.shape == add_grad.shape

diff --git a/kerops/ops/avgpool.py b/kerops/ops/avgpool.py
@@ -5,12 +5,12 @@
 from triton import next_power_of_2
 
 from ..kernels.avgpool import _AvgPoolCeilStats_cl3d_backward_impl, _AvgPoolCeilStats_cl3d_impl
-from ._settings import configure, get_l1_cache, ConfigurableArg
+from ..settings import ConfigurableArg, configure, get_l1_cache
 
 
 @configure(
-    _l1_cache_bytes=lambda: get_l1_cache(),
-    _num_warps=lambda: 2,
+    _l1_cache_bytes=get_l1_cache,
+    _num_warps=2,
 )
 def AvgPoolCeilStats(x, *, _l1_cache_bytes: ConfigurableArg, _num_warps: ConfigurableArg):
     num_channels = x.shape[1]
@@ -60,11 +60,17 @@ def AvgPoolCeilStats(x, *, _l1_cache_bytes: ConfigurableArg, _num_warps: Configu
     return output, mean, sqmean
 
 
-@configure(
-    _l1_cache_bytes=lambda: get_l1_cache(),
-    _num_warps=lambda: 4
-)
-def AvgPoolCeilStatsBackward(inpgrad, meangrad, sqmeangrad, output, outgrad_shape, *, _l1_cache_bytes: ConfigurableArg, _num_warps: ConfigurableArg):
+@configure(_l1_cache_bytes=get_l1_cache, _num_warps=4)
+def AvgPoolCeilStatsBackward(
+    inpgrad,
+    meangrad,
+    sqmeangrad,
+    output,
+    outgrad_shape,
+    *,
+    _l1_cache_bytes: ConfigurableArg,
+    _num_warps: ConfigurableArg,
+):
     MAX_SIZE = _l1_cache_bytes // inpgrad.element_size()  # 32768 for fp16
     bsize, num_channels, h_outgrad, w_outgrad, d_outgrad = outgrad_shape
     d_inpgrad = inpgrad.shape[-1]

diff --git a/kerops/ops/bnrelu.py b/kerops/ops/bnrelu.py
@@ -5,13 +5,10 @@
 from triton import next_power_of_2
 
 from ..kernels.bnrelu import _ApplyBNReLU_cl3d_backward_impl, _ApplyBNReLU_cl3d_impl
-from ._settings import configure, get_l1_cache, ConfigurableArg
+from ..settings import ConfigurableArg, configure, get_l1_cache
 
 
-@configure(
-    _l1_cache_bytes=lambda: get_l1_cache(),
-    _num_warps=lambda: 8
-)
+@configure(_l1_cache_bytes=get_l1_cache, _num_warps=8)
 def ApplyBNReLU(x, weight, bias, *, _l1_cache_bytes: ConfigurableArg, _num_warps: ConfigurableArg):
     num_channels = x.shape[1]
     numel = x.numel()
@@ -44,10 +41,7 @@ def ApplyBNReLU(x, weight, bias, *, _l1_cache_bytes: ConfigurableArg, _num_warps
     return output
 
 
-@configure(
-    _l1_cache_bytes=lambda: get_l1_cache(),
-    _num_warps=lambda: 8
-)
+@configure(_l1_cache_bytes=get_l1_cache, _num_warps=8)
 def ApplyBNReLUBackward(x, weight, bias, grad, *, _l1_cache_bytes: ConfigurableArg, _num_warps: ConfigurableArg):
     num_channels = x.shape[1]
     numel = x.numel()

diff --git a/kerops/ops/conv.py b/kerops/ops/conv.py
@@ -4,40 +4,35 @@
 from triton import language as tl, next_power_of_2
 
 from ..kernels.dw_conv import _DWConv_cl3d_impl, _DWConv_wgrad_cl3d_impl
-from ._settings import configure, ConfigurableArg
+from ..settings import ConfigurableArg, configure
 
 
-def configure_dwconv(channels):
-    """
-    Hardcoded, benchmarked on RTX 3090, mb should be generated automatically
-    H, W, D = [350, 350, 128]
+def dwconv_warps(channels):
+    return {8: 1, 16: 2, 32: 2, 64: 2, 128: 4}[channels]
 
-    channels: [[num_warps, D_block], [num_warps, D_block]]  one for fwd another for bwd
-    """
 
-    """
-    TODO
-    More geeky solution is to compare performances with respect to splitting axis D
-    to N * D_block with padding
-    """
+def dwconv_dblock(channels):
+    return {8: 32, 16: 32, 32: 16, 64: 8, 128: 8}[channels]
 
-    HARDCODED_CONFIG = {
-        8: [[1, 32], [1, 32]],
-        16: [[2, 32], [1, 32]],
-        32: [[2, 16], [1, 32]],
-        64: [[2, 8], [1, 16]],
-        128: [[4, 8], [2, 16]],
-    }
 
-    return HARDCODED_CONFIG.get(channels, None)
+def dwconv_wgrad_warps(channels):
+    return {8: 1, 16: 1, 32: 1, 64: 1, 128: 2}[channels]
+
+
+def dwconv_wgrad_dblock(channels):
+    return {8: 32, 16: 32, 32: 16, 64: 8, 128: 8}[channels]
+
+
+def dwconv_wgrad_ilp(channels):
+    return {8: 1, 16: 1, 32: 2, 64: 3, 128: 3}[channels]
 
 
 @configure(
-    ACCTYPE=lambda: 'float32',
-    _num_warps=lambda weight: configure_dwconv(weight.shape[-1])[0][0],
-    D_block=lambda weight: configure_dwconv(weight.shape[-1])[0][1],
+    ACCTYPE='float32',
+    _num_warps=lambda x: dwconv_warps(x.shape[1]),
+    D_block=lambda x: dwconv_dblock(x.shape[1]),
 )
-def DWConv(x, weight, *, ACCTYPE: ConfigurableArg = 'float32', _num_warps: ConfigurableArg = 2, D_block: ConfigurableArg = 32):
+def DWConv(x, weight, *, ACCTYPE: ConfigurableArg, _num_warps: ConfigurableArg, D_block: ConfigurableArg):
     channels = x.shape[1]
 
     assert x.ndim == 5
@@ -79,11 +74,14 @@ def DWConv(x, weight, *, ACCTYPE: ConfigurableArg = 'float32', _num_warps: Confi
 
 
 @configure(
-    _num_warps=lambda x: configure_dwconv(x.shape[1])[1][0],
-    ACCTYPE=lambda: 'float32',
-    D_block=lambda x: configure_dwconv(x.shape[1])[1][1],
+    ACCTYPE='float32',
+    _num_warps=lambda x: dwconv_wgrad_warps(x.shape[1]),
+    D_block=lambda x: dwconv_wgrad_dblock(x.shape[1]),
+    ILP=lambda x: dwconv_wgrad_ilp(x.shape[1]),
 )
-def DWConvWGRAD(x, grad, *, ACCTYPE: ConfigurableArg = 'float32', _num_warps: ConfigurableArg=2, D_block: ConfigurableArg = 32):
+def DWConvWGRAD(
+    x, grad, *, ACCTYPE: ConfigurableArg, _num_warps: ConfigurableArg, D_block: ConfigurableArg, ILP: ConfigurableArg
+):
     channels = x.shape[1]
 
     assert x.ndim == grad.ndim == 5
@@ -99,10 +97,10 @@ def DWConvWGRAD(x, grad, *, ACCTYPE: ConfigurableArg = 'float32', _num_warps: Co
     bsize, _, H, W, D = x.shape
     batch_stride, _, H_stride, W_stride, _ = x.stride()
 
-    H_grid = ceil(H / 2)
+    H_grid = ceil(H / (2 * ILP))
     W_grid = ceil(W / 2)
     D_grid = ceil(D / D_block)
-    grid = (H_grid, W_grid * D_grid)
+    grid = (H_grid, W_grid, D_grid)
 
     grad_w = torch.zeros([bsize, H_grid * W_grid * D_grid, 3, 3, 3, channels], device=x.device, dtype=torch.float16)
     WD_grid = W_grid * D_grid  # TODO: mb implement in another way
@@ -121,9 +119,12 @@ def DWConvWGRAD(x, grad, *, ACCTYPE: ConfigurableArg = 'float32', _num_warps: Co
             channels,
             D_block,
             WD_grid,
+            D_grid,
+            H_grid,
+            ILP,
             num_warps=_num_warps,
         )
 
-    grad_w = torch.flip(grad_w.sum(dim=(0, 1)), dims=(2,))
+    grad_w = grad_w.sum(dim=(0, 1))
 
     return grad_w