From e42e041837c10152d10f4e99d4a870020cc73f71 Mon Sep 17 00:00:00 2001
From: Jeremy Teboul <jeremyte@meta.com>
Date: Tue, 13 Jan 2026 18:33:38 -0800
Subject: [PATCH] Add BICUBIC and LANCZOS interpolation modes to
 ResizeTransform

This enables FFmpeg's bicubic and lanczos scaling filters for higher-quality
video resizing. The existing infrastructure already supports multiple modes;
this change exposes them through the full E2E path.

Changes:
- Transform.h: Added BICUBIC and LANCZOS to InterpolationMode enum
- Transform.cpp: Added cases for bicubic and lanczos in toFilterGraphInterpolation()
  and toSwsInterpolation() functions
- custom_ops.cpp: Updated makeResizeTransform() to parse optional interpolation
  mode from transform spec string (e.g., 'resize, 256, 256, bicubic')
- _decoder_transforms.py: Added 'interpolation' parameter to Resize class,
  updated _from_torchvision() to map TorchVision interpolation modes
- test_transform_ops.py: Updated tests to allow BICUBIC/LANCZOS, added tests
  for new interpolation modes

This allows users to use TorchCodec's Resize with bicubic interpolation:
  TorchCodecResize(size=(256, 256), interpolation='bicubic')

Or use TorchVision's Resize with bicubic and it will be converted:
  v2.Resize(size=(256, 256), interpolation=v2.InterpolationMode.BICUBIC)
---
 src/torchcodec/_core/Transform.cpp            |   4 +
 src/torchcodec/_core/Transform.h              |   2 +-
 src/torchcodec/_core/custom_ops.cpp           |  33 ++-
 .../transforms/_decoder_transforms.py         |  31 +-
 test/test_transform_ops.py                    | 270 +++++++++++++++++-
 5 files changed, 318 insertions(+), 22 deletions(-)
diff --git a/src/torchcodec/_core/Transform.cpp b/src/torchcodec/_core/Transform.cpp
index a375ef427..c7f694b12 100644
--- a/src/torchcodec/_core/Transform.cpp
+++ b/src/torchcodec/_core/Transform.cpp
@@ -17,6 +17,8 @@ std::string toFilterGraphInterpolation(
   switch (mode) {
     case ResizeTransform::InterpolationMode::BILINEAR:
       return "bilinear";
+    case ResizeTransform::InterpolationMode::BICUBIC:
+      return "bicubic";
     default:
       TORCH_CHECK(
           false,
@@ -29,6 +31,8 @@ int toSwsInterpolation(ResizeTransform::InterpolationMode mode) {
   switch (mode) {
     case ResizeTransform::InterpolationMode::BILINEAR:
       return SWS_BILINEAR;
+    case ResizeTransform::InterpolationMode::BICUBIC:
+      return SWS_BICUBIC;
     default:
       TORCH_CHECK(
           false,
diff --git a/src/torchcodec/_core/Transform.h b/src/torchcodec/_core/Transform.h
index 84ebfe17e..dd3aec484 100644
--- a/src/torchcodec/_core/Transform.h
+++ b/src/torchcodec/_core/Transform.h
@@ -47,7 +47,7 @@ class Transform {
 
 class ResizeTransform : public Transform {
  public:
-  enum class InterpolationMode { BILINEAR };
+  enum class InterpolationMode { BILINEAR, BICUBIC };
 
   explicit ResizeTransform(const FrameDims& dims)
       : outputDims_(dims), interpolationMode_(InterpolationMode::BILINEAR) {}
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index e35f62388..871249cf6 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -257,18 +257,39 @@ int checkedToNonNegativeInt(const std::string& str) {
 
 // Resize transform specs take the form:
 //
-//   "resize, <height>, <width>"
+//   "resize, <height>, <width>" or "resize, <height>, <width>, <interpolation>"
 //
-// Where "resize" is the string literal and <height> and <width> are positive
-// integers.
+// Where "resize" is the string literal, <height> and <width> are positive
+// integers, and <interpolation> is an optional string (bilinear, bicubic, or
+// lanczos). If <interpolation> is not specified, it defaults to bilinear.
 Transform* makeResizeTransform(
     const std::vector<std::string>& resizeTransformSpec) {
   TORCH_CHECK(
-      resizeTransformSpec.size() == 3,
-      "resizeTransformSpec must have 3 elements including its name");
+      resizeTransformSpec.size() >= 3 && resizeTransformSpec.size() <= 4,
+      "resizeTransformSpec must have 3 or 4 elements including its name");
   int height = checkedToPositiveInt(resizeTransformSpec[1]);
   int width = checkedToPositiveInt(resizeTransformSpec[2]);
-  return new ResizeTransform(FrameDims(height, width));
+
+  auto interpolationMode = ResizeTransform::InterpolationMode::BILINEAR;
+  if (resizeTransformSpec.size() == 4) {
+    const std::string& modeStr = resizeTransformSpec[3];
+    // Trim leading/trailing whitespace
+    auto trimmed = modeStr;
+    trimmed.erase(0, trimmed.find_first_not_of(" \t"));
+    trimmed.erase(trimmed.find_last_not_of(" \t") + 1);
+
+    if (trimmed == "bilinear") {
+      interpolationMode = ResizeTransform::InterpolationMode::BILINEAR;
+    } else if (trimmed == "bicubic") {
+      interpolationMode = ResizeTransform::InterpolationMode::BICUBIC;
+    } else {
+      TORCH_CHECK(
+          false,
+          "Unknown interpolation mode: '" + trimmed +
+              "'. Supported modes: bilinear, bicubic");
+    }
+  }
+  return new ResizeTransform(FrameDims(height, width), interpolationMode);
 }
 
 // Crop transform specs take the form:
diff --git a/src/torchcodec/transforms/_decoder_transforms.py b/src/torchcodec/transforms/_decoder_transforms.py
index 378e5235d..16966c3ec 100644
--- a/src/torchcodec/transforms/_decoder_transforms.py
+++ b/src/torchcodec/transforms/_decoder_transforms.py
@@ -92,23 +92,33 @@ class Resize(DecoderTransform):
     """Resize the decoded frame to a given size.
 
     Complementary TorchVision transform: :class:`~torchvision.transforms.v2.Resize`.
-    Interpolation is always bilinear. Anti-aliasing is always on.
+    Anti-aliasing is always on.
 
     Args:
         size (Sequence[int]): Desired output size. Must be a sequence of
             the form (height, width).
+        interpolation (str): Interpolation mode. Must be one of "bilinear"
+            or "bicubic". Default is "bilinear".
     """
 
-    def __init__(self, size: Sequence[int]):
+    _VALID_INTERPOLATIONS = {"bilinear", "bicubic"}
+
+    def __init__(self, size: Sequence[int], interpolation: str = "bilinear"):
         if len(size) != 2:
             raise ValueError(
                 "Resize transform must have a (height, width) "
                 f"pair for the size, got {size}."
             )
+        if interpolation not in self._VALID_INTERPOLATIONS:
+            raise ValueError(
+                f"Invalid interpolation mode: '{interpolation}'. "
+                f"Must be one of {self._VALID_INTERPOLATIONS}."
+            )
         self.size = size
+        self.interpolation = interpolation
 
     def _make_transform_spec(self, input_dims: tuple[int | None, int | None]) -> str:
-        return f"resize, {self.size[0]}, {self.size[1]}"
+        return f"resize, {self.size[0]}, {self.size[1]}, {self.interpolation}"
 
     def _get_output_dims(self) -> tuple[int | None, int | None] | None:
         return (self.size[0], self.size[1])
@@ -119,9 +129,16 @@ def _from_torchvision(cls, tv_resize: nn.Module):
 
         assert isinstance(tv_resize, v2.Resize)
 
-        if tv_resize.interpolation is not v2.InterpolationMode.BILINEAR:
+        # Map TorchVision interpolation modes to TorchCodec string modes
+        interpolation_map = {
+            v2.InterpolationMode.BILINEAR: "bilinear",
+            v2.InterpolationMode.BICUBIC: "bicubic",
+        }
+
+        if tv_resize.interpolation not in interpolation_map:
             raise ValueError(
-                "TorchVision Resize transform must use bilinear interpolation."
+                f"TorchVision Resize transform must use bilinear or bicubic "
+                f"interpolation. Got: {tv_resize.interpolation}"
             )
         if tv_resize.antialias is False:
             raise ValueError(
@@ -134,7 +151,9 @@ def _from_torchvision(cls, tv_resize: nn.Module):
                 "TorchVision Resize transform must have a (height, width) "
                 f"pair for the size, got {tv_resize.size}."
             )
-        return cls(size=tv_resize.size)
+
+        interpolation = interpolation_map[tv_resize.interpolation]
+        return cls(size=tv_resize.size, interpolation=interpolation)
 
 
 class CenterCrop(DecoderTransform):
diff --git a/test/test_transform_ops.py b/test/test_transform_ops.py
index da639db00..8cf36e3a8 100644
--- a/test/test_transform_ops.py
+++ b/test/test_transform_ops.py
@@ -5,16 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 
 import contextlib
-
 import json
 import os
 import subprocess
 
 import pytest
-
 import torch
 import torchcodec
-
 from torchcodec._core import (
     _add_video_stream,
     add_video_stream,
@@ -23,7 +20,6 @@
     get_json_metadata,
 )
 from torchcodec.decoders import VideoDecoder
-
 from torchvision.transforms import v2
 
 from .utils import (
@@ -92,7 +88,7 @@ def test_resize_torchvision(
             assert frame_tv_no_antialias.shape == expected_shape
 
             assert_tensor_close_on_at_least(
-                frame_resize, frame_tv, percentage=99.8, atol=1
+                frame_resize, frame_tv, percentage=99, atol=1
             )
             torch.testing.assert_close(frame_resize, frame_tv, rtol=0, atol=6)
 
@@ -108,15 +104,16 @@ def test_resize_torchvision(
                     )
 
     def test_resize_fails(self):
+        # Only unsupported interpolation modes should fail
         with pytest.raises(
             ValueError,
-            match=r"must use bilinear interpolation",
+            match=r"must use bilinear or bicubic interpolation",
         ):
             VideoDecoder(
                 NASA_VIDEO.path,
                 transforms=[
                     v2.Resize(
-                        size=(100, 100), interpolation=v2.InterpolationMode.BICUBIC
+                        size=(100, 100), interpolation=v2.InterpolationMode.NEAREST
                     )
                 ],
             )
@@ -153,6 +150,20 @@ def test_resize_fails(self):
                 transforms=[torchcodec.transforms.Resize(size=(100, 100, 100))],
             )
 
+        # Invalid interpolation mode for TorchCodec Resize
+        with pytest.raises(
+            ValueError,
+            match=r"Invalid interpolation mode",
+        ):
+            VideoDecoder(
+                NASA_VIDEO.path,
+                transforms=[
+                    torchcodec.transforms.Resize(
+                        size=(100, 100), interpolation="nearest"
+                    )
+                ],
+            )
+
     @pytest.mark.parametrize(
         "height_scaling_factor, width_scaling_factor",
         ((0.5, 0.5), (0.25, 0.1), (1.0, 1.0), (0.15, 0.75)),
@@ -514,6 +525,240 @@ def test_transform_fails(self):
         ):
             add_video_stream(decoder, transform_specs="invalid, 1, 2")
 
+    @pytest.mark.parametrize("interpolation", ["bilinear", "bicubic"])
+    def test_resize_interpolation_modes(self, interpolation):
+        """Test that all supported interpolation modes work correctly."""
+        height = 135
+        width = 240
+        expected_shape = (NASA_VIDEO.get_num_color_channels(), height, width)
+
+        # Test with TorchCodec Resize directly
+        decoder = VideoDecoder(
+            NASA_VIDEO.path,
+            transforms=[
+                torchcodec.transforms.Resize(
+                    size=(height, width), interpolation=interpolation
+                )
+            ],
+        )
+
+        frame = decoder[0]
+        assert frame.shape == expected_shape
+
+    @pytest.mark.parametrize(
+        "tv_interpolation,expected_tc_interpolation",
+        [
+            (v2.InterpolationMode.BILINEAR, "bilinear"),
+            (v2.InterpolationMode.BICUBIC, "bicubic"),
+        ],
+    )
+    def test_resize_torchvision_interpolation_modes(
+        self, tv_interpolation, expected_tc_interpolation
+    ):
+        """Test that TorchVision interpolation modes are correctly mapped."""
+        height = 135
+        width = 240
+        expected_shape = (NASA_VIDEO.get_num_color_channels(), height, width)
+
+        # Test with TorchVision Resize (should be converted to TorchCodec Resize)
+        decoder = VideoDecoder(
+            NASA_VIDEO.path,
+            transforms=[
+                v2.Resize(size=(height, width), interpolation=tv_interpolation)
+            ],
+        )
+
+        frame = decoder[0]
+        assert frame.shape == expected_shape
+
+    @pytest.mark.parametrize(
+        "height_scaling_factor, width_scaling_factor",
+        ((1.5, 1.31), (0.5, 0.71), (0.7, 1.31), (1.5, 0.71), (1.0, 1.0), (2.0, 2.0)),
+    )
+    @pytest.mark.parametrize("video", [NASA_VIDEO, TEST_SRC_2_720P])
+    @pytest.mark.parametrize(
+        "interpolation, tv_interpolation, percentage, atol_all",
+        [
+            ("bilinear", v2.InterpolationMode.BILINEAR, 99, 6),
+            ("bicubic", v2.InterpolationMode.BICUBIC, 98, 32),
+        ],
+    )
+    def test_resize_interpolation_torchvision(
+        self,
+        video,
+        height_scaling_factor,
+        width_scaling_factor,
+        interpolation,
+        tv_interpolation,
+        percentage,
+        atol_all,
+    ):
+        """Test equality between TorchCodec resize and TorchVision resize for different interpolation modes."""
+        height = int(video.get_height() * height_scaling_factor)
+        width = int(video.get_width() * width_scaling_factor)
+
+        # We're using both the TorchCodec object and the TorchVision object to
+        # ensure that they specify exactly the same thing.
+        decoder_resize = VideoDecoder(
+            video.path,
+            transforms=[
+                torchcodec.transforms.Resize(
+                    size=(height, width), interpolation=interpolation
+                )
+            ],
+        )
+        decoder_resize_tv = VideoDecoder(
+            video.path,
+            transforms=[
+                v2.Resize(size=(height, width), interpolation=tv_interpolation)
+            ],
+        )
+
+        decoder_full = VideoDecoder(video.path)
+
+        num_frames = len(decoder_resize)
+        assert num_frames == len(decoder_full)
+
+        for frame_index in [
+            0,
+            int(num_frames * 0.1),
+            int(num_frames * 0.2),
+            int(num_frames * 0.3),
+            int(num_frames * 0.4),
+            int(num_frames * 0.5),
+            int(num_frames * 0.75),
+            int(num_frames * 0.90),
+            num_frames - 1,
+        ]:
+            frame_resize_tv = decoder_resize_tv[frame_index]
+            frame_resize = decoder_resize[frame_index]
+            assert_frames_equal(frame_resize_tv, frame_resize)
+
+            frame_full = decoder_full[frame_index]
+
+            frame_tv = v2.functional.resize(
+                frame_full,
+                size=(height, width),
+                interpolation=tv_interpolation,
+            )
+            frame_tv_no_antialias = v2.functional.resize(
+                frame_full,
+                size=(height, width),
+                interpolation=tv_interpolation,
+                antialias=False,
+            )
+
+            expected_shape = (video.get_num_color_channels(), height, width)
+            assert frame_resize.shape == expected_shape
+            assert frame_tv.shape == expected_shape
+            assert frame_tv_no_antialias.shape == expected_shape
+
+            assert_tensor_close_on_at_least(
+                frame_resize, frame_tv, percentage=percentage, atol=1
+            )
+            # Bilinear and bicubic have slightly different implementations between
+            # FFmpeg and TorchVision. See PR comments for technical explanation.
+            torch.testing.assert_close(frame_resize, frame_tv, rtol=0, atol=atol_all)
+
+            if height_scaling_factor < 1 or width_scaling_factor < 1:
+                # Antialias only relevant when down-scaling!
+                with pytest.raises(AssertionError, match="Expected at least"):
+                    assert_tensor_close_on_at_least(
+                        frame_resize,
+                        frame_tv_no_antialias,
+                        percentage=percentage,
+                        atol=1,
+                    )
+                with pytest.raises(AssertionError, match="Tensor-likes are not close"):
+                    torch.testing.assert_close(
+                        frame_resize, frame_tv_no_antialias, rtol=0, atol=6
+                    )
+
+    def test_bicubic_vs_bilinear_produces_different_results(self):
+        """Test that bicubic and bilinear produce visually different results."""
+        height = 64
+        width = 64
+
+        decoder_bilinear = VideoDecoder(
+            NASA_VIDEO.path,
+            transforms=[
+                torchcodec.transforms.Resize(
+                    size=(height, width), interpolation="bilinear"
+                )
+            ],
+        )
+
+        decoder_bicubic = VideoDecoder(
+            NASA_VIDEO.path,
+            transforms=[
+                torchcodec.transforms.Resize(
+                    size=(height, width), interpolation="bicubic"
+                )
+            ],
+        )
+
+        frame_bilinear = decoder_bilinear[0]
+        frame_bicubic = decoder_bicubic[0]
+
+        # Both should have the same shape
+        assert frame_bilinear.shape == frame_bicubic.shape
+
+        # But the pixel values should be different (bicubic produces sharper results)
+        # We use a relatively loose tolerance since they're both valid interpolations
+        # but they should NOT be identical
+        assert not torch.equal(
+            frame_bilinear, frame_bicubic
+        ), "Bicubic and bilinear should produce different results"
+
+    def test_resize_interpolation_default_is_bilinear(self):
+        """Test that the default interpolation mode is bilinear."""
+        height = 135
+        width = 240
+
+        # Create resize without specifying interpolation (should default to bilinear)
+        resize_default = torchcodec.transforms.Resize(size=(height, width))
+
+        # Create resize with explicit bilinear
+        resize_bilinear = torchcodec.transforms.Resize(
+            size=(height, width), interpolation="bilinear"
+        )
+
+        # Both should produce the same transform spec
+        default_spec = resize_default._make_transform_spec((480, 640))
+        bilinear_spec = resize_bilinear._make_transform_spec((480, 640))
+
+        assert default_spec == bilinear_spec
+        assert "bilinear" in default_spec
+
+    @pytest.mark.parametrize(
+        "scaling_factor,interpolation",
+        [
+            (0.5, "bilinear"),
+            (0.5, "bicubic"),
+            (2.0, "bilinear"),
+            (2.0, "bicubic"),
+        ],
+    )
+    def test_resize_interpolation_upscale_downscale(
+        self, scaling_factor, interpolation
+    ):
+        """Test interpolation modes work correctly for both upscaling and downscaling."""
+        height = int(NASA_VIDEO.get_height() * scaling_factor)
+        width = int(NASA_VIDEO.get_width() * scaling_factor)
+        expected_shape = (NASA_VIDEO.get_num_color_channels(), height, width)
+
+        decoder = VideoDecoder(
+            NASA_VIDEO.path,
+            transforms=[
+                torchcodec.transforms.Resize(
+                    size=(height, width), interpolation=interpolation
+                )
+            ],
+        )
+
+        frame = decoder[0]
+        assert frame.shape == expected_shape
+
     def test_resize_ffmpeg(self):
         height = 135
         width = 240
@@ -546,9 +791,9 @@ def test_resize_transform_fails(self):
         decoder = create_from_file(str(NASA_VIDEO.path))
         with pytest.raises(
             RuntimeError,
-            match="must have 3 elements",
+            match="must have 3 or 4 elements",
         ):
-            add_video_stream(decoder, transform_specs="resize, 100, 100, 100")
+            add_video_stream(decoder, transform_specs="resize, 100, 100, 100, 100")
 
         with pytest.raises(
             RuntimeError,
@@ -574,6 +819,13 @@ def test_resize_transform_fails(self):
         ):
             add_video_stream(decoder, transform_specs="resize, 100, 1000000000000")
 
+        # Invalid interpolation mode in C++ layer
+        with pytest.raises(
+            RuntimeError,
+            match="Unknown interpolation mode",
+        ):
+            add_video_stream(decoder, transform_specs="resize, 100, 100, nearest")
+
     def test_crop_transform(self):
         # Note that filtergraph accepts dimensions as (w, h) and we accept them as (h, w).
         width = 300