From 138e22ed3b5169a837fd7c9b31a61539fcc562a7 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Fri, 5 Dec 2025 16:58:50 -0800
Subject: [PATCH 1/7] Initial sketch

---
 docs/source/conf.py             |   1 +
 examples/decoding/transforms.py | 141 ++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 examples/decoding/transforms.py

diff --git a/docs/source/conf.py b/docs/source/conf.py
index b26108561..b683fda64 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -83,6 +83,7 @@ def __call__(self, filename):
                 "sampling.py",
                 "parallel_decoding.py",
                 "custom_frame_mappings.py",
+                "transforms.py",
             ]
         else:
             assert "examples/encoding" in self.src_dir
diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py
new file mode 100644
index 000000000..695b3dd18
--- /dev/null
+++ b/examples/decoding/transforms.py
@@ -0,0 +1,141 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+===================================================================
+Decoder Transforms: Needs a tagline
+===================================================================
+
+In this example, we will describe the ``transforms`` parameter of the
+:class:`~torchcodec.decoders.VideoDecoder` class.
+"""
+
+# %%
+# First, a bit of boilerplate and definitions.
+
+
+import torch
+import requests
+import tempfile
+from pathlib import Path
+import shutil
+import subprocess
+from time import perf_counter_ns
+from IPython.display import Video
+
+def store_video_to(url: str, local_video_path: Path):
+    response = requests.get(url, headers={"User-Agent": ""})
+    if response.status_code != 200:
+        raise RuntimeError(f"Failed to download video. {response.status_code = }.")
+
+    with open(local_video_path, 'wb') as f:
+        for chunk in response.iter_content():
+            f.write(chunk)
+
+def plot(frames: torch.Tensor, title : str | None = None):
+    try:
+        from torchvision.utils import make_grid
+        from torchvision.transforms.v2.functional import to_pil_image
+        import matplotlib.pyplot as plt
+    except ImportError:
+        print("Cannot plot, please run `pip install torchvision matplotlib`")
+        return
+
+    plt.rcParams["savefig.bbox"] = "tight"
+    dpi = 300
+    fig, ax = plt.subplots(figsize=(800/dpi, 600/dpi), dpi=dpi)
+    ax.imshow(to_pil_image(make_grid(frames)))
+    ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+    if title is not None:
+        ax.set_title(title, fontsize=6)
+    plt.tight_layout()
+
+# %%
+# Our example video
+# -----------------
+#
+# We'll download a video from the internet and store it locally. We're
+# purposefully retrieving a high resolution video to demonstrate using
+# transforms to modify dimensions.
+
+# Video source: https://www.pexels.com/video/an-african-penguin-at-the-beach-9140346/
+# Author: Taryn Elliott.
+url = "https://videos.pexels.com/video-files/9140346/9140346-uhd_3840_2160_25fps.mp4"
+
+temp_dir = tempfile.mkdtemp()
+penguin_video_path = Path(temp_dir) / "penguin.mp4"
+store_video_to(url, penguin_video_path)
+
+from torchcodec.decoders import VideoDecoder
+print(f"Penguin video metadata: {VideoDecoder(penguin_video_path).metadata}")
+
+# %%
+# Some stuff about the video itself, including its resolution of 3840x2160.
+
+# %%
+# Applying transforms during pre-processing
+# -----------------------------------------
+#
+# There are lots of reasons to apply transforms to video frames during pre-proc
+# (list them). A typical example might look like:
+
+from torchvision.transforms import v2
+
+full_decoder = VideoDecoder(penguin_video_path)
+full_mid_frame = full_decoder[465] # mid-point of the video
+resized_post_mid_frame = v2.Resize(size=(360, 640))(full_mid_frame)
+
+plot(resized_post_mid_frame, title="Resized to 360x640 after decoding")
+
+# %%
+# But we can now do it:
+resize_decoder = VideoDecoder(
+    penguin_video_path,
+    transforms=[v2.Resize(size=(360, 640))]
+)
+resized_during_mid_frame = resize_decoder[465]
+
+plot(resized_during_mid_frame, title="Resized to 360x640 during decoding")
+
+# %%
+# TorchCodec's relationship with TorchVision transforms
+# -----------------------------------------------------
+# Talk about the relationship between TorchVision transforms and TorchCodec
+# decoder transforms. Importantly, they're not identical:
+abs_diff = (resized_post_mid_frame.float() - resized_during_mid_frame.float()).abs()
+(abs_diff == 0).all()
+
+# %%
+# But they're close enough that models won't be able to tell a difference:
+(abs_diff <= 1).float().mean() >= 0.998
+
+# %%
+# Transform pipelines
+# -------------------
+# But wait - there's more!
+
+crop_resize_decoder = VideoDecoder(
+    penguin_video_path,
+    transforms = [
+        v2.Resize(size=(360, 640)),
+        v2.CenterCrop(size=(300, 200))
+    ]
+)
+crop_resized_during_mid_frame = crop_resize_decoder[465]
+plot(crop_resized_during_mid_frame, title="Resized to 360x640 during decoding then center cropped")
+
+# %%
+# We also support `RandomCrop`. Reach out if there are particular transforms you want!
+
+# %%
+# Performance
+# -----------
+#
+# The main motivation for doing this is performance.
+
+# %%
+shutil.rmtree(temp_dir)
+# %%

From 408ac57754bb967d9b8d69bcd8fa53f2f2591bd1 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Tue, 9 Dec 2025 11:40:52 -0800
Subject: [PATCH 2/7] Commit to move on

---
 examples/decoding/transforms.py | 80 ++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 7 deletions(-)

diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py
index 695b3dd18..36093b3e9 100644
--- a/examples/decoding/transforms.py
+++ b/examples/decoding/transforms.py
@@ -24,7 +24,6 @@
 import shutil
 import subprocess
 from time import perf_counter_ns
-from IPython.display import Video
 
 def store_video_to(url: str, local_video_path: Path):
     response = requests.get(url, headers={"User-Agent": ""})
@@ -86,19 +85,19 @@ def plot(frames: torch.Tensor, title : str | None = None):
 
 full_decoder = VideoDecoder(penguin_video_path)
 full_mid_frame = full_decoder[465] # mid-point of the video
-resized_post_mid_frame = v2.Resize(size=(360, 640))(full_mid_frame)
+resized_post_mid_frame = v2.Resize(size=(480, 640))(full_mid_frame)
 
-plot(resized_post_mid_frame, title="Resized to 360x640 after decoding")
+plot(resized_post_mid_frame, title="Resized to 480x640 after decoding")
 
 # %%
 # But we can now do it:
 resize_decoder = VideoDecoder(
     penguin_video_path,
-    transforms=[v2.Resize(size=(360, 640))]
+    transforms=[v2.Resize(size=(480, 640))]
 )
 resized_during_mid_frame = resize_decoder[465]
 
-plot(resized_during_mid_frame, title="Resized to 360x640 during decoding")
+plot(resized_during_mid_frame, title="Resized to 480x640 during decoding")
 
 # %%
 # TorchCodec's relationship with TorchVision transforms
@@ -120,12 +119,12 @@ def plot(frames: torch.Tensor, title : str | None = None):
 crop_resize_decoder = VideoDecoder(
     penguin_video_path,
     transforms = [
-        v2.Resize(size=(360, 640)),
+        v2.Resize(size=(480, 640)),
         v2.CenterCrop(size=(300, 200))
     ]
 )
 crop_resized_during_mid_frame = crop_resize_decoder[465]
-plot(crop_resized_during_mid_frame, title="Resized to 360x640 during decoding then center cropped")
+plot(crop_resized_during_mid_frame, title="Resized to 480x640 during decoding then center cropped")
 
 # %%
 # We also support `RandomCrop`. Reach out if there are particular transforms you want!
@@ -136,6 +135,73 @@ def plot(frames: torch.Tensor, title : str | None = None):
 #
 # The main motivation for doing this is performance.
 
+import os
+import psutil
+
+def bench(f, average_over=5, warmup=2, **f_kwargs):
+
+    for _ in range(warmup):
+        f(**f_kwargs)
+
+    process = psutil.Process(os.getpid())
+    times = []
+    memory = []
+    for _ in range(average_over):
+        #start_rss = process.memory_info().rss / 1024 / 1024
+        start_time = perf_counter_ns()
+        f(**f_kwargs)
+        end_time = perf_counter_ns()
+        #end_rss = process.memory_info().rss / 1024 / 1024
+        times.append(end_time - start_time)
+        #memory.append(end_rss - start_rss)
+
+    times = torch.tensor(times) * 1e-6  # ns to ms
+    times_std = times.std().item()
+    times_med = times.median().item()
+    #memory = torch.tensor(memory)
+    #memory_std = memory.std().item()
+    #memory_med = memory.median().item()
+    print(f"{times_med = :.2f}ms +- {times_std:.2f}")
+    #print(f"{memory_med = :.2f}MB +- {memory_std:.2f}")
+
+from torchcodec import samplers
+
+def sample_decoder_transforms():
+    decoder = VideoDecoder(
+        penguin_video_path,
+        transforms = [
+            #v2.Resize(size=(480, 640)),
+            v2.CenterCrop(size=(300, 200))
+        ],
+        seek_mode="approximate",
+    )
+    transformed_frames = samplers.clips_at_regular_indices(
+        decoder,
+        num_clips=1,
+        num_frames_per_clip=200
+    )
+    assert len(transformed_frames.data[0]) == 200
+
+def sample_torchvision_transforms():
+    decoder = VideoDecoder(
+        penguin_video_path,
+        seek_mode="approximate"
+    )
+    frames = samplers.clips_at_regular_indices(
+        decoder,
+        num_clips=1,
+        num_frames_per_clip=200
+    )
+    transformed_frames = []
+    for frame in frames.data[0]:
+        #frame = v2.Resize(size=(480, 640))(frame)
+        frame = v2.CenterCrop(size=(300, 200))(frame)
+        transformed_frames.append(frame)
+    assert len(transformed_frames) == 200
+
+bench(sample_decoder_transforms)
+bench(sample_torchvision_transforms)
+
 # %%
 shutil.rmtree(temp_dir)
 # %%

From 2a16bbd8b70d8d25a94d8ee20da9c1d5be9d50b3 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Wed, 10 Dec 2025 20:38:18 -0800
Subject: [PATCH 3/7] First draft

---
 examples/decoding/transforms.py | 217 +++++++++++++++++++++++---------
 1 file changed, 161 insertions(+), 56 deletions(-)

diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py
index 36093b3e9..815796e91 100644
--- a/examples/decoding/transforms.py
+++ b/examples/decoding/transforms.py
@@ -5,16 +5,20 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-===================================================================
-Decoder Transforms: Needs a tagline
-===================================================================
-
-In this example, we will describe the ``transforms`` parameter of the
-:class:`~torchcodec.decoders.VideoDecoder` class.
+=======================================================
+Decoder Transforms: Applying transforms during decoding
+=======================================================
+
+In this example, we will demonstrate how to use the ``transforms`` parameter of
+the :class:`~torchcodec.decoders.VideoDecoder` class. This parameter allows us
+to specify a list of :class:`~torchcodec.transforms.DecoderTransform` or
+:class:`~torchvision.transforms.v2.Transform` objects. These objects serve as
+transform specificiations that the :class:`~torchcodec.decoders.VideoDecoder`
+will apply during the decoding process.
 """
 
 # %%
-# First, a bit of boilerplate and definitions.
+# First, a bit of boilerplate and definitions that we will use later:
 
 
 import torch
@@ -58,7 +62,7 @@ def plot(frames: torch.Tensor, title : str | None = None):
 #
 # We'll download a video from the internet and store it locally. We're
 # purposefully retrieving a high resolution video to demonstrate using
-# transforms to modify dimensions.
+# transforms to reduce the dimensions.
 
 # Video source: https://www.pexels.com/video/an-african-penguin-at-the-beach-9140346/
 # Author: Taryn Elliott.
@@ -72,39 +76,74 @@ def plot(frames: torch.Tensor, title : str | None = None):
 print(f"Penguin video metadata: {VideoDecoder(penguin_video_path).metadata}")
 
 # %%
-# Some stuff about the video itself, including its resolution of 3840x2160.
+# As shown above, the video is 37 seconds long and has a height of 2160 pixels
+# and a width of 3840 pixels.
+#
+# .. note::
+#
+#     The colloquial way to report the dimensions of this video would be as
+#     3840x2160; that is, (`width`, `height`). In the PyTorch ecosystem, image
+#     dimensions are typically expressed as (`height`, `width`). The remainder
+#     of this tutorial uses the PyTorch convention of (`height`, `width`) to
+#     specify image dimensions.
 
 # %%
 # Applying transforms during pre-processing
 # -----------------------------------------
 #
-# There are lots of reasons to apply transforms to video frames during pre-proc
-# (list them). A typical example might look like:
+# A pre-processing pipeline for videos during training will typically apply a
+# set of transforms for three main reasons:
+#
+#   1. **Normalization**: Videos can have many different lengths, resolutions,
+#      and frame rates. Normalizing all videos to the same characteristics
+#      leads to better model performance.
+#   2. **Data reduction**: Training on higher resolution frames may lead to better
+#      model performance, but it will be more expensive both at training and
+#      inference time. As a consequence, many video pre-processing pipelines reduce
+#      frame dimensions through resizing and cropping.
+#   3. **Variety**: Applying random transforms (flips, crops, perspective shifts)
+#      to the same frames during training can improve model performance.
+#
+# Below is a simple example of applying the
+# :class:`~torchvision.transforms.v2.Resize` transform to a single frame:
 
 from torchvision.transforms import v2
 
 full_decoder = VideoDecoder(penguin_video_path)
-full_mid_frame = full_decoder[465] # mid-point of the video
-resized_post_mid_frame = v2.Resize(size=(480, 640))(full_mid_frame)
+frame = full_decoder[5]
+resized_after = v2.Resize(size=(480, 640))(frame)
 
-plot(resized_post_mid_frame, title="Resized to 480x640 after decoding")
+plot(resized_after, title="Resized to 480x640 after decoding")
 
 # %%
-# But we can now do it:
+# In the example above, ``full_decoder`` returns a video frame that has the
+# dimensions (2160, 3840) which is then resized down to (480, 640). But with the
+# ``transforms`` parameter of :class:`~torchcodec.decoders.VideoDecoder` we can
+# specify for the resize to  happen during decoding:
+
 resize_decoder = VideoDecoder(
     penguin_video_path,
     transforms=[v2.Resize(size=(480, 640))]
 )
-resized_during_mid_frame = resize_decoder[465]
+resized_during = resize_decoder[5]
 
-plot(resized_during_mid_frame, title="Resized to 480x640 during decoding")
+plot(resized_during, title="Resized to 480x640 during decoding")
 
 # %%
-# TorchCodec's relationship with TorchVision transforms
+# TorchCodec's relationship to TorchVision transforms
 # -----------------------------------------------------
-# Talk about the relationship between TorchVision transforms and TorchCodec
-# decoder transforms. Importantly, they're not identical:
-abs_diff = (resized_post_mid_frame.float() - resized_during_mid_frame.float()).abs()
+# Notably, in our examples we are passing in TorchVision
+# :class:`~torchvision.transforms.v2.Transform` objects as our transforms. We
+# would have gotten equivalent behavior if we had passed in the
+# :class:`~torchcodec.transforms.Resize` object that is a part of TorchCodec.
+# :class:`~torchcodec.decoders.VideoDecoder` accepts both objects as a matter of
+# convenience and to clarify the relationship between the transforms that TorchCodec
+# applies and the transforms that TorchVision offers.
+#
+# Importantly, the two frames are not identical, even though we can see they
+# *look* very similar:
+
+abs_diff = (resized_after.float() - resized_during.float()).abs()
 (abs_diff == 0).all()
 
 # %%
@@ -112,68 +151,107 @@ def plot(frames: torch.Tensor, title : str | None = None):
 (abs_diff <= 1).float().mean() >= 0.998
 
 # %%
-# Transform pipelines
-# -------------------
-# But wait - there's more!
+# While :class:`~torchcodec.decoders.VideoDecoder` accepts TorchVision transforms as
+# *specifications*, it is not actually using the TorchVision implementation of these
+# transforms. Instead, it is mapping them to equivalent
+# `FFmpeg filters <https://ffmpeg.org/ffmpeg-filters.html>`_. That is,
+# :class:`torchvision.transforms.v2.Resize` is mapped to
+# `scale <https://ffmpeg.org/ffmpeg-filters.html#scale-1>`_ and
+# :class:`torchvision.transforms.v2.CenterCrop` is mapped to
+# `crop <https://ffmpeg.org/ffmpeg-filters.html#crop>`_.
+#
+# The relationships we ensure between TorchCodec :class:`~torchcodec.transforms.DecoderTransform` objects
+# and TorchVision :class:`~torchvision.transforms.v2.Transform` objects are:
+#
+#      1. The names are the same.
+#      2. Default behaviors are the same.
+#      3. The parameters for the :class:`~torchcodec.transforms.DecoderTransform` object are a subset of the
+#         TorchVision :class:`~torchvision.transforms.v2.Transform` object.
+#      4. Parameters with the same name control the same behavior and accept a
+#         subset of the same types.
+#      5. The difference between the frames returned by a decoder transform and
+#         the complementary TorchVision transform are such that a model should
+#         not be able to tell the difference.
+#
+# .. note::
+#
+#     We do not encourage *intentionally* mixing usage of TorchCodec's decoder
+#     transforms and TorchVision transforms. That is, if you use TorchCodec's
+#     decoder transforms during training, you should also use them during
+#     inference. And if you decode full frames and apply TorchVision's
+#     transforms to those fully decoded frames during training, you should also
+#     do the same during inference. We provide the similarity guarantees to mitigate
+#     the harm when the two techniques are *unintentionally* mixed.
+
+# %%
+# Decoder transform pipelines
+# ---------------------------
+# So far, we've only provided a single transform to the `transform` parameter to
+# :class:`~torchcodec.decoders.VideoDecoder`. But it
+# actually accepts a list of transforms, which become a pipeline of transforms.
+# The order of the list matters: the first transform in the list will receive
+# the originally decoded frame. The output of that transform becomes the input
+# to the next transform in the list, and so on.
+#
+# A simple example:
 
 crop_resize_decoder = VideoDecoder(
     penguin_video_path,
     transforms = [
         v2.Resize(size=(480, 640)),
-        v2.CenterCrop(size=(300, 200))
+        v2.CenterCrop(size=(315, 220))
     ]
 )
-crop_resized_during_mid_frame = crop_resize_decoder[465]
-plot(crop_resized_during_mid_frame, title="Resized to 480x640 during decoding then center cropped")
+crop_resized_during = crop_resize_decoder[5]
+plot(crop_resized_during, title="Resized to 480x640 during decoding then center cropped")
 
 # %%
-# We also support `RandomCrop`. Reach out if there are particular transforms you want!
-
-# %%
-# Performance
-# -----------
+# Performance: memory efficiency and speed
+# ----------------------------------------
 #
-# The main motivation for doing this is performance.
-
-import os
-import psutil
-
-def bench(f, average_over=5, warmup=2, **f_kwargs):
+# The main motivation for decoder transforms is *memory efficiency*,
+# particularly when applying transforms that reduce the size of a frame, such
+# as resize and crop. Because the transforms are applied during decoding, the
+# full frame is never returned to the Python layer. As a result, there is
+# significantly less pressure on the Python gargabe collector.
+#
+# In `benchmarks <https://github.com/meta-pytorch/torchcodec/blob/f6a816190cbcac417338c29d5e6fac99311d054f/benchmarks/decoders/benchmark_transforms.py>`_
+# reducing frames from (1080, 1920) down to (135, 240), we have observed a
+# reduction in peak resident set size from 4.3 GB to 0.4 MB.
+#
+# There is sometimes a runtime benefit, but it is dependent on the number of
+# threads that the :class:`~torchcodec.decoders.VideoDecoder` tells FFmpeg
+# to use. We define the following benchmark function, as well as the functions
+# to benchmark:
 
+def bench(f, average_over=3, warmup=1, **f_kwargs):
     for _ in range(warmup):
         f(**f_kwargs)
 
-    process = psutil.Process(os.getpid())
     times = []
     memory = []
     for _ in range(average_over):
-        #start_rss = process.memory_info().rss / 1024 / 1024
         start_time = perf_counter_ns()
         f(**f_kwargs)
         end_time = perf_counter_ns()
-        #end_rss = process.memory_info().rss / 1024 / 1024
         times.append(end_time - start_time)
-        #memory.append(end_rss - start_rss)
 
     times = torch.tensor(times) * 1e-6  # ns to ms
     times_std = times.std().item()
     times_med = times.median().item()
-    #memory = torch.tensor(memory)
-    #memory_std = memory.std().item()
-    #memory_med = memory.median().item()
-    print(f"{times_med = :.2f}ms +- {times_std:.2f}")
-    #print(f"{memory_med = :.2f}MB +- {memory_std:.2f}")
+    return f"{times_med = :.2f}ms +- {times_std:.2f}"
 
 from torchcodec import samplers
 
-def sample_decoder_transforms():
+def sample_decoder_transforms(num_threads: int):
     decoder = VideoDecoder(
         penguin_video_path,
         transforms = [
-            #v2.Resize(size=(480, 640)),
-            v2.CenterCrop(size=(300, 200))
+            v2.Resize(size=(480, 640)),
+            v2.CenterCrop(size=(315, 220))
         ],
         seek_mode="approximate",
+        num_ffmpeg_threads=num_threads,
     )
     transformed_frames = samplers.clips_at_regular_indices(
         decoder,
@@ -182,10 +260,11 @@ def sample_decoder_transforms():
     )
     assert len(transformed_frames.data[0]) == 200
 
-def sample_torchvision_transforms():
+def sample_torchvision_transforms(num_threads: int):
     decoder = VideoDecoder(
         penguin_video_path,
-        seek_mode="approximate"
+        seek_mode="approximate",
+        num_ffmpeg_threads=num_threads,
     )
     frames = samplers.clips_at_regular_indices(
         decoder,
@@ -194,14 +273,40 @@ def sample_torchvision_transforms():
     )
     transformed_frames = []
     for frame in frames.data[0]:
-        #frame = v2.Resize(size=(480, 640))(frame)
-        frame = v2.CenterCrop(size=(300, 200))(frame)
+        frame = v2.Resize(size=(480, 640))(frame)
+        frame = v2.CenterCrop(size=(315, 220))(frame)
         transformed_frames.append(frame)
     assert len(transformed_frames) == 200
 
-bench(sample_decoder_transforms)
-bench(sample_torchvision_transforms)
+# %%
+# When the :class:`~torchcodec.decoders.VideoDecoder` object sets the number of
+# FFmpeg threads to 0, that tells FFmpeg to determine how many threads to use
+# based on what is available on the current system. In such cases, decoder transforms
+# will tend to outperform getting back a full frame and applying TorchVision transforms
+# sequentially:
+
+print(f"decoder transforms:    {bench(sample_decoder_transforms, num_threads=0)}")
+print(f"torchvision transform: {bench(sample_torchvision_transforms, num_threads=0)}")
 
 # %%
+# The reason is that FFmpeg is applying the decoder transforms in parallel.
+# However, if the number of threads is 1 (as is the default), then there often is no
+# runtime benefit to using decoder transforms. Using the TorchVision transforms may
+# even be faster!
+
+print(f"decoder transforms:    {bench(sample_decoder_transforms, num_threads=1)}")
+print(f"torchvision transform: {bench(sample_torchvision_transforms, num_threads=1)}")
+
+# %%
+# In brief, our performance guidance is:
+#
+#    1. If you are applying a transform pipeline that signficantly reduces
+#       the dimensions of your input frames and memory efficiency matters, use
+#       decoder transforms.
+#    2. If you are using multiple FFmpeg threads, decoder transforms may be
+#       faster. Experiment with your setup to verify.
+#    3. If you are using a single FFmpeg thread, then decoder transforms may
+#       be slower. Experiment with your setup to verify.
+
 shutil.rmtree(temp_dir)
 # %%

From ba993ade50aed1bb637622f20647020fb5df6787 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Wed, 10 Dec 2025 20:50:27 -0800
Subject: [PATCH 4/7] Lint

---
 examples/decoding/transforms.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py
index 815796e91..95201e859 100644
--- a/examples/decoding/transforms.py
+++ b/examples/decoding/transforms.py
@@ -26,9 +26,9 @@
 import tempfile
 from pathlib import Path
 import shutil
-import subprocess
 from time import perf_counter_ns
 
+
 def store_video_to(url: str, local_video_path: Path):
     response = requests.get(url, headers={"User-Agent": ""})
     if response.status_code != 200:
@@ -38,6 +38,7 @@ def store_video_to(url: str, local_video_path: Path):
         for chunk in response.iter_content():
             f.write(chunk)
 
+
 def plot(frames: torch.Tensor, title : str | None = None):
     try:
         from torchvision.utils import make_grid
@@ -49,7 +50,7 @@ def plot(frames: torch.Tensor, title : str | None = None):
 
     plt.rcParams["savefig.bbox"] = "tight"
     dpi = 300
-    fig, ax = plt.subplots(figsize=(800/dpi, 600/dpi), dpi=dpi)
+    fig, ax = plt.subplots(figsize=(800 / dpi, 600 / dpi), dpi=dpi)
     ax.imshow(to_pil_image(make_grid(frames)))
     ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
     if title is not None:
@@ -64,6 +65,7 @@ def plot(frames: torch.Tensor, title : str | None = None):
 # purposefully retrieving a high resolution video to demonstrate using
 # transforms to reduce the dimensions.
 
+
 # Video source: https://www.pexels.com/video/an-african-penguin-at-the-beach-9140346/
 # Author: Taryn Elliott.
 url = "https://videos.pexels.com/video-files/9140346/9140346-uhd_3840_2160_25fps.mp4"
@@ -224,12 +226,12 @@ def plot(frames: torch.Tensor, title : str | None = None):
 # to use. We define the following benchmark function, as well as the functions
 # to benchmark:
 
+
 def bench(f, average_over=3, warmup=1, **f_kwargs):
     for _ in range(warmup):
         f(**f_kwargs)
 
     times = []
-    memory = []
     for _ in range(average_over):
         start_time = perf_counter_ns()
         f(**f_kwargs)
@@ -241,8 +243,10 @@ def bench(f, average_over=3, warmup=1, **f_kwargs):
     times_med = times.median().item()
     return f"{times_med = :.2f}ms +- {times_std:.2f}"
 
+
 from torchcodec import samplers
 
+
 def sample_decoder_transforms(num_threads: int):
     decoder = VideoDecoder(
         penguin_video_path,
@@ -260,6 +264,7 @@ def sample_decoder_transforms(num_threads: int):
     )
     assert len(transformed_frames.data[0]) == 200
 
+
 def sample_torchvision_transforms(num_threads: int):
     decoder = VideoDecoder(
         penguin_video_path,
@@ -285,6 +290,7 @@ def sample_torchvision_transforms(num_threads: int):
 # will tend to outperform getting back a full frame and applying TorchVision transforms
 # sequentially:
 
+
 print(f"decoder transforms:    {bench(sample_decoder_transforms, num_threads=0)}")
 print(f"torchvision transform: {bench(sample_torchvision_transforms, num_threads=0)}")
 

From 1a37b74b5ef2a766d4d274e2a6e9007efe4ce7b6 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Thu, 11 Dec 2025 20:49:33 -0800
Subject: [PATCH 5/7] Apply edits from review

---
 docs/source/api_ref_transforms.rst |  4 +-
 examples/decoding/transforms.py    | 79 +++++++++++++++---------------
 2 files changed, 43 insertions(+), 40 deletions(-)

diff --git a/docs/source/api_ref_transforms.rst b/docs/source/api_ref_transforms.rst
index dee3f0c3a..18bffabae 100644
--- a/docs/source/api_ref_transforms.rst
+++ b/docs/source/api_ref_transforms.rst
@@ -4,9 +4,11 @@
 torchcodec.transforms
 =====================
 
+.. automodule:: torchcodec.transforms
+
 .. currentmodule:: torchcodec.transforms
 
-For a tutorial, see: TODO_DECODER_TRANSFORMS_TUTORIAL.
+For a tutorial, see: :ref:`sphx_glr_generated_examples_decoding_transforms.py`.
 
 .. autosummary::
     :toctree: generated/
diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py
index 95201e859..268a857c3 100644
--- a/examples/decoding/transforms.py
+++ b/examples/decoding/transforms.py
@@ -11,8 +11,8 @@
 
 In this example, we will demonstrate how to use the ``transforms`` parameter of
 the :class:`~torchcodec.decoders.VideoDecoder` class. This parameter allows us
-to specify a list of :class:`~torchcodec.transforms.DecoderTransform` or
-:class:`~torchvision.transforms.v2.Transform` objects. These objects serve as
+to specify a list of :class:`torchcodec.transforms.DecoderTransform` or
+:class:`torchvision.transforms.v2.Transform` objects. These objects serve as
 transform specificiations that the :class:`~torchcodec.decoders.VideoDecoder`
 will apply during the decoding process.
 """
@@ -94,20 +94,9 @@ def plot(frames: torch.Tensor, title : str | None = None):
 # -----------------------------------------
 #
 # A pre-processing pipeline for videos during training will typically apply a
-# set of transforms for three main reasons:
-#
-#   1. **Normalization**: Videos can have many different lengths, resolutions,
-#      and frame rates. Normalizing all videos to the same characteristics
-#      leads to better model performance.
-#   2. **Data reduction**: Training on higher resolution frames may lead to better
-#      model performance, but it will be more expensive both at training and
-#      inference time. As a consequence, many video pre-processing pipelines reduce
-#      frame dimensions through resizing and cropping.
-#   3. **Variety**: Applying random transforms (flips, crops, perspective shifts)
-#      to the same frames during training can improve model performance.
-#
-# Below is a simple example of applying the
-# :class:`~torchvision.transforms.v2.Resize` transform to a single frame:
+# set of transforms for a variety of reasons. Below is a simple example of
+# applying TorchVision's :class:`~torchvision.transforms.v2.Resize` transform to a single
+# frame **after** the decoder returns it:
 
 from torchvision.transforms import v2
 
@@ -121,7 +110,7 @@ def plot(frames: torch.Tensor, title : str | None = None):
 # In the example above, ``full_decoder`` returns a video frame that has the
 # dimensions (2160, 3840) which is then resized down to (480, 640). But with the
 # ``transforms`` parameter of :class:`~torchcodec.decoders.VideoDecoder` we can
-# specify for the resize to  happen during decoding:
+# specify for the resize to  happen **during** decoding!
 
 resize_decoder = VideoDecoder(
     penguin_video_path,
@@ -135,9 +124,15 @@ def plot(frames: torch.Tensor, title : str | None = None):
 # TorchCodec's relationship to TorchVision transforms
 # -----------------------------------------------------
 # Notably, in our examples we are passing in TorchVision
-# :class:`~torchvision.transforms.v2.Transform` objects as our transforms. We
+# :class:`~torchvision.transforms.v2.Transform` objects as our transforms.
+# However, :class:`~torchcodec.decoders.VideoDecoder` accepts TorchVision
+# transforms as a matter of convenience. TorchVision is **not required** to use
+# decoder transforms.
+#
+# Every TorchVision transform that :class:`~torchcodec.decoders.VideoDecoder` accepts
+# has a complementary transform defined in :mod:`torchcodec.transforms`. We
 # would have gotten equivalent behavior if we had passed in the
-# :class:`~torchcodec.transforms.Resize` object that is a part of TorchCodec.
+# :class:`torchcodec.transforms.Resize` object that is a part of TorchCodec.
 # :class:`~torchcodec.decoders.VideoDecoder` accepts both objects as a matter of
 # convenience and to clarify the relationship between the transforms that TorchCodec
 # applies and the transforms that TorchVision offers.
@@ -150,16 +145,16 @@ def plot(frames: torch.Tensor, title : str | None = None):
 
 # %%
 # But they're close enough that models won't be able to tell a difference:
-(abs_diff <= 1).float().mean() >= 0.998
+assert (abs_diff <= 1).float().mean() >= 0.998
 
 # %%
 # While :class:`~torchcodec.decoders.VideoDecoder` accepts TorchVision transforms as
 # *specifications*, it is not actually using the TorchVision implementation of these
 # transforms. Instead, it is mapping them to equivalent
 # `FFmpeg filters <https://ffmpeg.org/ffmpeg-filters.html>`_. That is,
-# :class:`torchvision.transforms.v2.Resize` is mapped to
-# `scale <https://ffmpeg.org/ffmpeg-filters.html#scale-1>`_ and
-# :class:`torchvision.transforms.v2.CenterCrop` is mapped to
+# :class:`torchvision.transforms.v2.Resize` and :class:`torchcodec.transforms.Resize` are mapped to
+# `scale <https://ffmpeg.org/ffmpeg-filters.html#scale-1>`_; and
+# :class:`torchvision.transforms.v2.CenterCrop` and :class:`torchcodec.transforms.CenterCrop` are mapped to
 # `crop <https://ffmpeg.org/ffmpeg-filters.html#crop>`_.
 #
 # The relationships we ensure between TorchCodec :class:`~torchcodec.transforms.DecoderTransform` objects
@@ -188,7 +183,7 @@ def plot(frames: torch.Tensor, title : str | None = None):
 # %%
 # Decoder transform pipelines
 # ---------------------------
-# So far, we've only provided a single transform to the `transform` parameter to
+# So far, we've only provided a single transform to the ``transform`` parameter to
 # :class:`~torchcodec.decoders.VideoDecoder`. But it
 # actually accepts a list of transforms, which become a pipeline of transforms.
 # The order of the list matters: the first transform in the list will receive
@@ -200,12 +195,12 @@ def plot(frames: torch.Tensor, title : str | None = None):
 crop_resize_decoder = VideoDecoder(
     penguin_video_path,
     transforms = [
+        v2.CenterCrop(size=(1280, 1664)),
         v2.Resize(size=(480, 640)),
-        v2.CenterCrop(size=(315, 220))
     ]
 )
 crop_resized_during = crop_resize_decoder[5]
-plot(crop_resized_during, title="Resized to 480x640 during decoding then center cropped")
+plot(crop_resized_during, title="Center cropped then resized to 480x640")
 
 # %%
 # Performance: memory efficiency and speed
@@ -213,13 +208,15 @@ def plot(frames: torch.Tensor, title : str | None = None):
 #
 # The main motivation for decoder transforms is *memory efficiency*,
 # particularly when applying transforms that reduce the size of a frame, such
-# as resize and crop. Because the transforms are applied during decoding, the
-# full frame is never returned to the Python layer. As a result, there is
-# significantly less pressure on the Python gargabe collector.
+# as resize and crop. Because the FFmpeg layer knows all of the transforms it
+# needs to apply during decoding, it's able to efficiently reuse memory.
+# Further, full resolution frames are never returned to the Python layer.  As a
+# result, there is significantly less total memory needed and less pressure on
+# the Python garbage collector.
 #
 # In `benchmarks <https://github.com/meta-pytorch/torchcodec/blob/f6a816190cbcac417338c29d5e6fac99311d054f/benchmarks/decoders/benchmark_transforms.py>`_
 # reducing frames from (1080, 1920) down to (135, 240), we have observed a
-# reduction in peak resident set size from 4.3 GB to 0.4 MB.
+# reduction in peak resident set size from 4.3 GB to 0.4 GB.
 #
 # There is sometimes a runtime benefit, but it is dependent on the number of
 # threads that the :class:`~torchcodec.decoders.VideoDecoder` tells FFmpeg
@@ -251,8 +248,8 @@ def sample_decoder_transforms(num_threads: int):
     decoder = VideoDecoder(
         penguin_video_path,
         transforms = [
+            v2.CenterCrop(size=(1280, 1664)),
             v2.Resize(size=(480, 640)),
-            v2.CenterCrop(size=(315, 220))
         ],
         seek_mode="approximate",
         num_ffmpeg_threads=num_threads,
@@ -266,6 +263,8 @@ def sample_decoder_transforms(num_threads: int):
 
 
 def sample_torchvision_transforms(num_threads: int):
+    if num_threads > 0:
+        torch.set_num_threads(num_threads)
     decoder = VideoDecoder(
         penguin_video_path,
         seek_mode="approximate",
@@ -276,12 +275,14 @@ def sample_torchvision_transforms(num_threads: int):
         num_clips=1,
         num_frames_per_clip=200
     )
-    transformed_frames = []
-    for frame in frames.data[0]:
-        frame = v2.Resize(size=(480, 640))(frame)
-        frame = v2.CenterCrop(size=(315, 220))(frame)
-        transformed_frames.append(frame)
-    assert len(transformed_frames) == 200
+    transforms = v2.Compose(
+        [
+            v2.CenterCrop(size=(1280, 1664)),
+            v2.Resize(size=(480, 640)),
+        ]
+    )
+    transformed_frames = transforms(frames.data)
+    assert transformed_frames.shape[1] == 200
 
 # %%
 # When the :class:`~torchcodec.decoders.VideoDecoder` object sets the number of
@@ -296,8 +297,8 @@ def sample_torchvision_transforms(num_threads: int):
 
 # %%
 # The reason is that FFmpeg is applying the decoder transforms in parallel.
-# However, if the number of threads is 1 (as is the default), then there often is no
-# runtime benefit to using decoder transforms. Using the TorchVision transforms may
+# However, if the number of threads is 1 (as is the default), then there is often
+# less benefit to using decoder transforms. Using the TorchVision transforms may
 # even be faster!
 
 print(f"decoder transforms:    {bench(sample_decoder_transforms, num_threads=1)}")

From ade732caadb313bba3570a20b8863f8e6d1a281f Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Fri, 12 Dec 2025 08:29:30 -0800
Subject: [PATCH 6/7] Address more review comments

---
 docs/source/index.rst           | 8 ++++++++
 examples/decoding/transforms.py | 9 +++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 4311d2e0d..5d9d58bcb 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -84,6 +84,14 @@ Decoding
 
         How to sample regular and random clips from a video
 
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Decoder transforms
+        :img-top: _static/img/card-background.svg
+        :link: generated_examples/decoding/transforms.html
+        :link-type: url
+
+        How to apply transforms while decoding
+
 
 Encoding
 ^^^^^^^^
diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py
index 268a857c3..f1fbb2a13 100644
--- a/examples/decoding/transforms.py
+++ b/examples/decoding/transforms.py
@@ -13,12 +13,13 @@
 the :class:`~torchcodec.decoders.VideoDecoder` class. This parameter allows us
 to specify a list of :class:`torchcodec.transforms.DecoderTransform` or
 :class:`torchvision.transforms.v2.Transform` objects. These objects serve as
-transform specificiations that the :class:`~torchcodec.decoders.VideoDecoder`
+transform specifications that the :class:`~torchcodec.decoders.VideoDecoder`
 will apply during the decoding process.
 """
 
 # %%
-# First, a bit of boilerplate and definitions that we will use later:
+# First, a bit of boilerplate, definitions that we will use later. You can skip
+# ahead to our :ref:`example_video` or :ref:`applying_transforms`.
 
 
 import torch
@@ -58,6 +59,8 @@ def plot(frames: torch.Tensor, title : str | None = None):
     plt.tight_layout()
 
 # %%
+# .. _example_video:
+#
 # Our example video
 # -----------------
 #
@@ -90,6 +93,8 @@ def plot(frames: torch.Tensor, title : str | None = None):
 #     specify image dimensions.
 
 # %%
+# .. _applying_transforms:
+#
 # Applying transforms during pre-processing
 # -----------------------------------------
 #

From 8ea538470696bff64c41cee28d05fba0f0ddf90b Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Fri, 12 Dec 2025 13:32:00 -0800
Subject: [PATCH 7/7] More clarifications

---
 examples/decoding/transforms.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py
index f1fbb2a13..1c3920915 100644
--- a/examples/decoding/transforms.py
+++ b/examples/decoding/transforms.py
@@ -5,6 +5,9 @@
 # LICENSE file in the root directory of this source tree.
 
 """
+.. meta::
+   :description: Learn how to apply transforms during video decoding for improved memory efficiency and performance.
+
 =======================================================
 Decoder Transforms: Applying transforms during decoding
 =======================================================
@@ -167,8 +170,9 @@ def plot(frames: torch.Tensor, title : str | None = None):
 #
 #      1. The names are the same.
 #      2. Default behaviors are the same.
-#      3. The parameters for the :class:`~torchcodec.transforms.DecoderTransform` object are a subset of the
-#         TorchVision :class:`~torchvision.transforms.v2.Transform` object.
+#      3. The parameters for the :class:`~torchcodec.transforms.DecoderTransform`
+#         object are a subset of the TorchVision :class:`~torchvision.transforms.v2.Transform`
+#         object.
 #      4. Parameters with the same name control the same behavior and accept a
 #         subset of the same types.
 #      5. The difference between the frames returned by a decoder transform and
@@ -177,13 +181,20 @@ def plot(frames: torch.Tensor, title : str | None = None):
 #
 # .. note::
 #
-#     We do not encourage *intentionally* mixing usage of TorchCodec's decoder
-#     transforms and TorchVision transforms. That is, if you use TorchCodec's
-#     decoder transforms during training, you should also use them during
-#     inference. And if you decode full frames and apply TorchVision's
-#     transforms to those fully decoded frames during training, you should also
-#     do the same during inference. We provide the similarity guarantees to mitigate
-#     the harm when the two techniques are *unintentionally* mixed.
+#     Applying the exact same transforms during training and inference is
+#     important for model perforamnce. For example, if you use decoder
+#     transforms to resize frames during training, you should also use decoder
+#     transforms to resize frames during inference. We provide the similarity
+#     guarantees to mitigate the harm when the two techniques are
+#     *unintentionally* mixed. That is, if you use decoder transforms to resize
+#     frames during training, but use TorchVisions's
+#     :class:`~torchvision.transforms.v2.Resize` during inference, our guarantees
+#     mitigate the harm to model performance. But we **reccommend against** this kind of
+#     mixing.
+#
+#     It is appropriate and expected to use some decoder transforms and some TorchVision
+#     transforms, as long as the exact same pre-processing operations are performed during
+#     training and inference.
 
 # %%
 # Decoder transform pipelines