From 138e22ed3b5169a837fd7c9b31a61539fcc562a7 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Fri, 5 Dec 2025 16:58:50 -0800 Subject: [PATCH 1/7] Initial sketch --- docs/source/conf.py | 1 + examples/decoding/transforms.py | 141 ++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 examples/decoding/transforms.py diff --git a/docs/source/conf.py b/docs/source/conf.py index b26108561..b683fda64 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -83,6 +83,7 @@ def __call__(self, filename): "sampling.py", "parallel_decoding.py", "custom_frame_mappings.py", + "transforms.py", ] else: assert "examples/encoding" in self.src_dir diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py new file mode 100644 index 000000000..695b3dd18 --- /dev/null +++ b/examples/decoding/transforms.py @@ -0,0 +1,141 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +=================================================================== +Decoder Transforms: Needs a tagline +=================================================================== + +In this example, we will describe the ``transforms`` parameter of the +:class:`~torchcodec.decoders.VideoDecoder` class. +""" + +# %% +# First, a bit of boilerplate and definitions. + + +import torch +import requests +import tempfile +from pathlib import Path +import shutil +import subprocess +from time import perf_counter_ns +from IPython.display import Video + +def store_video_to(url: str, local_video_path: Path): + response = requests.get(url, headers={"User-Agent": ""}) + if response.status_code != 200: + raise RuntimeError(f"Failed to download video. {response.status_code = }.") + + with open(local_video_path, 'wb') as f: + for chunk in response.iter_content(): + f.write(chunk) + +def plot(frames: torch.Tensor, title : str | None = None): + try: + from torchvision.utils import make_grid + from torchvision.transforms.v2.functional import to_pil_image + import matplotlib.pyplot as plt + except ImportError: + print("Cannot plot, please run `pip install torchvision matplotlib`") + return + + plt.rcParams["savefig.bbox"] = "tight" + dpi = 300 + fig, ax = plt.subplots(figsize=(800/dpi, 600/dpi), dpi=dpi) + ax.imshow(to_pil_image(make_grid(frames))) + ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) + if title is not None: + ax.set_title(title, fontsize=6) + plt.tight_layout() + +# %% +# Our example video +# ----------------- +# +# We'll download a video from the internet and store it locally. We're +# purposefully retrieving a high resolution video to demonstrate using +# transforms to modify dimensions. + +# Video source: https://www.pexels.com/video/an-african-penguin-at-the-beach-9140346/ +# Author: Taryn Elliott. +url = "https://videos.pexels.com/video-files/9140346/9140346-uhd_3840_2160_25fps.mp4" + +temp_dir = tempfile.mkdtemp() +penguin_video_path = Path(temp_dir) / "penguin.mp4" +store_video_to(url, penguin_video_path) + +from torchcodec.decoders import VideoDecoder +print(f"Penguin video metadata: {VideoDecoder(penguin_video_path).metadata}") + +# %% +# Some stuff about the video itself, including its resolution of 3840x2160. + +# %% +# Applying transforms during pre-processing +# ----------------------------------------- +# +# There are lots of reasons to apply transforms to video frames during pre-proc +# (list them). A typical example might look like: + +from torchvision.transforms import v2 + +full_decoder = VideoDecoder(penguin_video_path) +full_mid_frame = full_decoder[465] # mid-point of the video +resized_post_mid_frame = v2.Resize(size=(360, 640))(full_mid_frame) + +plot(resized_post_mid_frame, title="Resized to 360x640 after decoding") + +# %% +# But we can now do it: +resize_decoder = VideoDecoder( + penguin_video_path, + transforms=[v2.Resize(size=(360, 640))] +) +resized_during_mid_frame = resize_decoder[465] + +plot(resized_during_mid_frame, title="Resized to 360x640 during decoding") + +# %% +# TorchCodec's relationship with TorchVision transforms +# ----------------------------------------------------- +# Talk about the relationship between TorchVision transforms and TorchCodec +# decoder transforms. Importantly, they're not identical: +abs_diff = (resized_post_mid_frame.float() - resized_during_mid_frame.float()).abs() +(abs_diff == 0).all() + +# %% +# But they're close enough that models won't be able to tell a difference: +(abs_diff <= 1).float().mean() >= 0.998 + +# %% +# Transform pipelines +# ------------------- +# But wait - there's more! + +crop_resize_decoder = VideoDecoder( + penguin_video_path, + transforms = [ + v2.Resize(size=(360, 640)), + v2.CenterCrop(size=(300, 200)) + ] +) +crop_resized_during_mid_frame = crop_resize_decoder[465] +plot(crop_resized_during_mid_frame, title="Resized to 360x640 during decoding then center cropped") + +# %% +# We also support `RandomCrop`. Reach out if there are particular transforms you want! + +# %% +# Performance +# ----------- +# +# The main motivation for doing this is performance. + +# %% +shutil.rmtree(temp_dir) +# %% From 408ac57754bb967d9b8d69bcd8fa53f2f2591bd1 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Tue, 9 Dec 2025 11:40:52 -0800 Subject: [PATCH 2/7] Commit to move on --- examples/decoding/transforms.py | 80 ++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 7 deletions(-) diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py index 695b3dd18..36093b3e9 100644 --- a/examples/decoding/transforms.py +++ b/examples/decoding/transforms.py @@ -24,7 +24,6 @@ import shutil import subprocess from time import perf_counter_ns -from IPython.display import Video def store_video_to(url: str, local_video_path: Path): response = requests.get(url, headers={"User-Agent": ""}) @@ -86,19 +85,19 @@ def plot(frames: torch.Tensor, title : str | None = None): full_decoder = VideoDecoder(penguin_video_path) full_mid_frame = full_decoder[465] # mid-point of the video -resized_post_mid_frame = v2.Resize(size=(360, 640))(full_mid_frame) +resized_post_mid_frame = v2.Resize(size=(480, 640))(full_mid_frame) -plot(resized_post_mid_frame, title="Resized to 360x640 after decoding") +plot(resized_post_mid_frame, title="Resized to 480x640 after decoding") # %% # But we can now do it: resize_decoder = VideoDecoder( penguin_video_path, - transforms=[v2.Resize(size=(360, 640))] + transforms=[v2.Resize(size=(480, 640))] ) resized_during_mid_frame = resize_decoder[465] -plot(resized_during_mid_frame, title="Resized to 360x640 during decoding") +plot(resized_during_mid_frame, title="Resized to 480x640 during decoding") # %% # TorchCodec's relationship with TorchVision transforms @@ -120,12 +119,12 @@ def plot(frames: torch.Tensor, title : str | None = None): crop_resize_decoder = VideoDecoder( penguin_video_path, transforms = [ - v2.Resize(size=(360, 640)), + v2.Resize(size=(480, 640)), v2.CenterCrop(size=(300, 200)) ] ) crop_resized_during_mid_frame = crop_resize_decoder[465] -plot(crop_resized_during_mid_frame, title="Resized to 360x640 during decoding then center cropped") +plot(crop_resized_during_mid_frame, title="Resized to 480x640 during decoding then center cropped") # %% # We also support `RandomCrop`. Reach out if there are particular transforms you want! @@ -136,6 +135,73 @@ def plot(frames: torch.Tensor, title : str | None = None): # # The main motivation for doing this is performance. +import os +import psutil + +def bench(f, average_over=5, warmup=2, **f_kwargs): + + for _ in range(warmup): + f(**f_kwargs) + + process = psutil.Process(os.getpid()) + times = [] + memory = [] + for _ in range(average_over): + #start_rss = process.memory_info().rss / 1024 / 1024 + start_time = perf_counter_ns() + f(**f_kwargs) + end_time = perf_counter_ns() + #end_rss = process.memory_info().rss / 1024 / 1024 + times.append(end_time - start_time) + #memory.append(end_rss - start_rss) + + times = torch.tensor(times) * 1e-6 # ns to ms + times_std = times.std().item() + times_med = times.median().item() + #memory = torch.tensor(memory) + #memory_std = memory.std().item() + #memory_med = memory.median().item() + print(f"{times_med = :.2f}ms +- {times_std:.2f}") + #print(f"{memory_med = :.2f}MB +- {memory_std:.2f}") + +from torchcodec import samplers + +def sample_decoder_transforms(): + decoder = VideoDecoder( + penguin_video_path, + transforms = [ + #v2.Resize(size=(480, 640)), + v2.CenterCrop(size=(300, 200)) + ], + seek_mode="approximate", + ) + transformed_frames = samplers.clips_at_regular_indices( + decoder, + num_clips=1, + num_frames_per_clip=200 + ) + assert len(transformed_frames.data[0]) == 200 + +def sample_torchvision_transforms(): + decoder = VideoDecoder( + penguin_video_path, + seek_mode="approximate" + ) + frames = samplers.clips_at_regular_indices( + decoder, + num_clips=1, + num_frames_per_clip=200 + ) + transformed_frames = [] + for frame in frames.data[0]: + #frame = v2.Resize(size=(480, 640))(frame) + frame = v2.CenterCrop(size=(300, 200))(frame) + transformed_frames.append(frame) + assert len(transformed_frames) == 200 + +bench(sample_decoder_transforms) +bench(sample_torchvision_transforms) + # %% shutil.rmtree(temp_dir) # %% From 2a16bbd8b70d8d25a94d8ee20da9c1d5be9d50b3 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Wed, 10 Dec 2025 20:38:18 -0800 Subject: [PATCH 3/7] First draft --- examples/decoding/transforms.py | 217 +++++++++++++++++++++++--------- 1 file changed, 161 insertions(+), 56 deletions(-) diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py index 36093b3e9..815796e91 100644 --- a/examples/decoding/transforms.py +++ b/examples/decoding/transforms.py @@ -5,16 +5,20 @@ # LICENSE file in the root directory of this source tree. """ -=================================================================== -Decoder Transforms: Needs a tagline -=================================================================== - -In this example, we will describe the ``transforms`` parameter of the -:class:`~torchcodec.decoders.VideoDecoder` class. +======================================================= +Decoder Transforms: Applying transforms during decoding +======================================================= + +In this example, we will demonstrate how to use the ``transforms`` parameter of +the :class:`~torchcodec.decoders.VideoDecoder` class. This parameter allows us +to specify a list of :class:`~torchcodec.transforms.DecoderTransform` or +:class:`~torchvision.transforms.v2.Transform` objects. These objects serve as +transform specificiations that the :class:`~torchcodec.decoders.VideoDecoder` +will apply during the decoding process. """ # %% -# First, a bit of boilerplate and definitions. +# First, a bit of boilerplate and definitions that we will use later: import torch @@ -58,7 +62,7 @@ def plot(frames: torch.Tensor, title : str | None = None): # # We'll download a video from the internet and store it locally. We're # purposefully retrieving a high resolution video to demonstrate using -# transforms to modify dimensions. +# transforms to reduce the dimensions. # Video source: https://www.pexels.com/video/an-african-penguin-at-the-beach-9140346/ # Author: Taryn Elliott. @@ -72,39 +76,74 @@ def plot(frames: torch.Tensor, title : str | None = None): print(f"Penguin video metadata: {VideoDecoder(penguin_video_path).metadata}") # %% -# Some stuff about the video itself, including its resolution of 3840x2160. +# As shown above, the video is 37 seconds long and has a height of 2160 pixels +# and a width of 3840 pixels. +# +# .. note:: +# +# The colloquial way to report the dimensions of this video would be as +# 3840x2160; that is, (`width`, `height`). In the PyTorch ecosystem, image +# dimensions are typically expressed as (`height`, `width`). The remainder +# of this tutorial uses the PyTorch convention of (`height`, `width`) to +# specify image dimensions. # %% # Applying transforms during pre-processing # ----------------------------------------- # -# There are lots of reasons to apply transforms to video frames during pre-proc -# (list them). A typical example might look like: +# A pre-processing pipeline for videos during training will typically apply a +# set of transforms for three main reasons: +# +# 1. **Normalization**: Videos can have many different lengths, resolutions, +# and frame rates. Normalizing all videos to the same characteristics +# leads to better model performance. +# 2. **Data reduction**: Training on higher resolution frames may lead to better +# model performance, but it will be more expensive both at training and +# inference time. As a consequence, many video pre-processing pipelines reduce +# frame dimensions through resizing and cropping. +# 3. **Variety**: Applying random transforms (flips, crops, perspective shifts) +# to the same frames during training can improve model performance. +# +# Below is a simple example of applying the +# :class:`~torchvision.transforms.v2.Resize` transform to a single frame: from torchvision.transforms import v2 full_decoder = VideoDecoder(penguin_video_path) -full_mid_frame = full_decoder[465] # mid-point of the video -resized_post_mid_frame = v2.Resize(size=(480, 640))(full_mid_frame) +frame = full_decoder[5] +resized_after = v2.Resize(size=(480, 640))(frame) -plot(resized_post_mid_frame, title="Resized to 480x640 after decoding") +plot(resized_after, title="Resized to 480x640 after decoding") # %% -# But we can now do it: +# In the example above, ``full_decoder`` returns a video frame that has the +# dimensions (2160, 3840) which is then resized down to (480, 640). But with the +# ``transforms`` parameter of :class:`~torchcodec.decoders.VideoDecoder` we can +# specify for the resize to happen during decoding: + resize_decoder = VideoDecoder( penguin_video_path, transforms=[v2.Resize(size=(480, 640))] ) -resized_during_mid_frame = resize_decoder[465] +resized_during = resize_decoder[5] -plot(resized_during_mid_frame, title="Resized to 480x640 during decoding") +plot(resized_during, title="Resized to 480x640 during decoding") # %% -# TorchCodec's relationship with TorchVision transforms +# TorchCodec's relationship to TorchVision transforms # ----------------------------------------------------- -# Talk about the relationship between TorchVision transforms and TorchCodec -# decoder transforms. Importantly, they're not identical: -abs_diff = (resized_post_mid_frame.float() - resized_during_mid_frame.float()).abs() +# Notably, in our examples we are passing in TorchVision +# :class:`~torchvision.transforms.v2.Transform` objects as our transforms. We +# would have gotten equivalent behavior if we had passed in the +# :class:`~torchcodec.transforms.Resize` object that is a part of TorchCodec. +# :class:`~torchcodec.decoders.VideoDecoder` accepts both objects as a matter of +# convenience and to clarify the relationship between the transforms that TorchCodec +# applies and the transforms that TorchVision offers. +# +# Importantly, the two frames are not identical, even though we can see they +# *look* very similar: + +abs_diff = (resized_after.float() - resized_during.float()).abs() (abs_diff == 0).all() # %% @@ -112,68 +151,107 @@ def plot(frames: torch.Tensor, title : str | None = None): (abs_diff <= 1).float().mean() >= 0.998 # %% -# Transform pipelines -# ------------------- -# But wait - there's more! +# While :class:`~torchcodec.decoders.VideoDecoder` accepts TorchVision transforms as +# *specifications*, it is not actually using the TorchVision implementation of these +# transforms. Instead, it is mapping them to equivalent +# `FFmpeg filters `_. That is, +# :class:`torchvision.transforms.v2.Resize` is mapped to +# `scale `_ and +# :class:`torchvision.transforms.v2.CenterCrop` is mapped to +# `crop `_. +# +# The relationships we ensure between TorchCodec :class:`~torchcodec.transforms.DecoderTransform` objects +# and TorchVision :class:`~torchvision.transforms.v2.Transform` objects are: +# +# 1. The names are the same. +# 2. Default behaviors are the same. +# 3. The parameters for the :class:`~torchcodec.transforms.DecoderTransform` object are a subset of the +# TorchVision :class:`~torchvision.transforms.v2.Transform` object. +# 4. Parameters with the same name control the same behavior and accept a +# subset of the same types. +# 5. The difference between the frames returned by a decoder transform and +# the complementary TorchVision transform are such that a model should +# not be able to tell the difference. +# +# .. note:: +# +# We do not encourage *intentionally* mixing usage of TorchCodec's decoder +# transforms and TorchVision transforms. That is, if you use TorchCodec's +# decoder transforms during training, you should also use them during +# inference. And if you decode full frames and apply TorchVision's +# transforms to those fully decoded frames during training, you should also +# do the same during inference. We provide the similarity guarantees to mitigate +# the harm when the two techniques are *unintentionally* mixed. + +# %% +# Decoder transform pipelines +# --------------------------- +# So far, we've only provided a single transform to the `transform` parameter to +# :class:`~torchcodec.decoders.VideoDecoder`. But it +# actually accepts a list of transforms, which become a pipeline of transforms. +# The order of the list matters: the first transform in the list will receive +# the originally decoded frame. The output of that transform becomes the input +# to the next transform in the list, and so on. +# +# A simple example: crop_resize_decoder = VideoDecoder( penguin_video_path, transforms = [ v2.Resize(size=(480, 640)), - v2.CenterCrop(size=(300, 200)) + v2.CenterCrop(size=(315, 220)) ] ) -crop_resized_during_mid_frame = crop_resize_decoder[465] -plot(crop_resized_during_mid_frame, title="Resized to 480x640 during decoding then center cropped") +crop_resized_during = crop_resize_decoder[5] +plot(crop_resized_during, title="Resized to 480x640 during decoding then center cropped") # %% -# We also support `RandomCrop`. Reach out if there are particular transforms you want! - -# %% -# Performance -# ----------- +# Performance: memory efficiency and speed +# ---------------------------------------- # -# The main motivation for doing this is performance. - -import os -import psutil - -def bench(f, average_over=5, warmup=2, **f_kwargs): +# The main motivation for decoder transforms is *memory efficiency*, +# particularly when applying transforms that reduce the size of a frame, such +# as resize and crop. Because the transforms are applied during decoding, the +# full frame is never returned to the Python layer. As a result, there is +# significantly less pressure on the Python gargabe collector. +# +# In `benchmarks `_ +# reducing frames from (1080, 1920) down to (135, 240), we have observed a +# reduction in peak resident set size from 4.3 GB to 0.4 MB. +# +# There is sometimes a runtime benefit, but it is dependent on the number of +# threads that the :class:`~torchcodec.decoders.VideoDecoder` tells FFmpeg +# to use. We define the following benchmark function, as well as the functions +# to benchmark: +def bench(f, average_over=3, warmup=1, **f_kwargs): for _ in range(warmup): f(**f_kwargs) - process = psutil.Process(os.getpid()) times = [] memory = [] for _ in range(average_over): - #start_rss = process.memory_info().rss / 1024 / 1024 start_time = perf_counter_ns() f(**f_kwargs) end_time = perf_counter_ns() - #end_rss = process.memory_info().rss / 1024 / 1024 times.append(end_time - start_time) - #memory.append(end_rss - start_rss) times = torch.tensor(times) * 1e-6 # ns to ms times_std = times.std().item() times_med = times.median().item() - #memory = torch.tensor(memory) - #memory_std = memory.std().item() - #memory_med = memory.median().item() - print(f"{times_med = :.2f}ms +- {times_std:.2f}") - #print(f"{memory_med = :.2f}MB +- {memory_std:.2f}") + return f"{times_med = :.2f}ms +- {times_std:.2f}" from torchcodec import samplers -def sample_decoder_transforms(): +def sample_decoder_transforms(num_threads: int): decoder = VideoDecoder( penguin_video_path, transforms = [ - #v2.Resize(size=(480, 640)), - v2.CenterCrop(size=(300, 200)) + v2.Resize(size=(480, 640)), + v2.CenterCrop(size=(315, 220)) ], seek_mode="approximate", + num_ffmpeg_threads=num_threads, ) transformed_frames = samplers.clips_at_regular_indices( decoder, @@ -182,10 +260,11 @@ def sample_decoder_transforms(): ) assert len(transformed_frames.data[0]) == 200 -def sample_torchvision_transforms(): +def sample_torchvision_transforms(num_threads: int): decoder = VideoDecoder( penguin_video_path, - seek_mode="approximate" + seek_mode="approximate", + num_ffmpeg_threads=num_threads, ) frames = samplers.clips_at_regular_indices( decoder, @@ -194,14 +273,40 @@ def sample_torchvision_transforms(): ) transformed_frames = [] for frame in frames.data[0]: - #frame = v2.Resize(size=(480, 640))(frame) - frame = v2.CenterCrop(size=(300, 200))(frame) + frame = v2.Resize(size=(480, 640))(frame) + frame = v2.CenterCrop(size=(315, 220))(frame) transformed_frames.append(frame) assert len(transformed_frames) == 200 -bench(sample_decoder_transforms) -bench(sample_torchvision_transforms) +# %% +# When the :class:`~torchcodec.decoders.VideoDecoder` object sets the number of +# FFmpeg threads to 0, that tells FFmpeg to determine how many threads to use +# based on what is available on the current system. In such cases, decoder transforms +# will tend to outperform getting back a full frame and applying TorchVision transforms +# sequentially: + +print(f"decoder transforms: {bench(sample_decoder_transforms, num_threads=0)}") +print(f"torchvision transform: {bench(sample_torchvision_transforms, num_threads=0)}") # %% +# The reason is that FFmpeg is applying the decoder transforms in parallel. +# However, if the number of threads is 1 (as is the default), then there often is no +# runtime benefit to using decoder transforms. Using the TorchVision transforms may +# even be faster! + +print(f"decoder transforms: {bench(sample_decoder_transforms, num_threads=1)}") +print(f"torchvision transform: {bench(sample_torchvision_transforms, num_threads=1)}") + +# %% +# In brief, our performance guidance is: +# +# 1. If you are applying a transform pipeline that signficantly reduces +# the dimensions of your input frames and memory efficiency matters, use +# decoder transforms. +# 2. If you are using multiple FFmpeg threads, decoder transforms may be +# faster. Experiment with your setup to verify. +# 3. If you are using a single FFmpeg thread, then decoder transforms may +# be slower. Experiment with your setup to verify. + shutil.rmtree(temp_dir) # %% From ba993ade50aed1bb637622f20647020fb5df6787 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Wed, 10 Dec 2025 20:50:27 -0800 Subject: [PATCH 4/7] Lint --- examples/decoding/transforms.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py index 815796e91..95201e859 100644 --- a/examples/decoding/transforms.py +++ b/examples/decoding/transforms.py @@ -26,9 +26,9 @@ import tempfile from pathlib import Path import shutil -import subprocess from time import perf_counter_ns + def store_video_to(url: str, local_video_path: Path): response = requests.get(url, headers={"User-Agent": ""}) if response.status_code != 200: @@ -38,6 +38,7 @@ def store_video_to(url: str, local_video_path: Path): for chunk in response.iter_content(): f.write(chunk) + def plot(frames: torch.Tensor, title : str | None = None): try: from torchvision.utils import make_grid @@ -49,7 +50,7 @@ def plot(frames: torch.Tensor, title : str | None = None): plt.rcParams["savefig.bbox"] = "tight" dpi = 300 - fig, ax = plt.subplots(figsize=(800/dpi, 600/dpi), dpi=dpi) + fig, ax = plt.subplots(figsize=(800 / dpi, 600 / dpi), dpi=dpi) ax.imshow(to_pil_image(make_grid(frames))) ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) if title is not None: @@ -64,6 +65,7 @@ def plot(frames: torch.Tensor, title : str | None = None): # purposefully retrieving a high resolution video to demonstrate using # transforms to reduce the dimensions. + # Video source: https://www.pexels.com/video/an-african-penguin-at-the-beach-9140346/ # Author: Taryn Elliott. url = "https://videos.pexels.com/video-files/9140346/9140346-uhd_3840_2160_25fps.mp4" @@ -224,12 +226,12 @@ def plot(frames: torch.Tensor, title : str | None = None): # to use. We define the following benchmark function, as well as the functions # to benchmark: + def bench(f, average_over=3, warmup=1, **f_kwargs): for _ in range(warmup): f(**f_kwargs) times = [] - memory = [] for _ in range(average_over): start_time = perf_counter_ns() f(**f_kwargs) @@ -241,8 +243,10 @@ def bench(f, average_over=3, warmup=1, **f_kwargs): times_med = times.median().item() return f"{times_med = :.2f}ms +- {times_std:.2f}" + from torchcodec import samplers + def sample_decoder_transforms(num_threads: int): decoder = VideoDecoder( penguin_video_path, @@ -260,6 +264,7 @@ def sample_decoder_transforms(num_threads: int): ) assert len(transformed_frames.data[0]) == 200 + def sample_torchvision_transforms(num_threads: int): decoder = VideoDecoder( penguin_video_path, @@ -285,6 +290,7 @@ def sample_torchvision_transforms(num_threads: int): # will tend to outperform getting back a full frame and applying TorchVision transforms # sequentially: + print(f"decoder transforms: {bench(sample_decoder_transforms, num_threads=0)}") print(f"torchvision transform: {bench(sample_torchvision_transforms, num_threads=0)}") From 1a37b74b5ef2a766d4d274e2a6e9007efe4ce7b6 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Thu, 11 Dec 2025 20:49:33 -0800 Subject: [PATCH 5/7] Apply edits from review --- docs/source/api_ref_transforms.rst | 4 +- examples/decoding/transforms.py | 79 +++++++++++++++--------------- 2 files changed, 43 insertions(+), 40 deletions(-) diff --git a/docs/source/api_ref_transforms.rst b/docs/source/api_ref_transforms.rst index dee3f0c3a..18bffabae 100644 --- a/docs/source/api_ref_transforms.rst +++ b/docs/source/api_ref_transforms.rst @@ -4,9 +4,11 @@ torchcodec.transforms ===================== +.. automodule:: torchcodec.transforms + .. currentmodule:: torchcodec.transforms -For a tutorial, see: TODO_DECODER_TRANSFORMS_TUTORIAL. +For a tutorial, see: :ref:`sphx_glr_generated_examples_decoding_transforms.py`. .. autosummary:: :toctree: generated/ diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py index 95201e859..268a857c3 100644 --- a/examples/decoding/transforms.py +++ b/examples/decoding/transforms.py @@ -11,8 +11,8 @@ In this example, we will demonstrate how to use the ``transforms`` parameter of the :class:`~torchcodec.decoders.VideoDecoder` class. This parameter allows us -to specify a list of :class:`~torchcodec.transforms.DecoderTransform` or -:class:`~torchvision.transforms.v2.Transform` objects. These objects serve as +to specify a list of :class:`torchcodec.transforms.DecoderTransform` or +:class:`torchvision.transforms.v2.Transform` objects. These objects serve as transform specificiations that the :class:`~torchcodec.decoders.VideoDecoder` will apply during the decoding process. """ @@ -94,20 +94,9 @@ def plot(frames: torch.Tensor, title : str | None = None): # ----------------------------------------- # # A pre-processing pipeline for videos during training will typically apply a -# set of transforms for three main reasons: -# -# 1. **Normalization**: Videos can have many different lengths, resolutions, -# and frame rates. Normalizing all videos to the same characteristics -# leads to better model performance. -# 2. **Data reduction**: Training on higher resolution frames may lead to better -# model performance, but it will be more expensive both at training and -# inference time. As a consequence, many video pre-processing pipelines reduce -# frame dimensions through resizing and cropping. -# 3. **Variety**: Applying random transforms (flips, crops, perspective shifts) -# to the same frames during training can improve model performance. -# -# Below is a simple example of applying the -# :class:`~torchvision.transforms.v2.Resize` transform to a single frame: +# set of transforms for a variety of reasons. Below is a simple example of +# applying TorchVision's :class:`~torchvision.transforms.v2.Resize` transform to a single +# frame **after** the decoder returns it: from torchvision.transforms import v2 @@ -121,7 +110,7 @@ def plot(frames: torch.Tensor, title : str | None = None): # In the example above, ``full_decoder`` returns a video frame that has the # dimensions (2160, 3840) which is then resized down to (480, 640). But with the # ``transforms`` parameter of :class:`~torchcodec.decoders.VideoDecoder` we can -# specify for the resize to happen during decoding: +# specify for the resize to happen **during** decoding! resize_decoder = VideoDecoder( penguin_video_path, @@ -135,9 +124,15 @@ def plot(frames: torch.Tensor, title : str | None = None): # TorchCodec's relationship to TorchVision transforms # ----------------------------------------------------- # Notably, in our examples we are passing in TorchVision -# :class:`~torchvision.transforms.v2.Transform` objects as our transforms. We +# :class:`~torchvision.transforms.v2.Transform` objects as our transforms. +# However, :class:`~torchcodec.decoders.VideoDecoder` accepts TorchVision +# transforms as a matter of convenience. TorchVision is **not required** to use +# decoder transforms. +# +# Every TorchVision transform that :class:`~torchcodec.decoders.VideoDecoder` accepts +# has a complementary transform defined in :mod:`torchcodec.transforms`. We # would have gotten equivalent behavior if we had passed in the -# :class:`~torchcodec.transforms.Resize` object that is a part of TorchCodec. +# :class:`torchcodec.transforms.Resize` object that is a part of TorchCodec. # :class:`~torchcodec.decoders.VideoDecoder` accepts both objects as a matter of # convenience and to clarify the relationship between the transforms that TorchCodec # applies and the transforms that TorchVision offers. @@ -150,16 +145,16 @@ def plot(frames: torch.Tensor, title : str | None = None): # %% # But they're close enough that models won't be able to tell a difference: -(abs_diff <= 1).float().mean() >= 0.998 +assert (abs_diff <= 1).float().mean() >= 0.998 # %% # While :class:`~torchcodec.decoders.VideoDecoder` accepts TorchVision transforms as # *specifications*, it is not actually using the TorchVision implementation of these # transforms. Instead, it is mapping them to equivalent # `FFmpeg filters `_. That is, -# :class:`torchvision.transforms.v2.Resize` is mapped to -# `scale `_ and -# :class:`torchvision.transforms.v2.CenterCrop` is mapped to +# :class:`torchvision.transforms.v2.Resize` and :class:`torchcodec.transforms.Resize` are mapped to +# `scale `_; and +# :class:`torchvision.transforms.v2.CenterCrop` and :class:`torchcodec.transforms.CenterCrop` are mapped to # `crop `_. # # The relationships we ensure between TorchCodec :class:`~torchcodec.transforms.DecoderTransform` objects @@ -188,7 +183,7 @@ def plot(frames: torch.Tensor, title : str | None = None): # %% # Decoder transform pipelines # --------------------------- -# So far, we've only provided a single transform to the `transform` parameter to +# So far, we've only provided a single transform to the ``transform`` parameter to # :class:`~torchcodec.decoders.VideoDecoder`. But it # actually accepts a list of transforms, which become a pipeline of transforms. # The order of the list matters: the first transform in the list will receive @@ -200,12 +195,12 @@ def plot(frames: torch.Tensor, title : str | None = None): crop_resize_decoder = VideoDecoder( penguin_video_path, transforms = [ + v2.CenterCrop(size=(1280, 1664)), v2.Resize(size=(480, 640)), - v2.CenterCrop(size=(315, 220)) ] ) crop_resized_during = crop_resize_decoder[5] -plot(crop_resized_during, title="Resized to 480x640 during decoding then center cropped") +plot(crop_resized_during, title="Center cropped then resized to 480x640") # %% # Performance: memory efficiency and speed @@ -213,13 +208,15 @@ def plot(frames: torch.Tensor, title : str | None = None): # # The main motivation for decoder transforms is *memory efficiency*, # particularly when applying transforms that reduce the size of a frame, such -# as resize and crop. Because the transforms are applied during decoding, the -# full frame is never returned to the Python layer. As a result, there is -# significantly less pressure on the Python gargabe collector. +# as resize and crop. Because the FFmpeg layer knows all of the transforms it +# needs to apply during decoding, it's able to efficiently reuse memory. +# Further, full resolution frames are never returned to the Python layer. As a +# result, there is significantly less total memory needed and less pressure on +# the Python garbage collector. # # In `benchmarks `_ # reducing frames from (1080, 1920) down to (135, 240), we have observed a -# reduction in peak resident set size from 4.3 GB to 0.4 MB. +# reduction in peak resident set size from 4.3 GB to 0.4 GB. # # There is sometimes a runtime benefit, but it is dependent on the number of # threads that the :class:`~torchcodec.decoders.VideoDecoder` tells FFmpeg @@ -251,8 +248,8 @@ def sample_decoder_transforms(num_threads: int): decoder = VideoDecoder( penguin_video_path, transforms = [ + v2.CenterCrop(size=(1280, 1664)), v2.Resize(size=(480, 640)), - v2.CenterCrop(size=(315, 220)) ], seek_mode="approximate", num_ffmpeg_threads=num_threads, @@ -266,6 +263,8 @@ def sample_decoder_transforms(num_threads: int): def sample_torchvision_transforms(num_threads: int): + if num_threads > 0: + torch.set_num_threads(num_threads) decoder = VideoDecoder( penguin_video_path, seek_mode="approximate", @@ -276,12 +275,14 @@ def sample_torchvision_transforms(num_threads: int): num_clips=1, num_frames_per_clip=200 ) - transformed_frames = [] - for frame in frames.data[0]: - frame = v2.Resize(size=(480, 640))(frame) - frame = v2.CenterCrop(size=(315, 220))(frame) - transformed_frames.append(frame) - assert len(transformed_frames) == 200 + transforms = v2.Compose( + [ + v2.CenterCrop(size=(1280, 1664)), + v2.Resize(size=(480, 640)), + ] + ) + transformed_frames = transforms(frames.data) + assert transformed_frames.shape[1] == 200 # %% # When the :class:`~torchcodec.decoders.VideoDecoder` object sets the number of @@ -296,8 +297,8 @@ def sample_torchvision_transforms(num_threads: int): # %% # The reason is that FFmpeg is applying the decoder transforms in parallel. -# However, if the number of threads is 1 (as is the default), then there often is no -# runtime benefit to using decoder transforms. Using the TorchVision transforms may +# However, if the number of threads is 1 (as is the default), then there is often +# less benefit to using decoder transforms. Using the TorchVision transforms may # even be faster! print(f"decoder transforms: {bench(sample_decoder_transforms, num_threads=1)}") From ade732caadb313bba3570a20b8863f8e6d1a281f Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Fri, 12 Dec 2025 08:29:30 -0800 Subject: [PATCH 6/7] Address more review comments --- docs/source/index.rst | 8 ++++++++ examples/decoding/transforms.py | 9 +++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 4311d2e0d..5d9d58bcb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -84,6 +84,14 @@ Decoding How to sample regular and random clips from a video + .. grid-item-card:: :octicon:`file-code;1em` + Decoder transforms + :img-top: _static/img/card-background.svg + :link: generated_examples/decoding/transforms.html + :link-type: url + + How to apply transforms while decoding + Encoding ^^^^^^^^ diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py index 268a857c3..f1fbb2a13 100644 --- a/examples/decoding/transforms.py +++ b/examples/decoding/transforms.py @@ -13,12 +13,13 @@ the :class:`~torchcodec.decoders.VideoDecoder` class. This parameter allows us to specify a list of :class:`torchcodec.transforms.DecoderTransform` or :class:`torchvision.transforms.v2.Transform` objects. These objects serve as -transform specificiations that the :class:`~torchcodec.decoders.VideoDecoder` +transform specifications that the :class:`~torchcodec.decoders.VideoDecoder` will apply during the decoding process. """ # %% -# First, a bit of boilerplate and definitions that we will use later: +# First, a bit of boilerplate, definitions that we will use later. You can skip +# ahead to our :ref:`example_video` or :ref:`applying_transforms`. import torch @@ -58,6 +59,8 @@ def plot(frames: torch.Tensor, title : str | None = None): plt.tight_layout() # %% +# .. _example_video: +# # Our example video # ----------------- # @@ -90,6 +93,8 @@ def plot(frames: torch.Tensor, title : str | None = None): # specify image dimensions. # %% +# .. _applying_transforms: +# # Applying transforms during pre-processing # ----------------------------------------- # From 8ea538470696bff64c41cee28d05fba0f0ddf90b Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Fri, 12 Dec 2025 13:32:00 -0800 Subject: [PATCH 7/7] More clarifications --- examples/decoding/transforms.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/examples/decoding/transforms.py b/examples/decoding/transforms.py index f1fbb2a13..1c3920915 100644 --- a/examples/decoding/transforms.py +++ b/examples/decoding/transforms.py @@ -5,6 +5,9 @@ # LICENSE file in the root directory of this source tree. """ +.. meta:: + :description: Learn how to apply transforms during video decoding for improved memory efficiency and performance. + ======================================================= Decoder Transforms: Applying transforms during decoding ======================================================= @@ -167,8 +170,9 @@ def plot(frames: torch.Tensor, title : str | None = None): # # 1. The names are the same. # 2. Default behaviors are the same. -# 3. The parameters for the :class:`~torchcodec.transforms.DecoderTransform` object are a subset of the -# TorchVision :class:`~torchvision.transforms.v2.Transform` object. +# 3. The parameters for the :class:`~torchcodec.transforms.DecoderTransform` +# object are a subset of the TorchVision :class:`~torchvision.transforms.v2.Transform` +# object. # 4. Parameters with the same name control the same behavior and accept a # subset of the same types. # 5. The difference between the frames returned by a decoder transform and @@ -177,13 +181,20 @@ def plot(frames: torch.Tensor, title : str | None = None): # # .. note:: # -# We do not encourage *intentionally* mixing usage of TorchCodec's decoder -# transforms and TorchVision transforms. That is, if you use TorchCodec's -# decoder transforms during training, you should also use them during -# inference. And if you decode full frames and apply TorchVision's -# transforms to those fully decoded frames during training, you should also -# do the same during inference. We provide the similarity guarantees to mitigate -# the harm when the two techniques are *unintentionally* mixed. +# Applying the exact same transforms during training and inference is +# important for model perforamnce. For example, if you use decoder +# transforms to resize frames during training, you should also use decoder +# transforms to resize frames during inference. We provide the similarity +# guarantees to mitigate the harm when the two techniques are +# *unintentionally* mixed. That is, if you use decoder transforms to resize +# frames during training, but use TorchVisions's +# :class:`~torchvision.transforms.v2.Resize` during inference, our guarantees +# mitigate the harm to model performance. But we **reccommend against** this kind of +# mixing. +# +# It is appropriate and expected to use some decoder transforms and some TorchVision +# transforms, as long as the exact same pre-processing operations are performed during +# training and inference. # %% # Decoder transform pipelines