meta-pytorch
diff --git a/‎benchmarks/decoders/benchmark_transforms.py‎
Lines changed: 83 additions & 55 deletions b/‎benchmarks/decoders/benchmark_transforms.py‎
Lines changed: 83 additions & 55 deletions
diff --git a/‎docs/source/api_ref_decoders.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/api_ref_decoders.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/conf.py‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/conf.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/index.rst‎
Lines changed: 8 additions & 0 deletions b/‎docs/source/index.rst‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎examples/decoding/basic_cuda_example.py‎
Lines changed: 19 additions & 22 deletions b/‎examples/decoding/basic_cuda_example.py‎
Lines changed: 19 additions & 22 deletions
@@ -5,14 +5,11 @@
 
 import torch
 from torch import Tensor
-from torchcodec._core import add_video_stream, create_from_file, get_frames_by_pts
 from torchcodec.decoders import VideoDecoder
 from torchvision.transforms import v2
 
-DEFAULT_NUM_EXP = 20
 
-
-def bench(f, *args, num_exp=DEFAULT_NUM_EXP, warmup=1) -> Tensor:
+def bench(f, *args, num_exp, warmup=1) -> Tensor:
 
     for _ in range(warmup):
         f(*args)
@@ -45,37 +42,55 @@ def report_stats(times: Tensor, unit: str = "ms", prefix: str = "") -> float:
 
 
 def torchvision_resize(
-    path: Path, pts_seconds: list[float], dims: tuple[int, int]
-) -> None:
-    decoder = create_from_file(str(path), seek_mode="approximate")
-    add_video_stream(decoder)
-    raw_frames, *_ = get_frames_by_pts(decoder, timestamps=pts_seconds)
-    return v2.functional.resize(raw_frames, size=dims)
+    path: Path, pts_seconds: list[float], dims: tuple[int, int], num_threads: int
+) -> Tensor:
+    decoder = VideoDecoder(
+        path, seek_mode="approximate", num_ffmpeg_threads=num_threads
+    )
+    raw_frames = decoder.get_frames_played_at(pts_seconds)
+    transformed_frames = v2.Resize(size=dims)(raw_frames.data)
+    assert len(transformed_frames) == len(pts_seconds)
+    return transformed_frames
 
 
 def torchvision_crop(
-    path: Path, pts_seconds: list[float], dims: tuple[int, int], x: int, y: int
-) -> None:
-    decoder = create_from_file(str(path), seek_mode="approximate")
-    add_video_stream(decoder)
-    raw_frames, *_ = get_frames_by_pts(decoder, timestamps=pts_seconds)
-    return v2.functional.crop(raw_frames, top=y, left=x, height=dims[0], width=dims[1])
-
-
-def decoder_native_resize(
-    path: Path, pts_seconds: list[float], dims: tuple[int, int]
-) -> None:
-    decoder = create_from_file(str(path), seek_mode="approximate")
-    add_video_stream(decoder, transform_specs=f"resize, {dims[0]}, {dims[1]}")
-    return get_frames_by_pts(decoder, timestamps=pts_seconds)[0]
-
-
-def decoder_native_crop(
-    path: Path, pts_seconds: list[float], dims: tuple[int, int], x: int, y: int
-) -> None:
-    decoder = create_from_file(str(path), seek_mode="approximate")
-    add_video_stream(decoder, transform_specs=f"crop, {dims[0]}, {dims[1]}, {x}, {y}")
-    return get_frames_by_pts(decoder, timestamps=pts_seconds)[0]
+    path: Path, pts_seconds: list[float], dims: tuple[int, int], num_threads: int
+) -> Tensor:
+    decoder = VideoDecoder(
+        path, seek_mode="approximate", num_ffmpeg_threads=num_threads
+    )
+    raw_frames = decoder.get_frames_played_at(pts_seconds)
+    transformed_frames = v2.CenterCrop(size=dims)(raw_frames.data)
+    assert len(transformed_frames) == len(pts_seconds)
+    return transformed_frames
+
+
+def decoder_resize(
+    path: Path, pts_seconds: list[float], dims: tuple[int, int], num_threads: int
+) -> Tensor:
+    decoder = VideoDecoder(
+        path,
+        transforms=[v2.Resize(size=dims)],
+        seek_mode="approximate",
+        num_ffmpeg_threads=num_threads,
+    )
+    transformed_frames = decoder.get_frames_played_at(pts_seconds).data
+    assert len(transformed_frames) == len(pts_seconds)
+    return transformed_frames.data
+
+
+def decoder_crop(
+    path: Path, pts_seconds: list[float], dims: tuple[int, int], num_threads: int
+) -> Tensor:
+    decoder = VideoDecoder(
+        path,
+        transforms=[v2.CenterCrop(size=dims)],
+        seek_mode="approximate",
+        num_ffmpeg_threads=num_threads,
+    )
+    transformed_frames = decoder.get_frames_played_at(pts_seconds).data
+    assert len(transformed_frames) == len(pts_seconds)
+    return transformed_frames
 
 
 def main():
@@ -84,9 +99,27 @@ def main():
     parser.add_argument(
         "--num-exp",
         type=int,
-        default=DEFAULT_NUM_EXP,
+        default=5,
         help="number of runs to average over",
     )
+    parser.add_argument(
+        "--num-threads",
+        type=int,
+        default=1,
+        help="number of threads to use; 0 means FFmpeg decides",
+    )
+    parser.add_argument(
+        "--total-frame-fractions",
+        nargs="+",
+        type=float,
+        default=[0.005, 0.01, 0.05, 0.1],
+    )
+    parser.add_argument(
+        "--input-dimension-fractions",
+        nargs="+",
+        type=float,
+        default=[0.5, 0.25, 0.125],
+    )
 
     args = parser.parse_args()
     path = Path(args.path)
@@ -100,10 +133,7 @@ def main():
 
     input_height = metadata.height
     input_width = metadata.width
-    fraction_of_total_frames_to_sample = [0.005, 0.01, 0.05, 0.1]
-    fraction_of_input_dimensions = [0.5, 0.25, 0.125]
-
-    for num_fraction in fraction_of_total_frames_to_sample:
+    for num_fraction in args.total_frame_fractions:
         num_frames_to_sample = math.ceil(metadata.num_frames * num_fraction)
         print(
             f"Sampling {num_fraction * 100}%, {num_frames_to_sample}, of {metadata.num_frames} frames"
@@ -112,51 +142,49 @@ def main():
             i * duration / num_frames_to_sample for i in range(num_frames_to_sample)
         ]
 
-        for dims_fraction in fraction_of_input_dimensions:
+        for dims_fraction in args.input_dimension_fractions:
             dims = (int(input_height * dims_fraction), int(input_width * dims_fraction))
 
             times = bench(
-                torchvision_resize, path, uniform_timestamps, dims, num_exp=args.num_exp
+                torchvision_resize,
+                path,
+                uniform_timestamps,
+                dims,
+                args.num_threads,
+                num_exp=args.num_exp,
             )
             report_stats(times, prefix=f"torchvision_resize({dims})")
 
             times = bench(
-                decoder_native_resize,
+                decoder_resize,
                 path,
                 uniform_timestamps,
                 dims,
+                args.num_threads,
                 num_exp=args.num_exp,
             )
-            report_stats(times, prefix=f"decoder_native_resize({dims})")
-            print()
+            report_stats(times, prefix=f"decoder_resize({dims})")
 
-            center_x = (input_height - dims[0]) // 2
-            center_y = (input_width - dims[1]) // 2
             times = bench(
                 torchvision_crop,
                 path,
                 uniform_timestamps,
                 dims,
-                center_x,
-                center_y,
+                args.num_threads,
                 num_exp=args.num_exp,
             )
-            report_stats(
-                times, prefix=f"torchvision_crop({dims}, {center_x}, {center_y})"
-            )
+            report_stats(times, prefix=f"torchvision_crop({dims})")
 
             times = bench(
-                decoder_native_crop,
+                decoder_crop,
                 path,
                 uniform_timestamps,
                 dims,
-                center_x,
-                center_y,
+                args.num_threads,
                 num_exp=args.num_exp,
             )
-            report_stats(
-                times, prefix=f"decoder_native_crop({dims}, {center_x}, {center_y})"
-            )
+            report_stats(times, prefix=f"decoder_crop({dims})")
+
             print()
 
 
 
@@ -33,3 +33,4 @@ For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_decoding_a
 
     VideoStreamMetadata
     AudioStreamMetadata
+    CpuFallbackStatus
@@ -82,6 +82,7 @@ def __call__(self, filename):
                 "approximate_mode.py",
                 "sampling.py",
                 "parallel_decoding.py",
+                "performance_tips.py",
                 "custom_frame_mappings.py",
             ]
         else:
 
@@ -84,6 +84,14 @@ Decoding
 
         How to sample regular and random clips from a video
 
+     .. grid-item-card:: :octicon:`file-code;1em`
+        Performance Tips
+        :img-top: _static/img/card-background.svg
+        :link: generated_examples/decoding/performance_tips.html
+        :link-type: url
+
+        Tips for optimizing video decoding performance
+
 
 Encoding
 ^^^^^^^^
 
@@ -18,28 +18,6 @@
 running the transform steps. Encoded packets are often much smaller than decoded frames so
 CUDA decoding also uses less PCI-e bandwidth.
 
-When to and when not to use CUDA Decoding
------------------------------------------
-
-CUDA Decoding can offer speed-up over CPU Decoding in a few scenarios:
-
-#. You are decoding a large resolution video
-#. You are decoding a large batch of videos that's saturating the CPU
-#. You want to do whole-image transforms like scaling or convolutions on the decoded tensors
-   after decoding
-#. Your CPU is saturated and you want to free it up for other work
-
-
-Here are situations where CUDA Decoding may not make sense:
-
-#. You want bit-exact results compared to CPU Decoding
-#. You have small resolution videos and the PCI-e transfer latency is large
-#. Your GPU is already busy and CPU is not
-
-It's best to experiment with CUDA Decoding to see if it improves your use-case. With
-TorchCodec you can simply pass in a device parameter to the
-:class:`~torchcodec.decoders.VideoDecoder` class to use CUDA Decoding.
-
 Installing TorchCodec with CUDA Enabled
 ---------------------------------------
 
@@ -113,6 +91,25 @@
 print(frame.data.device)
 
 
+# %%
+# Checking for CPU Fallback
+# -------------------------------------
+#
+# In some cases, CUDA decoding may fall back to CPU decoding. This can happen
+# when the video codec or format is not supported by the NVDEC hardware decoder, or when NVCUVID wasn't found.
+# TorchCodec provides the :class:`~torchcodec.decoders.CpuFallbackStatus` class
+# to help you detect when this fallback occurs.
+#
+# You can access the fallback status via the
+# :attr:`~torchcodec.decoders.VideoDecoder.cpu_fallback` attribute:
+
+with set_cuda_backend("beta"):
+    decoder = VideoDecoder(video_file, device="cuda")
+
+# Check and print the CPU fallback status
+print(decoder.cpu_fallback)
+
+
 # %%
 # Visualizing Frames
 # -------------------------------------
Original file line number	Diff line number	Diff line change
@@ -33,3 +33,4 @@ For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_decoding_a
`33`	`33`
`34`	`34`	`VideoStreamMetadata`
`35`	`35`	`AudioStreamMetadata`
	`36`	`+ CpuFallbackStatus`
Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@ def __call__(self, filename):`
`82`	`82`	`"approximate_mode.py",`
`83`	`83`	`"sampling.py",`
`84`	`84`	`"parallel_decoding.py",`
	`85`	`+ "performance_tips.py",`
`85`	`86`	`"custom_frame_mappings.py",`
`86`	`87`	`]`
`87`	`88`	`else:`