meta-pytorch · NicolasHug · Oct 20, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 15, 2025
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -53,74 +53,6 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
 }
 
 static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
-  // Check decoder capabilities - same checks as DALI
-  auto caps = CUVIDDECODECAPS{};
-  caps.eCodecType = videoFormat->codec;
-  caps.eChromaFormat = videoFormat->chroma_format;
-  caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
-  CUresult result = cuvidGetDecoderCaps(&caps);
-  TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);
-
-  TORCH_CHECK(
-      caps.bIsSupported,
-      "Codec configuration not supported on this GPU. "
-      "Codec: ",
-      static_cast<int>(videoFormat->codec),
-      ", chroma format: ",
-      static_cast<int>(videoFormat->chroma_format),
-      ", bit depth: ",
-      videoFormat->bit_depth_luma_minus8 + 8);
-
-  TORCH_CHECK(
-      videoFormat->coded_width >= caps.nMinWidth &&
-          videoFormat->coded_height >= caps.nMinHeight,
-      "Video is too small in at least one dimension. Provided: ",
-      videoFormat->coded_width,
-      "x",
-      videoFormat->coded_height,
-      " vs supported:",
-      caps.nMinWidth,
-      "x",
-      caps.nMinHeight);
-
-  TORCH_CHECK(
-      videoFormat->coded_width <= caps.nMaxWidth &&
-          videoFormat->coded_height <= caps.nMaxHeight,
-      "Video is too large in at least one dimension. Provided: ",
-      videoFormat->coded_width,
-      "x",
-      videoFormat->coded_height,
-      " vs supported:",
-      caps.nMaxWidth,
-      "x",
-      caps.nMaxHeight);
-
-  // See nMaxMBCount in cuviddec.h
-  constexpr unsigned int macroblockConstant = 256;
-  TORCH_CHECK(
-      videoFormat->coded_width * videoFormat->coded_height /
-              macroblockConstant <=
-          caps.nMaxMBCount,
-      "Video is too large (too many macroblocks). "
-      "Provided (width * height / ",
-      macroblockConstant,
-      "): ",
-      videoFormat->coded_width * videoFormat->coded_height / macroblockConstant,
-      " vs supported:",
-      caps.nMaxMBCount);
-
-  // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make
-  // sure it's actually supported.
-  TORCH_CHECK(
-      (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1,
-      "NV12 output format is not supported for this configuration. ",
-      "Codec: ",
-      static_cast<int>(videoFormat->codec),
-      ", chroma format: ",
-      static_cast<int>(videoFormat->chroma_format),
-      ", bit depth: ",
-      videoFormat->bit_depth_luma_minus8 + 8);
-
   // Decoder creation parameters, most are taken from DALI
   CUVIDDECODECREATEINFO decoderParams = {};
   decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
@@ -157,13 +89,39 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   decoderParams.display_area.bottom = videoFormat->display_area.bottom;
 
   CUvideodecoder* decoder = new CUvideodecoder();
-  result = cuvidCreateDecoder(decoder, &decoderParams);
+  CUresult result = cuvidCreateDecoder(decoder, &decoderParams);
   TORCH_CHECK(
       result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
   return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
 }
 
-cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
+std::optional<cudaVideoChromaFormat> validateChromaSupport(
+    const AVPixFmtDescriptor* desc) {
+  // Return the corresponding cudaVideoChromaFormat if supported, std::nullopt
+  // otherwise.
+  TORCH_CHECK(desc != nullptr, "desc can't be null");
+
+  if (desc->nb_components == 1) {
+    return cudaVideoChromaFormat_Monochrome;
+  } else if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) {
+    // Make sure it's YUV: has chroma planes and isn't RGB
+    if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) {
+      return cudaVideoChromaFormat_444; // 1x1 subsampling = 4:4:4
+    } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) {
+      return cudaVideoChromaFormat_420; // 2x2 subsampling = 4:2:0
+    } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) {
+      return cudaVideoChromaFormat_422; // 2x1 subsampling = 4:2:2
+    }
+  }
+
+  return std::nullopt;
+}
+
+std::optional<cudaVideoCodec> validateCodecSupport(AVCodecID codecId) {
+  // Return the corresponding cudaVideoCodec if supported, std::nullopt
+  // otherwise
+  // Note that we currently return nullopt (and thus fallback to CPU) for some
+  // codecs that are technically supported by NVDEC, see comment below.
   switch (codecId) {
     case AV_CODEC_ID_H264:
       return cudaVideoCodec_H264;
@@ -189,10 +147,69 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
     //   return cudaVideoCodec_JPEG;
     // case AV_CODEC_ID_VC1:
     //   return cudaVideoCodec_VC1;
-    default: {
-      TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
-    }
+    default:
+      return std::nullopt;
+  }
+}
+
+bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
+  // Return true iff the input video stream is supported by our NVDEC
+  // implementation.
+  auto codecType = validateCodecSupport(codecContext->codec_id);
+  if (!codecType.has_value()) {
+    return false;
+  }
+
+  const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get(codecContext->pix_fmt);
+  if (!desc) {
+    return false;
+  }
+
+  auto chromaFormat = validateChromaSupport(desc);
+  if (!chromaFormat.has_value()) {
+    return false;
+  }
+
+  auto caps = CUVIDDECODECAPS{};
+  caps.eCodecType = codecType.value();
+  caps.eChromaFormat = chromaFormat.value();
+  caps.nBitDepthMinus8 = desc->comp[0].depth - 8;
+
+  CUresult result = cuvidGetDecoderCaps(&caps);
+  if (result != CUDA_SUCCESS) {
+    return false;
+  }
+
+  if (!caps.bIsSupported) {
+    return false;
+  }
+
+  auto coded_width = static_cast<unsigned int>(codecContext->coded_width);
+  auto coded_height = static_cast<unsigned int>(codecContext->coded_height);
+  if (coded_width < static_cast<unsigned int>(caps.nMinWidth) ||
+      coded_height < static_cast<unsigned int>(caps.nMinHeight) ||
+      coded_width > caps.nMaxWidth || coded_height > caps.nMaxHeight) {
+    return false;
+  }
+
+  // See nMaxMBCount in cuviddec.h
+  constexpr unsigned int macroblockConstant = 256;
+  if (coded_width * coded_height / macroblockConstant > caps.nMaxMBCount) {
+    return false;
+  }
+
+  // We'll set the decoderParams.OutputFormat to NV12, so we need to make
+  // sure it's actually supported.
+  // TODO: If this fail, we could consider decoding to something else than NV12
+  // (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is
+  // what FFmpeg does.
+  bool supportsNV12Output =
+      (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1;
+  if (!supportsNV12Output) {
+    return false;
   }
+
+  return true;
 }
 
 } // namespace
@@ -232,6 +249,19 @@ void BetaCudaDeviceInterface::initialize(
     const AVStream* avStream,
     const UniqueDecodingAVFormatContext& avFormatCtx,
     [[maybe_unused]] const SharedAVCodecContext& codecContext) {
+  if (!nativeNVDECSupport(codecContext)) {
+    cpuFallback_ = createDeviceInterface(torch::kCPU);
+    TORCH_CHECK(
+        cpuFallback_ != nullptr, "Failed to create CPU device interface");
+    cpuFallback_->initialize(avStream, avFormatCtx, codecContext);
+    cpuFallback_->initializeVideo(
+        VideoStreamOptions(),
+        {},
+        /*resizedOutputDims=*/std::nullopt);
+    // We'll always use the CPU fallback from now on, so we can return early.
+    return;
+  }
+
   TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
   timeBase_ = avStream->time_base;
   frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
@@ -243,7 +273,11 @@ void BetaCudaDeviceInterface::initialize(
 
   // Create parser. Default values that aren't obvious are taken from DALI.
   CUVIDPARSERPARAMS parserParams = {};
-  parserParams.CodecType = validateCodecSupport(codecPar->codec_id);
+  auto codecType = validateCodecSupport(codecPar->codec_id);
+  TORCH_CHECK(
+      codecType.has_value(),
+      "This should never happen, we should be using the CPU fallback by now. Please report a bug.");
+  parserParams.CodecType = codecType.value();
   parserParams.ulMaxNumDecodeSurfaces = 8;
   parserParams.ulMaxDisplayDelay = 0;
   // Callback setup, all are triggered by the parser within a call
@@ -383,6 +417,10 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
 // Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to
 // the NVCUVID parser.
 int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
+  if (cpuFallback_) {
+    return cpuFallback_->sendPacket(packet);
+  }
+
   TORCH_CHECK(
       packet.get() && packet->data && packet->size > 0,
       "sendPacket received an empty packet, this is unexpected, please report.");
@@ -406,6 +444,10 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
 }
 
 int BetaCudaDeviceInterface::sendEOFPacket() {
+  if (cpuFallback_) {
+    return cpuFallback_->sendEOFPacket();
+  }
+
   CUVIDSOURCEDATAPACKET cuvidPacket = {};
   cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
   eofSent_ = true;
@@ -467,6 +509,10 @@ int BetaCudaDeviceInterface::frameReadyInDisplayOrder(
 
 // Moral equivalent of avcodec_receive_frame().
 int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
+  if (cpuFallback_) {
+    return cpuFallback_->receiveFrame(avFrame);
+  }
+
   if (readyFrames_.empty()) {
     // No frame found, instruct caller to try again later after sending more
     // packets, or to stop if EOF was already sent.
@@ -601,6 +647,11 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
 }
 
 void BetaCudaDeviceInterface::flush() {
+  if (cpuFallback_) {
+    cpuFallback_->flush();
+    return;
+  }
+
   // The NVCUVID docs mention that after seeking, i.e. when flush() is called,
   // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
   // don't say whether this should be an empty packet, or whether it should be a
@@ -618,6 +669,21 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
+  if (cpuFallback_) {
+    // CPU decoded frame - need to do CPU color conversion then transfer to GPU
+    FrameOutput cpuFrameOutput;
+    cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
+
+    // Transfer CPU frame to GPU
+    if (preAllocatedOutputTensor.has_value()) {
+      preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
+      frameOutput.data = preAllocatedOutputTensor.value();
+    } else {
+      frameOutput.data = cpuFrameOutput.data.to(device_);
+    }
+    return;
+  }
+
 if (avFrame->format != AV_PIX_FMT_CUDA) { 
   // The frame's format is AV_PIX_FMT_CUDA if and only if its content is on 
   // the GPU. In this branch, the frame is on the CPU. There are two possible 
   // reasons: 
   // 
   //   1. During maybeConvertAVFrameToNV12OrRGB24(), we had a non-NV12 format 
   //      frame and we're on FFmpeg 4.4 or earlier. In such cases, we had to 
   //      use CPU filters and we just converted the frame to RGB24. 
   //   2. This is what NVDEC gave us if it wasn't able to decode a frame, for 
   //      whatever reason. Typically that happens if the video's encoder isn't 
   //      supported by NVDEC. 
   // 
   // In both cases, we have a frame on the CPU. We send the frame back to the 
   // CUDA device when we're done. 
   enum AVPixelFormat frameFormat = 
       static_cast<enum AVPixelFormat>(avFrame->format); 
   FrameOutput cpuFrameOutput; 
   if (frameFormat == AV_PIX_FMT_RGB24) { 
     // Reason 1 above. The frame is already in RGB24, we just need to convert 
     // it to a tensor. 
     cpuFrameOutput.data = rgbAVFrameToTensor(avFrame); 
   } else { 
     // Reason 2 above. We need to do a full conversion which requires an 
     // actual CPU device. 
     cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput); 
   } 
   // Finally, we need to send the frame back to the GPU. Note that the 
   // pre-allocated tensor is on the GPU, so we can't send that to the CPU 
   // device interface. We copy it over here. 
   if (preAllocatedOutputTensor.has_value()) { 
     preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data); 
     frameOutput.data = preAllocatedOutputTensor.value(); 
   } else { 
     frameOutput.data = cpuFrameOutput.data.to(device_); 
   } 
   return; 
 } 
 if (avFrame->format != AV_PIX_FMT_CUDA) { 
   // The frame's format is AV_PIX_FMT_CUDA if and only if its content is on 
   // the GPU. In this branch, the frame is on the CPU. There are two possible 
   // reasons: 
   // 
   //   1. During maybeConvertAVFrameToNV12OrRGB24(), we had a non-NV12 format 
   //      frame and we're on FFmpeg 4.4 or earlier. In such cases, we had to 
   //      use CPU filters and we just converted the frame to RGB24. 
   //   2. This is what NVDEC gave us if it wasn't able to decode a frame, for 
   //      whatever reason. Typically that happens if the video's encoder isn't 
   //      supported by NVDEC. 
   // 
   // In both cases, we have a frame on the CPU. We send the frame back to the 
   // CUDA device when we're done. 
  
   enum AVPixelFormat frameFormat = 
       static_cast<enum AVPixelFormat>(avFrame->format); 
  
   FrameOutput cpuFrameOutput; 
   if (frameFormat == AV_PIX_FMT_RGB24) { 
     // Reason 1 above. The frame is already in RGB24, we just need to convert 
     // it to a tensor. 
     cpuFrameOutput.data = rgbAVFrameToTensor(avFrame); 
   } else { 
     // Reason 2 above. We need to do a full conversion which requires an 
     // actual CPU device. 
     cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput); 
   } 
  
   // Finally, we need to send the frame back to the GPU. Note that the 
   // pre-allocated tensor is on the GPU, so we can't send that to the CPU 
   // device interface. We copy it over here. 
   if (preAllocatedOutputTensor.has_value()) { 
     preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data); 
     frameOutput.data = preAllocatedOutputTensor.value(); 
   } else { 
     frameOutput.data = cpuFrameOutput.data.to(device_); 
   } 
  
   return; 
 } 
   // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
   // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
   TORCH_CHECK(

diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -94,6 +94,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
 
   // NPP context for color conversion
   UniqueNppContext nppCtx_;
+
+  std::unique_ptr<DeviceInterface> cpuFallback_;
 };
 
 } // namespace facebook::torchcodec

diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -1701,20 +1701,19 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode):
             assert beta_frame.duration_seconds == ref_frame.duration_seconds
 
     @needs_cuda
-    def test_beta_cuda_interface_small_h265(self):
-        # Test to illustrate current difference in behavior between the BETA and
-        # the ffmpeg interface: this video isn't supported by NVDEC, but in the
-        # ffmpeg interface, FFMPEG fallsback to the CPU while we don't.
-
-        VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
-
+    def test_beta_cuda_interface_cpu_fallback(self):
+        # Non-regression test for the CPU fallback behavior of the BETA CUDA
+        # interface.
+        # We know that the H265_VIDEO asset isn't supported by NVDEC, its
+        # dimensions are too small. We also know that the FFmpeg CUDA interface
+        # fallbacks to the CPU path in such cases. We assert that we fall back
+        # to the CPU path, too.
+
+        ffmpeg = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
         with set_cuda_backend("beta"):
-            dec = VideoDecoder(H265_VIDEO.path, device="cuda")
-        with pytest.raises(
-            RuntimeError,
-            match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144",
-        ):
-            dec.get_frame_at(0)
+            beta = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
+
+        torch.testing.assert_close(ffmpeg.data, beta.data, rtol=0, atol=0)
 
     @needs_cuda
     def test_beta_cuda_interface_error(self):
@@ -1740,15 +1739,20 @@ def test_set_cuda_backend(self):
             assert _get_cuda_backend() == "beta"
 
         def assert_decoder_uses(decoder, *, expected_backend):
+            # TODO: This doesn't work anymore after
+            # https://github.com/meta-pytorch/torchcodec/pull/977
+            # We need to define a better way to assert which backend a decoder
+            # is using.
+            return
             # Assert that a decoder instance is using a given backend.
             #
             # We know H265_VIDEO fails on the BETA backend while it works on the
             # ffmpeg one.
-            if expected_backend == "ffmpeg":
-                decoder.get_frame_at(0)  # this would fail if this was BETA
-            else:
-                with pytest.raises(RuntimeError, match="Video is too small"):
-                    decoder.get_frame_at(0)
+            # if expected_backend == "ffmpeg":
+            #     decoder.get_frame_at(0)  # this would fail if this was BETA
+            # else:
+            #     with pytest.raises(RuntimeError, match="Video is too small"):
+            #         decoder.get_frame_at(0)
 
         # Check that the default is the ffmpeg backend
         assert _get_cuda_backend() == "ffmpeg"