Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 140 additions & 74 deletions src/torchcodec/_core/BetaCudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,74 +53,6 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
}

static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All checks below have been moved to the new bool nativeNVDECSupport(const SharedAVCodecContext& codecContext);

// Check decoder capabilities - same checks as DALI
auto caps = CUVIDDECODECAPS{};
caps.eCodecType = videoFormat->codec;
caps.eChromaFormat = videoFormat->chroma_format;
caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
CUresult result = cuvidGetDecoderCaps(&caps);
TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);

TORCH_CHECK(
caps.bIsSupported,
"Codec configuration not supported on this GPU. "
"Codec: ",
static_cast<int>(videoFormat->codec),
", chroma format: ",
static_cast<int>(videoFormat->chroma_format),
", bit depth: ",
videoFormat->bit_depth_luma_minus8 + 8);

TORCH_CHECK(
videoFormat->coded_width >= caps.nMinWidth &&
videoFormat->coded_height >= caps.nMinHeight,
"Video is too small in at least one dimension. Provided: ",
videoFormat->coded_width,
"x",
videoFormat->coded_height,
" vs supported:",
caps.nMinWidth,
"x",
caps.nMinHeight);

TORCH_CHECK(
videoFormat->coded_width <= caps.nMaxWidth &&
videoFormat->coded_height <= caps.nMaxHeight,
"Video is too large in at least one dimension. Provided: ",
videoFormat->coded_width,
"x",
videoFormat->coded_height,
" vs supported:",
caps.nMaxWidth,
"x",
caps.nMaxHeight);

// See nMaxMBCount in cuviddec.h
constexpr unsigned int macroblockConstant = 256;
TORCH_CHECK(
videoFormat->coded_width * videoFormat->coded_height /
macroblockConstant <=
caps.nMaxMBCount,
"Video is too large (too many macroblocks). "
"Provided (width * height / ",
macroblockConstant,
"): ",
videoFormat->coded_width * videoFormat->coded_height / macroblockConstant,
" vs supported:",
caps.nMaxMBCount);

// Below we'll set the decoderParams.OutputFormat to NV12, so we need to make
// sure it's actually supported.
TORCH_CHECK(
(caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1,
"NV12 output format is not supported for this configuration. ",
"Codec: ",
static_cast<int>(videoFormat->codec),
", chroma format: ",
static_cast<int>(videoFormat->chroma_format),
", bit depth: ",
videoFormat->bit_depth_luma_minus8 + 8);

// Decoder creation parameters, most are taken from DALI
CUVIDDECODECREATEINFO decoderParams = {};
decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
Expand Down Expand Up @@ -157,13 +89,39 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
decoderParams.display_area.bottom = videoFormat->display_area.bottom;

CUvideodecoder* decoder = new CUvideodecoder();
result = cuvidCreateDecoder(decoder, &decoderParams);
CUresult result = cuvidCreateDecoder(decoder, &decoderParams);
TORCH_CHECK(
result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
}

cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
std::optional<cudaVideoChromaFormat> validateChromaSupport(
const AVPixFmtDescriptor* desc) {
// Return the corresponding cudaVideoChromaFormat if supported, std::nullopt
// otherwise.
TORCH_CHECK(desc != nullptr, "desc can't be null");

if (desc->nb_components == 1) {
return cudaVideoChromaFormat_Monochrome;
} else if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) {
// Make sure it's YUV: has chroma planes and isn't RGB
if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) {
return cudaVideoChromaFormat_444; // 1x1 subsampling = 4:4:4
} else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) {
return cudaVideoChromaFormat_420; // 2x2 subsampling = 4:2:0
} else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) {
return cudaVideoChromaFormat_422; // 2x1 subsampling = 4:2:2
}
}

return std::nullopt;
}

std::optional<cudaVideoCodec> validateCodecSupport(AVCodecID codecId) {
// Return the corresponding cudaVideoCodec if supported, std::nullopt
// otherwise
// Note that we currently return nullopt (and thus fallback to CPU) for some
// codecs that are technically supported by NVDEC, see comment below.
switch (codecId) {
case AV_CODEC_ID_H264:
return cudaVideoCodec_H264;
Expand All @@ -189,10 +147,69 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
// return cudaVideoCodec_JPEG;
// case AV_CODEC_ID_VC1:
// return cudaVideoCodec_VC1;
default: {
TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
}
default:
return std::nullopt;
}
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are the same checks as before, but with a crucial difference: the source of truth that we use to get the stream properties is now FFmpeg's AVCodecContext. Previously, it was NVCUVID's CUVIDEOFORMAT.
I think that's fine. I can't imagine NVCUVID being more accurate than FFmpeg.

bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
// Return true iff the input video stream is supported by our NVDEC
// implementation.
auto codecType = validateCodecSupport(codecContext->codec_id);
if (!codecType.has_value()) {
return false;
}

const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get(codecContext->pix_fmt);
if (!desc) {
return false;
}

auto chromaFormat = validateChromaSupport(desc);
if (!chromaFormat.has_value()) {
return false;
}

auto caps = CUVIDDECODECAPS{};
caps.eCodecType = codecType.value();
caps.eChromaFormat = chromaFormat.value();
caps.nBitDepthMinus8 = desc->comp[0].depth - 8;

CUresult result = cuvidGetDecoderCaps(&caps);
if (result != CUDA_SUCCESS) {
return false;
}

if (!caps.bIsSupported) {
return false;
}

auto coded_width = static_cast<unsigned int>(codecContext->coded_width);
auto coded_height = static_cast<unsigned int>(codecContext->coded_height);
if (coded_width < static_cast<unsigned int>(caps.nMinWidth) ||
coded_height < static_cast<unsigned int>(caps.nMinHeight) ||
coded_width > caps.nMaxWidth || coded_height > caps.nMaxHeight) {
return false;
}

// See nMaxMBCount in cuviddec.h
constexpr unsigned int macroblockConstant = 256;
if (coded_width * coded_height / macroblockConstant > caps.nMaxMBCount) {
return false;
}

// We'll set the decoderParams.OutputFormat to NV12, so we need to make
// sure it's actually supported.
// TODO: If this fail, we could consider decoding to something else than NV12
// (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is
// what FFmpeg does.
bool supportsNV12Output =
(caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1;
if (!supportsNV12Output) {
return false;
}

return true;
}

} // namespace
Expand Down Expand Up @@ -232,6 +249,19 @@ void BetaCudaDeviceInterface::initialize(
const AVStream* avStream,
const UniqueDecodingAVFormatContext& avFormatCtx,
[[maybe_unused]] const SharedAVCodecContext& codecContext) {
if (!nativeNVDECSupport(codecContext)) {
cpuFallback_ = createDeviceInterface(torch::kCPU);
TORCH_CHECK(
cpuFallback_ != nullptr, "Failed to create CPU device interface");
cpuFallback_->initialize(avStream, avFormatCtx, codecContext);
cpuFallback_->initializeVideo(
VideoStreamOptions(),
{},
/*resizedOutputDims=*/std::nullopt);
// We'll always use the CPU fallback from now on, so we can return early.
return;
}

Copy link
Contributor Author

@NicolasHug NicolasHug Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above: we create the fallback CPU interface only if we need it.

I'm a bit sad that the BETA Interface needs to store a CPU interface. I have TODOs in other places basically saying "An interface shouldn't rely on another interface".

But, I'm not sure we have a better way to implement this fallback mechanism, so I think our principle should be something less strict along those lines: "It's OK for interface A to use interface B, but interface B shouldn't need to worry about being called by interface A". I think we're respecting this principle so far.

The main design alternative would be for the fallback to be implemented within SingleStreamDecoder rather than in the interface itself. But I'm not sure we want this.
Specifically for this fallabck, we'd have to create a third hybrid interface (the fallback one), that runs all the decoding on the CPU, but runs the color conversion (convertAVFrameToFrameOutput) on GPU. Sounds like a hassle.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"It's OK for interface A to use interface B, but interface B shouldn't need to worry about being called by interface A". I think we're respecting this principle so far.

Agreed and agreed.

In my previous refactoring of the FFmpeg CUDA device interface, I believe I had always created the CPU device for fallback. But in that device, I also think we only knew if we would need the fallback when we discovered that FFmpeg had decoded on the frame on the CPU and then we needed to do CPU color conversion. If we could discover earlier in the FFmpeg CUDA device interface, then doing this pattern would be good. But I doubt we can

TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
timeBase_ = avStream->time_base;
frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
Expand All @@ -243,7 +273,11 @@ void BetaCudaDeviceInterface::initialize(

// Create parser. Default values that aren't obvious are taken from DALI.
CUVIDPARSERPARAMS parserParams = {};
parserParams.CodecType = validateCodecSupport(codecPar->codec_id);
auto codecType = validateCodecSupport(codecPar->codec_id);
TORCH_CHECK(
codecType.has_value(),
"This should never happen, we should be using the CPU fallback by now. Please report a bug.");
parserParams.CodecType = codecType.value();
parserParams.ulMaxNumDecodeSurfaces = 8;
parserParams.ulMaxDisplayDelay = 0;
// Callback setup, all are triggered by the parser within a call
Expand Down Expand Up @@ -383,6 +417,10 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
// Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to
// the NVCUVID parser.
int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
if (cpuFallback_) {
return cpuFallback_->sendPacket(packet);
}

TORCH_CHECK(
packet.get() && packet->data && packet->size > 0,
"sendPacket received an empty packet, this is unexpected, please report.");
Expand All @@ -406,6 +444,10 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
}

int BetaCudaDeviceInterface::sendEOFPacket() {
if (cpuFallback_) {
return cpuFallback_->sendEOFPacket();
}

CUVIDSOURCEDATAPACKET cuvidPacket = {};
cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
eofSent_ = true;
Expand Down Expand Up @@ -467,6 +509,10 @@ int BetaCudaDeviceInterface::frameReadyInDisplayOrder(

// Moral equivalent of avcodec_receive_frame().
int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
if (cpuFallback_) {
return cpuFallback_->receiveFrame(avFrame);
}

if (readyFrames_.empty()) {
// No frame found, instruct caller to try again later after sending more
// packets, or to stop if EOF was already sent.
Expand Down Expand Up @@ -601,6 +647,11 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
}

void BetaCudaDeviceInterface::flush() {
if (cpuFallback_) {
cpuFallback_->flush();
return;
}

// The NVCUVID docs mention that after seeking, i.e. when flush() is called,
// we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
// don't say whether this should be an empty packet, or whether it should be a
Expand All @@ -618,6 +669,21 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
UniqueAVFrame& avFrame,
FrameOutput& frameOutput,
std::optional<torch::Tensor> preAllocatedOutputTensor) {
if (cpuFallback_) {
// CPU decoded frame - need to do CPU color conversion then transfer to GPU
FrameOutput cpuFrameOutput;
cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);

// Transfer CPU frame to GPU
if (preAllocatedOutputTensor.has_value()) {
preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
frameOutput.data = preAllocatedOutputTensor.value();
} else {
frameOutput.data = cpuFrameOutput.data.to(device_);
}
return;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do this dance in at least one other place in the FFmpeg CUDA device. Maybe a part of #943 could also be trying to pull this into a utils function.

}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above is essentially the same as the FFmpeg interface logic:

if (avFrame->format != AV_PIX_FMT_CUDA) {
// The frame's format is AV_PIX_FMT_CUDA if and only if its content is on
// the GPU. In this branch, the frame is on the CPU. There are two possible
// reasons:
//
// 1. During maybeConvertAVFrameToNV12OrRGB24(), we had a non-NV12 format
// frame and we're on FFmpeg 4.4 or earlier. In such cases, we had to
// use CPU filters and we just converted the frame to RGB24.
// 2. This is what NVDEC gave us if it wasn't able to decode a frame, for
// whatever reason. Typically that happens if the video's encoder isn't
// supported by NVDEC.
//
// In both cases, we have a frame on the CPU. We send the frame back to the
// CUDA device when we're done.
enum AVPixelFormat frameFormat =
static_cast<enum AVPixelFormat>(avFrame->format);
FrameOutput cpuFrameOutput;
if (frameFormat == AV_PIX_FMT_RGB24) {
// Reason 1 above. The frame is already in RGB24, we just need to convert
// it to a tensor.
cpuFrameOutput.data = rgbAVFrameToTensor(avFrame);
} else {
// Reason 2 above. We need to do a full conversion which requires an
// actual CPU device.
cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
}
// Finally, we need to send the frame back to the GPU. Note that the
// pre-allocated tensor is on the GPU, so we can't send that to the CPU
// device interface. We copy it over here.
if (preAllocatedOutputTensor.has_value()) {
preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
frameOutput.data = preAllocatedOutputTensor.value();
} else {
frameOutput.data = cpuFrameOutput.data.to(device_);
}
return;
}

There are still minor differences that we should eventually align, as part of #943

// TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
// ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
TORCH_CHECK(
Expand Down
2 changes: 2 additions & 0 deletions src/torchcodec/_core/BetaCudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {

// NPP context for color conversion
UniqueNppContext nppCtx_;

std::unique_ptr<DeviceInterface> cpuFallback_;
};

} // namespace facebook::torchcodec
Expand Down
40 changes: 22 additions & 18 deletions test/test_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -1701,20 +1701,19 @@ def test_beta_cuda_interface_backwards(self, asset, seek_mode):
assert beta_frame.duration_seconds == ref_frame.duration_seconds

@needs_cuda
def test_beta_cuda_interface_small_h265(self):
# Test to illustrate current difference in behavior between the BETA and
# the ffmpeg interface: this video isn't supported by NVDEC, but in the
# ffmpeg interface, FFMPEG fallsback to the CPU while we don't.

VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)

def test_beta_cuda_interface_cpu_fallback(self):
# Non-regression test for the CPU fallback behavior of the BETA CUDA
# interface.
# We know that the H265_VIDEO asset isn't supported by NVDEC, its
# dimensions are too small. We also know that the FFmpeg CUDA interface
# fallbacks to the CPU path in such cases. We assert that we fall back
# to the CPU path, too.

ffmpeg = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)
with set_cuda_backend("beta"):
dec = VideoDecoder(H265_VIDEO.path, device="cuda")
with pytest.raises(
RuntimeError,
match="Video is too small in at least one dimension. Provided: 128x128 vs supported:144x144",
):
dec.get_frame_at(0)
beta = VideoDecoder(H265_VIDEO.path, device="cuda").get_frame_at(0)

torch.testing.assert_close(ffmpeg.data, beta.data, rtol=0, atol=0)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above: we used to assert an error, now we just assert that the video can be decoded properly. Strictly speaking, we're not asserting that there is a fallback. (That will probably come in follow-ups).

There's another important thing to check, that we're actually not enforcing in the tests: we need to make sure that we don't have false positives, i.e. that we're not falling back to the CPU when we don't need to. To check that, I added a TORCH_CHECK(false) in the CPU fallback path. All the tests are passing, except for these two:

FAILED test/test_decoders.py::TestVideoDecoder::test_get_key_frame_indices[cuda:beta] - RuntimeError: Falling back to CPU decoder.
FAILED test/test_decoders.py::TestVideoDecoder::test_beta_cuda_interface_cpu_fallback - RuntimeError: Falling back to CPU decoder.

This is the expected result: these two tests are using H265_VIDEO, which is the only video that needs a fallback.

I will try to follow-up with a an official and robust way to assert the backend / fallback of the VideoDecoder object (see other comment below)


@needs_cuda
def test_beta_cuda_interface_error(self):
Expand All @@ -1740,15 +1739,20 @@ def test_set_cuda_backend(self):
assert _get_cuda_backend() == "beta"

def assert_decoder_uses(decoder, *, expected_backend):
# TODO: This doesn't work anymore after
# https://github.com/meta-pytorch/torchcodec/pull/977
# We need to define a better way to assert which backend a decoder
# is using.
return
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's OK to deactivate this test for now, we know it's working fine.

I think we should have a private way to query the VideoDecoder object so that it can tell us which interace / backend it's currently using. I will follow-up on this.

# Assert that a decoder instance is using a given backend.
#
# We know H265_VIDEO fails on the BETA backend while it works on the
# ffmpeg one.
if expected_backend == "ffmpeg":
decoder.get_frame_at(0) # this would fail if this was BETA
else:
with pytest.raises(RuntimeError, match="Video is too small"):
decoder.get_frame_at(0)
# if expected_backend == "ffmpeg":
# decoder.get_frame_at(0) # this would fail if this was BETA
# else:
# with pytest.raises(RuntimeError, match="Video is too small"):
# decoder.get_frame_at(0)

# Check that the default is the ffmpeg backend
assert _get_cuda_backend() == "ffmpeg"
Expand Down
Loading