Skip to content
Draft
125 changes: 96 additions & 29 deletions src/torchcodec/_core/CudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -380,22 +380,76 @@ std::string CudaDeviceInterface::getDetails() {
// Below are methods exclusive to video encoding:
// --------------------------------------------------------------------------
namespace {
// RGB to NV12 color conversion matrix for BT.601 limited range.
// NPP ColorTwist function used below expects the limited range
// color conversion matrix, and this matches FFmpeg's default behavior.
const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
// Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
{0.257f, 0.504f, 0.098f, 16.0f},
// U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
{-0.148f, -0.291f, 0.439f, 128.0f},
// V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
{0.439f, -0.368f, -0.071f, 128.0f}};
// For background on these matrices, see the note:
// [YUV -> RGB Color Conversion, color space and color range]
// https://github.com/meta-pytorch/torchcodec/blob/main/src/torchcodec/_core/CUDACommon.cpp#L63-L65
// TODO Video-Encoder: Extend note to explain limited vs full range
// RGB to YUV conversion matrices to use in NPP color conversion functions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you share how these were derived? What were the original values that were used as reference?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These follow the pattern described in the note in CudaCommon, I can add a comment referencing that note here

// Color space and color range
// ---------------------------

Copy link
Contributor

@NicolasHug NicolasHug Dec 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The note is about YUV -> RGB so it's not 100% targeted to what the matrices are doing. But yes, add a ref to that note, it's still useful.

You asked me offline whether we should update the note to explain limited range: yes, we should :)
There's a TODO in the note for that, but I never had the chance to do it - and frankly I forgot the underlying logic lol. If you'd like to give it a go, please do it - in a follow-up PR.

struct ColorConversionMatrices {
static constexpr Npp32f BT601_LIMITED[3][4] = {
{0.257f, 0.504f, 0.098f, 16.0f},
{-0.148f, -0.291f, 0.439f, 128.0f},
{0.439f, -0.368f, -0.071f, 128.0f}};

static constexpr Npp32f BT601_FULL[3][4] = {
{0.299f, 0.587f, 0.114f, 0.0f},
{-0.168736f, -0.331264f, 0.5f, 128.0f},
{0.5f, -0.418688f, -0.081312f, 128.0f}};

static constexpr Npp32f BT709_LIMITED[3][4] = {
{0.183f, 0.614f, 0.062f, 16.0f},
{-0.101f, -0.338f, 0.439f, 128.0f},
{0.439f, -0.399f, -0.040f, 128.0f}};

static constexpr Npp32f BT709_FULL[3][4] = {
{0.2126f, 0.7152f, 0.0722f, 0.0f},
{-0.114572f, -0.385428f, 0.5f, 128.0f},
{0.5f, -0.454153f, -0.045847f, 128.0f}};

static constexpr Npp32f BT2020_LIMITED[3][4] = {
{0.2256f, 0.5823f, 0.0509f, 16.0f},
{-0.122f, -0.315f, 0.439f, 128.0f},
{0.439f, -0.403f, -0.036f, 128.0f}};

static constexpr Npp32f BT2020_FULL[3][4] = {
{0.2627f, 0.6780f, 0.0593f, 0.0f},
{-0.139630f, -0.360370f, 0.5f, 128.0f},
{0.5f, -0.459786f, -0.040214f, 128.0f}};
};

// Returns conversion matrix based on codec context color space and range
const Npp32f (*getConversionMatrix(AVCodecContext* codecContext))[4] {
if (codecContext->color_range == AVCOL_RANGE_MPEG || // limited range
codecContext->color_range == AVCOL_RANGE_UNSPECIFIED) {
if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
return ColorConversionMatrices::BT601_LIMITED;
} else if (codecContext->colorspace == AVCOL_SPC_BT709) {
return ColorConversionMatrices::BT709_LIMITED;
} else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
return ColorConversionMatrices::BT2020_LIMITED;
} else { // default to BT.601
return ColorConversionMatrices::BT601_LIMITED;
}
} else if (codecContext->color_range == AVCOL_RANGE_JPEG) { // full range
if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
return ColorConversionMatrices::BT601_FULL;
} else if (codecContext->colorspace == AVCOL_SPC_BT709) {
return ColorConversionMatrices::BT709_FULL;
} else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
return ColorConversionMatrices::BT2020_FULL;
} else { // default to BT.601
return ColorConversionMatrices::BT601_FULL;
}
}
return ColorConversionMatrices::BT601_LIMITED;
}
} // namespace

UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
const torch::Tensor& tensor,
int frameIndex,
AVCodecContext* codecContext) {
AVCodecContext* codecContext,
AVPixelFormat targetPixelFormat) {
TORCH_CHECK(
tensor.dim() == 3 && tensor.size(0) == 3,
"Expected 3D RGB tensor (CHW format), got shape: ",
Expand Down Expand Up @@ -434,25 +488,39 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();

NppiSize oSizeROI = {width, height};
NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
validateInt64ToInt(
hwcFrame.stride(0) * hwcFrame.element_size(), "nSrcStep"),
avFrame->data,
avFrame->linesize,
oSizeROI,
defaultLimitedRangeRgbToNv12,
*nppCtx_);
NppStatus status;
switch (targetPixelFormat) {
case AV_PIX_FMT_NV12:
status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
hwcFrame.stride(0) * hwcFrame.element_size(),
avFrame->data,
avFrame->linesize,
oSizeROI,
getConversionMatrix(codecContext),
*nppCtx_);
break;
default:
TORCH_CHECK(
false,
"GPU encoding expected to encode into nv12 pixel format, but got ",
av_get_pix_fmt_name(targetPixelFormat),
". This should not happen, please report this to the TorchCodec repo");
}

TORCH_CHECK(
status == NPP_SUCCESS,
"Failed to convert RGB to NV12: NPP error code ",
"Failed to convert RGB to ",
av_get_pix_fmt_name(targetPixelFormat),
": NPP error code ",
status);

// TODO-VideoEncoder: Enable configuration of color properties, similar to
// FFmpeg. Below are the default color properties used by FFmpeg.
avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
avFrame->colorspace = codecContext->colorspace != AVCOL_SPC_UNSPECIFIED
? codecContext->colorspace
: AVCOL_SPC_BT470BG; // BT.601
avFrame->color_range = codecContext->color_range != AVCOL_RANGE_UNSPECIFIED
? codecContext->color_range
: AVCOL_RANGE_MPEG; // limited range

return avFrame;
}
Expand All @@ -461,7 +529,8 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
// to enable encoding with CUDA device. The hw_frames_ctx field is needed by
// FFmpeg to allocate frames on GPU's memory.
void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
AVCodecContext* codecContext) {
AVCodecContext* codecContext,
AVPixelFormat targetPixelFormat) {
TORCH_CHECK(codecContext != nullptr, "codecContext is null");
TORCH_CHECK(
hardwareDeviceCtx_, "Hardware device context has not been initialized");
Expand All @@ -471,9 +540,7 @@ void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
hwFramesCtxRef != nullptr,
"Failed to allocate hardware frames context for codec");

// TODO-VideoEncoder: Enable user set pixel formats to be set
// (outPixelFormat_) and handled with the appropriate NPP function
codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
codecContext->sw_pix_fmt = targetPixelFormat;
// Always set pixel format to support CUDA encoding.
codecContext->pix_fmt = AV_PIX_FMT_CUDA;

Expand Down
6 changes: 4 additions & 2 deletions src/torchcodec/_core/CudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,12 @@ class CudaDeviceInterface : public DeviceInterface {
UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
const torch::Tensor& tensor,
int frameIndex,
AVCodecContext* codecContext) override;
AVCodecContext* codecContext,
AVPixelFormat targetPixelFormat) override;

void setupHardwareFrameContextForEncoding(
AVCodecContext* codecContext) override;
AVCodecContext* codecContext,
AVPixelFormat targetPixelFormat) override;

private:
// Our CUDA decoding code assumes NV12 format. In order to handle other
Expand Down
6 changes: 4 additions & 2 deletions src/torchcodec/_core/DeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,16 @@ class DeviceInterface {
virtual UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
[[maybe_unused]] const torch::Tensor& tensor,
[[maybe_unused]] int frameIndex,
[[maybe_unused]] AVCodecContext* codecContext) {
[[maybe_unused]] AVCodecContext* codecContext,
[[maybe_unused]] AVPixelFormat targetPixelFormat) {
TORCH_CHECK(false);
}

// Function used for video encoding, only implemented in CudaDeviceInterface.
// It is here to isolate CUDA dependencies from CPU builds
virtual void setupHardwareFrameContextForEncoding(
[[maybe_unused]] AVCodecContext* codecContext) {
[[maybe_unused]] AVCodecContext* codecContext,
[[maybe_unused]] AVPixelFormat targetPixelFormat) {
TORCH_CHECK(false);
}

Expand Down
31 changes: 19 additions & 12 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -790,23 +790,30 @@ void VideoEncoder::initializeEncoder(
outHeight_ = inHeight_;

if (videoStreamOptions.pixelFormat.has_value()) {
// TODO-VideoEncoder: Enable pixel formats to be set by user
// and handled with the appropriate NPP function on GPU.
if (frames_.device().is_cuda()) {
TORCH_CHECK(
false,
"GPU Video encoding currently only supports the NV12 pixel format. "
"Do not set pixel_format to use NV12.");
"Video encoding on GPU currently only supports the nv12 pixel format. "
"Do not set pixel_format to use nv12 by default.");
}
outPixelFormat_ =
validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value());
} else {
const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
// Use first listed pixel format as default (often yuv420p).
// This is similar to FFmpeg's logic:
// https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
// If pixel formats are undefined for some reason, try yuv420p
outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
? formats[0]
: AV_PIX_FMT_YUV420P;
if (frames_.device().is_cuda()) {
// Default to nv12 pixel format when encoding on GPU.
outPixelFormat_ = AV_PIX_FMT_NV12;
} else {
const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
// Use first listed pixel format as default (often yuv420p).
// This is similar to FFmpeg's logic:
// https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
// If pixel formats are undefined for some reason, try yuv420p
outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
? formats[0]
: AV_PIX_FMT_YUV420P;
}
}

// Configure codec parameters
Expand Down Expand Up @@ -852,7 +859,7 @@ void VideoEncoder::initializeEncoder(
if (frames_.device().is_cuda() && deviceInterface_) {
deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get());
deviceInterface_->setupHardwareFrameContextForEncoding(
avCodecContext_.get());
avCodecContext_.get(), outPixelFormat_);
}

int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
Expand Down Expand Up @@ -898,7 +905,7 @@ void VideoEncoder::encode() {
UniqueAVFrame avFrame;
if (frames_.device().is_cuda() && deviceInterface_) {
auto cudaFrame = deviceInterface_->convertCUDATensorToAVFrameForEncoding(
currFrame, i, avCodecContext_.get());
currFrame, i, avCodecContext_.get(), outPixelFormat_);
TORCH_CHECK(
cudaFrame != nullptr,
"convertCUDATensorToAVFrameForEncoding failed for frame ",
Expand Down
74 changes: 62 additions & 12 deletions test/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,9 +780,9 @@ def test_pixel_format_errors(self, method, device, tmp_path):
if device == "cuda":
with pytest.raises(
RuntimeError,
match="GPU Video encoding currently only supports the NV12 pixel format. Do not set pixel_format to use NV12",
match="Video encoding on GPU currently only supports the nv12 pixel format. Do not set pixel_format to use nv12 by default.",
):
getattr(encoder, method)(**valid_params, pixel_format="yuv420p")
getattr(encoder, method)(**valid_params, pixel_format="yuv444p")
return

with pytest.raises(
Expand Down Expand Up @@ -1354,7 +1354,24 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range
),
],
)
def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec):
# BT.601, BT.709, BT.2020
@pytest.mark.parametrize("color_space", ("bt470bg", "bt709", "bt2020nc"))
# Full/PC range, Limited/TV range
@pytest.mark.parametrize("color_range", ("pc", "tv"))
def test_nvenc_against_ffmpeg_cli(
self, tmp_path, method, format, codec, color_space, color_range
):
ffmpeg_version = get_ffmpeg_major_version()
# TODO-VideoEncoder: Investigate why FFmpeg 4 and 6 fail with non-default color space and range.
# See https://github.com/meta-pytorch/torchcodec/issues/1140
if ffmpeg_version in (4, 6) and not (
color_space == "bt470bg" and color_range == "tv"
):
pytest.skip(
"Non-default color space and range have lower accuracy on FFmpeg 4 and 6"
)
if ffmpeg_version == 4 and codec == "av1_nvenc":
pytest.skip("av1_nvenc is not supported on FFmpeg 4")
# Encode with FFmpeg CLI using nvenc codecs
device = "cuda"
qp = 1 # Use near lossless encoding to reduce noise and support av1_nvenc
Expand Down Expand Up @@ -1382,16 +1399,23 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec):
temp_raw_path,
]
# CLI requires explicit codec for nvenc
# VideoEncoder will default to h264_nvenc since the frames are on GPU.
ffmpeg_cmd.extend(["-c:v", codec if codec is not None else "h264_nvenc"])
# VideoEncoder will select an NVENC encoder by default since the frames are on GPU.

ffmpeg_cmd.extend(["-pix_fmt", "nv12"]) # Output format is always NV12
ffmpeg_cmd.extend(["-qp", str(qp)])
ffmpeg_cmd.extend(["-qp", str(qp)]) # Use lossless qp for other codecs
if color_space:
ffmpeg_cmd.extend(["-colorspace", color_space])
if color_range:
ffmpeg_cmd.extend(["-color_range", color_range])
ffmpeg_cmd.extend([ffmpeg_encoded_path])
subprocess.run(ffmpeg_cmd, check=True, capture_output=True)

encoder = VideoEncoder(frames=source_frames, frame_rate=frame_rate)
encoder_extra_options = {"qp": qp}
if color_space:
encoder_extra_options["colorspace"] = color_space
if color_range:
encoder_extra_options["color_range"] = color_range
if method == "to_file":
encoder_output_path = str(tmp_path / f"nvenc_output.{format}")
encoder.to_file(
Expand Down Expand Up @@ -1422,13 +1446,39 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec):
encoder_frames = self.decode(encoder_output).data

assert ffmpeg_frames.shape[0] == encoder_frames.shape[0]
# The combination of full range + bt709 results in worse accuracy
percentage = 91 if color_range == "full" and color_space == "bt709" else 96
for ff_frame, enc_frame in zip(ffmpeg_frames, encoder_frames):
assert psnr(ff_frame, enc_frame) > 25
assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=96, atol=2)
assert_tensor_close_on_at_least(
ff_frame, enc_frame, percentage=percentage, atol=2
)

if method == "to_file":
ffmpeg_metadata = self._get_video_metadata(ffmpeg_encoded_path, ["pix_fmt"])
encoder_metadata = self._get_video_metadata(encoder_output, ["pix_fmt"])
# pix_fmt nv12 is stored as yuv420p in metadata
assert encoder_metadata["pix_fmt"] == "yuv420p"
assert ffmpeg_metadata["pix_fmt"] == "yuv420p"
Comment on lines -1433 to -1434
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like we're not assert pix_fmt anymore, which makes it hard to verify that the changes in this PR are correct. IIRC, passing NV12 actually resulted in a yuv420p format at the end. We should try to undertand why that was the case. It may not add a lot of value to support both nv12 and yuv420p as parameter values if they're both the same thing (and if they both end-up being yuv420 anyway).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect the format changes occur based on codec implementation. By adding back this assertion, I observed a deprecated pixel format yuvj420p is set when pc (full) color range is used by h264_nvenc and hevc_nvenc, but not av1_nvenc.
I can incorporate pixel formats into my benchmarking PR, to see if there is some optimization to using nv12.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new assertions you added are good. But I personally still do not understand why passing NV12 actually ends up being reported yuv420.
Is it actually still NV12, and it's just FFmpeg that can't tell the difference? Or is it indeed yuv420?

Until we get a good understanding on that, I think we should refrain from allowing pixel_format with CUDA encoding. We have a surprising behavior that we cannot explain right now: passing NV12 leads to yuv420. If we're surprised, our users will be surprised too, and we won't have a good explanation to give them. It is safer to simply not expose this functionality just yet, and let them rely on the default behavior.

metadata_fields = ["pix_fmt", "color_range", "color_space"]
ffmpeg_metadata = self._get_video_metadata(
ffmpeg_encoded_path, metadata_fields
)
encoder_metadata = self._get_video_metadata(encoder_output, metadata_fields)
# pix_fmt nv12 is stored as yuv420p in metadata, unless full range (pc)is used
# In that case, h264 and hevc NVENC codecs will use yuvj420p automatically.
if color_range == "pc" and codec != "av1_nvenc":
expected_pix_fmt = "yuvj420p"
else:
# av1_nvenc does not utilize the yuvj420p pixel format
expected_pix_fmt = "yuv420p"
assert (
encoder_metadata["pix_fmt"]
== ffmpeg_metadata["pix_fmt"]
== expected_pix_fmt
)
assert (
encoder_metadata["color_range"]
== ffmpeg_metadata["color_range"]
== color_range
)
assert (
encoder_metadata["color_space"]
== ffmpeg_metadata["color_space"]
== color_space
)
Loading