meta-pytorch · Dan-Flores · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -380,22 +380,76 @@ std::string CudaDeviceInterface::getDetails() {
 // Below are methods exclusive to video encoding:
 // --------------------------------------------------------------------------
 namespace {
-// RGB to NV12 color conversion matrix for BT.601 limited range.
-// NPP ColorTwist function used below expects the limited range
-// color conversion matrix, and this matches FFmpeg's default behavior.
-const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
-    // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
-    {0.257f, 0.504f, 0.098f, 16.0f},
-    // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
-    {-0.148f, -0.291f, 0.439f, 128.0f},
-    // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
-    {0.439f, -0.368f, -0.071f, 128.0f}};
+// For background on these matrices, see the note:
+// [YUV -> RGB Color Conversion, color space and color range]
+// https://github.com/meta-pytorch/torchcodec/blob/main/src/torchcodec/_core/CUDACommon.cpp#L63-L65
+// TODO Video-Encoder: Extend note to explain limited vs full range
+// RGB to YUV conversion matrices to use in NPP color conversion functions
 // Color space and color range 
 // --------------------------- 
 // Color space and color range 
 // --------------------------- 
+struct ColorConversionMatrices {
+  static constexpr Npp32f BT601_LIMITED[3][4] = {
+      {0.257f, 0.504f, 0.098f, 16.0f},
+      {-0.148f, -0.291f, 0.439f, 128.0f},
+      {0.439f, -0.368f, -0.071f, 128.0f}};
+
+  static constexpr Npp32f BT601_FULL[3][4] = {
+      {0.299f, 0.587f, 0.114f, 0.0f},
+      {-0.168736f, -0.331264f, 0.5f, 128.0f},
+      {0.5f, -0.418688f, -0.081312f, 128.0f}};
+
+  static constexpr Npp32f BT709_LIMITED[3][4] = {
+      {0.183f, 0.614f, 0.062f, 16.0f},
+      {-0.101f, -0.338f, 0.439f, 128.0f},
+      {0.439f, -0.399f, -0.040f, 128.0f}};
+
+  static constexpr Npp32f BT709_FULL[3][4] = {
+      {0.2126f, 0.7152f, 0.0722f, 0.0f},
+      {-0.114572f, -0.385428f, 0.5f, 128.0f},
+      {0.5f, -0.454153f, -0.045847f, 128.0f}};
+
+  static constexpr Npp32f BT2020_LIMITED[3][4] = {
+      {0.2256f, 0.5823f, 0.0509f, 16.0f},
+      {-0.122f, -0.315f, 0.439f, 128.0f},
+      {0.439f, -0.403f, -0.036f, 128.0f}};
+
+  static constexpr Npp32f BT2020_FULL[3][4] = {
+      {0.2627f, 0.6780f, 0.0593f, 0.0f},
+      {-0.139630f, -0.360370f, 0.5f, 128.0f},
+      {0.5f, -0.459786f, -0.040214f, 128.0f}};
+};
+
+// Returns conversion matrix based on codec context color space and range
+const Npp32f (*getConversionMatrix(AVCodecContext* codecContext))[4] {
+  if (codecContext->color_range == AVCOL_RANGE_MPEG || // limited range
+      codecContext->color_range == AVCOL_RANGE_UNSPECIFIED) {
+    if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
+      return ColorConversionMatrices::BT601_LIMITED;
+    } else if (codecContext->colorspace == AVCOL_SPC_BT709) {
+      return ColorConversionMatrices::BT709_LIMITED;
+    } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
+      return ColorConversionMatrices::BT2020_LIMITED;
+    } else { // default to BT.601
+      return ColorConversionMatrices::BT601_LIMITED;
+    }
+  } else if (codecContext->color_range == AVCOL_RANGE_JPEG) { // full range
+    if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
+      return ColorConversionMatrices::BT601_FULL;
+    } else if (codecContext->colorspace == AVCOL_SPC_BT709) {
+      return ColorConversionMatrices::BT709_FULL;
+    } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
+      return ColorConversionMatrices::BT2020_FULL;
+    } else { // default to BT.601
+      return ColorConversionMatrices::BT601_FULL;
+    }
+  }
+  return ColorConversionMatrices::BT601_LIMITED;
+}
 } // namespace
 
 UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
     const torch::Tensor& tensor,
     int frameIndex,
-    AVCodecContext* codecContext) {
+    AVCodecContext* codecContext,
+    AVPixelFormat targetPixelFormat) {
   TORCH_CHECK(
       tensor.dim() == 3 && tensor.size(0) == 3,
       "Expected 3D RGB tensor (CHW format), got shape: ",
@@ -434,25 +488,39 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
   torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();
 
   NppiSize oSizeROI = {width, height};
-  NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
-      static_cast<const Npp8u*>(hwcFrame.data_ptr()),
-      validateInt64ToInt(
-          hwcFrame.stride(0) * hwcFrame.element_size(), "nSrcStep"),
-      avFrame->data,
-      avFrame->linesize,
-      oSizeROI,
-      defaultLimitedRangeRgbToNv12,
-      *nppCtx_);
+  NppStatus status;
+  switch (targetPixelFormat) {
+    case AV_PIX_FMT_NV12:
+      status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
+          static_cast<const Npp8u*>(hwcFrame.data_ptr()),
+          hwcFrame.stride(0) * hwcFrame.element_size(),
+          avFrame->data,
+          avFrame->linesize,
+          oSizeROI,
+          getConversionMatrix(codecContext),
+          *nppCtx_);
+      break;
+    default:
+      TORCH_CHECK(
+          false,
+          "GPU encoding expected to encode into nv12 pixel format, but got ",
+          av_get_pix_fmt_name(targetPixelFormat),
+          ". This should not happen, please report this to the TorchCodec repo");
+  }
 
   TORCH_CHECK(
       status == NPP_SUCCESS,
-      "Failed to convert RGB to NV12: NPP error code ",
+      "Failed to convert RGB to ",
+      av_get_pix_fmt_name(targetPixelFormat),
+      ": NPP error code ",
       status);
 
-  // TODO-VideoEncoder: Enable configuration of color properties, similar to
-  // FFmpeg. Below are the default color properties used by FFmpeg.
-  avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
-  avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
+  avFrame->colorspace = codecContext->colorspace != AVCOL_SPC_UNSPECIFIED
+      ? codecContext->colorspace
+      : AVCOL_SPC_BT470BG; // BT.601
+  avFrame->color_range = codecContext->color_range != AVCOL_RANGE_UNSPECIFIED
+      ? codecContext->color_range
+      : AVCOL_RANGE_MPEG; // limited range
 
   return avFrame;
 }
@@ -461,7 +529,8 @@ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
 // to enable encoding with CUDA device. The hw_frames_ctx field is needed by
 // FFmpeg to allocate frames on GPU's memory.
 void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
-    AVCodecContext* codecContext) {
+    AVCodecContext* codecContext,
+    AVPixelFormat targetPixelFormat) {
   TORCH_CHECK(codecContext != nullptr, "codecContext is null");
   TORCH_CHECK(
       hardwareDeviceCtx_, "Hardware device context has not been initialized");
@@ -471,9 +540,7 @@ void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
       hwFramesCtxRef != nullptr,
       "Failed to allocate hardware frames context for codec");
 
-  // TODO-VideoEncoder: Enable user set pixel formats to be set
-  // (outPixelFormat_) and handled with the appropriate NPP function
-  codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
+  codecContext->sw_pix_fmt = targetPixelFormat;
   // Always set pixel format to support CUDA encoding.
   codecContext->pix_fmt = AV_PIX_FMT_CUDA;
 

diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -46,10 +46,12 @@ class CudaDeviceInterface : public DeviceInterface {
   UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
       const torch::Tensor& tensor,
       int frameIndex,
-      AVCodecContext* codecContext) override;
+      AVCodecContext* codecContext,
+      AVPixelFormat targetPixelFormat) override;
 
   void setupHardwareFrameContextForEncoding(
-      AVCodecContext* codecContext) override;
+      AVCodecContext* codecContext,
+      AVPixelFormat targetPixelFormat) override;
 
  private:
   // Our CUDA decoding code assumes NV12 format. In order to handle other

diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -146,14 +146,16 @@ class DeviceInterface {
   virtual UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
       [[maybe_unused]] const torch::Tensor& tensor,
       [[maybe_unused]] int frameIndex,
-      [[maybe_unused]] AVCodecContext* codecContext) {
+      [[maybe_unused]] AVCodecContext* codecContext,
+      [[maybe_unused]] AVPixelFormat targetPixelFormat) {
     TORCH_CHECK(false);
   }
 
   // Function used for video encoding, only implemented in CudaDeviceInterface.
   // It is here to isolate CUDA dependencies from CPU builds
   virtual void setupHardwareFrameContextForEncoding(
-      [[maybe_unused]] AVCodecContext* codecContext) {
+      [[maybe_unused]] AVCodecContext* codecContext,
+      [[maybe_unused]] AVPixelFormat targetPixelFormat) {
     TORCH_CHECK(false);
   }
 

diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -790,23 +790,30 @@ void VideoEncoder::initializeEncoder(
   outHeight_ = inHeight_;
 
   if (videoStreamOptions.pixelFormat.has_value()) {
+    // TODO-VideoEncoder: Enable pixel formats to be set by user
+    // and handled with the appropriate NPP function on GPU.
     if (frames_.device().is_cuda()) {
       TORCH_CHECK(
           false,
-          "GPU Video encoding currently only supports the NV12 pixel format. "
-          "Do not set pixel_format to use NV12.");
+          "Video encoding on GPU currently only supports the nv12 pixel format. "
+          "Do not set pixel_format to use nv12 by default.");
     }
     outPixelFormat_ =
         validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value());
   } else {
-    const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
-    // Use first listed pixel format as default (often yuv420p).
-    // This is similar to FFmpeg's logic:
-    // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
-    // If pixel formats are undefined for some reason, try yuv420p
-    outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
-        ? formats[0]
-        : AV_PIX_FMT_YUV420P;
+    if (frames_.device().is_cuda()) {
+      // Default to nv12 pixel format when encoding on GPU.
+      outPixelFormat_ = AV_PIX_FMT_NV12;
+    } else {
+      const AVPixelFormat* formats = getSupportedPixelFormats(*avCodec);
+      // Use first listed pixel format as default (often yuv420p).
+      // This is similar to FFmpeg's logic:
+      // https://www.ffmpeg.org/doxygen/4.0/decode_8c_source.html#l01087
+      // If pixel formats are undefined for some reason, try yuv420p
+      outPixelFormat_ = (formats && formats[0] != AV_PIX_FMT_NONE)
+          ? formats[0]
+          : AV_PIX_FMT_YUV420P;
+    }
   }
 
   // Configure codec parameters
@@ -852,7 +859,7 @@ void VideoEncoder::initializeEncoder(
   if (frames_.device().is_cuda() && deviceInterface_) {
     deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get());
     deviceInterface_->setupHardwareFrameContextForEncoding(
-        avCodecContext_.get());
+        avCodecContext_.get(), outPixelFormat_);
   }
 
   int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
@@ -898,7 +905,7 @@ void VideoEncoder::encode() {
     UniqueAVFrame avFrame;
     if (frames_.device().is_cuda() && deviceInterface_) {
       auto cudaFrame = deviceInterface_->convertCUDATensorToAVFrameForEncoding(
-          currFrame, i, avCodecContext_.get());
+          currFrame, i, avCodecContext_.get(), outPixelFormat_);
       TORCH_CHECK(
           cudaFrame != nullptr,
           "convertCUDATensorToAVFrameForEncoding failed for frame ",

diff --git a/test/test_encoders.py b/test/test_encoders.py
@@ -780,9 +780,9 @@ def test_pixel_format_errors(self, method, device, tmp_path):
         if device == "cuda":
             with pytest.raises(
                 RuntimeError,
-                match="GPU Video encoding currently only supports the NV12 pixel format. Do not set pixel_format to use NV12",
+                match="Video encoding on GPU currently only supports the nv12 pixel format. Do not set pixel_format to use nv12 by default.",
             ):
-                getattr(encoder, method)(**valid_params, pixel_format="yuv420p")
+                getattr(encoder, method)(**valid_params, pixel_format="yuv444p")
             return
 
         with pytest.raises(
@@ -1354,7 +1354,24 @@ def test_extra_options_utilized(self, tmp_path, profile, colorspace, color_range
             ),
         ],
     )
-    def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec):
+    # BT.601, BT.709, BT.2020
+    @pytest.mark.parametrize("color_space", ("bt470bg", "bt709", "bt2020nc"))
+    # Full/PC range, Limited/TV range
+    @pytest.mark.parametrize("color_range", ("pc", "tv"))
+    def test_nvenc_against_ffmpeg_cli(
+        self, tmp_path, method, format, codec, color_space, color_range
+    ):
+        ffmpeg_version = get_ffmpeg_major_version()
+        # TODO-VideoEncoder: Investigate why FFmpeg 4 and 6 fail with non-default color space and range.
+        # See https://github.com/meta-pytorch/torchcodec/issues/1140
+        if ffmpeg_version in (4, 6) and not (
+            color_space == "bt470bg" and color_range == "tv"
+        ):
+            pytest.skip(
+                "Non-default color space and range have lower accuracy on FFmpeg 4 and 6"
+            )
+        if ffmpeg_version == 4 and codec == "av1_nvenc":
+            pytest.skip("av1_nvenc is not supported on FFmpeg 4")
         # Encode with FFmpeg CLI using nvenc codecs
         device = "cuda"
         qp = 1  # Use near lossless encoding to reduce noise and support av1_nvenc
@@ -1382,16 +1399,23 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec):
             temp_raw_path,
         ]
         # CLI requires explicit codec for nvenc
+        # VideoEncoder will default to h264_nvenc since the frames are on GPU.
         ffmpeg_cmd.extend(["-c:v", codec if codec is not None else "h264_nvenc"])
-        # VideoEncoder will select an NVENC encoder by default since the frames are on GPU.
-
         ffmpeg_cmd.extend(["-pix_fmt", "nv12"])  # Output format is always NV12
-        ffmpeg_cmd.extend(["-qp", str(qp)])
+        ffmpeg_cmd.extend(["-qp", str(qp)])  # Use lossless qp for other codecs
+        if color_space:
+            ffmpeg_cmd.extend(["-colorspace", color_space])
+        if color_range:
+            ffmpeg_cmd.extend(["-color_range", color_range])
         ffmpeg_cmd.extend([ffmpeg_encoded_path])
         subprocess.run(ffmpeg_cmd, check=True, capture_output=True)
 
         encoder = VideoEncoder(frames=source_frames, frame_rate=frame_rate)
         encoder_extra_options = {"qp": qp}
+        if color_space:
+            encoder_extra_options["colorspace"] = color_space
+        if color_range:
+            encoder_extra_options["color_range"] = color_range
         if method == "to_file":
             encoder_output_path = str(tmp_path / f"nvenc_output.{format}")
             encoder.to_file(
@@ -1422,13 +1446,39 @@ def test_nvenc_against_ffmpeg_cli(self, tmp_path, method, format, codec):
         encoder_frames = self.decode(encoder_output).data
 
         assert ffmpeg_frames.shape[0] == encoder_frames.shape[0]
+        # The combination of full range + bt709 results in worse accuracy
+        percentage = 91 if color_range == "full" and color_space == "bt709" else 96
         for ff_frame, enc_frame in zip(ffmpeg_frames, encoder_frames):
             assert psnr(ff_frame, enc_frame) > 25
-            assert_tensor_close_on_at_least(ff_frame, enc_frame, percentage=96, atol=2)
+            assert_tensor_close_on_at_least(
+                ff_frame, enc_frame, percentage=percentage, atol=2
+            )
 
         if method == "to_file":
-            ffmpeg_metadata = self._get_video_metadata(ffmpeg_encoded_path, ["pix_fmt"])
-            encoder_metadata = self._get_video_metadata(encoder_output, ["pix_fmt"])
-            # pix_fmt nv12 is stored as yuv420p in metadata
-            assert encoder_metadata["pix_fmt"] == "yuv420p"
-            assert ffmpeg_metadata["pix_fmt"] == "yuv420p"
+            metadata_fields = ["pix_fmt", "color_range", "color_space"]
+            ffmpeg_metadata = self._get_video_metadata(
+                ffmpeg_encoded_path, metadata_fields
+            )
+            encoder_metadata = self._get_video_metadata(encoder_output, metadata_fields)
+            # pix_fmt nv12 is stored as yuv420p in metadata, unless full range (pc)is used
+            # In that case, h264 and hevc NVENC codecs will use yuvj420p automatically.
+            if color_range == "pc" and codec != "av1_nvenc":
+                expected_pix_fmt = "yuvj420p"
+            else:
+                # av1_nvenc does not utilize the yuvj420p pixel format
+                expected_pix_fmt = "yuv420p"
+            assert (
+                encoder_metadata["pix_fmt"]
+                == ffmpeg_metadata["pix_fmt"]
+                == expected_pix_fmt
+            )
+            assert (
+                encoder_metadata["color_range"]
+                == ffmpeg_metadata["color_range"]
+                == color_range
+            )
+            assert (
+                encoder_metadata["color_space"]
+                == ffmpeg_metadata["color_space"]
+                == color_space
+            )