meta-pytorch · Dan-Flores · Nov 3, 2025 · Nov 3, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -100,7 +100,7 @@ function(make_torchcodec_libraries
     )
 
     if(ENABLE_CUDA)
-	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp)
+	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp GpuEncoder.cpp)
     endif()
 
     set(core_library_dependencies

diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -523,7 +523,9 @@ void AudioEncoder::flushBuffers() {
 
 namespace {
 
-torch::Tensor validateFrames(const torch::Tensor& frames) {
+torch::Tensor validateFrames(
+    const torch::Tensor& frames,
+    const torch::Device& device) {
   TORCH_CHECK(
       frames.dtype() == torch::kUInt8,
       "frames must have uint8 dtype, got ",
@@ -536,6 +538,15 @@ torch::Tensor validateFrames(const torch::Tensor& frames) {
       frames.sizes()[1] == 3,
       "frame must have 3 channels (R, G, B), got ",
       frames.sizes()[1]);
+  if (device.type() != torch::kCPU) {
+    TORCH_CHECK(
+        frames.is_cuda(),
+        "When using CUDA encoding (device=",
+        device.str(),
+        "), frames must be on a CUDA device. Got frames on ",
+        frames.device().str(),
+        ". Please move frames to a CUDA device: frames.to('cuda')");
+  }
   return frames.contiguous();
 }
 
@@ -665,7 +676,8 @@ VideoEncoder::VideoEncoder(
     double frameRate,
     std::string_view fileName,
     const VideoStreamOptions& videoStreamOptions)
-    : frames_(validateFrames(frames)), inFrameRate_(frameRate) {
+    : frames_(validateFrames(frames, videoStreamOptions.device)),
+      inFrameRate_(frameRate) {
   setFFmpegLogLevel();
 
   // Allocate output format context
@@ -698,7 +710,7 @@ VideoEncoder::VideoEncoder(
     std::string_view formatName,
     std::unique_ptr<AVIOContextHolder> avioContextHolder,
     const VideoStreamOptions& videoStreamOptions)
-    : frames_(validateFrames(frames)),
+    : frames_(validateFrames(frames, videoStreamOptions.device)),
       inFrameRate_(frameRate),
       avioContextHolder_(std::move(avioContextHolder)) {
   setFFmpegLogLevel();
@@ -724,6 +736,10 @@ VideoEncoder::VideoEncoder(
 
 void VideoEncoder::initializeEncoder(
     const VideoStreamOptions& videoStreamOptions) {
+  if (videoStreamOptions.device.is_cuda()) {
+    gpuEncoder_ = std::make_unique<GpuEncoder>(videoStreamOptions.device);
+  }
+
   const AVCodec* avCodec = nullptr;
   // If codec arg is provided, find codec using logic similar to FFmpeg:
   // https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835
@@ -748,7 +764,12 @@ void VideoEncoder::initializeEncoder(
     TORCH_CHECK(
         avFormatContext_->oformat != nullptr,
         "Output format is null, unable to find default codec.");
+    // Try to find a hardware-accelerated encoder if not using CPU
     avCodec = avcodec_find_encoder(avFormatContext_->oformat->video_codec);
+    if (gpuEncoder_) {
+      avCodec = gpuEncoder_->findEncoder(avFormatContext_->oformat->video_codec)
+                    .value_or(avCodec);
+    }
 // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9 
 // we have to do this because of an FFmpeg bug where hardware decoding is not 
 // appropriately set, so we just go off and find the matching codec for the CUDA 
 // device 
 std::optional<const AVCodec*> CudaDeviceInterface::findCodec( 
 // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9 
 // we have to do this because of an FFmpeg bug where hardware decoding is not 
 // appropriately set, so we just go off and find the matching codec for the CUDA 
 // device 
 std::optional<const AVCodec*> CudaDeviceInterface::findCodec( 
     TORCH_CHECK(avCodec != nullptr, "Video codec not found");
   }
 
@@ -820,6 +841,14 @@ void VideoEncoder::initializeEncoder(
         videoStreamOptions.preset.value().c_str(),
         0);
   }
+
+  // Register the hardware device context with the codec
+  // context before calling avcodec_open2().
+  if (gpuEncoder_) {
+    gpuEncoder_->registerHardwareDeviceWithCodec(avCodecContext_.get());
+    gpuEncoder_->setupEncodingContext(avCodecContext_.get());
+  }
+
   int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
   av_dict_free(&avCodecOptions);
 
@@ -860,7 +889,13 @@ void VideoEncoder::encode() {
   int numFrames = static_cast<int>(frames_.sizes()[0]);
   for (int i = 0; i < numFrames; ++i) {
     torch::Tensor currFrame = frames_[i];
-    UniqueAVFrame avFrame = convertTensorToAVFrame(currFrame, i);
+    UniqueAVFrame avFrame;
+    if (gpuEncoder_) {
+      avFrame = gpuEncoder_->convertTensorToAVFrame(
+          currFrame, outPixelFormat_, i, avCodecContext_.get());
+    } else {
+      avFrame = convertTensorToAVFrame(currFrame, i);
+    }
     encodeFrame(autoAVPacket, avFrame);
   }
 
@@ -876,6 +911,7 @@ void VideoEncoder::encode() {
 UniqueAVFrame VideoEncoder::convertTensorToAVFrame(
     const torch::Tensor& frame,
     int frameIndex) {
+  TORCH_CHECK(frame.is_cpu(), "CPU encoder requires CPU tensors");
   // Initialize and cache scaling context if it does not exist
   if (!swsContext_) {
     swsContext_.reset(sws_getContext(

diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -3,7 +3,9 @@
 #include <map>
 #include <string>
 #include "AVIOContextHolder.h"
+#include "DeviceInterface.h"
 #include "FFMPEGCommon.h"
+#include "GpuEncoder.h"
 #include "StreamOptions.h"
 
 extern "C" {
@@ -183,6 +185,7 @@ class VideoEncoder {
   AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;
 
   std::unique_ptr<AVIOContextHolder> avioContextHolder_;
+  std::unique_ptr<GpuEncoder> gpuEncoder_;
 
   bool encodeWasCalled_ = false;
   AVDictionary* avFormatOptions_ = nullptr;

diff --git a/src/torchcodec/_core/GpuEncoder.cpp b/src/torchcodec/_core/GpuEncoder.cpp
@@ -0,0 +1,242 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "GpuEncoder.h"
+
+#include <ATen/cuda/CUDAEvent.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include "CUDACommon.h"
+#include "FFMPEGCommon.h"
+
+extern "C" {
+#include <libavutil/hwcontext_cuda.h>
+#include <libavutil/pixdesc.h>
+}
+
+namespace facebook::torchcodec {
+namespace {
+
+// Redefinition from CudaDeviceInterface.cpp anonymous namespace
+int getFlagsAVHardwareDeviceContextCreate() {
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
+  return AV_CUDA_USE_CURRENT_CONTEXT;
+#else
+  return 0;
+#endif
+}
+
+// Redefinition from CudaDeviceInterface.cpp anonymous namespace
+// TODO-VideoEncoder: unify device context creation, add caching to encoder
+UniqueAVBufferRef createHardwareDeviceContext(const torch::Device& device) {
+  enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
+  TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
+
+  int deviceIndex = getDeviceIndex(device);
+
+  c10::cuda::CUDAGuard deviceGuard(device);
+  // We set the device because we may be called from a different thread than
+  // the one that initialized the cuda context.
+  TORCH_CHECK(
+      cudaSetDevice(deviceIndex) == cudaSuccess, "Failed to set CUDA device");
+
+  AVBufferRef* hardwareDeviceCtxRaw = nullptr;
+  std::string deviceOrdinal = std::to_string(deviceIndex);
+
+  int err = av_hwdevice_ctx_create(
+      &hardwareDeviceCtxRaw,
+      type,
+      deviceOrdinal.c_str(),
+      nullptr,
+      getFlagsAVHardwareDeviceContextCreate());
+
+  if (err < 0) {
+    /* clang-format off */
+    TORCH_CHECK(
+        false,
+        "Failed to create specified HW device. This typically happens when ",
+        "your installed FFmpeg doesn't support CUDA (see ",
+        "https://github.com/pytorch/torchcodec#installing-cuda-enabled-torchcodec",
+        "). FFmpeg error: ", getFFMPEGErrorStringFromErrorCode(err));
+    /* clang-format on */
+  }
+
+  return UniqueAVBufferRef(hardwareDeviceCtxRaw);
+}
+
+// RGB to NV12 color conversion matrices (inverse of YUV to RGB)
+// Note: NPP's ColorTwist function apparently expects "limited range"
+// coefficient format even when producing full range output. All matrices below
+// use the limited range coefficient format (Y with +16 offset) for NPP
+// compatibility.
+
+// BT.601 limited range (matches FFmpeg default behavior)
+const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
+    // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
+    {0.257f, 0.504f, 0.098f, 16.0f},
+    // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
+    {-0.148f, -0.291f, 0.439f, 128.0f},
+    // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
+    {0.439f, -0.368f, -0.071f, 128.0f}};
+} // anonymous namespace
+
+GpuEncoder::GpuEncoder(const torch::Device& device) : device_(device) {
+  TORCH_CHECK(
+      device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
+
+  initializeCudaContextWithPytorch(device_);
+  initializeHardwareContext();
+}
+
+GpuEncoder::~GpuEncoder() {}
+
+void GpuEncoder::initializeHardwareContext() {
+  hardwareDeviceCtx_ = createHardwareDeviceContext(device_);
+  nppCtx_ = getNppStreamContext(device_);
+}
+
+std::optional<const AVCodec*> GpuEncoder::findEncoder(
+    const AVCodecID& codecId) {
+  void* i = nullptr;
+  const AVCodec* codec = nullptr;
+  while ((codec = av_codec_iterate(&i)) != nullptr) {
+    if (codec->id != codecId || !av_codec_is_encoder(codec)) {
+      continue;
+    }
+
+    const AVCodecHWConfig* config = nullptr;
+    for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr;
+         ++j) {
+      if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
+        return codec;
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+void GpuEncoder::registerHardwareDeviceWithCodec(AVCodecContext* codecContext) {
+  TORCH_CHECK(
+      hardwareDeviceCtx_, "Hardware device context has not been initialized");
+  TORCH_CHECK(codecContext != nullptr, "codecContext is null");
+  codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get());
+}
+
+void GpuEncoder::setupEncodingContext(AVCodecContext* codecContext) {
+  TORCH_CHECK(
+      hardwareDeviceCtx_, "Hardware device context has not been initialized");
+  TORCH_CHECK(codecContext != nullptr, "codecContext is null");
+
+  codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
+  codecContext->pix_fmt = AV_PIX_FMT_CUDA;
+
+  AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
+  TORCH_CHECK(
+      hwFramesCtxRef != nullptr,
+      "Failed to allocate hardware frames context for codec");
+
+  AVHWFramesContext* hwFramesCtx =
+      reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
+  hwFramesCtx->format = codecContext->pix_fmt;
+  hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
+  hwFramesCtx->width = codecContext->width;
+  hwFramesCtx->height = codecContext->height;
+
+  int ret = av_hwframe_ctx_init(hwFramesCtxRef);
+  if (ret < 0) {
+    av_buffer_unref(&hwFramesCtxRef);
+    TORCH_CHECK(
+        false,
+        "Failed to initialize CUDA frames context for codec: ",
+        getFFMPEGErrorStringFromErrorCode(ret));
+  }
+
+  codecContext->hw_frames_ctx = hwFramesCtxRef;
+}
+
+UniqueAVFrame GpuEncoder::convertTensorToAVFrame(
+    const torch::Tensor& tensor,
+    [[maybe_unused]] AVPixelFormat targetFormat,
+    int frameIndex,
+    AVCodecContext* codecContext) {
+  TORCH_CHECK(tensor.is_cuda(), "GpuEncoder requires CUDA tensors");
+  TORCH_CHECK(
+      tensor.dim() == 3 && tensor.size(0) == 3,
+      "Expected 3D RGB tensor (CHW format), got shape: ",
+      tensor.sizes());
+  UniqueAVFrame avFrame(av_frame_alloc());
+  TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
+
+  avFrame->format = AV_PIX_FMT_CUDA;
+  avFrame->width = static_cast<int>(tensor.size(2));
+  avFrame->height = static_cast<int>(tensor.size(1));
+  avFrame->pts = frameIndex;
+
+  int ret = av_hwframe_get_buffer(
+      codecContext ? codecContext->hw_frames_ctx : nullptr, avFrame.get(), 0);
+  TORCH_CHECK(
+      ret >= 0,
+      "Failed to allocate hardware frame: ",
+      getFFMPEGErrorStringFromErrorCode(ret));
+
+  // Validate that avFrame was properly allocated with CUDA memory
+  TORCH_CHECK(
+      avFrame != nullptr && avFrame->data[0] != nullptr,
+      "avFrame must be pre-allocated with CUDA memory");
+
+  // Convert CHW to HWC for NPP processing
+  int height = static_cast<int>(tensor.size(1));
+  int width = static_cast<int>(tensor.size(2));
+  torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();
+
+  // Get current CUDA stream for NPP operations
+  at::cuda::CUDAStream currentStream =
+      at::cuda::getCurrentCUDAStream(device_.index());
+
+  // Setup NPP context with current stream
+  nppCtx_->hStream = currentStream.stream();
+  cudaError_t cudaErr =
+      cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags);
+  TORCH_CHECK(
+      cudaErr == cudaSuccess,
+      "cudaStreamGetFlags failed: ",
+      cudaGetErrorString(cudaErr));
+
+  // Always use FFmpeg's default behavior: BT.601 limited range
+  NppiSize oSizeROI = {width, height};
+
+  NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
+      static_cast<const Npp8u*>(hwcFrame.data_ptr()),
+      hwcFrame.stride(0) * hwcFrame.element_size(),
+      avFrame->data,
+      avFrame->linesize,
+      oSizeROI,
+      defaultLimitedRangeRgbToNv12,
+      *nppCtx_);
+
+  TORCH_CHECK(
+      status == NPP_SUCCESS,
+      "Failed to convert RGB to NV12: NPP error code ",
+      status);
+
+  // Validate CUDA operations completed successfully
+  cudaError_t memCheck = cudaGetLastError();
+  TORCH_CHECK(
+      memCheck == cudaSuccess,
+      "CUDA error detected: ",
+      cudaGetErrorString(memCheck));
+
+  // TODO-VideoEncoder: Enable configuration of color properties, similar to
+  // FFmpeg Set color properties to FFmpeg defaults
+  avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
+  avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
+
+  return avFrame;
+}
+
+} // namespace facebook::torchcodec