Skip to content
Draft
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/torchcodec/_core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ function(make_torchcodec_libraries
)

if(ENABLE_CUDA)
list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp)
list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp GpuEncoder.cpp)
endif()

set(core_library_dependencies
Expand Down
44 changes: 40 additions & 4 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,9 @@ void AudioEncoder::flushBuffers() {

namespace {

torch::Tensor validateFrames(const torch::Tensor& frames) {
torch::Tensor validateFrames(
const torch::Tensor& frames,
const torch::Device& device) {
TORCH_CHECK(
frames.dtype() == torch::kUInt8,
"frames must have uint8 dtype, got ",
Expand All @@ -536,6 +538,15 @@ torch::Tensor validateFrames(const torch::Tensor& frames) {
frames.sizes()[1] == 3,
"frame must have 3 channels (R, G, B), got ",
frames.sizes()[1]);
if (device.type() != torch::kCPU) {
TORCH_CHECK(
frames.is_cuda(),
"When using CUDA encoding (device=",
device.str(),
"), frames must be on a CUDA device. Got frames on ",
frames.device().str(),
". Please move frames to a CUDA device: frames.to('cuda')");
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The error above should be an internal bug: if the frames are on CUDA while the device parameter is not, it means it wasn't set properly in the custom ops. So this should be an "internal bug" error message rather than a user-facing thing, since we assume users aren't using the C++ APIs.

But, I think we should push the logic from https://github.com/meta-pytorch/torchcodec/pull/1008/files#r2565059217 further: we don't need a device parameter at all, anywhere. Having a device parameter duplicates the source of truth of the device and leads to potential bugs (like the one above that the TORCH_CHECK is preventing). So I think we shouldn't set options.device at all for encoding, and always rely on the frames. We can add a comment in src/torchcodec/_core/StreamOptions.h to indicate that.

return frames.contiguous();
}

Expand Down Expand Up @@ -665,7 +676,8 @@ VideoEncoder::VideoEncoder(
double frameRate,
std::string_view fileName,
const VideoStreamOptions& videoStreamOptions)
: frames_(validateFrames(frames)), inFrameRate_(frameRate) {
: frames_(validateFrames(frames, videoStreamOptions.device)),
inFrameRate_(frameRate) {
setFFmpegLogLevel();

// Allocate output format context
Expand Down Expand Up @@ -698,7 +710,7 @@ VideoEncoder::VideoEncoder(
std::string_view formatName,
std::unique_ptr<AVIOContextHolder> avioContextHolder,
const VideoStreamOptions& videoStreamOptions)
: frames_(validateFrames(frames)),
: frames_(validateFrames(frames, videoStreamOptions.device)),
inFrameRate_(frameRate),
avioContextHolder_(std::move(avioContextHolder)) {
setFFmpegLogLevel();
Expand All @@ -724,6 +736,10 @@ VideoEncoder::VideoEncoder(

void VideoEncoder::initializeEncoder(
const VideoStreamOptions& videoStreamOptions) {
if (videoStreamOptions.device.is_cuda()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was hopiing we wouldn't need to support a device parameter anywhere. Any reason we can't just rely on the input frames device?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As per our offline discussion, I've updated this PR to not have an explicit device param, and instead use whichever device the frames Tensor is on.

gpuEncoder_ = std::make_unique<GpuEncoder>(videoStreamOptions.device);
}

const AVCodec* avCodec = nullptr;
// If codec arg is provided, find codec using logic similar to FFmpeg:
// https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835
Expand All @@ -748,7 +764,12 @@ void VideoEncoder::initializeEncoder(
TORCH_CHECK(
avFormatContext_->oformat != nullptr,
"Output format is null, unable to find default codec.");
// Try to find a hardware-accelerated encoder if not using CPU
avCodec = avcodec_find_encoder(avFormatContext_->oformat->video_codec);
if (gpuEncoder_) {
avCodec = gpuEncoder_->findEncoder(avFormatContext_->oformat->video_codec)
.value_or(avCodec);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there actual value in having findEncoder? Did you notice any problem if we didn't use it? If not, then let's remove it.

If yes, then let's rename it into findCodec and add a comment simlar to this one

// inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9
// we have to do this because of an FFmpeg bug where hardware decoding is not
// appropriately set, so we just go off and find the matching codec for the CUDA
// device
std::optional<const AVCodec*> CudaDeviceInterface::findCodec(

and also indicate that this findCodec function exists for similar reasons as the one I linked to.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's delete it. The intention was to find a hardware enabled encoder if one was not specified, but as far as I can tell, FFmpeg CLI does not support that for encoding, only for decoding via the -hwaccel flag.

TORCH_CHECK(avCodec != nullptr, "Video codec not found");
}

Expand Down Expand Up @@ -820,6 +841,14 @@ void VideoEncoder::initializeEncoder(
videoStreamOptions.preset.value().c_str(),
0);
}

// Register the hardware device context with the codec
// context before calling avcodec_open2().
if (gpuEncoder_) {
gpuEncoder_->registerHardwareDeviceWithCodec(avCodecContext_.get());
gpuEncoder_->setupEncodingContext(avCodecContext_.get());
}

int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
av_dict_free(&avCodecOptions);

Expand Down Expand Up @@ -860,7 +889,13 @@ void VideoEncoder::encode() {
int numFrames = static_cast<int>(frames_.sizes()[0]);
for (int i = 0; i < numFrames; ++i) {
torch::Tensor currFrame = frames_[i];
UniqueAVFrame avFrame = convertTensorToAVFrame(currFrame, i);
UniqueAVFrame avFrame;
if (gpuEncoder_) {
avFrame = gpuEncoder_->convertTensorToAVFrame(
currFrame, outPixelFormat_, i, avCodecContext_.get());
} else {
avFrame = convertTensorToAVFrame(currFrame, i);
}
encodeFrame(autoAVPacket, avFrame);
}

Expand All @@ -876,6 +911,7 @@ void VideoEncoder::encode() {
UniqueAVFrame VideoEncoder::convertTensorToAVFrame(
const torch::Tensor& frame,
int frameIndex) {
TORCH_CHECK(frame.is_cpu(), "CPU encoder requires CPU tensors");
// Initialize and cache scaling context if it does not exist
if (!swsContext_) {
swsContext_.reset(sws_getContext(
Expand Down
3 changes: 3 additions & 0 deletions src/torchcodec/_core/Encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
#include <map>
#include <string>
#include "AVIOContextHolder.h"
#include "DeviceInterface.h"
#include "FFMPEGCommon.h"
#include "GpuEncoder.h"
#include "StreamOptions.h"

extern "C" {
Expand Down Expand Up @@ -183,6 +185,7 @@ class VideoEncoder {
AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;

std::unique_ptr<AVIOContextHolder> avioContextHolder_;
std::unique_ptr<GpuEncoder> gpuEncoder_;

bool encodeWasCalled_ = false;
AVDictionary* avFormatOptions_ = nullptr;
Expand Down
242 changes: 242 additions & 0 deletions src/torchcodec/_core/GpuEncoder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// All rights reserved.
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include "GpuEncoder.h"

#include <ATen/cuda/CUDAEvent.h>
#include <c10/cuda/CUDAStream.h>
#include <cuda_runtime.h>
#include <torch/types.h>

#include "CUDACommon.h"
#include "FFMPEGCommon.h"

extern "C" {
#include <libavutil/hwcontext_cuda.h>
#include <libavutil/pixdesc.h>
}

namespace facebook::torchcodec {
namespace {

// Redefinition from CudaDeviceInterface.cpp anonymous namespace
int getFlagsAVHardwareDeviceContextCreate() {
#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
return AV_CUDA_USE_CURRENT_CONTEXT;
#else
return 0;
#endif
}

// Redefinition from CudaDeviceInterface.cpp anonymous namespace
// TODO-VideoEncoder: unify device context creation, add caching to encoder
Comment on lines +34 to +35
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed on the TODOs. Seeing this makes me think we will probably want to follow-up quickly and implement GpuEncoder.cpp as part of CudaDeviceInterface.cpp. But this is better done as a follow-up considering this encoder itself is already a big change.

UniqueAVBufferRef createHardwareDeviceContext(const torch::Device& device) {
enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");

int deviceIndex = getDeviceIndex(device);

c10::cuda::CUDAGuard deviceGuard(device);
// We set the device because we may be called from a different thread than
// the one that initialized the cuda context.
TORCH_CHECK(
cudaSetDevice(deviceIndex) == cudaSuccess, "Failed to set CUDA device");

AVBufferRef* hardwareDeviceCtxRaw = nullptr;
std::string deviceOrdinal = std::to_string(deviceIndex);

int err = av_hwdevice_ctx_create(
&hardwareDeviceCtxRaw,
type,
deviceOrdinal.c_str(),
nullptr,
getFlagsAVHardwareDeviceContextCreate());

if (err < 0) {
/* clang-format off */
TORCH_CHECK(
false,
"Failed to create specified HW device. This typically happens when ",
"your installed FFmpeg doesn't support CUDA (see ",
"https://github.com/pytorch/torchcodec#installing-cuda-enabled-torchcodec",
"). FFmpeg error: ", getFFMPEGErrorStringFromErrorCode(err));
/* clang-format on */
}

return UniqueAVBufferRef(hardwareDeviceCtxRaw);
}

// RGB to NV12 color conversion matrices (inverse of YUV to RGB)
// Note: NPP's ColorTwist function apparently expects "limited range"
// coefficient format even when producing full range output. All matrices below
// use the limited range coefficient format (Y with +16 offset) for NPP
// compatibility.

// BT.601 limited range (matches FFmpeg default behavior)
const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
// Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
{0.257f, 0.504f, 0.098f, 16.0f},
// U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
{-0.148f, -0.291f, 0.439f, 128.0f},
// V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
{0.439f, -0.368f, -0.071f, 128.0f}};
} // anonymous namespace

GpuEncoder::GpuEncoder(const torch::Device& device) : device_(device) {
TORCH_CHECK(
device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());

initializeCudaContextWithPytorch(device_);
initializeHardwareContext();
}

GpuEncoder::~GpuEncoder() {}

void GpuEncoder::initializeHardwareContext() {
hardwareDeviceCtx_ = createHardwareDeviceContext(device_);
nppCtx_ = getNppStreamContext(device_);
}

std::optional<const AVCodec*> GpuEncoder::findEncoder(
const AVCodecID& codecId) {
void* i = nullptr;
const AVCodec* codec = nullptr;
while ((codec = av_codec_iterate(&i)) != nullptr) {
if (codec->id != codecId || !av_codec_is_encoder(codec)) {
continue;
}

const AVCodecHWConfig* config = nullptr;
for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr;
++j) {
if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
return codec;
}
}
}
return std::nullopt;
}

void GpuEncoder::registerHardwareDeviceWithCodec(AVCodecContext* codecContext) {
TORCH_CHECK(
hardwareDeviceCtx_, "Hardware device context has not been initialized");
TORCH_CHECK(codecContext != nullptr, "codecContext is null");
codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get());
}

void GpuEncoder::setupEncodingContext(AVCodecContext* codecContext) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs a comment to explain what this function does, and why.

My understranding is that it's mostly setting the AVHWFramesContext field of the AVFrame. And that's needed because this is how FFmpeg knows to allocate frames on the GPU's memory. I could be wrong, check me on this. This should be documented :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the feedback, I added a comment and renamed it to setupHardwareFrameContext to be more clear.

TORCH_CHECK(
hardwareDeviceCtx_, "Hardware device context has not been initialized");
TORCH_CHECK(codecContext != nullptr, "codecContext is null");

codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
codecContext->pix_fmt = AV_PIX_FMT_CUDA;

AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
TORCH_CHECK(
hwFramesCtxRef != nullptr,
"Failed to allocate hardware frames context for codec");

AVHWFramesContext* hwFramesCtx =
reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
hwFramesCtx->format = codecContext->pix_fmt;
hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
hwFramesCtx->width = codecContext->width;
hwFramesCtx->height = codecContext->height;

int ret = av_hwframe_ctx_init(hwFramesCtxRef);
if (ret < 0) {
av_buffer_unref(&hwFramesCtxRef);
TORCH_CHECK(
false,
"Failed to initialize CUDA frames context for codec: ",
getFFMPEGErrorStringFromErrorCode(ret));
}

codecContext->hw_frames_ctx = hwFramesCtxRef;
}

UniqueAVFrame GpuEncoder::convertTensorToAVFrame(
const torch::Tensor& tensor,
[[maybe_unused]] AVPixelFormat targetFormat,
int frameIndex,
AVCodecContext* codecContext) {
TORCH_CHECK(tensor.is_cuda(), "GpuEncoder requires CUDA tensors");
TORCH_CHECK(
tensor.dim() == 3 && tensor.size(0) == 3,
"Expected 3D RGB tensor (CHW format), got shape: ",
tensor.sizes());
UniqueAVFrame avFrame(av_frame_alloc());
TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");

avFrame->format = AV_PIX_FMT_CUDA;
avFrame->width = static_cast<int>(tensor.size(2));
avFrame->height = static_cast<int>(tensor.size(1));
avFrame->pts = frameIndex;
Comment on lines 164 to 167
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the 4 lines above, let's add a TODO to unify this with the logic that's already in the CPU version of convertTensorToAVFrame. We should make sure frames use the exact same code path for setting the PTS, width and height, with the same sources of truth.


int ret = av_hwframe_get_buffer(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a note here that we're letting FFmpeg allocate the CUDA memory. I think we should explore allocating the memory with pytorch instead, so that we can automatically rely on pytorch's CUDA memory allocator, which should be more efficient. There could be a TODO to investigate how to do that (this is related to my comment about setupEncodingContext above).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would an example of this be allocateEmptyHWCTensor?

codecContext ? codecContext->hw_frames_ctx : nullptr, avFrame.get(), 0);
TORCH_CHECK(
ret >= 0,
"Failed to allocate hardware frame: ",
getFFMPEGErrorStringFromErrorCode(ret));

// Validate that avFrame was properly allocated with CUDA memory
TORCH_CHECK(
avFrame != nullptr && avFrame->data[0] != nullptr,
"avFrame must be pre-allocated with CUDA memory");

// Convert CHW to HWC for NPP processing
int height = static_cast<int>(tensor.size(1));
int width = static_cast<int>(tensor.size(2));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. make sure to use the same height and width that were used to set the AVframe properties - use the same variables!
  2. define those where they're needed.

torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();

// Get current CUDA stream for NPP operations
at::cuda::CUDAStream currentStream =
at::cuda::getCurrentCUDAStream(device_.index());

// Setup NPP context with current stream
nppCtx_->hStream = currentStream.stream();
cudaError_t cudaErr =
cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags);
TORCH_CHECK(
cudaErr == cudaSuccess,
"cudaStreamGetFlags failed: ",
cudaGetErrorString(cudaErr));

// Always use FFmpeg's default behavior: BT.601 limited range
NppiSize oSizeROI = {width, height};

NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
hwcFrame.stride(0) * hwcFrame.element_size(),
avFrame->data,
avFrame->linesize,
oSizeROI,
defaultLimitedRangeRgbToNv12,
*nppCtx_);

TORCH_CHECK(
status == NPP_SUCCESS,
"Failed to convert RGB to NV12: NPP error code ",
status);

// Validate CUDA operations completed successfully
cudaError_t memCheck = cudaGetLastError();
TORCH_CHECK(
memCheck == cudaSuccess,
"CUDA error detected: ",
cudaGetErrorString(memCheck));

// TODO-VideoEncoder: Enable configuration of color properties, similar to
// FFmpeg Set color properties to FFmpeg defaults
avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range

return avFrame;
}

} // namespace facebook::torchcodec
Loading