Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions src/torchcodec/_core/CudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,4 +362,117 @@ std::string CudaDeviceInterface::getDetails() {
(usingCPUFallback_ ? "CPU fallback." : "NVDEC.");
}

// --------------------------------------------------------------------------
// Below are methods exclusive to video encoding:
// --------------------------------------------------------------------------
namespace {
// RGB to NV12 color conversion matrix for BT.601 limited range.
// NPP ColorTwist function used below expects the limited range
// color conversion matrix, and this matches FFmpeg's default behavior.
const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
// Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
{0.257f, 0.504f, 0.098f, 16.0f},
// U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
{-0.148f, -0.291f, 0.439f, 128.0f},
// V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
{0.439f, -0.368f, -0.071f, 128.0f}};
} // namespace

UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
const torch::Tensor& tensor,
int frameIndex,
AVCodecContext* codecContext) {
TORCH_CHECK(
tensor.dim() == 3 && tensor.size(0) == 3,
"Expected 3D RGB tensor (CHW format), got shape: ",
tensor.sizes());

UniqueAVFrame avFrame(av_frame_alloc());
TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
int height = static_cast<int>(tensor.size(1));
int width = static_cast<int>(tensor.size(2));

// TODO-VideoEncoder: Unify AVFrame creation with CPU version of this method
avFrame->format = AV_PIX_FMT_CUDA;
avFrame->height = height;
avFrame->width = width;
avFrame->pts = frameIndex;

// FFmpeg's av_hwframe_get_buffer is used to allocate memory on CUDA device.
// TODO-VideoEncoder: Consider using pytorch to allocate CUDA memory for
// efficiency
int ret =
av_hwframe_get_buffer(codecContext->hw_frames_ctx, avFrame.get(), 0);
TORCH_CHECK(
ret >= 0,
"Failed to allocate hardware frame: ",
getFFMPEGErrorStringFromErrorCode(ret));

TORCH_CHECK(
avFrame != nullptr && avFrame->data[0] != nullptr,
"avFrame must be pre-allocated with CUDA memory");

torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();

NppiSize oSizeROI = {width, height};
NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
hwcFrame.stride(0) * hwcFrame.element_size(),
avFrame->data,
avFrame->linesize,
oSizeROI,
defaultLimitedRangeRgbToNv12,
*nppCtx_);

TORCH_CHECK(
status == NPP_SUCCESS,
"Failed to convert RGB to NV12: NPP error code ",
status);

// TODO-VideoEncoder: Enable configuration of color properties, similar to
// FFmpeg. Below are the default color properties used by FFmpeg.
avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range

return avFrame;
}

// Allocates and initializes AVHWFramesContext, and sets pixel format fields
// to enable encoding with CUDA device. The hw_frames_ctx field is needed by
// FFmpeg to allocate frames on GPU's memory.
void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
AVCodecContext* codecContext) {
TORCH_CHECK(codecContext != nullptr, "codecContext is null");
TORCH_CHECK(
hardwareDeviceCtx_, "Hardware device context has not been initialized");

AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
TORCH_CHECK(
hwFramesCtxRef != nullptr,
"Failed to allocate hardware frames context for codec");

// TODO-VideoEncoder: Enable user set pixel formats to be set
// (outPixelFormat_) and handled with the appropriate NPP function
codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
// Always set pixel format to support CUDA encoding.
codecContext->pix_fmt = AV_PIX_FMT_CUDA;

AVHWFramesContext* hwFramesCtx =
reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
hwFramesCtx->format = codecContext->pix_fmt;
hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
hwFramesCtx->width = codecContext->width;
hwFramesCtx->height = codecContext->height;

int ret = av_hwframe_ctx_init(hwFramesCtxRef);
if (ret < 0) {
av_buffer_unref(&hwFramesCtxRef);
TORCH_CHECK(
false,
"Failed to initialize CUDA frames context for codec: ",
getFFMPEGErrorStringFromErrorCode(ret));
}
codecContext->hw_frames_ctx = hwFramesCtxRef;
}

} // namespace facebook::torchcodec
8 changes: 8 additions & 0 deletions src/torchcodec/_core/CudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ class CudaDeviceInterface : public DeviceInterface {

std::string getDetails() override;

UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
const torch::Tensor& tensor,
int frameIndex,
AVCodecContext* codecContext) override;

void setupHardwareFrameContextForEncoding(
AVCodecContext* codecContext) override;

private:
// Our CUDA decoding code assumes NV12 format. In order to handle other
// kinds of input, we need to convert them to NV12. Our current implementation
Expand Down
18 changes: 18 additions & 0 deletions src/torchcodec/_core/DeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,24 @@ class DeviceInterface {
return "";
}

// Function used for video encoding, only implemented in CudaDeviceInterface.
// It is here to isolate CUDA dependencies from CPU builds
// TODO Video-Encoder: Reconsider using video encoding functions in device
// interface
virtual UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
[[maybe_unused]] const torch::Tensor& tensor,
[[maybe_unused]] int frameIndex,
[[maybe_unused]] AVCodecContext* codecContext) {
TORCH_CHECK(false);
}

// Function used for video encoding, only implemented in CudaDeviceInterface.
// It is here to isolate CUDA dependencies from CPU builds
virtual void setupHardwareFrameContextForEncoding(
[[maybe_unused]] AVCodecContext* codecContext) {
TORCH_CHECK(false);
}

protected:
torch::Device device_;
SharedAVCodecContext codecContext_;
Expand Down
29 changes: 28 additions & 1 deletion src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "torch/types.h"

extern "C" {
#include <libavutil/hwcontext.h>
#include <libavutil/opt.h>
#include <libavutil/pixdesc.h>
}
Expand Down Expand Up @@ -724,6 +725,11 @@ VideoEncoder::VideoEncoder(

void VideoEncoder::initializeEncoder(
const VideoStreamOptions& videoStreamOptions) {
// Only create device interface when frames are on a CUDA device.
// Encoding on CPU is implemented in this file.
if (frames_.device().is_cuda()) {
deviceInterface_ = createDeviceInterface(frames_.device());
}
const AVCodec* avCodec = nullptr;
// If codec arg is provided, find codec using logic similar to FFmpeg:
// https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835
Expand Down Expand Up @@ -820,6 +826,14 @@ void VideoEncoder::initializeEncoder(
videoStreamOptions.preset.value().c_str(),
0);
}

// When frames are on a CUDA device, deviceInterface_ will be defined.
if (frames_.device().is_cuda() && deviceInterface_) {
deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get());
deviceInterface_->setupHardwareFrameContextForEncoding(
avCodecContext_.get());
}

int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
av_dict_free(&avCodecOptions);

Expand Down Expand Up @@ -860,7 +874,20 @@ void VideoEncoder::encode() {
int numFrames = static_cast<int>(frames_.sizes()[0]);
for (int i = 0; i < numFrames; ++i) {
torch::Tensor currFrame = frames_[i];
UniqueAVFrame avFrame = convertTensorToAVFrame(currFrame, i);
UniqueAVFrame avFrame;
if (frames_.device().is_cuda() && deviceInterface_) {
auto cudaFrame = deviceInterface_->convertCUDATensorToAVFrameForEncoding(
currFrame, i, avCodecContext_.get());
TORCH_CHECK(
cudaFrame != nullptr,
"convertCUDATensorToAVFrameForEncoding failed for frame ",
i,
" on device: ",
frames_.device());
avFrame = std::move(cudaFrame);
} else {
avFrame = convertTensorToAVFrame(currFrame, i);
}
encodeFrame(autoAVPacket, avFrame);
}

Expand Down
2 changes: 2 additions & 0 deletions src/torchcodec/_core/Encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <map>
#include <string>
#include "AVIOContextHolder.h"
#include "DeviceInterface.h"
#include "FFMPEGCommon.h"
#include "StreamOptions.h"

Expand Down Expand Up @@ -183,6 +184,7 @@ class VideoEncoder {
AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;

std::unique_ptr<AVIOContextHolder> avioContextHolder_;
std::unique_ptr<DeviceInterface> deviceInterface_;

bool encodeWasCalled_ = false;
AVDictionary* avFormatOptions_ = nullptr;
Expand Down
2 changes: 2 additions & 0 deletions src/torchcodec/_core/StreamOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ struct VideoStreamOptions {
ColorConversionLibrary::FILTERGRAPH;

// By default we use CPU for decoding for both C++ and python users.
// Note: This is not used for video encoding, because device is determined by
// the device of the input frame tensor.
torch::Device device = torch::kCPU;
// Device variant (e.g., "ffmpeg", "beta", etc.)
std::string_view deviceVariant = "ffmpeg";
Expand Down
3 changes: 3 additions & 0 deletions src/torchcodec/_core/custom_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,9 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
m.impl("_create_from_file_like", &_create_from_file_like);
m.impl(
"_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions);
m.impl("encode_video_to_file", &encode_video_to_file);
m.impl("encode_video_to_tensor", &encode_video_to_tensor);
m.impl("_encode_video_to_file_like", &_encode_video_to_file_like);
}

TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
Expand Down
2 changes: 1 addition & 1 deletion src/torchcodec/_core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def encode_video_to_file_like(
"""Encode video frames to a file-like object.

Args:
frames: Video frames tensor
frames: Video frames tensor. The device of the frames tensor will be used for encoding.
frame_rate: Frame rate in frames per second
format: Video format (e.g., "mp4", "mov", "mkv")
file_like: File-like object that supports write() and seek() methods
Expand Down
1 change: 1 addition & 0 deletions src/torchcodec/encoders/_video_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class VideoEncoder:
tensor of shape ``(N, C, H, W)`` where N is the number of frames,
C is 3 channels (RGB), H is height, and W is width.
Values must be uint8 in the range ``[0, 255]``.
The device of the frames tensor will be used for encoding.
frame_rate (float): The frame rate of the **input** ``frames``. Also defines the encoded **output** frame rate.
"""

Expand Down
Loading
Loading