-
Notifications
You must be signed in to change notification settings - Fork 73
Enable CUDA device for video encoder #1008
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 16 commits
ca1f538
50cdb21
54d1a1f
88e1299
926b7ea
eee8889
43c6221
4af1f53
bdd133f
d5f2637
9c7bae7
bf78468
7e5e6d4
ffbdf4e
15e23f6
2342837
f8b64ea
6d88905
fc05cb8
6b8c1fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -523,7 +523,9 @@ void AudioEncoder::flushBuffers() { | |||||||||||
|
|
||||||||||||
| namespace { | ||||||||||||
|
|
||||||||||||
| torch::Tensor validateFrames(const torch::Tensor& frames) { | ||||||||||||
| torch::Tensor validateFrames( | ||||||||||||
| const torch::Tensor& frames, | ||||||||||||
| const torch::Device& device) { | ||||||||||||
| TORCH_CHECK( | ||||||||||||
| frames.dtype() == torch::kUInt8, | ||||||||||||
| "frames must have uint8 dtype, got ", | ||||||||||||
|
|
@@ -536,6 +538,15 @@ torch::Tensor validateFrames(const torch::Tensor& frames) { | |||||||||||
| frames.sizes()[1] == 3, | ||||||||||||
| "frame must have 3 channels (R, G, B), got ", | ||||||||||||
| frames.sizes()[1]); | ||||||||||||
| if (device.type() != torch::kCPU) { | ||||||||||||
| TORCH_CHECK( | ||||||||||||
| frames.is_cuda(), | ||||||||||||
| "When using CUDA encoding (device=", | ||||||||||||
| device.str(), | ||||||||||||
| "), frames must be on a CUDA device. Got frames on ", | ||||||||||||
| frames.device().str(), | ||||||||||||
| ". Please move frames to a CUDA device: frames.to('cuda')"); | ||||||||||||
| } | ||||||||||||
| return frames.contiguous(); | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
|
|
@@ -665,7 +676,8 @@ VideoEncoder::VideoEncoder( | |||||||||||
| double frameRate, | ||||||||||||
| std::string_view fileName, | ||||||||||||
| const VideoStreamOptions& videoStreamOptions) | ||||||||||||
| : frames_(validateFrames(frames)), inFrameRate_(frameRate) { | ||||||||||||
| : frames_(validateFrames(frames, videoStreamOptions.device)), | ||||||||||||
| inFrameRate_(frameRate) { | ||||||||||||
| setFFmpegLogLevel(); | ||||||||||||
|
|
||||||||||||
| // Allocate output format context | ||||||||||||
|
|
@@ -698,7 +710,7 @@ VideoEncoder::VideoEncoder( | |||||||||||
| std::string_view formatName, | ||||||||||||
| std::unique_ptr<AVIOContextHolder> avioContextHolder, | ||||||||||||
| const VideoStreamOptions& videoStreamOptions) | ||||||||||||
| : frames_(validateFrames(frames)), | ||||||||||||
| : frames_(validateFrames(frames, videoStreamOptions.device)), | ||||||||||||
| inFrameRate_(frameRate), | ||||||||||||
| avioContextHolder_(std::move(avioContextHolder)) { | ||||||||||||
| setFFmpegLogLevel(); | ||||||||||||
|
|
@@ -724,6 +736,10 @@ VideoEncoder::VideoEncoder( | |||||||||||
|
|
||||||||||||
| void VideoEncoder::initializeEncoder( | ||||||||||||
| const VideoStreamOptions& videoStreamOptions) { | ||||||||||||
| if (videoStreamOptions.device.is_cuda()) { | ||||||||||||
|
||||||||||||
| gpuEncoder_ = std::make_unique<GpuEncoder>(videoStreamOptions.device); | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| const AVCodec* avCodec = nullptr; | ||||||||||||
| // If codec arg is provided, find codec using logic similar to FFmpeg: | ||||||||||||
| // https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835 | ||||||||||||
|
|
@@ -748,7 +764,12 @@ void VideoEncoder::initializeEncoder( | |||||||||||
| TORCH_CHECK( | ||||||||||||
| avFormatContext_->oformat != nullptr, | ||||||||||||
| "Output format is null, unable to find default codec."); | ||||||||||||
| // Try to find a hardware-accelerated encoder if not using CPU | ||||||||||||
Dan-Flores marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||||||||
| avCodec = avcodec_find_encoder(avFormatContext_->oformat->video_codec); | ||||||||||||
| if (gpuEncoder_) { | ||||||||||||
| avCodec = gpuEncoder_->findEncoder(avFormatContext_->oformat->video_codec) | ||||||||||||
| .value_or(avCodec); | ||||||||||||
| } | ||||||||||||
|
||||||||||||
| // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9 | |
| // we have to do this because of an FFmpeg bug where hardware decoding is not | |
| // appropriately set, so we just go off and find the matching codec for the CUDA | |
| // device | |
| std::optional<const AVCodec*> CudaDeviceInterface::findCodec( |
and also indicate that this findCodec function exists for similar reasons as the one I linked to.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's delete it. The intention was to find a hardware enabled encoder if one was not specified, but as far as I can tell, FFmpeg CLI does not support that for encoding, only for decoding via the -hwaccel flag.
Dan-Flores marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,242 @@ | ||
| // Copyright (c) Meta Platforms, Inc. and affiliates. | ||
| // All rights reserved. | ||
| // | ||
| // This source code is licensed under the BSD-style license found in the | ||
| // LICENSE file in the root directory of this source tree. | ||
|
|
||
| #include "GpuEncoder.h" | ||
|
|
||
| #include <ATen/cuda/CUDAEvent.h> | ||
| #include <c10/cuda/CUDAStream.h> | ||
| #include <cuda_runtime.h> | ||
| #include <torch/types.h> | ||
|
|
||
| #include "CUDACommon.h" | ||
| #include "FFMPEGCommon.h" | ||
|
|
||
| extern "C" { | ||
| #include <libavutil/hwcontext_cuda.h> | ||
| #include <libavutil/pixdesc.h> | ||
| } | ||
|
|
||
| namespace facebook::torchcodec { | ||
| namespace { | ||
|
|
||
| // Redefinition from CudaDeviceInterface.cpp anonymous namespace | ||
| int getFlagsAVHardwareDeviceContextCreate() { | ||
| #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100) | ||
| return AV_CUDA_USE_CURRENT_CONTEXT; | ||
| #else | ||
| return 0; | ||
| #endif | ||
| } | ||
|
|
||
| // Redefinition from CudaDeviceInterface.cpp anonymous namespace | ||
| // TODO-VideoEncoder: unify device context creation, add caching to encoder | ||
|
Comment on lines
+34
to
+35
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed on the TODOs. Seeing this makes me think we will probably want to follow-up quickly and implement |
||
| UniqueAVBufferRef createHardwareDeviceContext(const torch::Device& device) { | ||
| enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda"); | ||
| TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device"); | ||
|
|
||
| int deviceIndex = getDeviceIndex(device); | ||
|
|
||
| c10::cuda::CUDAGuard deviceGuard(device); | ||
| // We set the device because we may be called from a different thread than | ||
| // the one that initialized the cuda context. | ||
| TORCH_CHECK( | ||
| cudaSetDevice(deviceIndex) == cudaSuccess, "Failed to set CUDA device"); | ||
|
|
||
| AVBufferRef* hardwareDeviceCtxRaw = nullptr; | ||
| std::string deviceOrdinal = std::to_string(deviceIndex); | ||
|
|
||
| int err = av_hwdevice_ctx_create( | ||
| &hardwareDeviceCtxRaw, | ||
| type, | ||
| deviceOrdinal.c_str(), | ||
| nullptr, | ||
| getFlagsAVHardwareDeviceContextCreate()); | ||
|
|
||
| if (err < 0) { | ||
| /* clang-format off */ | ||
| TORCH_CHECK( | ||
| false, | ||
| "Failed to create specified HW device. This typically happens when ", | ||
| "your installed FFmpeg doesn't support CUDA (see ", | ||
| "https://github.com/pytorch/torchcodec#installing-cuda-enabled-torchcodec", | ||
| "). FFmpeg error: ", getFFMPEGErrorStringFromErrorCode(err)); | ||
| /* clang-format on */ | ||
| } | ||
|
|
||
| return UniqueAVBufferRef(hardwareDeviceCtxRaw); | ||
| } | ||
|
|
||
| // RGB to NV12 color conversion matrices (inverse of YUV to RGB) | ||
| // Note: NPP's ColorTwist function apparently expects "limited range" | ||
| // coefficient format even when producing full range output. All matrices below | ||
| // use the limited range coefficient format (Y with +16 offset) for NPP | ||
| // compatibility. | ||
|
|
||
| // BT.601 limited range (matches FFmpeg default behavior) | ||
| const Npp32f defaultLimitedRangeRgbToNv12[3][4] = { | ||
| // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B) | ||
| {0.257f, 0.504f, 0.098f, 16.0f}, | ||
| // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients) | ||
| {-0.148f, -0.291f, 0.439f, 128.0f}, | ||
| // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients) | ||
| {0.439f, -0.368f, -0.071f, 128.0f}}; | ||
| } // anonymous namespace | ||
|
|
||
| GpuEncoder::GpuEncoder(const torch::Device& device) : device_(device) { | ||
| TORCH_CHECK( | ||
| device_.type() == torch::kCUDA, "Unsupported device: ", device_.str()); | ||
|
|
||
| initializeCudaContextWithPytorch(device_); | ||
| initializeHardwareContext(); | ||
| } | ||
|
|
||
| GpuEncoder::~GpuEncoder() {} | ||
|
|
||
| void GpuEncoder::initializeHardwareContext() { | ||
| hardwareDeviceCtx_ = createHardwareDeviceContext(device_); | ||
| nppCtx_ = getNppStreamContext(device_); | ||
| } | ||
|
|
||
| std::optional<const AVCodec*> GpuEncoder::findEncoder( | ||
| const AVCodecID& codecId) { | ||
| void* i = nullptr; | ||
| const AVCodec* codec = nullptr; | ||
| while ((codec = av_codec_iterate(&i)) != nullptr) { | ||
| if (codec->id != codecId || !av_codec_is_encoder(codec)) { | ||
| continue; | ||
| } | ||
|
|
||
| const AVCodecHWConfig* config = nullptr; | ||
| for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr; | ||
| ++j) { | ||
| if (config->device_type == AV_HWDEVICE_TYPE_CUDA) { | ||
| return codec; | ||
| } | ||
| } | ||
| } | ||
| return std::nullopt; | ||
| } | ||
|
|
||
| void GpuEncoder::registerHardwareDeviceWithCodec(AVCodecContext* codecContext) { | ||
| TORCH_CHECK( | ||
| hardwareDeviceCtx_, "Hardware device context has not been initialized"); | ||
| TORCH_CHECK(codecContext != nullptr, "codecContext is null"); | ||
| codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get()); | ||
| } | ||
|
|
||
| void GpuEncoder::setupEncodingContext(AVCodecContext* codecContext) { | ||
|
||
| TORCH_CHECK( | ||
| hardwareDeviceCtx_, "Hardware device context has not been initialized"); | ||
| TORCH_CHECK(codecContext != nullptr, "codecContext is null"); | ||
|
|
||
| codecContext->sw_pix_fmt = AV_PIX_FMT_NV12; | ||
| codecContext->pix_fmt = AV_PIX_FMT_CUDA; | ||
|
|
||
| AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get()); | ||
| TORCH_CHECK( | ||
| hwFramesCtxRef != nullptr, | ||
| "Failed to allocate hardware frames context for codec"); | ||
|
|
||
| AVHWFramesContext* hwFramesCtx = | ||
| reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data); | ||
| hwFramesCtx->format = codecContext->pix_fmt; | ||
| hwFramesCtx->sw_format = codecContext->sw_pix_fmt; | ||
| hwFramesCtx->width = codecContext->width; | ||
| hwFramesCtx->height = codecContext->height; | ||
|
|
||
| int ret = av_hwframe_ctx_init(hwFramesCtxRef); | ||
| if (ret < 0) { | ||
| av_buffer_unref(&hwFramesCtxRef); | ||
| TORCH_CHECK( | ||
| false, | ||
| "Failed to initialize CUDA frames context for codec: ", | ||
| getFFMPEGErrorStringFromErrorCode(ret)); | ||
| } | ||
|
|
||
| codecContext->hw_frames_ctx = hwFramesCtxRef; | ||
| } | ||
|
|
||
| UniqueAVFrame GpuEncoder::convertTensorToAVFrame( | ||
| const torch::Tensor& tensor, | ||
| [[maybe_unused]] AVPixelFormat targetFormat, | ||
| int frameIndex, | ||
| AVCodecContext* codecContext) { | ||
| TORCH_CHECK(tensor.is_cuda(), "GpuEncoder requires CUDA tensors"); | ||
| TORCH_CHECK( | ||
| tensor.dim() == 3 && tensor.size(0) == 3, | ||
| "Expected 3D RGB tensor (CHW format), got shape: ", | ||
| tensor.sizes()); | ||
| UniqueAVFrame avFrame(av_frame_alloc()); | ||
| TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame"); | ||
|
|
||
| avFrame->format = AV_PIX_FMT_CUDA; | ||
| avFrame->width = static_cast<int>(tensor.size(2)); | ||
| avFrame->height = static_cast<int>(tensor.size(1)); | ||
| avFrame->pts = frameIndex; | ||
|
Comment on lines
164
to
167
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the 4 lines above, let's add a TODO to unify this with the logic that's already in the CPU version of |
||
|
|
||
| int ret = av_hwframe_get_buffer( | ||
|
||
| codecContext ? codecContext->hw_frames_ctx : nullptr, avFrame.get(), 0); | ||
| TORCH_CHECK( | ||
| ret >= 0, | ||
| "Failed to allocate hardware frame: ", | ||
| getFFMPEGErrorStringFromErrorCode(ret)); | ||
|
|
||
| // Validate that avFrame was properly allocated with CUDA memory | ||
Dan-Flores marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| TORCH_CHECK( | ||
| avFrame != nullptr && avFrame->data[0] != nullptr, | ||
| "avFrame must be pre-allocated with CUDA memory"); | ||
|
|
||
| // Convert CHW to HWC for NPP processing | ||
| int height = static_cast<int>(tensor.size(1)); | ||
| int width = static_cast<int>(tensor.size(2)); | ||
|
||
| torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous(); | ||
|
|
||
| // Get current CUDA stream for NPP operations | ||
Dan-Flores marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| at::cuda::CUDAStream currentStream = | ||
| at::cuda::getCurrentCUDAStream(device_.index()); | ||
|
|
||
| // Setup NPP context with current stream | ||
Dan-Flores marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| nppCtx_->hStream = currentStream.stream(); | ||
| cudaError_t cudaErr = | ||
| cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags); | ||
| TORCH_CHECK( | ||
| cudaErr == cudaSuccess, | ||
| "cudaStreamGetFlags failed: ", | ||
| cudaGetErrorString(cudaErr)); | ||
|
|
||
| // Always use FFmpeg's default behavior: BT.601 limited range | ||
Dan-Flores marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| NppiSize oSizeROI = {width, height}; | ||
|
|
||
| NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx( | ||
| static_cast<const Npp8u*>(hwcFrame.data_ptr()), | ||
| hwcFrame.stride(0) * hwcFrame.element_size(), | ||
| avFrame->data, | ||
| avFrame->linesize, | ||
| oSizeROI, | ||
| defaultLimitedRangeRgbToNv12, | ||
| *nppCtx_); | ||
|
|
||
| TORCH_CHECK( | ||
| status == NPP_SUCCESS, | ||
| "Failed to convert RGB to NV12: NPP error code ", | ||
| status); | ||
|
|
||
| // Validate CUDA operations completed successfully | ||
| cudaError_t memCheck = cudaGetLastError(); | ||
Dan-Flores marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| TORCH_CHECK( | ||
| memCheck == cudaSuccess, | ||
| "CUDA error detected: ", | ||
| cudaGetErrorString(memCheck)); | ||
|
|
||
| // TODO-VideoEncoder: Enable configuration of color properties, similar to | ||
| // FFmpeg Set color properties to FFmpeg defaults | ||
Dan-Flores marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601 | ||
| avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range | ||
|
|
||
| return avFrame; | ||
| } | ||
|
|
||
| } // namespace facebook::torchcodec | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The error above should be an internal bug: if the frames are on CUDA while the device parameter is not, it means it wasn't set properly in the custom ops. So this should be an "internal bug" error message rather than a user-facing thing, since we assume users aren't using the C++ APIs.
But, I think we should push the logic from https://github.com/meta-pytorch/torchcodec/pull/1008/files#r2565059217 further: we don't need a
deviceparameter at all, anywhere. Having a device parameter duplicates the source of truth of the device and leads to potential bugs (like the one above that theTORCH_CHECKis preventing). So I think we shouldn't setoptions.deviceat all for encoding, and always rely on the frames. We can add a comment insrc/torchcodec/_core/StreamOptions.hto indicate that.