@@ -53,74 +53,6 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
5353}
5454
5555static UniqueCUvideodecoder createDecoder (CUVIDEOFORMAT* videoFormat) {
56- // Check decoder capabilities - same checks as DALI
57- auto caps = CUVIDDECODECAPS{};
58- caps.eCodecType = videoFormat->codec ;
59- caps.eChromaFormat = videoFormat->chroma_format ;
60- caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8 ;
61- CUresult result = cuvidGetDecoderCaps (&caps);
62- TORCH_CHECK (result == CUDA_SUCCESS, " Failed to get decoder caps: " , result);
63-
64- TORCH_CHECK (
65- caps.bIsSupported ,
66- " Codec configuration not supported on this GPU. "
67- " Codec: " ,
68- static_cast <int >(videoFormat->codec ),
69- " , chroma format: " ,
70- static_cast <int >(videoFormat->chroma_format ),
71- " , bit depth: " ,
72- videoFormat->bit_depth_luma_minus8 + 8 );
73-
74- TORCH_CHECK (
75- videoFormat->coded_width >= caps.nMinWidth &&
76- videoFormat->coded_height >= caps.nMinHeight ,
77- " Video is too small in at least one dimension. Provided: " ,
78- videoFormat->coded_width ,
79- " x" ,
80- videoFormat->coded_height ,
81- " vs supported:" ,
82- caps.nMinWidth ,
83- " x" ,
84- caps.nMinHeight );
85-
86- TORCH_CHECK (
87- videoFormat->coded_width <= caps.nMaxWidth &&
88- videoFormat->coded_height <= caps.nMaxHeight ,
89- " Video is too large in at least one dimension. Provided: " ,
90- videoFormat->coded_width ,
91- " x" ,
92- videoFormat->coded_height ,
93- " vs supported:" ,
94- caps.nMaxWidth ,
95- " x" ,
96- caps.nMaxHeight );
97-
98- // See nMaxMBCount in cuviddec.h
99- constexpr unsigned int macroblockConstant = 256 ;
100- TORCH_CHECK (
101- videoFormat->coded_width * videoFormat->coded_height /
102- macroblockConstant <=
103- caps.nMaxMBCount ,
104- " Video is too large (too many macroblocks). "
105- " Provided (width * height / " ,
106- macroblockConstant,
107- " ): " ,
108- videoFormat->coded_width * videoFormat->coded_height / macroblockConstant,
109- " vs supported:" ,
110- caps.nMaxMBCount );
111-
112- // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make
113- // sure it's actually supported.
114- TORCH_CHECK (
115- (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1 ,
116- " NV12 output format is not supported for this configuration. " ,
117- " Codec: " ,
118- static_cast <int >(videoFormat->codec ),
119- " , chroma format: " ,
120- static_cast <int >(videoFormat->chroma_format ),
121- " , bit depth: " ,
122- videoFormat->bit_depth_luma_minus8 + 8 );
123-
12456 // Decoder creation parameters, most are taken from DALI
12557 CUVIDDECODECREATEINFO decoderParams = {};
12658 decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8 ;
@@ -157,13 +89,39 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
15789 decoderParams.display_area .bottom = videoFormat->display_area .bottom ;
15890
15991 CUvideodecoder* decoder = new CUvideodecoder ();
160- result = cuvidCreateDecoder (decoder, &decoderParams);
92+ CUresult result = cuvidCreateDecoder (decoder, &decoderParams);
16193 TORCH_CHECK (
16294 result == CUDA_SUCCESS, " Failed to create NVDEC decoder: " , result);
16395 return UniqueCUvideodecoder (decoder, CUvideoDecoderDeleter{});
16496}
16597
166- cudaVideoCodec validateCodecSupport (AVCodecID codecId) {
98+ std::optional<cudaVideoChromaFormat> validateChromaSupport (
99+ const AVPixFmtDescriptor* desc) {
100+ // Return the corresponding cudaVideoChromaFormat if supported, std::nullopt
101+ // otherwise.
102+ TORCH_CHECK (desc != nullptr , " desc can't be null" );
103+
104+ if (desc->nb_components == 1 ) {
105+ return cudaVideoChromaFormat_Monochrome;
106+ } else if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) {
107+ // Make sure it's YUV: has chroma planes and isn't RGB
108+ if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0 ) {
109+ return cudaVideoChromaFormat_444; // 1x1 subsampling = 4:4:4
110+ } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1 ) {
111+ return cudaVideoChromaFormat_420; // 2x2 subsampling = 4:2:0
112+ } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0 ) {
113+ return cudaVideoChromaFormat_422; // 2x1 subsampling = 4:2:2
114+ }
115+ }
116+
117+ return std::nullopt ;
118+ }
119+
120+ std::optional<cudaVideoCodec> validateCodecSupport (AVCodecID codecId) {
121+ // Return the corresponding cudaVideoCodec if supported, std::nullopt
122+ // otherwise
123+ // Note that we currently return nullopt (and thus fallback to CPU) for some
124+ // codecs that are technically supported by NVDEC, see comment below.
167125 switch (codecId) {
168126 case AV_CODEC_ID_H264:
169127 return cudaVideoCodec_H264;
@@ -189,10 +147,69 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
189147 // return cudaVideoCodec_JPEG;
190148 // case AV_CODEC_ID_VC1:
191149 // return cudaVideoCodec_VC1;
192- default : {
193- TORCH_CHECK (false , " Unsupported codec type: " , avcodec_get_name (codecId));
194- }
150+ default :
151+ return std::nullopt ;
152+ }
153+ }
154+
155+ bool nativeNVDECSupport (const SharedAVCodecContext& codecContext) {
156+ // Return true iff the input video stream is supported by our NVDEC
157+ // implementation.
158+ auto codecType = validateCodecSupport (codecContext->codec_id );
159+ if (!codecType.has_value ()) {
160+ return false ;
161+ }
162+
163+ const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get (codecContext->pix_fmt );
164+ if (!desc) {
165+ return false ;
166+ }
167+
168+ auto chromaFormat = validateChromaSupport (desc);
169+ if (!chromaFormat.has_value ()) {
170+ return false ;
171+ }
172+
173+ auto caps = CUVIDDECODECAPS{};
174+ caps.eCodecType = codecType.value ();
175+ caps.eChromaFormat = chromaFormat.value ();
176+ caps.nBitDepthMinus8 = desc->comp [0 ].depth - 8 ;
177+
178+ CUresult result = cuvidGetDecoderCaps (&caps);
179+ if (result != CUDA_SUCCESS) {
180+ return false ;
181+ }
182+
183+ if (!caps.bIsSupported ) {
184+ return false ;
185+ }
186+
187+ auto coded_width = static_cast <unsigned int >(codecContext->coded_width );
188+ auto coded_height = static_cast <unsigned int >(codecContext->coded_height );
189+ if (coded_width < static_cast <unsigned int >(caps.nMinWidth ) ||
190+ coded_height < static_cast <unsigned int >(caps.nMinHeight ) ||
191+ coded_width > caps.nMaxWidth || coded_height > caps.nMaxHeight ) {
192+ return false ;
193+ }
194+
195+ // See nMaxMBCount in cuviddec.h
196+ constexpr unsigned int macroblockConstant = 256 ;
197+ if (coded_width * coded_height / macroblockConstant > caps.nMaxMBCount ) {
198+ return false ;
199+ }
200+
201+ // We'll set the decoderParams.OutputFormat to NV12, so we need to make
202+ // sure it's actually supported.
203+ // TODO: If this fail, we could consider decoding to something else than NV12
204+ // (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is
205+ // what FFmpeg does.
206+ bool supportsNV12Output =
207+ (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1 ;
208+ if (!supportsNV12Output) {
209+ return false ;
195210 }
211+
212+ return true ;
196213}
197214
198215} // namespace
@@ -232,6 +249,19 @@ void BetaCudaDeviceInterface::initialize(
232249 const AVStream* avStream,
233250 const UniqueDecodingAVFormatContext& avFormatCtx,
234251 [[maybe_unused]] const SharedAVCodecContext& codecContext) {
252+ if (!nativeNVDECSupport (codecContext)) {
253+ cpuFallback_ = createDeviceInterface (torch::kCPU );
254+ TORCH_CHECK (
255+ cpuFallback_ != nullptr , " Failed to create CPU device interface" );
256+ cpuFallback_->initialize (avStream, avFormatCtx, codecContext);
257+ cpuFallback_->initializeVideo (
258+ VideoStreamOptions (),
259+ {},
260+ /* resizedOutputDims=*/ std::nullopt );
261+ // We'll always use the CPU fallback from now on, so we can return early.
262+ return ;
263+ }
264+
235265 TORCH_CHECK (avStream != nullptr , " AVStream cannot be null" );
236266 timeBase_ = avStream->time_base ;
237267 frameRateAvgFromFFmpeg_ = avStream->r_frame_rate ;
@@ -243,7 +273,11 @@ void BetaCudaDeviceInterface::initialize(
243273
244274 // Create parser. Default values that aren't obvious are taken from DALI.
245275 CUVIDPARSERPARAMS parserParams = {};
246- parserParams.CodecType = validateCodecSupport (codecPar->codec_id );
276+ auto codecType = validateCodecSupport (codecPar->codec_id );
277+ TORCH_CHECK (
278+ codecType.has_value (),
279+ " This should never happen, we should be using the CPU fallback by now. Please report a bug." );
280+ parserParams.CodecType = codecType.value ();
247281 parserParams.ulMaxNumDecodeSurfaces = 8 ;
248282 parserParams.ulMaxDisplayDelay = 0 ;
249283 // Callback setup, all are triggered by the parser within a call
@@ -383,6 +417,10 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
383417// Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to
384418// the NVCUVID parser.
385419int BetaCudaDeviceInterface::sendPacket (ReferenceAVPacket& packet) {
420+ if (cpuFallback_) {
421+ return cpuFallback_->sendPacket (packet);
422+ }
423+
386424 TORCH_CHECK (
387425 packet.get () && packet->data && packet->size > 0 ,
388426 " sendPacket received an empty packet, this is unexpected, please report." );
@@ -406,6 +444,10 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
406444}
407445
408446int BetaCudaDeviceInterface::sendEOFPacket () {
447+ if (cpuFallback_) {
448+ return cpuFallback_->sendEOFPacket ();
449+ }
450+
409451 CUVIDSOURCEDATAPACKET cuvidPacket = {};
410452 cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
411453 eofSent_ = true ;
@@ -467,6 +509,10 @@ int BetaCudaDeviceInterface::frameReadyInDisplayOrder(
467509
468510// Moral equivalent of avcodec_receive_frame().
469511int BetaCudaDeviceInterface::receiveFrame (UniqueAVFrame& avFrame) {
512+ if (cpuFallback_) {
513+ return cpuFallback_->receiveFrame (avFrame);
514+ }
515+
470516 if (readyFrames_.empty ()) {
471517 // No frame found, instruct caller to try again later after sending more
472518 // packets, or to stop if EOF was already sent.
@@ -601,6 +647,11 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
601647}
602648
603649void BetaCudaDeviceInterface::flush () {
650+ if (cpuFallback_) {
651+ cpuFallback_->flush ();
652+ return ;
653+ }
654+
604655 // The NVCUVID docs mention that after seeking, i.e. when flush() is called,
605656 // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
606657 // don't say whether this should be an empty packet, or whether it should be a
@@ -618,6 +669,21 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
618669 UniqueAVFrame& avFrame,
619670 FrameOutput& frameOutput,
620671 std::optional<torch::Tensor> preAllocatedOutputTensor) {
672+ if (cpuFallback_) {
673+ // CPU decoded frame - need to do CPU color conversion then transfer to GPU
674+ FrameOutput cpuFrameOutput;
675+ cpuFallback_->convertAVFrameToFrameOutput (avFrame, cpuFrameOutput);
676+
677+ // Transfer CPU frame to GPU
678+ if (preAllocatedOutputTensor.has_value ()) {
679+ preAllocatedOutputTensor.value ().copy_ (cpuFrameOutput.data );
680+ frameOutput.data = preAllocatedOutputTensor.value ();
681+ } else {
682+ frameOutput.data = cpuFrameOutput.data .to (device_);
683+ }
684+ return ;
685+ }
686+
621687 // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
622688 // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
623689 TORCH_CHECK (
0 commit comments