From 2828dc5d29a1ff3852caab618597540e8803f199 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Wed, 10 Sep 2025 08:45:13 -0700 Subject: [PATCH 1/7] Refactor pybind_ops to only deal with file like context holders --- src/torchcodec/_core/AVIOFileLikeContext.h | 3 +- src/torchcodec/_core/custom_ops.cpp | 55 ++++++++++++++++++++ src/torchcodec/_core/ops.py | 59 +++++++++++++--------- src/torchcodec/_core/pybind_ops.cpp | 59 ++-------------------- 4 files changed, 97 insertions(+), 79 deletions(-) diff --git a/src/torchcodec/_core/AVIOFileLikeContext.h b/src/torchcodec/_core/AVIOFileLikeContext.h index fd7f534f3..74dad3d76 100644 --- a/src/torchcodec/_core/AVIOFileLikeContext.h +++ b/src/torchcodec/_core/AVIOFileLikeContext.h @@ -17,7 +17,8 @@ namespace facebook::torchcodec { // Enables uers to pass in a Python file-like object. We then forward all read // and seek calls back up to the methods on the Python object. -class AVIOFileLikeContext : public AVIOContextHolder { +class __attribute__((visibility("hidden"))) AVIOFileLikeContext + : public AVIOContextHolder { public: explicit AVIOFileLikeContext(const py::object& fileLike, bool isForWriting); diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index c646ed54a..1bf47edec 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -10,6 +10,7 @@ #include #include "c10/core/SymIntArrayRef.h" #include "c10/util/Exception.h" +#include "src/torchcodec/_core/AVIOFileLikeContext.h" #include "src/torchcodec/_core/AVIOTensorContext.h" #include "src/torchcodec/_core/Encoder.h" #include "src/torchcodec/_core/SingleStreamDecoder.h" @@ -33,8 +34,12 @@ TORCH_LIBRARY(torchcodec_ns, m) { "encode_audio_to_file(Tensor samples, int sample_rate, str filename, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()"); m.def( "encode_audio_to_tensor(Tensor samples, int sample_rate, str format, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> Tensor"); + m.def( + "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()"); m.def( "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor"); + m.def( + "_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor"); m.def("_convert_to_tensor(int decoder_ptr) -> Tensor"); m.def( "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()"); @@ -210,6 +215,24 @@ at::Tensor create_from_tensor( return wrapDecoderPointerToTensor(std::move(uniqueDecoder)); } +at::Tensor _create_from_file_like( + int64_t file_like_context, + std::optional seek_mode) { + auto fileLikeContext = + reinterpret_cast(file_like_context); + TORCH_CHECK(fileLikeContext != nullptr, "file_like must be a valid pointer"); + std::unique_ptr contextHolder(fileLikeContext); + + SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact; + if (seek_mode.has_value()) { + realSeek = seekModeFromString(seek_mode.value()); + } + + std::unique_ptr uniqueDecoder = + std::make_unique(std::move(contextHolder), realSeek); + return wrapDecoderPointerToTensor(std::move(uniqueDecoder)); +} + at::Tensor _convert_to_tensor(int64_t decoder_ptr) { auto decoder = reinterpret_cast(decoder_ptr); std::unique_ptr uniqueDecoder(decoder); @@ -441,6 +464,36 @@ at::Tensor encode_audio_to_tensor( .encodeToTensor(); } +void _encode_audio_to_file_like( + const at::Tensor& samples, + int64_t sample_rate, + std::string_view format, + int64_t file_like_context, + std::optional bit_rate = std::nullopt, + std::optional num_channels = std::nullopt, + std::optional desired_sample_rate = std::nullopt) { + auto fileLikeContext = + reinterpret_cast(file_like_context); + TORCH_CHECK( + fileLikeContext != nullptr, "file_like_context must be a valid pointer"); + std::unique_ptr avioContextHolder(fileLikeContext); + + AudioStreamOptions audioStreamOptions; + audioStreamOptions.bitRate = validateOptionalInt64ToInt(bit_rate, "bit_rate"); + audioStreamOptions.numChannels = + validateOptionalInt64ToInt(num_channels, "num_channels"); + audioStreamOptions.sampleRate = + validateOptionalInt64ToInt(desired_sample_rate, "desired_sample_rate"); + + AudioEncoder encoder( + samples, + validateInt64ToInt(sample_rate, "sample_rate"), + format, + std::move(avioContextHolder), + audioStreamOptions); + encoder.encode(); +} + // For testing only. We need to implement this operation as a core library // function because what we're testing is round-tripping pts values as // double-precision floating point numbers from C++ to Python and back to C++. @@ -694,6 +747,7 @@ void scan_all_streams_to_update_metadata(at::Tensor& decoder) { TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) { m.impl("create_from_file", &create_from_file); m.impl("create_from_tensor", &create_from_tensor); + m.impl("_create_from_file_like", &_create_from_file_like); m.impl("_convert_to_tensor", &_convert_to_tensor); m.impl( "_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions); @@ -702,6 +756,7 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) { TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) { m.impl("encode_audio_to_file", &encode_audio_to_file); m.impl("encode_audio_to_tensor", &encode_audio_to_tensor); + m.impl("_encode_audio_to_file_like", &_encode_audio_to_file_like); m.impl("seek_to_pts", &seek_to_pts); m.impl("add_video_stream", &add_video_stream); m.impl("_add_video_stream", &_add_video_stream); diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py index 807ed265d..43983539b 100644 --- a/src/torchcodec/_core/ops.py +++ b/src/torchcodec/_core/ops.py @@ -95,9 +95,15 @@ def load_torchcodec_shared_libraries(): encode_audio_to_tensor = torch._dynamo.disallow_in_graph( torch.ops.torchcodec_ns.encode_audio_to_tensor.default ) +_encode_audio_to_file_like = torch._dynamo.disallow_in_graph( + torch.ops.torchcodec_ns._encode_audio_to_file_like.default +) create_from_tensor = torch._dynamo.disallow_in_graph( torch.ops.torchcodec_ns.create_from_tensor.default ) +_create_from_file_like = torch._dynamo.disallow_in_graph( + torch.ops.torchcodec_ns._create_from_file_like.default +) _convert_to_tensor = torch._dynamo.disallow_in_graph( torch.ops.torchcodec_ns._convert_to_tensor.default ) @@ -148,7 +154,12 @@ def create_from_file_like( file_like: Union[io.RawIOBase, io.BufferedReader], seek_mode: Optional[str] = None ) -> torch.Tensor: assert _pybind_ops is not None - return _convert_to_tensor(_pybind_ops.create_from_file_like(file_like, seek_mode)) + return _create_from_file_like( + _pybind_ops.create_file_like_context( + file_like, False # False means not for writing + ), + seek_mode, + ) def encode_audio_to_file_like( @@ -176,36 +187,16 @@ def encode_audio_to_file_like( if samples.dtype != torch.float32: raise ValueError(f"samples must have dtype torch.float32, got {samples.dtype}") - # We're having the same problem as with the decoder's create_from_file_like: - # We should be able to pass a tensor directly, but this leads to a pybind - # error. In order to work around this, we pass the pointer to the tensor's - # data, and its shape, in order to re-construct it in C++. For this to work: - # - the tensor must be float32 - # - the tensor must be contiguous, which is why we call contiguous(). - # In theory we could avoid this restriction by also passing the strides? - # - IMPORTANT: the input samples tensor and its underlying data must be - # alive during the call. - # - # A more elegant solution would be to cast the tensor into a py::object, but - # casting the py::object backk to a tensor in C++ seems to lead to the same - # pybing error. - - samples = samples.contiguous() - _pybind_ops.encode_audio_to_file_like( - samples.data_ptr(), - list(samples.shape), + _encode_audio_to_file_like( + samples, sample_rate, format, - file_like, + _pybind_ops.create_file_like_context(file_like, True), # True means for writing bit_rate, num_channels, desired_sample_rate, ) - # This check is useless but it's critical to keep it to ensures that samples - # is still alive during the call to encode_audio_to_file_like. - assert samples.is_contiguous() - # ============================== # Abstract impl for the operators. Needed by torch.compile. @@ -215,6 +206,13 @@ def create_from_file_abstract(filename: str, seek_mode: Optional[str]) -> torch. return torch.empty([], dtype=torch.long) +@register_fake("torchcodec_ns::_create_from_file_like") +def _create_from_file_like_abstract( + file_like: int, seek_mode: Optional[str] +) -> torch.Tensor: + return torch.empty([], dtype=torch.long) + + @register_fake("torchcodec_ns::encode_audio_to_file") def encode_audio_to_file_abstract( samples: torch.Tensor, @@ -239,6 +237,19 @@ def encode_audio_to_tensor_abstract( return torch.empty([], dtype=torch.long) +@register_fake("torchcodec_ns::_encode_audio_to_file_like") +def _encode_audio_to_file_like_abstract( + samples: torch.Tensor, + sample_rate: int, + format: str, + file_like_context: int, + bit_rate: Optional[int] = None, + num_channels: Optional[int] = None, + desired_sample_rate: Optional[int] = None, +) -> None: + return torch.empty([], dtype=torch.long) + + @register_fake("torchcodec_ns::create_from_tensor") def create_from_tensor_abstract( video_tensor: torch.Tensor, seek_mode: Optional[str] diff --git a/src/torchcodec/_core/pybind_ops.cpp b/src/torchcodec/_core/pybind_ops.cpp index 72969f7a9..04c87de32 100644 --- a/src/torchcodec/_core/pybind_ops.cpp +++ b/src/torchcodec/_core/pybind_ops.cpp @@ -7,13 +7,8 @@ #include #include #include -#include #include "src/torchcodec/_core/AVIOFileLikeContext.h" -#include "src/torchcodec/_core/Encoder.h" -#include "src/torchcodec/_core/SingleStreamDecoder.h" -#include "src/torchcodec/_core/StreamOptions.h" -#include "src/torchcodec/_core/ValidationUtils.h" namespace py = pybind11; @@ -26,53 +21,10 @@ namespace facebook::torchcodec { // // So we instead launder the pointer through an int, and then use a conversion // function on the custom ops side to launder that int into a tensor. -int64_t create_from_file_like( - py::object file_like, - std::optional seek_mode) { - SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact; - if (seek_mode.has_value()) { - realSeek = seekModeFromString(seek_mode.value()); - } - - auto avioContextHolder = - std::make_unique(file_like, /*isForWriting=*/false); - - SingleStreamDecoder* decoder = - new SingleStreamDecoder(std::move(avioContextHolder), realSeek); - return reinterpret_cast(decoder); -} - -void encode_audio_to_file_like( - int64_t data_ptr, - const std::vector& shape, - int64_t sample_rate, - std::string_view format, - py::object file_like, - std::optional bit_rate = std::nullopt, - std::optional num_channels = std::nullopt, - std::optional desired_sample_rate = std::nullopt) { - // We assume float32 *and* contiguity, this must be enforced by the caller. - auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32); - auto samples = torch::from_blob( - reinterpret_cast(data_ptr), shape, tensor_options); - - AudioStreamOptions audioStreamOptions; - audioStreamOptions.bitRate = validateOptionalInt64ToInt(bit_rate, "bit_rate"); - audioStreamOptions.numChannels = - validateOptionalInt64ToInt(num_channels, "num_channels"); - audioStreamOptions.sampleRate = - validateOptionalInt64ToInt(desired_sample_rate, "desired_sample_rate"); - - auto avioContextHolder = - std::make_unique(file_like, /*isForWriting=*/true); - - AudioEncoder encoder( - samples, - validateInt64ToInt(sample_rate, "sample_rate"), - format, - std::move(avioContextHolder), - audioStreamOptions); - encoder.encode(); +int64_t create_file_like_context(py::object file_like, bool is_for_writing) { + AVIOFileLikeContext* context = + new AVIOFileLikeContext(file_like, is_for_writing); + return reinterpret_cast(context); } #ifndef PYBIND_OPS_MODULE_NAME @@ -80,8 +32,7 @@ void encode_audio_to_file_like( #endif PYBIND11_MODULE(PYBIND_OPS_MODULE_NAME, m) { - m.def("create_from_file_like", &create_from_file_like); - m.def("encode_audio_to_file_like", &encode_audio_to_file_like); + m.def("create_file_like_context", &create_file_like_context); } } // namespace facebook::torchcodec From f57fc00d992f532e32cef86257b666713fd51983 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Wed, 10 Sep 2025 09:00:35 -0700 Subject: [PATCH 2/7] Remove converting int to decoder --- src/torchcodec/_core/custom_ops.cpp | 8 -------- src/torchcodec/_core/ops.py | 8 -------- 2 files changed, 16 deletions(-) diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 1bf47edec..0265a3e3f 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -40,7 +40,6 @@ TORCH_LIBRARY(torchcodec_ns, m) { "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor"); m.def( "_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor"); - m.def("_convert_to_tensor(int decoder_ptr) -> Tensor"); m.def( "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()"); m.def( @@ -233,12 +232,6 @@ at::Tensor _create_from_file_like( return wrapDecoderPointerToTensor(std::move(uniqueDecoder)); } -at::Tensor _convert_to_tensor(int64_t decoder_ptr) { - auto decoder = reinterpret_cast(decoder_ptr); - std::unique_ptr uniqueDecoder(decoder); - return wrapDecoderPointerToTensor(std::move(uniqueDecoder)); -} - void _add_video_stream( at::Tensor& decoder, std::optional width = std::nullopt, @@ -748,7 +741,6 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) { m.impl("create_from_file", &create_from_file); m.impl("create_from_tensor", &create_from_tensor); m.impl("_create_from_file_like", &_create_from_file_like); - m.impl("_convert_to_tensor", &_convert_to_tensor); m.impl( "_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions); } diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py index 43983539b..e18527f9a 100644 --- a/src/torchcodec/_core/ops.py +++ b/src/torchcodec/_core/ops.py @@ -104,9 +104,6 @@ def load_torchcodec_shared_libraries(): _create_from_file_like = torch._dynamo.disallow_in_graph( torch.ops.torchcodec_ns._create_from_file_like.default ) -_convert_to_tensor = torch._dynamo.disallow_in_graph( - torch.ops.torchcodec_ns._convert_to_tensor.default -) add_video_stream = torch.ops.torchcodec_ns.add_video_stream.default _add_video_stream = torch.ops.torchcodec_ns._add_video_stream.default add_audio_stream = torch.ops.torchcodec_ns.add_audio_stream.default @@ -257,11 +254,6 @@ def create_from_tensor_abstract( return torch.empty([], dtype=torch.long) -@register_fake("torchcodec_ns::_convert_to_tensor") -def _convert_to_tensor_abstract(decoder_ptr: int) -> torch.Tensor: - return torch.empty([], dtype=torch.long) - - @register_fake("torchcodec_ns::_add_video_stream") def _add_video_stream_abstract( decoder: torch.Tensor, From fa32ece137df9ac2c85e878dcc1e6df87a6a7afd Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Wed, 10 Sep 2025 09:06:12 -0700 Subject: [PATCH 3/7] Lint --- src/torchcodec/_core/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py index e18527f9a..12f70b89f 100644 --- a/src/torchcodec/_core/ops.py +++ b/src/torchcodec/_core/ops.py @@ -244,7 +244,7 @@ def _encode_audio_to_file_like_abstract( num_channels: Optional[int] = None, desired_sample_rate: Optional[int] = None, ) -> None: - return torch.empty([], dtype=torch.long) + return @register_fake("torchcodec_ns::create_from_tensor") From c6e07aefe3d86afd99cb1cafc0f265aa406224ef Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Wed, 10 Sep 2025 12:12:04 -0700 Subject: [PATCH 4/7] Fix for Windows --- src/torchcodec/_core/AVIOFileLikeContext.h | 17 +++++++++++++++-- src/torchcodec/_core/custom_ops.cpp | 14 +++++++++----- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/torchcodec/_core/AVIOFileLikeContext.h b/src/torchcodec/_core/AVIOFileLikeContext.h index 74dad3d76..3146c00de 100644 --- a/src/torchcodec/_core/AVIOFileLikeContext.h +++ b/src/torchcodec/_core/AVIOFileLikeContext.h @@ -11,14 +11,27 @@ #include "src/torchcodec/_core/AVIOContextHolder.h" +// pybind11 has specific visibility rules; see: +// https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-visibility-than-the-type-of-its-field-someclass-member-wattributes +// This file is included in both pybind_ops.cpp and custom_ops.cpp. We compile +// pybind_ops.cpp with the global visibility flag as required. On the +// custom_ops.cpp side we don't need to do that. However, we do need to ensure +// that this class is also seen as the same visibility. +// +// This only matters on Linux and Mac; on Windows we don't need to do anything. +#ifdef _WIN32 +#define VISIBILITY_HIDDEN +#else +#define VISIBILITY_HIDDEN __attribute__((visibility("hidden"))) +#endif + namespace py = pybind11; namespace facebook::torchcodec { // Enables uers to pass in a Python file-like object. We then forward all read // and seek calls back up to the methods on the Python object. -class __attribute__((visibility("hidden"))) AVIOFileLikeContext - : public AVIOContextHolder { +class VISIBILITY_HIDDEN AVIOFileLikeContext : public AVIOContextHolder { public: explicit AVIOFileLikeContext(const py::object& fileLike, bool isForWriting); diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 0265a3e3f..d24c4dfa3 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -207,10 +207,12 @@ at::Tensor create_from_tensor( realSeek = seekModeFromString(seek_mode.value()); } - auto contextHolder = std::make_unique(video_tensor); + auto avioContextHolder = + std::make_unique(video_tensor); std::unique_ptr uniqueDecoder = - std::make_unique(std::move(contextHolder), realSeek); + std::make_unique( + std::move(avioContextHolder), realSeek); return wrapDecoderPointerToTensor(std::move(uniqueDecoder)); } @@ -219,8 +221,9 @@ at::Tensor _create_from_file_like( std::optional seek_mode) { auto fileLikeContext = reinterpret_cast(file_like_context); - TORCH_CHECK(fileLikeContext != nullptr, "file_like must be a valid pointer"); - std::unique_ptr contextHolder(fileLikeContext); + TORCH_CHECK( + fileLikeContext != nullptr, "file_like_context must be a valid pointer"); + std::unique_ptr avioContextHolder(fileLikeContext); SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact; if (seek_mode.has_value()) { @@ -228,7 +231,8 @@ at::Tensor _create_from_file_like( } std::unique_ptr uniqueDecoder = - std::make_unique(std::move(contextHolder), realSeek); + std::make_unique( + std::move(avioContextHolder), realSeek); return wrapDecoderPointerToTensor(std::move(uniqueDecoder)); } From 497f68b4cf4ebca326677696af9618febebfd622 Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Thu, 11 Sep 2025 07:00:36 -0700 Subject: [PATCH 5/7] Better comment; move around seek to string --- src/torchcodec/_core/SingleStreamDecoder.cpp | 12 ----------- src/torchcodec/_core/SingleStreamDecoder.h | 2 -- src/torchcodec/_core/custom_ops.cpp | 12 +++++++++++ src/torchcodec/_core/pybind_ops.cpp | 22 +++++++++++++++----- 4 files changed, 29 insertions(+), 19 deletions(-) diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 864e82d0a..b52556e93 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -1704,16 +1704,4 @@ FrameDims getHeightAndWidthFromOptionsOrAVFrame( videoStreamOptions.width.value_or(avFrame->width)); } -SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) { - if (seekMode == "exact") { - return SingleStreamDecoder::SeekMode::exact; - } else if (seekMode == "approximate") { - return SingleStreamDecoder::SeekMode::approximate; - } else if (seekMode == "custom_frame_mappings") { - return SingleStreamDecoder::SeekMode::custom_frame_mappings; - } else { - TORCH_CHECK(false, "Invalid seek mode: " + std::string(seekMode)); - } -} - } // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h index 027f52fc4..56bb8bb58 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.h +++ b/src/torchcodec/_core/SingleStreamDecoder.h @@ -375,6 +375,4 @@ std::ostream& operator<<( std::ostream& os, const SingleStreamDecoder::DecodeStats& stats); -SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode); - } // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index d24c4dfa3..76e89e692 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -169,6 +169,18 @@ std::string mapToJson(const std::map& metadataMap) { return ss.str(); } +SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) { + if (seekMode == "exact") { + return SingleStreamDecoder::SeekMode::exact; + } else if (seekMode == "approximate") { + return SingleStreamDecoder::SeekMode::approximate; + } else if (seekMode == "custom_frame_mappings") { + return SingleStreamDecoder::SeekMode::custom_frame_mappings; + } else { + TORCH_CHECK(false, "Invalid seek mode: " + std::string(seekMode)); + } +} + } // namespace // ============================== diff --git a/src/torchcodec/_core/pybind_ops.cpp b/src/torchcodec/_core/pybind_ops.cpp index 04c87de32..f9cfd7eb6 100644 --- a/src/torchcodec/_core/pybind_ops.cpp +++ b/src/torchcodec/_core/pybind_ops.cpp @@ -14,13 +14,25 @@ namespace py = pybind11; namespace facebook::torchcodec { -// In principle, this should be able to return a tensor. But when we try that, -// we run into the bug reported here: +// Note: It's not immediately obvous why we need both custom_ops.cpp and +// pybind_ops.cpp. We do all other Python to C++ bridging in +// custom_ops.cpp, and that even depends on pybind11, so why have an +// explicit pybind-only file? // -// https://github.com/pytorch/pytorch/issues/136664 +// The reason is that we want to accept OWNERSHIP of a file-like object +// from the Python side. In order to do that, we need a proper +// py::object. For raw bytes, we can launder that through a tensor on the +// custom_ops.cpp side, but we can't launder a proper Python object +// through a tensor. Custom ops can't accept a proper Python object +// through py::object, so we have to do direct pybind11 here. // -// So we instead launder the pointer through an int, and then use a conversion -// function on the custom ops side to launder that int into a tensor. +// TODO: Investigate if we can do something better here. See: +// https://github.com/pytorch/torchcodec/issues/896 +// Short version is that we're laundering a pointer through an int, the +// Python side forwards that to decoder creation functions in +// custom_ops.cpp and we do another cast on that side to get a pointer +// again. We want to investigate if we can do something cleaner by +// defining proper pybind objects. int64_t create_file_like_context(py::object file_like, bool is_for_writing) { AVIOFileLikeContext* context = new AVIOFileLikeContext(file_like, is_for_writing); From 167212f225c231e8fd707339b754d5751313b1db Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Thu, 11 Sep 2025 07:20:48 -0700 Subject: [PATCH 6/7] Remove visibility stuff from source code, add it to cmake --- src/torchcodec/_core/AVIOFileLikeContext.h | 16 +--------------- src/torchcodec/_core/CMakeLists.txt | 7 +++++++ 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/src/torchcodec/_core/AVIOFileLikeContext.h b/src/torchcodec/_core/AVIOFileLikeContext.h index 3146c00de..fd7f534f3 100644 --- a/src/torchcodec/_core/AVIOFileLikeContext.h +++ b/src/torchcodec/_core/AVIOFileLikeContext.h @@ -11,27 +11,13 @@ #include "src/torchcodec/_core/AVIOContextHolder.h" -// pybind11 has specific visibility rules; see: -// https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-visibility-than-the-type-of-its-field-someclass-member-wattributes -// This file is included in both pybind_ops.cpp and custom_ops.cpp. We compile -// pybind_ops.cpp with the global visibility flag as required. On the -// custom_ops.cpp side we don't need to do that. However, we do need to ensure -// that this class is also seen as the same visibility. -// -// This only matters on Linux and Mac; on Windows we don't need to do anything. -#ifdef _WIN32 -#define VISIBILITY_HIDDEN -#else -#define VISIBILITY_HIDDEN __attribute__((visibility("hidden"))) -#endif - namespace py = pybind11; namespace facebook::torchcodec { // Enables uers to pass in a Python file-like object. We then forward all read // and seek calls back up to the methods on the Python object. -class VISIBILITY_HIDDEN AVIOFileLikeContext : public AVIOContextHolder { +class AVIOFileLikeContext : public AVIOContextHolder { public: explicit AVIOFileLikeContext(const py::object& fileLike, bool isForWriting); diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt index 03f68f6b8..28d9f512e 100644 --- a/src/torchcodec/_core/CMakeLists.txt +++ b/src/torchcodec/_core/CMakeLists.txt @@ -175,12 +175,19 @@ function(make_torchcodec_libraries # stray initialization of py::objects. The rest of the object code must # match. See: # https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-visibility-than-the-type-of-its-field-someclass-member-wattributes + # We have to do this for both pybind_ops and custom_ops because they include + # some of the same headers. if(NOT WIN32) target_compile_options( ${pybind_ops_library_name} PUBLIC "-fvisibility=hidden" ) + target_compile_options( + ${custom_ops_library_name} + PUBLIC + "-fvisibility=hidden" + ) endif() # The value we use here must match the value we return from From 97224ba072aa66cc1b56faab0edaf2f838ba466b Mon Sep 17 00:00:00 2001 From: Scott Schneider Date: Fri, 12 Sep 2025 10:37:54 -0700 Subject: [PATCH 7/7] Make visiblity hidden linux only --- src/torchcodec/_core/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt index 28d9f512e..e3f9102e2 100644 --- a/src/torchcodec/_core/CMakeLists.txt +++ b/src/torchcodec/_core/CMakeLists.txt @@ -177,7 +177,11 @@ function(make_torchcodec_libraries # https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-visibility-than-the-type-of-its-field-someclass-member-wattributes # We have to do this for both pybind_ops and custom_ops because they include # some of the same headers. - if(NOT WIN32) + # + # Note that this is Linux only. It's not necessary on Windows, and on Mac + # hiding visibility can actually break dyanmic casts across share libraries + # because the type infos don't get exported. + if(LINUX) target_compile_options( ${pybind_ops_library_name} PUBLIC