From 2828dc5d29a1ff3852caab618597540e8803f199 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Wed, 10 Sep 2025 08:45:13 -0700
Subject: [PATCH 1/7] Refactor pybind_ops to only deal with file like context
 holders

---
 src/torchcodec/_core/AVIOFileLikeContext.h |  3 +-
 src/torchcodec/_core/custom_ops.cpp        | 55 ++++++++++++++++++++
 src/torchcodec/_core/ops.py                | 59 +++++++++++++---------
 src/torchcodec/_core/pybind_ops.cpp        | 59 ++--------------------
 4 files changed, 97 insertions(+), 79 deletions(-)
diff --git a/src/torchcodec/_core/AVIOFileLikeContext.h b/src/torchcodec/_core/AVIOFileLikeContext.h
index fd7f534f3..74dad3d76 100644
--- a/src/torchcodec/_core/AVIOFileLikeContext.h
+++ b/src/torchcodec/_core/AVIOFileLikeContext.h
@@ -17,7 +17,8 @@ namespace facebook::torchcodec {
 
 // Enables uers to pass in a Python file-like object. We then forward all read
 // and seek calls back up to the methods on the Python object.
-class AVIOFileLikeContext : public AVIOContextHolder {
+class __attribute__((visibility("hidden"))) AVIOFileLikeContext
+    : public AVIOContextHolder {
  public:
   explicit AVIOFileLikeContext(const py::object& fileLike, bool isForWriting);
 
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index c646ed54a..1bf47edec 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -10,6 +10,7 @@
 #include <string>
 #include "c10/core/SymIntArrayRef.h"
 #include "c10/util/Exception.h"
+#include "src/torchcodec/_core/AVIOFileLikeContext.h"
 #include "src/torchcodec/_core/AVIOTensorContext.h"
 #include "src/torchcodec/_core/Encoder.h"
 #include "src/torchcodec/_core/SingleStreamDecoder.h"
@@ -33,8 +34,12 @@ TORCH_LIBRARY(torchcodec_ns, m) {
       "encode_audio_to_file(Tensor samples, int sample_rate, str filename, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
       "encode_audio_to_tensor(Tensor samples, int sample_rate, str format, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> Tensor");
+  m.def(
+      "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
+  m.def(
+      "_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor");
   m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
   m.def(
       "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
@@ -210,6 +215,24 @@ at::Tensor create_from_tensor(
   return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
 }
 
+at::Tensor _create_from_file_like(
+    int64_t file_like_context,
+    std::optional<std::string_view> seek_mode) {
+  auto fileLikeContext =
+      reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
+  TORCH_CHECK(fileLikeContext != nullptr, "file_like must be a valid pointer");
+  std::unique_ptr<AVIOFileLikeContext> contextHolder(fileLikeContext);
+
+  SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
+  if (seek_mode.has_value()) {
+    realSeek = seekModeFromString(seek_mode.value());
+  }
+
+  std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
+      std::make_unique<SingleStreamDecoder>(std::move(contextHolder), realSeek);
+  return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
+}
+
 at::Tensor _convert_to_tensor(int64_t decoder_ptr) {
   auto decoder = reinterpret_cast<SingleStreamDecoder*>(decoder_ptr);
   std::unique_ptr<SingleStreamDecoder> uniqueDecoder(decoder);
@@ -441,6 +464,36 @@ at::Tensor encode_audio_to_tensor(
       .encodeToTensor();
 }
 
+void _encode_audio_to_file_like(
+    const at::Tensor& samples,
+    int64_t sample_rate,
+    std::string_view format,
+    int64_t file_like_context,
+    std::optional<int64_t> bit_rate = std::nullopt,
+    std::optional<int64_t> num_channels = std::nullopt,
+    std::optional<int64_t> desired_sample_rate = std::nullopt) {
+  auto fileLikeContext =
+      reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
+  TORCH_CHECK(
+      fileLikeContext != nullptr, "file_like_context must be a valid pointer");
+  std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
+
+  AudioStreamOptions audioStreamOptions;
+  audioStreamOptions.bitRate = validateOptionalInt64ToInt(bit_rate, "bit_rate");
+  audioStreamOptions.numChannels =
+      validateOptionalInt64ToInt(num_channels, "num_channels");
+  audioStreamOptions.sampleRate =
+      validateOptionalInt64ToInt(desired_sample_rate, "desired_sample_rate");
+
+  AudioEncoder encoder(
+      samples,
+      validateInt64ToInt(sample_rate, "sample_rate"),
+      format,
+      std::move(avioContextHolder),
+      audioStreamOptions);
+  encoder.encode();
+}
+
 // For testing only. We need to implement this operation as a core library
 // function because what we're testing is round-tripping pts values as
 // double-precision floating point numbers from C++ to Python and back to C++.
@@ -694,6 +747,7 @@ void scan_all_streams_to_update_metadata(at::Tensor& decoder) {
 TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
   m.impl("create_from_file", &create_from_file);
   m.impl("create_from_tensor", &create_from_tensor);
+  m.impl("_create_from_file_like", &_create_from_file_like);
   m.impl("_convert_to_tensor", &_convert_to_tensor);
   m.impl(
       "_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions);
@@ -702,6 +756,7 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
 TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
   m.impl("encode_audio_to_file", &encode_audio_to_file);
   m.impl("encode_audio_to_tensor", &encode_audio_to_tensor);
+  m.impl("_encode_audio_to_file_like", &_encode_audio_to_file_like);
   m.impl("seek_to_pts", &seek_to_pts);
   m.impl("add_video_stream", &add_video_stream);
   m.impl("_add_video_stream", &_add_video_stream);
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
index 807ed265d..43983539b 100644
--- a/src/torchcodec/_core/ops.py
+++ b/src/torchcodec/_core/ops.py
@@ -95,9 +95,15 @@ def load_torchcodec_shared_libraries():
 encode_audio_to_tensor = torch._dynamo.disallow_in_graph(
     torch.ops.torchcodec_ns.encode_audio_to_tensor.default
 )
+_encode_audio_to_file_like = torch._dynamo.disallow_in_graph(
+    torch.ops.torchcodec_ns._encode_audio_to_file_like.default
+)
 create_from_tensor = torch._dynamo.disallow_in_graph(
     torch.ops.torchcodec_ns.create_from_tensor.default
 )
+_create_from_file_like = torch._dynamo.disallow_in_graph(
+    torch.ops.torchcodec_ns._create_from_file_like.default
+)
 _convert_to_tensor = torch._dynamo.disallow_in_graph(
     torch.ops.torchcodec_ns._convert_to_tensor.default
 )
@@ -148,7 +154,12 @@ def create_from_file_like(
     file_like: Union[io.RawIOBase, io.BufferedReader], seek_mode: Optional[str] = None
 ) -> torch.Tensor:
     assert _pybind_ops is not None
-    return _convert_to_tensor(_pybind_ops.create_from_file_like(file_like, seek_mode))
+    return _create_from_file_like(
+        _pybind_ops.create_file_like_context(
+            file_like, False  # False means not for writing
+        ),
+        seek_mode,
+    )
 
 
 def encode_audio_to_file_like(
@@ -176,36 +187,16 @@ def encode_audio_to_file_like(
     if samples.dtype != torch.float32:
         raise ValueError(f"samples must have dtype torch.float32, got {samples.dtype}")
 
-    # We're having the same problem as with the decoder's create_from_file_like:
-    # We should be able to pass a tensor directly, but this leads to a pybind
-    # error. In order to work around this, we pass the pointer to the tensor's
-    # data, and its shape, in order to re-construct it in C++. For this to work:
-    # - the tensor must be float32
-    # - the tensor  must be contiguous, which is why we call contiguous().
-    #   In theory we could avoid this restriction by also passing the strides?
-    # - IMPORTANT: the input samples tensor and its underlying data must be
-    #   alive during the call.
-    #
-    # A more elegant solution would be to cast the tensor into a py::object, but
-    # casting the py::object backk to a tensor in C++ seems to lead to the same
-    # pybing error.
-
-    samples = samples.contiguous()
-    _pybind_ops.encode_audio_to_file_like(
-        samples.data_ptr(),
-        list(samples.shape),
+    _encode_audio_to_file_like(
+        samples,
         sample_rate,
         format,
-        file_like,
+        _pybind_ops.create_file_like_context(file_like, True),  # True means for writing
         bit_rate,
         num_channels,
         desired_sample_rate,
     )
 
-    # This check is useless but it's critical to keep it to ensures that samples
-    # is still alive during the call to encode_audio_to_file_like.
-    assert samples.is_contiguous()
-
 
 # ==============================
 # Abstract impl for the operators. Needed by torch.compile.
@@ -215,6 +206,13 @@ def create_from_file_abstract(filename: str, seek_mode: Optional[str]) -> torch.
     return torch.empty([], dtype=torch.long)
 
 
+@register_fake("torchcodec_ns::_create_from_file_like")
+def _create_from_file_like_abstract(
+    file_like: int, seek_mode: Optional[str]
+) -> torch.Tensor:
+    return torch.empty([], dtype=torch.long)
+
+
 @register_fake("torchcodec_ns::encode_audio_to_file")
 def encode_audio_to_file_abstract(
     samples: torch.Tensor,
@@ -239,6 +237,19 @@ def encode_audio_to_tensor_abstract(
     return torch.empty([], dtype=torch.long)
 
 
+@register_fake("torchcodec_ns::_encode_audio_to_file_like")
+def _encode_audio_to_file_like_abstract(
+    samples: torch.Tensor,
+    sample_rate: int,
+    format: str,
+    file_like_context: int,
+    bit_rate: Optional[int] = None,
+    num_channels: Optional[int] = None,
+    desired_sample_rate: Optional[int] = None,
+) -> None:
+    return torch.empty([], dtype=torch.long)
+
+
 @register_fake("torchcodec_ns::create_from_tensor")
 def create_from_tensor_abstract(
     video_tensor: torch.Tensor, seek_mode: Optional[str]
diff --git a/src/torchcodec/_core/pybind_ops.cpp b/src/torchcodec/_core/pybind_ops.cpp
index 72969f7a9..04c87de32 100644
--- a/src/torchcodec/_core/pybind_ops.cpp
+++ b/src/torchcodec/_core/pybind_ops.cpp
@@ -7,13 +7,8 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <cstdint>
-#include <string>
 
 #include "src/torchcodec/_core/AVIOFileLikeContext.h"
-#include "src/torchcodec/_core/Encoder.h"
-#include "src/torchcodec/_core/SingleStreamDecoder.h"
-#include "src/torchcodec/_core/StreamOptions.h"
-#include "src/torchcodec/_core/ValidationUtils.h"
 
 namespace py = pybind11;
 
@@ -26,53 +21,10 @@ namespace facebook::torchcodec {
 //
 // So we instead launder the pointer through an int, and then use a conversion
 // function on the custom ops side to launder that int into a tensor.
-int64_t create_from_file_like(
-    py::object file_like,
-    std::optional<std::string_view> seek_mode) {
-  SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
-  if (seek_mode.has_value()) {
-    realSeek = seekModeFromString(seek_mode.value());
-  }
-
-  auto avioContextHolder =
-      std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/false);
-
-  SingleStreamDecoder* decoder =
-      new SingleStreamDecoder(std::move(avioContextHolder), realSeek);
-  return reinterpret_cast<int64_t>(decoder);
-}
-
-void encode_audio_to_file_like(
-    int64_t data_ptr,
-    const std::vector<int64_t>& shape,
-    int64_t sample_rate,
-    std::string_view format,
-    py::object file_like,
-    std::optional<int64_t> bit_rate = std::nullopt,
-    std::optional<int64_t> num_channels = std::nullopt,
-    std::optional<int64_t> desired_sample_rate = std::nullopt) {
-  // We assume float32 *and* contiguity, this must be enforced by the caller.
-  auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32);
-  auto samples = torch::from_blob(
-      reinterpret_cast<void*>(data_ptr), shape, tensor_options);
-
-  AudioStreamOptions audioStreamOptions;
-  audioStreamOptions.bitRate = validateOptionalInt64ToInt(bit_rate, "bit_rate");
-  audioStreamOptions.numChannels =
-      validateOptionalInt64ToInt(num_channels, "num_channels");
-  audioStreamOptions.sampleRate =
-      validateOptionalInt64ToInt(desired_sample_rate, "desired_sample_rate");
-
-  auto avioContextHolder =
-      std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/true);
-
-  AudioEncoder encoder(
-      samples,
-      validateInt64ToInt(sample_rate, "sample_rate"),
-      format,
-      std::move(avioContextHolder),
-      audioStreamOptions);
-  encoder.encode();
+int64_t create_file_like_context(py::object file_like, bool is_for_writing) {
+  AVIOFileLikeContext* context =
+      new AVIOFileLikeContext(file_like, is_for_writing);
+  return reinterpret_cast<int64_t>(context);
 }
 
 #ifndef PYBIND_OPS_MODULE_NAME
@@ -80,8 +32,7 @@ void encode_audio_to_file_like(
 #endif
 
 PYBIND11_MODULE(PYBIND_OPS_MODULE_NAME, m) {
-  m.def("create_from_file_like", &create_from_file_like);
-  m.def("encode_audio_to_file_like", &encode_audio_to_file_like);
+  m.def("create_file_like_context", &create_file_like_context);
 }
 
 } // namespace facebook::torchcodec

From f57fc00d992f532e32cef86257b666713fd51983 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Wed, 10 Sep 2025 09:00:35 -0700
Subject: [PATCH 2/7] Remove converting int to decoder

---
 src/torchcodec/_core/custom_ops.cpp | 8 --------
 src/torchcodec/_core/ops.py         | 8 --------
 2 files changed, 16 deletions(-)

diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index 1bf47edec..0265a3e3f 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -40,7 +40,6 @@ TORCH_LIBRARY(torchcodec_ns, m) {
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
   m.def(
       "_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor");
-  m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
   m.def(
       "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
   m.def(
@@ -233,12 +232,6 @@ at::Tensor _create_from_file_like(
   return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
 }
 
-at::Tensor _convert_to_tensor(int64_t decoder_ptr) {
-  auto decoder = reinterpret_cast<SingleStreamDecoder*>(decoder_ptr);
-  std::unique_ptr<SingleStreamDecoder> uniqueDecoder(decoder);
-  return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
-}
-
 void _add_video_stream(
     at::Tensor& decoder,
     std::optional<int64_t> width = std::nullopt,
@@ -748,7 +741,6 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
   m.impl("create_from_file", &create_from_file);
   m.impl("create_from_tensor", &create_from_tensor);
   m.impl("_create_from_file_like", &_create_from_file_like);
-  m.impl("_convert_to_tensor", &_convert_to_tensor);
   m.impl(
       "_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions);
 }
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
index 43983539b..e18527f9a 100644
--- a/src/torchcodec/_core/ops.py
+++ b/src/torchcodec/_core/ops.py
@@ -104,9 +104,6 @@ def load_torchcodec_shared_libraries():
 _create_from_file_like = torch._dynamo.disallow_in_graph(
     torch.ops.torchcodec_ns._create_from_file_like.default
 )
-_convert_to_tensor = torch._dynamo.disallow_in_graph(
-    torch.ops.torchcodec_ns._convert_to_tensor.default
-)
 add_video_stream = torch.ops.torchcodec_ns.add_video_stream.default
 _add_video_stream = torch.ops.torchcodec_ns._add_video_stream.default
 add_audio_stream = torch.ops.torchcodec_ns.add_audio_stream.default
@@ -257,11 +254,6 @@ def create_from_tensor_abstract(
     return torch.empty([], dtype=torch.long)
 
 
-@register_fake("torchcodec_ns::_convert_to_tensor")
-def _convert_to_tensor_abstract(decoder_ptr: int) -> torch.Tensor:
-    return torch.empty([], dtype=torch.long)
-
-
 @register_fake("torchcodec_ns::_add_video_stream")
 def _add_video_stream_abstract(
     decoder: torch.Tensor,

From fa32ece137df9ac2c85e878dcc1e6df87a6a7afd Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Wed, 10 Sep 2025 09:06:12 -0700
Subject: [PATCH 3/7] Lint

---
 src/torchcodec/_core/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
index e18527f9a..12f70b89f 100644
--- a/src/torchcodec/_core/ops.py
+++ b/src/torchcodec/_core/ops.py
@@ -244,7 +244,7 @@ def _encode_audio_to_file_like_abstract(
     num_channels: Optional[int] = None,
     desired_sample_rate: Optional[int] = None,
 ) -> None:
-    return torch.empty([], dtype=torch.long)
+    return
 
 
 @register_fake("torchcodec_ns::create_from_tensor")

From c6e07aefe3d86afd99cb1cafc0f265aa406224ef Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Wed, 10 Sep 2025 12:12:04 -0700
Subject: [PATCH 4/7] Fix for Windows

---
 src/torchcodec/_core/AVIOFileLikeContext.h | 17 +++++++++++++++--
 src/torchcodec/_core/custom_ops.cpp        | 14 +++++++++-----
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/torchcodec/_core/AVIOFileLikeContext.h b/src/torchcodec/_core/AVIOFileLikeContext.h
index 74dad3d76..3146c00de 100644
--- a/src/torchcodec/_core/AVIOFileLikeContext.h
+++ b/src/torchcodec/_core/AVIOFileLikeContext.h
@@ -11,14 +11,27 @@
 
 #include "src/torchcodec/_core/AVIOContextHolder.h"
 
+// pybind11 has specific visibility rules; see:
+//  https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-visibility-than-the-type-of-its-field-someclass-member-wattributes
+// This file is included in both pybind_ops.cpp and custom_ops.cpp. We compile
+// pybind_ops.cpp with the global visibility flag as required. On the
+// custom_ops.cpp side we don't need to do that. However, we do need to ensure
+// that this class is also seen as the same visibility.
+//
+// This only matters on Linux and Mac; on Windows we don't need to do anything.
+#ifdef _WIN32
+#define VISIBILITY_HIDDEN
+#else
+#define VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
+#endif
+
 namespace py = pybind11;
 
 namespace facebook::torchcodec {
 
 // Enables uers to pass in a Python file-like object. We then forward all read
 // and seek calls back up to the methods on the Python object.
-class __attribute__((visibility("hidden"))) AVIOFileLikeContext
-    : public AVIOContextHolder {
+class VISIBILITY_HIDDEN AVIOFileLikeContext : public AVIOContextHolder {
  public:
   explicit AVIOFileLikeContext(const py::object& fileLike, bool isForWriting);
 
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index 0265a3e3f..d24c4dfa3 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -207,10 +207,12 @@ at::Tensor create_from_tensor(
     realSeek = seekModeFromString(seek_mode.value());
   }
 
-  auto contextHolder = std::make_unique<AVIOFromTensorContext>(video_tensor);
+  auto avioContextHolder =
+      std::make_unique<AVIOFromTensorContext>(video_tensor);
 
   std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
-      std::make_unique<SingleStreamDecoder>(std::move(contextHolder), realSeek);
+      std::make_unique<SingleStreamDecoder>(
+          std::move(avioContextHolder), realSeek);
   return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
 }
 
@@ -219,8 +221,9 @@ at::Tensor _create_from_file_like(
     std::optional<std::string_view> seek_mode) {
   auto fileLikeContext =
       reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
-  TORCH_CHECK(fileLikeContext != nullptr, "file_like must be a valid pointer");
-  std::unique_ptr<AVIOFileLikeContext> contextHolder(fileLikeContext);
+  TORCH_CHECK(
+      fileLikeContext != nullptr, "file_like_context must be a valid pointer");
+  std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
 
   SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
   if (seek_mode.has_value()) {
@@ -228,7 +231,8 @@ at::Tensor _create_from_file_like(
   }
 
   std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
-      std::make_unique<SingleStreamDecoder>(std::move(contextHolder), realSeek);
+      std::make_unique<SingleStreamDecoder>(
+          std::move(avioContextHolder), realSeek);
   return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
 }
 

From 497f68b4cf4ebca326677696af9618febebfd622 Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Thu, 11 Sep 2025 07:00:36 -0700
Subject: [PATCH 5/7] Better comment; move around seek to string

---
 src/torchcodec/_core/SingleStreamDecoder.cpp | 12 -----------
 src/torchcodec/_core/SingleStreamDecoder.h   |  2 --
 src/torchcodec/_core/custom_ops.cpp          | 12 +++++++++++
 src/torchcodec/_core/pybind_ops.cpp          | 22 +++++++++++++++-----
 4 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index 864e82d0a..b52556e93 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1704,16 +1704,4 @@ FrameDims getHeightAndWidthFromOptionsOrAVFrame(
       videoStreamOptions.width.value_or(avFrame->width));
 }
 
-SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
-  if (seekMode == "exact") {
-    return SingleStreamDecoder::SeekMode::exact;
-  } else if (seekMode == "approximate") {
-    return SingleStreamDecoder::SeekMode::approximate;
-  } else if (seekMode == "custom_frame_mappings") {
-    return SingleStreamDecoder::SeekMode::custom_frame_mappings;
-  } else {
-    TORCH_CHECK(false, "Invalid seek mode: " + std::string(seekMode));
-  }
-}
-
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
index 027f52fc4..56bb8bb58 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.h
+++ b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -375,6 +375,4 @@ std::ostream& operator<<(
     std::ostream& os,
     const SingleStreamDecoder::DecodeStats& stats);
 
-SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode);
-
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
index d24c4dfa3..76e89e692 100644
--- a/src/torchcodec/_core/custom_ops.cpp
+++ b/src/torchcodec/_core/custom_ops.cpp
@@ -169,6 +169,18 @@ std::string mapToJson(const std::map<std::string, std::string>& metadataMap) {
   return ss.str();
 }
 
+SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
+  if (seekMode == "exact") {
+    return SingleStreamDecoder::SeekMode::exact;
+  } else if (seekMode == "approximate") {
+    return SingleStreamDecoder::SeekMode::approximate;
+  } else if (seekMode == "custom_frame_mappings") {
+    return SingleStreamDecoder::SeekMode::custom_frame_mappings;
+  } else {
+    TORCH_CHECK(false, "Invalid seek mode: " + std::string(seekMode));
+  }
+}
+
 } // namespace
 
 // ==============================
diff --git a/src/torchcodec/_core/pybind_ops.cpp b/src/torchcodec/_core/pybind_ops.cpp
index 04c87de32..f9cfd7eb6 100644
--- a/src/torchcodec/_core/pybind_ops.cpp
+++ b/src/torchcodec/_core/pybind_ops.cpp
@@ -14,13 +14,25 @@ namespace py = pybind11;
 
 namespace facebook::torchcodec {
 
-// In principle, this should be able to return a tensor. But when we try that,
-// we run into the bug reported here:
+// Note: It's not immediately obvous why we need both custom_ops.cpp and
+//       pybind_ops.cpp. We do all other Python to C++ bridging in
+//       custom_ops.cpp, and that even depends on pybind11, so why have an
+//       explicit pybind-only file?
 //
-//   https://github.com/pytorch/pytorch/issues/136664
+//       The reason is that we want to accept OWNERSHIP of a file-like object
+//       from the Python side. In order to do that, we need a proper
+//       py::object. For raw bytes, we can launder that through a tensor on the
+//       custom_ops.cpp side, but we can't launder a proper Python object
+//       through a tensor. Custom ops can't accept a proper Python object
+//       through py::object, so we have to do direct pybind11 here.
 //
-// So we instead launder the pointer through an int, and then use a conversion
-// function on the custom ops side to launder that int into a tensor.
+// TODO: Investigate if we can do something better here. See:
+//         https://github.com/pytorch/torchcodec/issues/896
+//       Short version is that we're laundering a pointer through an int, the
+//       Python side forwards that to decoder creation functions in
+//       custom_ops.cpp and we do another cast on that side to get a pointer
+//       again. We want to investigate if we can do something cleaner by
+//       defining proper pybind objects.
 int64_t create_file_like_context(py::object file_like, bool is_for_writing) {
   AVIOFileLikeContext* context =
       new AVIOFileLikeContext(file_like, is_for_writing);

From 167212f225c231e8fd707339b754d5751313b1db Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Thu, 11 Sep 2025 07:20:48 -0700
Subject: [PATCH 6/7] Remove visibility stuff from source code, add it to cmake

---
 src/torchcodec/_core/AVIOFileLikeContext.h | 16 +---------------
 src/torchcodec/_core/CMakeLists.txt        |  7 +++++++
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/torchcodec/_core/AVIOFileLikeContext.h b/src/torchcodec/_core/AVIOFileLikeContext.h
index 3146c00de..fd7f534f3 100644
--- a/src/torchcodec/_core/AVIOFileLikeContext.h
+++ b/src/torchcodec/_core/AVIOFileLikeContext.h
@@ -11,27 +11,13 @@
 
 #include "src/torchcodec/_core/AVIOContextHolder.h"
 
-// pybind11 has specific visibility rules; see:
-//  https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-visibility-than-the-type-of-its-field-someclass-member-wattributes
-// This file is included in both pybind_ops.cpp and custom_ops.cpp. We compile
-// pybind_ops.cpp with the global visibility flag as required. On the
-// custom_ops.cpp side we don't need to do that. However, we do need to ensure
-// that this class is also seen as the same visibility.
-//
-// This only matters on Linux and Mac; on Windows we don't need to do anything.
-#ifdef _WIN32
-#define VISIBILITY_HIDDEN
-#else
-#define VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
-#endif
-
 namespace py = pybind11;
 
 namespace facebook::torchcodec {
 
 // Enables uers to pass in a Python file-like object. We then forward all read
 // and seek calls back up to the methods on the Python object.
-class VISIBILITY_HIDDEN AVIOFileLikeContext : public AVIOContextHolder {
+class AVIOFileLikeContext : public AVIOContextHolder {
  public:
   explicit AVIOFileLikeContext(const py::object& fileLike, bool isForWriting);
 
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
index 03f68f6b8..28d9f512e 100644
--- a/src/torchcodec/_core/CMakeLists.txt
+++ b/src/torchcodec/_core/CMakeLists.txt
@@ -175,12 +175,19 @@ function(make_torchcodec_libraries
     # stray initialization of py::objects. The rest of the object code must
     # match. See:
     #   https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-visibility-than-the-type-of-its-field-someclass-member-wattributes
+    # We have to do this for both pybind_ops and custom_ops because they include
+    # some of the same headers.
     if(NOT WIN32)
         target_compile_options(
             ${pybind_ops_library_name}
             PUBLIC
             "-fvisibility=hidden"
         )
+        target_compile_options(
+            ${custom_ops_library_name}
+            PUBLIC
+            "-fvisibility=hidden"
+        )
     endif()
 
     # The value we use here must match the value we return from

From 97224ba072aa66cc1b56faab0edaf2f838ba466b Mon Sep 17 00:00:00 2001
From: Scott Schneider <scott.a.s@gmail.com>
Date: Fri, 12 Sep 2025 10:37:54 -0700
Subject: [PATCH 7/7] Make visiblity hidden linux only

---
 src/torchcodec/_core/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
index 28d9f512e..e3f9102e2 100644
--- a/src/torchcodec/_core/CMakeLists.txt
+++ b/src/torchcodec/_core/CMakeLists.txt
@@ -177,7 +177,11 @@ function(make_torchcodec_libraries
     #   https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-visibility-than-the-type-of-its-field-someclass-member-wattributes
     # We have to do this for both pybind_ops and custom_ops because they include
     # some of the same headers.
-    if(NOT WIN32)
+    #
+    # Note that this is Linux only. It's not necessary on Windows, and on Mac
+    # hiding visibility can actually break dyanmic casts across share libraries
+    # because the type infos don't get exported.
+    if(LINUX)
         target_compile_options(
             ${pybind_ops_library_name}
             PUBLIC