From e14dd0c28d71d93ec49d674964e9ca45d9d2f934 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Fri, 10 Oct 2025 17:19:30 -0400 Subject: [PATCH 1/7] to_tensor, AVIOTensorContext fix --- src/torchcodec/_core/AVIOTensorContext.cpp | 33 +++++++++++----------- src/torchcodec/_core/AVIOTensorContext.h | 4 +-- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/torchcodec/_core/AVIOTensorContext.cpp b/src/torchcodec/_core/AVIOTensorContext.cpp index 238475761..be8a8f1ff 100644 --- a/src/torchcodec/_core/AVIOTensorContext.cpp +++ b/src/torchcodec/_core/AVIOTensorContext.cpp @@ -18,15 +18,15 @@ constexpr int64_t MAX_TENSOR_SIZE = 320'000'000; // 320 MB int read(void* opaque, uint8_t* buf, int buf_size) { auto tensorContext = static_cast(opaque); TORCH_CHECK( - tensorContext->current_pos <= tensorContext->data.numel(), - "Tried to read outside of the buffer: current_pos=", - tensorContext->current_pos, + tensorContext->current <= tensorContext->data.numel(), + "Tried to read outside of the buffer: current=", + tensorContext->current, ", size=", tensorContext->data.numel()); int64_t numBytesRead = std::min( static_cast(buf_size), - tensorContext->data.numel() - tensorContext->current_pos); + tensorContext->data.numel() - tensorContext->current); TORCH_CHECK( numBytesRead >= 0, @@ -34,8 +34,8 @@ int read(void* opaque, uint8_t* buf, int buf_size) { numBytesRead, ", size=", tensorContext->data.numel(), - ", current_pos=", - tensorContext->current_pos); + ", current=", + tensorContext->current); if (numBytesRead == 0) { return AVERROR_EOF; @@ -43,9 +43,9 @@ int read(void* opaque, uint8_t* buf, int buf_size) { std::memcpy( buf, - tensorContext->data.data_ptr() + tensorContext->current_pos, + tensorContext->data.data_ptr() + tensorContext->current, numBytesRead); - tensorContext->current_pos += numBytesRead; + tensorContext->current += numBytesRead; return numBytesRead; } @@ -54,7 +54,7 @@ int write(void* opaque, const uint8_t* buf, int buf_size) { auto tensorContext = static_cast(opaque); int64_t bufSize = static_cast(buf_size); - if (tensorContext->current_pos + bufSize > tensorContext->data.numel()) { + if (tensorContext->current + bufSize > tensorContext->data.numel()) { TORCH_CHECK( tensorContext->data.numel() * 2 <= MAX_TENSOR_SIZE, "We tried to allocate an output encoded tensor larger than ", @@ -68,17 +68,18 @@ int write(void* opaque, const uint8_t* buf, int buf_size) { } TORCH_CHECK( - tensorContext->current_pos + bufSize <= tensorContext->data.numel(), + tensorContext->current + bufSize <= tensorContext->data.numel(), "Re-allocation of the output tensor didn't work. ", "This should not happen, please report on TorchCodec bug tracker"); uint8_t* outputTensorData = tensorContext->data.data_ptr(); - std::memcpy(outputTensorData + tensorContext->current_pos, buf, bufSize); - tensorContext->current_pos += bufSize; + std::memcpy(outputTensorData + tensorContext->current, buf, bufSize); + tensorContext->current += bufSize; // Track the maximum position written so getOutputTensor's narrow() does not // truncate the file if final seek was backwards - tensorContext->max_pos = - std::max(tensorContext->current_pos, tensorContext->max_pos); + if (tensorContext->current > tensorContext->max) { + tensorContext->max = tensorContext->current; + } return buf_size; } @@ -92,7 +93,7 @@ int64_t seek(void* opaque, int64_t offset, int whence) { ret = tensorContext->data.numel(); break; case SEEK_SET: - tensorContext->current_pos = offset; + tensorContext->current = offset; ret = offset; break; default: @@ -124,7 +125,7 @@ AVIOToTensorContext::AVIOToTensorContext() torch::Tensor AVIOToTensorContext::getOutputTensor() { return tensorContext_.data.narrow( - /*dim=*/0, /*start=*/0, /*length=*/tensorContext_.max_pos); + /*dim=*/0, /*start=*/0, /*length=*/tensorContext_.max); } } // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/AVIOTensorContext.h b/src/torchcodec/_core/AVIOTensorContext.h index bcd97052b..ad03cfbd5 100644 --- a/src/torchcodec/_core/AVIOTensorContext.h +++ b/src/torchcodec/_core/AVIOTensorContext.h @@ -15,8 +15,8 @@ namespace detail { struct TensorContext { torch::Tensor data; - int64_t current_pos; - int64_t max_pos; + int64_t current; + int64_t max; }; } // namespace detail From 45222dce119f4a06408331b4ab9545e0fd46cd2b Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Wed, 15 Oct 2025 14:11:13 -0400 Subject: [PATCH 2/7] update tensorContext vars, use std::max --- src/torchcodec/_core/AVIOTensorContext.cpp | 33 +++++++++++----------- src/torchcodec/_core/AVIOTensorContext.h | 4 +-- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/src/torchcodec/_core/AVIOTensorContext.cpp b/src/torchcodec/_core/AVIOTensorContext.cpp index be8a8f1ff..238475761 100644 --- a/src/torchcodec/_core/AVIOTensorContext.cpp +++ b/src/torchcodec/_core/AVIOTensorContext.cpp @@ -18,15 +18,15 @@ constexpr int64_t MAX_TENSOR_SIZE = 320'000'000; // 320 MB int read(void* opaque, uint8_t* buf, int buf_size) { auto tensorContext = static_cast(opaque); TORCH_CHECK( - tensorContext->current <= tensorContext->data.numel(), - "Tried to read outside of the buffer: current=", - tensorContext->current, + tensorContext->current_pos <= tensorContext->data.numel(), + "Tried to read outside of the buffer: current_pos=", + tensorContext->current_pos, ", size=", tensorContext->data.numel()); int64_t numBytesRead = std::min( static_cast(buf_size), - tensorContext->data.numel() - tensorContext->current); + tensorContext->data.numel() - tensorContext->current_pos); TORCH_CHECK( numBytesRead >= 0, @@ -34,8 +34,8 @@ int read(void* opaque, uint8_t* buf, int buf_size) { numBytesRead, ", size=", tensorContext->data.numel(), - ", current=", - tensorContext->current); + ", current_pos=", + tensorContext->current_pos); if (numBytesRead == 0) { return AVERROR_EOF; @@ -43,9 +43,9 @@ int read(void* opaque, uint8_t* buf, int buf_size) { std::memcpy( buf, - tensorContext->data.data_ptr() + tensorContext->current, + tensorContext->data.data_ptr() + tensorContext->current_pos, numBytesRead); - tensorContext->current += numBytesRead; + tensorContext->current_pos += numBytesRead; return numBytesRead; } @@ -54,7 +54,7 @@ int write(void* opaque, const uint8_t* buf, int buf_size) { auto tensorContext = static_cast(opaque); int64_t bufSize = static_cast(buf_size); - if (tensorContext->current + bufSize > tensorContext->data.numel()) { + if (tensorContext->current_pos + bufSize > tensorContext->data.numel()) { TORCH_CHECK( tensorContext->data.numel() * 2 <= MAX_TENSOR_SIZE, "We tried to allocate an output encoded tensor larger than ", @@ -68,18 +68,17 @@ int write(void* opaque, const uint8_t* buf, int buf_size) { } TORCH_CHECK( - tensorContext->current + bufSize <= tensorContext->data.numel(), + tensorContext->current_pos + bufSize <= tensorContext->data.numel(), "Re-allocation of the output tensor didn't work. ", "This should not happen, please report on TorchCodec bug tracker"); uint8_t* outputTensorData = tensorContext->data.data_ptr(); - std::memcpy(outputTensorData + tensorContext->current, buf, bufSize); - tensorContext->current += bufSize; + std::memcpy(outputTensorData + tensorContext->current_pos, buf, bufSize); + tensorContext->current_pos += bufSize; // Track the maximum position written so getOutputTensor's narrow() does not // truncate the file if final seek was backwards - if (tensorContext->current > tensorContext->max) { - tensorContext->max = tensorContext->current; - } + tensorContext->max_pos = + std::max(tensorContext->current_pos, tensorContext->max_pos); return buf_size; } @@ -93,7 +92,7 @@ int64_t seek(void* opaque, int64_t offset, int whence) { ret = tensorContext->data.numel(); break; case SEEK_SET: - tensorContext->current = offset; + tensorContext->current_pos = offset; ret = offset; break; default: @@ -125,7 +124,7 @@ AVIOToTensorContext::AVIOToTensorContext() torch::Tensor AVIOToTensorContext::getOutputTensor() { return tensorContext_.data.narrow( - /*dim=*/0, /*start=*/0, /*length=*/tensorContext_.max); + /*dim=*/0, /*start=*/0, /*length=*/tensorContext_.max_pos); } } // namespace facebook::torchcodec diff --git a/src/torchcodec/_core/AVIOTensorContext.h b/src/torchcodec/_core/AVIOTensorContext.h index ad03cfbd5..bcd97052b 100644 --- a/src/torchcodec/_core/AVIOTensorContext.h +++ b/src/torchcodec/_core/AVIOTensorContext.h @@ -15,8 +15,8 @@ namespace detail { struct TensorContext { torch::Tensor data; - int64_t current; - int64_t max; + int64_t current_pos; + int64_t max_pos; }; } // namespace detail From 050d91a0b3039e7263935f54cbfa50a5094e08e2 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Fri, 17 Oct 2025 11:22:17 -0400 Subject: [PATCH 3/7] to_filelike, update test --- src/torchcodec/_core/__init__.py | 1 + src/torchcodec/_core/custom_ops.cpp | 27 +++++++++++++++++++ src/torchcodec/_core/ops.py | 41 +++++++++++++++++++++++++++++ test/test_ops.py | 41 ++++++++++++++++++++++++----- 4 files changed, 104 insertions(+), 6 deletions(-) diff --git a/src/torchcodec/_core/__init__.py b/src/torchcodec/_core/__init__.py index eb8dd9697..e04fcb8bd 100644 --- a/src/torchcodec/_core/__init__.py +++ b/src/torchcodec/_core/__init__.py @@ -26,6 +26,7 @@ encode_audio_to_file_like, encode_audio_to_tensor, encode_video_to_file, + encode_video_to_file_like, encode_video_to_tensor, get_ffmpeg_library_versions, get_frame_at_index, diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 94a3fba1b..32c140d9e 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -40,6 +40,8 @@ TORCH_LIBRARY(torchcodec_ns, m) { "encode_video_to_file(Tensor frames, int frame_rate, str filename, int? crf=None) -> ()"); m.def( "encode_video_to_tensor(Tensor frames, int frame_rate, str format, int? crf=None) -> Tensor"); + m.def( + "_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, int? crf=None) -> ()"); m.def( "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor"); m.def( @@ -606,6 +608,30 @@ at::Tensor encode_video_to_tensor( .encodeToTensor(); } +void _encode_video_to_file_like( + const at::Tensor& frames, + int64_t frame_rate, + std::string_view format, + int64_t file_like_context, + std::optional crf = std::nullopt) { + auto fileLikeContext = + reinterpret_cast(file_like_context); + TORCH_CHECK( + fileLikeContext != nullptr, "file_like_context must be a valid pointer"); + std::unique_ptr avioContextHolder(fileLikeContext); + + VideoStreamOptions videoStreamOptions; + videoStreamOptions.crf = crf; + + VideoEncoder encoder( + frames, + validateInt64ToInt(frame_rate, "frame_rate"), + format, + std::move(avioContextHolder), + videoStreamOptions); + encoder.encode(); +} + // For testing only. We need to implement this operation as a core library // function because what we're testing is round-tripping pts values as // double-precision floating point numbers from C++ to Python and back to C++. @@ -870,6 +896,7 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) { m.impl("_encode_audio_to_file_like", &_encode_audio_to_file_like); m.impl("encode_video_to_file", &encode_video_to_file); m.impl("encode_video_to_tensor", &encode_video_to_tensor); + m.impl("_encode_video_to_file_like", &_encode_video_to_file_like); m.impl("seek_to_pts", &seek_to_pts); m.impl("add_video_stream", &add_video_stream); m.impl("_add_video_stream", &_add_video_stream); diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py index 03cf8cf6d..7123c83da 100644 --- a/src/torchcodec/_core/ops.py +++ b/src/torchcodec/_core/ops.py @@ -104,6 +104,9 @@ def load_torchcodec_shared_libraries(): encode_video_to_tensor = torch._dynamo.disallow_in_graph( torch.ops.torchcodec_ns.encode_video_to_tensor.default ) +_encode_video_to_file_like = torch._dynamo.disallow_in_graph( + torch.ops.torchcodec_ns._encode_video_to_file_like.default +) create_from_tensor = torch._dynamo.disallow_in_graph( torch.ops.torchcodec_ns.create_from_tensor.default ) @@ -203,6 +206,33 @@ def encode_audio_to_file_like( ) +def encode_video_to_file_like( + frames: torch.Tensor, + frame_rate: int, + format: str, + file_like: Union[io.RawIOBase, io.BufferedIOBase], + crf: Optional[int] = None, +) -> None: + """Encode video frames to a file-like object. + + Args: + frames: Video frames tensor + frame_rate: Frame rate in frames per second + format: Video format (e.g., "mp4", "mov", "mkv") + file_like: File-like object that supports write() and seek() methods + crf: Optional constant rate factor for encoding quality + """ + assert _pybind_ops is not None + + _encode_video_to_file_like( + frames, + frame_rate, + format, + _pybind_ops.create_file_like_context(file_like, True), # True means for writing + crf, + ) + + def get_frames_at_indices( decoder: torch.Tensor, *, frame_indices: Union[torch.Tensor, list[int]] ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: @@ -302,6 +332,17 @@ def encode_video_to_tensor_abstract( return torch.empty([], dtype=torch.long) +@register_fake("torchcodec_ns::_encode_video_to_file_like") +def _encode_video_to_file_like_abstract( + frames: torch.Tensor, + frame_rate: int, + format: str, + file_like_context: int, + crf: Optional[int] = None, +) -> None: + return + + @register_fake("torchcodec_ns::create_from_tensor") def create_from_tensor_abstract( video_tensor: torch.Tensor, seek_mode: Optional[str] diff --git a/test/test_ops.py b/test/test_ops.py index 31afbdd14..5522c26bc 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -29,6 +29,7 @@ create_from_tensor, encode_audio_to_file, encode_video_to_file, + encode_video_to_file_like, encode_video_to_tensor, get_ffmpeg_library_versions, get_frame_at_index, @@ -1397,7 +1398,7 @@ def decode(self, source=None) -> torch.Tensor: @pytest.mark.parametrize( "format", ("mov", "mp4", "mkv", pytest.param("webm", marks=pytest.mark.slow)) ) - @pytest.mark.parametrize("method", ("to_file", "to_tensor")) + @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like")) def test_video_encoder_round_trip(self, tmp_path, format, method): # Test that decode(encode(decode(frames))) == decode(frames) ffmpeg_version = get_ffmpeg_major_version() @@ -1424,11 +1425,23 @@ def test_video_encoder_round_trip(self, tmp_path, format, method): **params, ) round_trip_frames = self.decode(encoded_path).data - else: # to_tensor + elif method == "to_tensor": encoded_tensor = encode_video_to_tensor( source_frames, format=format, **params ) round_trip_frames = self.decode(encoded_tensor).data + elif method == "to_file_like": + file_like = io.BytesIO() + encode_video_to_file_like( + frames=source_frames, + format=format, + file_like=file_like, + **params, + ) + file_like.seek(0) + round_trip_frames = self.decode(file_like).data + else: + raise ValueError(f"Unknown method: {method}") assert source_frames.shape == round_trip_frames.shape assert source_frames.dtype == round_trip_frames.dtype @@ -1445,6 +1458,7 @@ def test_video_encoder_round_trip(self, tmp_path, format, method): assert psnr(s_frame, rt_frame) > 30 assert_close(s_frame, rt_frame, atol=atol, rtol=0) + @pytest.mark.slow @pytest.mark.parametrize( "format", ( @@ -1457,8 +1471,9 @@ def test_video_encoder_round_trip(self, tmp_path, format, method): pytest.param("webm", marks=pytest.mark.slow), ), ) - def test_against_to_file(self, tmp_path, format): - # Test that to_file and to_tensor produce the same results + @pytest.mark.parametrize("method", ("to_tensor", "to_file_like")) + def test_against_to_file(self, tmp_path, format, method): + # Test that to_file, to_tensor, and to_file_like produce the same results ffmpeg_version = get_ffmpeg_major_version() if format == "webm" and ( ffmpeg_version == 4 or (IS_WINDOWS and ffmpeg_version in (6, 7)) @@ -1470,11 +1485,25 @@ def test_against_to_file(self, tmp_path, format): encoded_file = tmp_path / f"output.{format}" encode_video_to_file(frames=source_frames, filename=str(encoded_file), **params) - encoded_tensor = encode_video_to_tensor(source_frames, format=format, **params) + + if method == "to_tensor": + encoded_output = encode_video_to_tensor( + source_frames, format=format, **params + ) + else: # to_file_like + file_like = io.BytesIO() + encode_video_to_file_like( + frames=source_frames, + file_like=file_like, + format=format, + **params, + ) + file_like.seek(0) + encoded_output = file_like torch.testing.assert_close( self.decode(encoded_file).data, - self.decode(encoded_tensor).data, + self.decode(encoded_output).data, atol=0, rtol=0, ) From 4f107ab1e304a1893ca84e1baf842b55be660f63 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Fri, 17 Oct 2025 13:13:25 -0400 Subject: [PATCH 4/7] Add file_like tests similar to AudioEncoder --- test/test_ops.py | 79 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 2 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 5522c26bc..82ced8d30 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1438,8 +1438,7 @@ def test_video_encoder_round_trip(self, tmp_path, format, method): file_like=file_like, **params, ) - file_like.seek(0) - round_trip_frames = self.decode(file_like).data + round_trip_frames = self.decode(file_like.getvalue()).data else: raise ValueError(f"Unknown method: {method}") @@ -1586,6 +1585,82 @@ def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format): ff_frame, enc_frame, percentage=percentage, atol=2 ) + def test_to_file_like_custom_file_object(self): + """Test with a custom file-like object that implements write and seek.""" + + class CustomFileObject: + def __init__(self): + self._file = io.BytesIO() + + def write(self, data): + return self._file.write(data) + + def seek(self, offset, whence=0): + return self._file.seek(offset, whence) + + def get_encoded_data(self): + return self._file.getvalue() + + source_frames = self.decode(TEST_SRC_2_720P.path).data + file_like = CustomFileObject() + encode_video_to_file_like( + source_frames, frame_rate=30, crf=0, format="mp4", file_like=file_like + ) + decoded_samples = self.decode(file_like.get_encoded_data()) + + torch.testing.assert_close( + decoded_samples.data, + source_frames, + atol=2, + rtol=0, + ) + + def test_to_file_like_real_file(self, tmp_path): + """Test to_file_like with a real file opened in binary write mode.""" + source_frames = self.decode(TEST_SRC_2_720P.path).data + file_path = tmp_path / "test_file_like.mp4" + + with open(file_path, "wb") as file_like: + encode_video_to_file_like( + source_frames, frame_rate=30, crf=0, format="mp4", file_like=file_like + ) + decoded_samples = self.decode(str(file_path)) + + torch.testing.assert_close( + decoded_samples.data, + source_frames, + atol=2, + rtol=0, + ) + + def test_to_file_like_bad_methods(self): + source_frames = self.decode(TEST_SRC_2_720P.path).data + + class NoWriteMethod: + def seek(self, offset, whence=0): + return 0 + + with pytest.raises( + RuntimeError, match="File like object must implement a write method" + ): + encode_video_to_file_like( + source_frames, + frame_rate=30, + format="mp4", + file_like=NoWriteMethod(), + ) + + class NoSeekMethod: + def write(self, data): + return len(data) + + with pytest.raises( + RuntimeError, match="File like object must implement a seek method" + ): + encode_video_to_file_like( + source_frames, frame_rate=30, format="mp4", file_like=NoSeekMethod() + ) + if __name__ == "__main__": pytest.main() From 733022cd1727c361eec46bc17fed092eb6696c53 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Fri, 17 Oct 2025 13:40:26 -0400 Subject: [PATCH 5/7] add todo for test_contiguity --- test/test_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_ops.py b/test/test_ops.py index 82ced8d30..a48a12992 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1330,7 +1330,7 @@ def test_bad_input(self, tmp_path): class TestVideoEncoderOps: - + # TODO-VideoEncoder: Test encoding against different memory layouts (ex. test_contiguity) # TODO-VideoEncoder: Parametrize test after moving to test_encoders def test_bad_input(self, tmp_path): output_file = str(tmp_path / ".mp4") From 5015d6d4f366f3dc66622aa0ff25ec101be43849 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Mon, 20 Oct 2025 14:16:51 -0700 Subject: [PATCH 6/7] remove unneeded slow mark --- test/test_ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_ops.py b/test/test_ops.py index a48a12992..26bbac2a7 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1457,7 +1457,6 @@ def test_video_encoder_round_trip(self, tmp_path, format, method): assert psnr(s_frame, rt_frame) > 30 assert_close(s_frame, rt_frame, atol=atol, rtol=0) - @pytest.mark.slow @pytest.mark.parametrize( "format", ( From ffe01bee9aefdd2fb199236d24f5c5cdf55ff2a9 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Mon, 20 Oct 2025 14:21:03 -0700 Subject: [PATCH 7/7] consistently use getvalue instead of seek --- test/test_ops.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 26bbac2a7..b2fe45b50 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1496,8 +1496,7 @@ def test_against_to_file(self, tmp_path, format, method): format=format, **params, ) - file_like.seek(0) - encoded_output = file_like + encoded_output = file_like.getvalue() torch.testing.assert_close( self.decode(encoded_file).data,