Merge branch 'main' of https://github.com/meta-pytorch/torchcodec into python314_on_ci

Dan-Flores · Dan-Flores · commit 86698612040d · 2025-12-02T10:58:31.000-05:00
diff --git a/.github/workflows/build_ffmpeg.yaml b/.github/workflows/build_ffmpeg.yaml
@@ -48,6 +48,33 @@ jobs:
         mkdir -p "${artifact_dir}"
         mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
 
+  LGPL-Linux-aarch64:
+    strategy:
+      fail-fast: false
+      matrix:
+        ffmpeg-version: ["4.4.4", "5.1.4", "6.1.1", "7.0.1", "8.0"]
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      job-name: Build
+      upload-artifact: ffmpeg-lgpl-linux_aarch64-${{ matrix.ffmpeg-version }}
+      repository: meta-pytorch/torchcodec
+      runner: linux.arm64.2xlarge
+      docker-image: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64
+      script: |
+        export FFMPEG_VERSION="${{ matrix.ffmpeg-version }}"
+        export FFMPEG_ROOT="${PWD}/ffmpeg"
+
+        packaging/build_ffmpeg.sh
+
+        tar -cf ffmpeg.tar.gz ffmpeg/include ffmpeg/lib
+
+        artifact_dir="${RUNNER_ARTIFACT_DIR}/$(date +%Y-%m-%d)/linux_aarch64"
+        mkdir -p "${artifact_dir}"
+        mv ffmpeg.tar.gz "${artifact_dir}/${FFMPEG_VERSION}.tar.gz"
+
   LGPL-macOS:
     strategy:
       fail-fast: false
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -3,6 +3,7 @@ sphinx==5.0.0
 sphinx_design
 sphinx_copybutton
 sphinx-tabs
+sphinx-sitemap
 matplotlib
 torchvision
 ipython
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -55,6 +55,7 @@
     "sphinx_tabs.tabs",
     "sphinx_design",
     "sphinx_copybutton",
+    "sphinx_sitemap",
 ]
 
 
@@ -216,6 +217,15 @@ def __call__(self, filename):
     "matplotlib": ("https://matplotlib.org/stable/", None),
 }
 
+# sitemap config
+html_baseurl = "https://meta-pytorch.org/torchcodec/stable/"
+sitemap_locales = [None]
+sitemap_excludes = [
+    "search.html",
+    "genindex.html",
+]
+sitemap_url_scheme = "{link}"
+
 
 def inject_minigalleries(app, what, name, obj, options, lines):
     """Inject a minigallery into a docstring.
diff --git a/examples/decoding/approximate_mode.py b/examples/decoding/approximate_mode.py
@@ -66,7 +66,7 @@
 # Performance: ``VideoDecoder`` creation
 # --------------------------------------
 #
-# In terms of performance, the ``seek_mode`` parameter ultimately affects the
+# In terms of performance, the ``seek_mode`` parameter mainly affects the
 # **creation** of a :class:`~torchcodec.decoders.VideoDecoder` object. The
 # longer the video, the higher the performance gain.
 
@@ -104,7 +104,7 @@ def bench(f, average_over=50, warmup=2, **f_kwargs):
 # ---------------------------------------------
 #
 # Strictly speaking the ``seek_mode`` parameter only affects the performance of
-# the :class:`~torchcodec.decoders.VideoDecoder` creation. It does not have a
+# the :class:`~torchcodec.decoders.VideoDecoder` creation. It usually does not have a
 # direct effect on the performance of frame decoding or sampling.  **However**,
 # because frame decoding and sampling patterns typically involve the creation of
 # the :class:`~torchcodec.decoders.VideoDecoder` (one per video), ``seek_mode``
@@ -168,20 +168,21 @@ def sample_clips(seek_mode):
 # duration), and also builds an internal index of frames and key-frames. This
 # internal index is potentially more accurate than the one in the file's
 # headers, which leads to more accurate seeking behavior.
-# Without the scan, TorchCodec relies only on the metadata contained in the
-# file, which may not always be as accurate.
+# Without the scan (in approximate mode), TorchCodec relies only on the metadata
+# contained in the file, which may not always be as accurate. In some rare
+# cases, relying on this less accurate data may also lead to slower frame
+# decoding, because it can involve unnecessary seeks.
 #
 # Which mode should I use?
 # ------------------------
 #
 # The general rule of thumb is as follows:
 #
 # - If you really care about exactness of frame seeking, use "exact".
-# - If you can sacrifice exactness of seeking for speed, which is usually the
-#   case when doing clip sampling, use "approximate".
-# - If your videos don't have variable framerate and their metadata is correct,
-#   then "approximate" mode is a net win: it will be just as accurate as the
-#   "exact" mode while still being significantly faster.
+# - If your videos are short (less then a few minutes) then "exact" will usually
+#   be preferable, as the scan's fixed cost will be negligible.
+# - For long videos, if you can sacrifice exactness of seeking for speed, which
+#   is usually the case when doing clip sampling, consider using "approximate".
 
 # %%
 shutil.rmtree(temp_dir)
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -1092,13 +1092,6 @@ bool SingleStreamDecoder::canWeAvoidSeeking() const {
   // Returns true if we can avoid seeking in the AVFormatContext based on
   // heuristics that rely on the target cursor_ and the last decoded frame.
   // Seeking is expensive, so we try to avoid it when possible.
-  // Note that this function itself isn't always that cheap to call: in
-  // particular the calls to getKeyFrameIndexForPts below in approximate mode
-  // are sometimes slow.
-  // TODO we should understand why (is it because it reads the file?) and
-  // potentially optimize it. E.g. we may not want to ever seek, or even *check*
-  // if we need to seek in some cases, like if we're going to decode 80% of the
-  // frames anyway.
   const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
   if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
     // For audio, we only need to seek if a backwards seek was requested
@@ -1145,10 +1138,10 @@ bool SingleStreamDecoder::canWeAvoidSeeking() const {
   // I    P     P    P    I    P    P    P    I    P    P    I    P
   //                           x              j         y
   // (2) is only more efficient than (1) if there is an I frame between x and y.
-  int lastKeyFrameIndex = getKeyFrameIndexForPts(lastDecodedAvFramePts_);
-  int targetKeyFrameIndex = getKeyFrameIndexForPts(cursor_);
-  return lastKeyFrameIndex >= 0 && targetKeyFrameIndex >= 0 &&
-      lastKeyFrameIndex == targetKeyFrameIndex;
+  int lastKeyFrame = getKeyFrameIdentifier(lastDecodedAvFramePts_);
+  int targetKeyFrame = getKeyFrameIdentifier(cursor_);
+  return lastKeyFrame >= 0 && targetKeyFrame >= 0 &&
+      lastKeyFrame == targetKeyFrame;
 }
 
 // This method looks at currentPts and desiredPts and seeks in the
@@ -1365,7 +1358,19 @@ torch::Tensor SingleStreamDecoder::maybePermuteHWC2CHW(
 // PTS <-> INDEX CONVERSIONS
 // --------------------------------------------------------------------------
 
-int SingleStreamDecoder::getKeyFrameIndexForPts(int64_t pts) const {
+int SingleStreamDecoder::getKeyFrameIdentifier(int64_t pts) const {
+  // This function "identifies" a key frame for a given pts value.
+  // We use the term "identifier" rather than "index" because the nature of the
+  // index that is returned depends on various factors:
+  // - If seek_mode is exact, we return the index of the key frame in the
+  //   scanned key-frame vector (streamInfo.keyFrames). So the returned value is
+  //   in [0, num_key_frames).
+  // - If seek_mode is approximate, we use av_index_search_timestamp() which
+  //   may return a value in [0, num_key_frames) like for mkv, but also a value
+  //   in [0, num_frames) like for mp4. It really depends on the container.
+  //
+  //  The range of the "identifier" doesn't matter that much, for now we only
+  //  use it to uniquely identify a key frame in canWeAvoidSeeking().
   const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
   if (streamInfo.keyFrames.empty()) {
     return av_index_search_timestamp(
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -282,7 +282,7 @@ class SingleStreamDecoder {
   // PTS <-> INDEX CONVERSIONS
   // --------------------------------------------------------------------------
 
-  int getKeyFrameIndexForPts(int64_t pts) const;
+  int getKeyFrameIdentifier(int64_t pts) const;
 
   // Returns the key frame index of the presentation timestamp using our index.
   // We build this index by scanning the file in