Use our own index to seek more accurately when it is available (#180)

ahmadsharif1 · web-flow · commit 0a06c3da90dc · 2024-08-15T10:23:13.000-04:00
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -732,6 +732,18 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
   int firstActiveStreamIndex = *activeStreamIndices_.begin();
   const auto& firstStreamInfo = streams_[firstActiveStreamIndex];
   int64_t desiredPts = *maybeDesiredPts_ * firstStreamInfo.timeBase.den;
+
+  // For some encodings like H265, FFMPEG sometimes seeks past the point we
+  // set as the max_ts. So we use our own index to give it the exact pts of
+  // the key frame that we want to seek to.
+  // See https://github.com/pytorch/torchcodec/issues/179 for more details.
+  // See https://trac.ffmpeg.org/ticket/11137 for the underlying ffmpeg bug.
+  if (!firstStreamInfo.keyFrames.empty()) {
+    int desiredKeyFrameIndex =
+        getKeyFrameIndexForPts(firstStreamInfo, desiredPts);
+    desiredPts = firstStreamInfo.keyFrames[desiredKeyFrameIndex].pts;
+  }
+
   int ffmepgStatus = avformat_seek_file(
       formatContext_.get(),
       firstStreamInfo.streamIndex,
diff --git a/test/decoders/test_simple_video_decoder.py b/test/decoders/test_simple_video_decoder.py
@@ -321,12 +321,9 @@ def test_get_frame_displayed_at(self):
         assert isinstance(decoder.get_frame_displayed_at(6.02).duration_seconds, float)
 
     def test_get_frame_displayed_at_h265(self):
+        # Non-regression test for https://github.com/pytorch/torchcodec/issues/179
         decoder = SimpleVideoDecoder(H265_VIDEO.path)
-        # Note that for H265, FFMPEG's seeking is not precise. Even though we ask to
-        # seek with a max_ts=0.5, FFMPEG will seek beyond that point.
-        # TODO: Revert use frame5 in the test below once it's fixed upstream:
-        # https://trac.ffmpeg.org/ticket/11137
-        ref_frame6 = H265_VIDEO.get_frame_by_name("frame000006")
+        ref_frame6 = H265_VIDEO.get_frame_by_name("frame000005")
         assert_tensor_equal(ref_frame6, decoder.get_frame_displayed_at(0.5).data)
 
     def test_get_frame_displayed_at_fails(self):
diff --git a/test/generate_reference_resources.sh b/test/generate_reference_resources.sh
@@ -47,7 +47,7 @@ ffmpeg -y -i "$VIDEO_PATH" -b:a 192K -vn "$VIDEO_PATH.audio.mp3"
 # ./configure --enable-nonfree --enable-gpl --prefix=$(readlink -f ../bin) --enable-libx265  --enable-rpath --extra-ldflags=-Wl,-rpath=$CONDA_PREFIX/lib --enable-filter=drawtext --enable-libfontconfig --enable-libfreetype --enable-libharfbuzz
 # ffmpeg -f lavfi -i color=size=128x128:duration=1:rate=10:color=blue -vf "drawtext=fontsize=30:fontcolor=white:x=(w-text_w)/2:y=(h-text_h)/2:text='Frame %{frame_num}'" -vcodec libx265 -pix_fmt yuv420p -g 2 -crf 10 h265_video.mp4 -y
 VIDEO_PATH=$RESOURCES_DIR/h265_video.mp4
-FRAMES=(6)
+FRAMES=(5)
 for frame in "${FRAMES[@]}"; do
   frame_name=$(printf "%06d" "$frame")
   ffmpeg -y -i "$VIDEO_PATH" -vf select="eq(n\,$frame)" -vsync vfr -q:v 2 "$VIDEO_PATH.frame$frame_name.bmp"
diff --git a/test/resources/h265_video.mp4.frame000005.pt b/test/resources/h265_video.mp4.frame000005.pt