Bkorbar/pyavapi (#6943)

bjuncek · Bruno Korbar · jdsgomes · web-flow · commit d710f3d1edc0 · 2022-11-17T13:12:43.000Z
* Test: add backend parameter * VideoReader object now works on backend * Frame reading now passes * Keyframe seek now passes * Pyav backend now supports metadata * changes in test to reflect GPU decoder change * Linter? * Test GPU output * Addressing Joao's comments * lint * lint * Revert "Test GPU output" This reverts commit f62e955. * lint? * lint * lint * Address issues in build? * hopefully doc fix * Arrgh * arrgh * fix typos * fix input options * remove read from memory option in pyav * skip read from mem test for gpu and pyab be * fix test * remove unused import * Hack to get reading from memory work with pyav * patch audio test * gallery change in a hope that docs won't break * check video decoder inside io * adding missing lib loading code * remove unused input Co-authored-by: Bruno Korbar <bkorbar@quansight.com> Co-authored-by: Joao Gomes <jdsgomes@fb.com>
diff --git a/gallery/plot_video_api.py b/gallery/plot_video_api.py
@@ -32,6 +32,7 @@
 import torch
 import torchvision
 from torchvision.datasets.utils import download_url
+torchvision.set_video_backend("video_reader")
 
 # Download the sample video
 download_url(
diff --git a/test/test_video_gpu_decoder.py b/test/test_video_gpu_decoder.py
@@ -3,6 +3,7 @@
 
 import pytest
 import torch
+import torchvision
 from torchvision.io import _HAS_GPU_VIDEO_DECODER, VideoReader
 
 try:
@@ -29,8 +30,9 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_frame_reading(self, video_file):
+        torchvision.set_video_backend("cuda")
         full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path, device="cuda")
+        decoder = VideoReader(full_path)
         with av.open(full_path) as container:
             for av_frame in container.decode(container.streams.video[0]):
                 av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
@@ -54,7 +56,8 @@ def test_frame_reading(self, video_file):
         ],
     )
     def test_seek_reading(self, keyframes, full_path, duration):
-        decoder = VideoReader(full_path, device="cuda")
+        torchvision.set_video_backend("cuda")
+        decoder = VideoReader(full_path)
         time = duration / 2
         decoder.seek(time, keyframes_only=keyframes)
         with av.open(full_path) as container:
@@ -79,8 +82,9 @@ def test_seek_reading(self, keyframes, full_path, duration):
         ],
     )
     def test_metadata(self, video_file):
+        torchvision.set_video_backend("cuda")
         full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path, device="cuda")
+        decoder = VideoReader(full_path)
         video_metadata = decoder.get_metadata()["video"]
         with av.open(full_path) as container:
             video = container.streams.video[0]
diff --git a/test/test_videoapi.py b/test/test_videoapi.py
@@ -53,7 +53,9 @@ def fate(name, path="."):
 class TestVideoApi:
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_frame_reading(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_frame_reading(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         with av.open(full_path) as av_reader:
             if av_reader.streams.video:
@@ -117,58 +119,70 @@ def test_frame_reading(self, test_video):
 
     @pytest.mark.parametrize("stream", ["video", "audio"])
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_frame_reading_mem_vs_file(self, test_video, stream):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_frame_reading_mem_vs_file(self, test_video, stream, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
 
-        # Test video reading from file vs from memory
-        vr_frames, vr_frames_mem = [], []
-        vr_pts, vr_pts_mem = [], []
-        # get vr frames
-        video_reader = VideoReader(full_path, stream)
-        for vr_frame in video_reader:
-            vr_frames.append(vr_frame["data"])
-            vr_pts.append(vr_frame["pts"])
-
-        # get vr frames = read from memory
-        f = open(full_path, "rb")
-        fbytes = f.read()
-        f.close()
-        video_reader_from_mem = VideoReader(fbytes, stream)
-
-        for vr_frame_from_mem in video_reader_from_mem:
-            vr_frames_mem.append(vr_frame_from_mem["data"])
-            vr_pts_mem.append(vr_frame_from_mem["pts"])
-
-        # same number of frames
-        assert len(vr_frames) == len(vr_frames_mem)
-        assert len(vr_pts) == len(vr_pts_mem)
-
-        # compare the frames and ptss
-        for i in range(len(vr_frames)):
-            assert vr_pts[i] == vr_pts_mem[i]
-            mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
-            # on average the difference is very small and caused
-            # by decoding (around 1%)
-            # TODO: asses empirically how to set this? atm it's 1%
-            # averaged over all frames
-            assert mean_delta.item() < 2.55
-
-        del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
+        reader = VideoReader(full_path)
+        reader_md = reader.get_metadata()
+
+        if stream in reader_md:
+            # Test video reading from file vs from memory
+            vr_frames, vr_frames_mem = [], []
+            vr_pts, vr_pts_mem = [], []
+            # get vr frames
+            video_reader = VideoReader(full_path, stream)
+            for vr_frame in video_reader:
+                vr_frames.append(vr_frame["data"])
+                vr_pts.append(vr_frame["pts"])
+
+            # get vr frames = read from memory
+            f = open(full_path, "rb")
+            fbytes = f.read()
+            f.close()
+            video_reader_from_mem = VideoReader(fbytes, stream)
+
+            for vr_frame_from_mem in video_reader_from_mem:
+                vr_frames_mem.append(vr_frame_from_mem["data"])
+                vr_pts_mem.append(vr_frame_from_mem["pts"])
+
+            # same number of frames
+            assert len(vr_frames) == len(vr_frames_mem)
+            assert len(vr_pts) == len(vr_pts_mem)
+
+            # compare the frames and ptss
+            for i in range(len(vr_frames)):
+                assert vr_pts[i] == vr_pts_mem[i]
+                mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
+                # on average the difference is very small and caused
+                # by decoding (around 1%)
+                # TODO: asses empirically how to set this? atm it's 1%
+                # averaged over all frames
+                assert mean_delta.item() < 2.55
+
+            del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
+        else:
+            del reader, reader_md
 
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_metadata(self, test_video, config):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_metadata(self, test_video, config, backend):
         """
         Test that the metadata returned via pyav corresponds to the one returned
         by the new video decoder API
         """
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         reader = VideoReader(full_path, "video")
         reader_md = reader.get_metadata()
         assert config.video_fps == approx(reader_md["video"]["fps"][0], abs=0.0001)
         assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5)
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_seek_start(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader", "pyav"])
+    def test_seek_start(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         video_reader = VideoReader(full_path, "video")
         num_frames = 0
@@ -194,7 +208,9 @@ def test_seek_start(self, test_video):
         assert start_num_frames == num_frames
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_accurateseek_middle(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader"])
+    def test_accurateseek_middle(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         stream = "video"
         video_reader = VideoReader(full_path, stream)
@@ -233,7 +249,9 @@ def test_fate_suite(self):
 
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_keyframe_reading(self, test_video, config):
+    @pytest.mark.parametrize("backend", ["pyav", "video_reader"])
+    def test_keyframe_reading(self, test_video, config, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
 
         av_reader = av.open(full_path)
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
@@ -1,5 +1,6 @@
 import os
 import warnings
+from modulefinder import Module
 
 import torch
 from torchvision import datasets, io, models, ops, transforms, utils
@@ -11,6 +12,7 @@
 except ImportError:
     pass
 
+
 # Check if torchvision is being imported within the root folder
 if not _HAS_OPS and os.path.dirname(os.path.realpath(__file__)) == os.path.join(
     os.path.realpath(os.getcwd()), "torchvision"
@@ -66,11 +68,16 @@ def set_video_backend(backend):
         backend, please compile torchvision from source.
     """
     global _video_backend
-    if backend not in ["pyav", "video_reader"]:
-        raise ValueError("Invalid video backend '%s'. Options are 'pyav' and 'video_reader'" % backend)
+    if backend not in ["pyav", "video_reader", "cuda"]:
+        raise ValueError("Invalid video backend '%s'. Options are 'pyav', 'video_reader' and 'cuda'" % backend)
     if backend == "video_reader" and not io._HAS_VIDEO_OPT:
+        # TODO: better messages
         message = "video_reader video backend is not available. Please compile torchvision from source and try again"
-        warnings.warn(message)
+        raise RuntimeError(message)
+    elif backend == "cuda" and not io._HAS_GPU_VIDEO_DECODER:
+        # TODO: better messages
+        message = "cuda video backend is not available."
+        raise RuntimeError(message)
     else:
         _video_backend = backend
 
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
@@ -4,10 +4,6 @@
 
 from ..utils import _log_api_usage_once
 
-try:
-    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
-except ModuleNotFoundError:
-    _HAS_GPU_VIDEO_DECODER = False
 from ._video_opt import (
     _HAS_VIDEO_OPT,
     _probe_video_from_file,
@@ -32,7 +28,7 @@
     write_jpeg,
     write_png,
 )
-from .video import read_video, read_video_timestamps, write_video
+from .video import _HAS_GPU_VIDEO_DECODER, read_video, read_video_timestamps, write_video
 from .video_reader import VideoReader
 
 
diff --git a/torchvision/io/_load_gpu_decoder.py b/torchvision/io/_load_gpu_decoder.py
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
@@ -9,9 +9,16 @@
 import numpy as np
 import torch
 
+from ..extension import _load_library
+
 from ..utils import _log_api_usage_once
 from . import _video_opt
 
+try:
+    _load_library("Decoder")
+    _HAS_GPU_VIDEO_DECODER = True
+except (ImportError, OSError, ModuleNotFoundError):
+    _HAS_GPU_VIDEO_DECODER = False
 
 try:
     import av
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py