Revert "[torchcodec] Add CUDA support to SimpleVideoDecoder (#146)"

ahmadsharif1 · ahmadsharif1 · commit 0be7e780fe55 · 2024-08-15T08:48:41.000-07:00
This reverts commit ec5e63a.
diff --git a/benchmarks/decoders/gpu_benchmark.py b/benchmarks/decoders/gpu_benchmark.py
@@ -8,30 +8,40 @@
 from torchvision.transforms import Resize
 
 
-def transfer_and_resize_frame(frame):
-    # This should be a no-op if the frame is already on the GPU.
-    frame = frame.to("cuda:0")
+def transfer_and_resize_frame(frame, device):
+    # This should be a no-op if the frame is already on the device.
+    frame = frame.to(device)
     frame = Resize((256, 256))(frame)
     return frame
 
 
-def decode_full_video(video_path, device_string, do_gpu_preproc):
-    decoder = torchcodec.decoders.SimpleVideoDecoder(
-        video_path, device=torch.device(device_string)
+def decode_full_video(video_path, decode_device):
+    decoder = torchcodec.decoders._core.create_from_file(video_path)
+    num_threads = None
+    if "cuda" in decode_device:
+        num_threads = 1
+    torchcodec.decoders._core.add_video_stream(
+        decoder, stream_index=0, device_string=decode_device, num_threads=num_threads
     )
     start_time = time.time()
     frame_count = 0
-    for frame in decoder:
-        # You can do a resize to simulate extra preproc work that happens
-        # on the GPU by uncommenting the following line:
-        if do_gpu_preproc:
-            frame = transfer_and_resize_frame(frame)
-        frame_count += 1
+    while True:
+        try:
+            frame, *_ = torchcodec.decoders._core.get_next_frame(decoder)
+            # You can do a resize to simulate extra preproc work that happens
+            # on the GPU by uncommenting the following line:
+            # frame = transfer_and_resize_frame(frame, decode_device)
+
+            frame_count += 1
+        except Exception as e:
+            print("EXCEPTION", e)
+            break
+        # print(f"current {frame_count=}", flush=True)
     end_time = time.time()
     elapsed = end_time - start_time
     fps = frame_count / (end_time - start_time)
     print(
-        f"****** DECODED full video {device_string=} {frame_count=} {elapsed=} {fps=}"
+        f"****** DECODED full video {decode_device=} {frame_count=} {elapsed=} {fps=}"
     )
     return frame_count, end_time - start_time
 
@@ -59,15 +69,6 @@ def main():
             "to measure the cold start time."
         ),
     )
-    parser.add_argument(
-        "--do_gpu_preproc",
-        action=argparse.BooleanOptionalAction,
-        default=True,
-        help=(
-            "Do a transfer to GPU and resize operation after the decode to "
-            "simulate a real-world transform."
-        ),
-    )
     args = parser.parse_args()
     video_path = args.video
 
@@ -77,23 +78,17 @@ def main():
             decode_full_video(video_path, device)
         return
 
-    label = "Decode"
-    if args.do_gpu_preproc:
-        label += " + GPU Preproc"
-    label += " Time"
-
     results = []
     for device in args.devices.split(","):
         print("device", device)
         t = benchmark.Timer(
-            stmt="decode_full_video(video_path, device, do_gpu_preproc)",
+            stmt="decode_full_video(video_path, device)",
             globals={
                 "device": device,
                 "video_path": video_path,
                 "decode_full_video": decode_full_video,
-                "do_gpu_preproc": args.do_gpu_preproc,
             },
-            label=label,
+            label="Decode+Resize Time",
             sub_label=f"video={os.path.basename(video_path)}",
             description=f"decode_device={device}",
         ).blocked_autorange()
diff --git a/examples/basic_example.py b/examples/basic_example.py
@@ -172,13 +172,3 @@ def plot(frames: torch.Tensor, title : Optional[str] = None):
 plot(frame_at_2_seconds.data, "Frame displayed at 2 seconds")
 plot(first_two_seconds.data, "Frames displayed during [0, 2) seconds")
 
-# %%
-# Using a CUDA GPU to accelerate decoding
-# ---------------------------------------
-#
-# If you have a CUDA GPU that has NVDEC, you can decode on the GPU.
-if torch.cuda.is_available():
-    cuda_decoder = SimpleVideoDecoder(raw_video_bytes, device="cuda:0")
-    cuda_frame = cuda_decoder.get_frame_displayed_at(seconds=2)
-    print(cuda_frame.data.device)  # should be cuda:0
-    plot(cuda_frame.data.to("cpu"), "Frame displayed at 2 seconds on CUDA")
diff --git a/src/torchcodec/decoders/_simple_video_decoder.py b/src/torchcodec/decoders/_simple_video_decoder.py
@@ -9,7 +9,7 @@
 from pathlib import Path
 from typing import Iterable, Iterator, Literal, Tuple, Union
 
-from torch import device as torch_device, Tensor
+from torch import Tensor
 
 from torchcodec.decoders import _core as core
 
@@ -89,14 +89,6 @@ class SimpleVideoDecoder:
             This can be either "NCHW" (default) or "NHWC", where N is the batch
             size, C is the number of channels, H is the height, and W is the
             width of the frames.
-        device (torch.device, optional): The device to use for decoding.
-            Currently we only support CPU and CUDA devices. If CUDA is used,
-            we use NVDEC and CUDA to do decoding and color-conversion
-            respectively. The resulting frame is left on the GPU for further
-            processing.
-            You can either pass in a string like "cpu" or "cuda:0" or a
-            torch.device like torch.device("cuda:0").
-            Default: ``torch.device("cpu")``.
 
             .. note::
 
@@ -114,7 +106,6 @@ def __init__(
         self,
         source: Union[str, Path, bytes, Tensor],
         dimension_order: Literal["NCHW", "NHWC"] = "NCHW",
-        device: Union[str, torch_device] = torch_device("cpu"),
     ):
         if isinstance(source, str):
             self._decoder = core.create_from_file(source)
@@ -138,20 +129,7 @@ def __init__(
             )
 
         core.scan_all_streams_to_update_metadata(self._decoder)
-        num_threads = None
-        if isinstance(device, str):
-            device = torch_device(device)
-        if device.type == "cuda":
-            # Using multiple CPU threads seems to slow down decoding on CUDA.
-            # CUDA internally uses dedicated hardware to do decoding so we
-            # don't need CPU software threads here.
-            num_threads = 1
-        core.add_video_stream(
-            self._decoder,
-            dimension_order=dimension_order,
-            device_string=str(device),
-            num_threads=num_threads,
-        )
+        core.add_video_stream(self._decoder, dimension_order=dimension_order)
 
         self.metadata, self._stream_index = _get_and_validate_stream_metadata(
             self._decoder
diff --git a/test/decoders/test_simple_video_decoder.py b/test/decoders/test_simple_video_decoder.py
@@ -45,34 +45,6 @@ def test_create_fails(self):
         with pytest.raises(TypeError, match="Unknown source type"):
             decoder = SimpleVideoDecoder(123)  # noqa
 
-    def test_can_accept_devices(self):
-        # You can pass a CPU device as a string...<contd>
-        decoder = SimpleVideoDecoder(NASA_VIDEO.path, device="cpu")
-        assert_tensor_equal(decoder[0], NASA_VIDEO.get_frame_data_by_index(0))
-
-        # ...or as a torch.device.
-        decoder = SimpleVideoDecoder(NASA_VIDEO.path, device=torch.device("cpu"))
-        assert_tensor_equal(decoder[0], NASA_VIDEO.get_frame_data_by_index(0))
-
-        if torch.cuda.is_available():
-            # You can pass a CUDA device as a string...<contd>
-            decoder = SimpleVideoDecoder(NASA_VIDEO.path, device="cuda")
-            frame = decoder[0]
-            assert frame.device.type == "cuda"
-            assert frame.shape == torch.Size(
-                [NASA_VIDEO.num_color_channels, NASA_VIDEO.height, NASA_VIDEO.width]
-            )
-
-            # ...or as a torch.device.
-            decoder = SimpleVideoDecoder(NASA_VIDEO.path, device=torch.device("cuda"))
-            frame = decoder[0]
-            assert frame.device.type == "cuda"
-            assert frame.shape == torch.Size(
-                [NASA_VIDEO.num_color_channels, NASA_VIDEO.height, NASA_VIDEO.width]
-            )
-            # TODO: compare tensor values too. We don't compare values because
-            # the exact values are hardware-dependent.
-
     def test_getitem_int(self):
         decoder = SimpleVideoDecoder(NASA_VIDEO.path)