Support NCNN CPU inference (#1867)

This is a useful option on systems with strong CPU and weak GPU, where PyTorch is nontrivial to install (e.g. ppc64le systems). Currently CPU inference is keyed on which NCNN package is installed; a future PR could allow CPU inference even when the Vulkan NCNN package is installed. Co-authored-by: Jeremy Rand <[email protected]>
chaiNNer-org · Jun 24, 2023 · 6fd553e · 6fd553e
1 parent c975719
commit 6fd553e
Show file tree

Hide file tree

Showing 4 changed files with 98 additions and 33 deletions.
diff --git a/backend/src/nodes/impl/ncnn/auto_split.py b/backend/src/nodes/impl/ncnn/auto_split.py
@@ -3,7 +3,15 @@
 import gc
 
 import numpy as np
-from ncnn_vulkan import ncnn
+
+try:
+    from ncnn_vulkan import ncnn
+
+    use_gpu = True
+except ImportError:
+    from ncnn import ncnn
+
+    use_gpu = False
 from sanic.log import logger
 
 from ...utils.utils import get_h_w_c
@@ -22,9 +30,10 @@ def ncnn_auto_split(
 ) -> np.ndarray:
     def upscale(img: np.ndarray, _):
         ex = net.create_extractor()
-        ex.set_blob_vkallocator(blob_vkallocator)
-        ex.set_workspace_vkallocator(blob_vkallocator)
-        ex.set_staging_vkallocator(staging_vkallocator)
+        if use_gpu:
+            ex.set_blob_vkallocator(blob_vkallocator)
+            ex.set_workspace_vkallocator(blob_vkallocator)
+            ex.set_staging_vkallocator(staging_vkallocator)
         # ex.set_light_mode(True)
         try:
             lr_c = get_h_w_c(img)[2]
@@ -49,17 +58,19 @@ def upscale(img: np.ndarray, _):
             result = np.array(mat_out).transpose(1, 2, 0).astype(np.float32)
             del ex, mat_in, mat_out
             gc.collect()
-            # Clear VRAM
-            blob_vkallocator.clear()
-            staging_vkallocator.clear()
+            if use_gpu:
+                # Clear VRAM
+                blob_vkallocator.clear()
+                staging_vkallocator.clear()
             return result
         except Exception as e:
             if "vkQueueSubmit" in str(e):
                 ex = None
                 del ex
                 gc.collect()
-                blob_vkallocator.clear()
-                staging_vkallocator.clear()
+                if use_gpu:
+                    blob_vkallocator.clear()
+                    staging_vkallocator.clear()
                 # TODO: Have someone running into this issue enable this and see if it fixes anything
                 # ncnn.destroy_gpu_instance()
                 raise RuntimeError(
@@ -72,8 +83,9 @@ def upscale(img: np.ndarray, _):
                 ex = None
                 del ex
                 gc.collect()
-                blob_vkallocator.clear()
-                staging_vkallocator.clear()
+                if use_gpu:
+                    blob_vkallocator.clear()
+                    staging_vkallocator.clear()
                 return Split()
             else:
                 # Re-raise the exception if not an OOM error

diff --git a/backend/src/nodes/impl/ncnn/session.py b/backend/src/nodes/impl/ncnn/session.py
@@ -1,8 +1,16 @@
 from __future__ import annotations
 
+import tempfile
 from weakref import WeakKeyDictionary
 
-from ncnn_vulkan import ncnn
+try:
+    from ncnn_vulkan import ncnn
+
+    use_gpu = True
+except ImportError:
+    from ncnn import ncnn
+
+    use_gpu = False
 
 from ...utils.exec_options import ExecutionOptions
 from .model import NcnnModelWrapper
@@ -22,13 +30,25 @@ def create_ncnn_net(
         net.opt.use_fp16_storage = False
         net.opt.use_fp16_arithmetic = False
 
-    # Use vulkan compute
-    net.opt.use_vulkan_compute = True
-    net.set_vulkan_device(exec_options.ncnn_gpu_index)
+    if use_gpu:
+        # Use vulkan compute
+        net.opt.use_vulkan_compute = True
+        net.set_vulkan_device(exec_options.ncnn_gpu_index)
 
     # Load model param and bin
-    net.load_param_mem(model.model.write_param())
-    net.load_model_mem(model.model.bin)
+    if use_gpu:
+        net.load_param_mem(model.model.write_param())
+        net.load_model_mem(model.model.bin)
+    else:
+        with tempfile.TemporaryDirectory() as tmp_model_dir:
+            param_filename = tmp_model_dir + "/ncnn-model.param"
+            bin_filename = tmp_model_dir + "/ncnn-model.bin"
+
+            model.model.write_param(param_filename)
+            model.model.write_bin(bin_filename)
+
+            net.load_param(param_filename)
+            net.load_model(bin_filename)
 
     return net
 

diff --git a/backend/src/packages/chaiNNer_ncnn/ncnn/processing/upscale_image.py b/backend/src/packages/chaiNNer_ncnn/ncnn/processing/upscale_image.py
@@ -4,7 +4,15 @@
 
 import cv2
 import numpy as np
-from ncnn_vulkan import ncnn
+
+try:
+    from ncnn_vulkan import ncnn
+
+    use_gpu = True
+except ImportError:
+    from ncnn import ncnn
+
+    use_gpu = False
 from sanic.log import logger
 
 from nodes.impl.ncnn.auto_split import ncnn_auto_split
@@ -61,26 +69,44 @@ def upscale_impl(
     net = get_ncnn_net(model, exec_options)
     # Try/except block to catch errors
     try:
-        vkdev = ncnn.get_gpu_device(exec_options.ncnn_gpu_index)
+        if use_gpu:
+            vkdev = ncnn.get_gpu_device(exec_options.ncnn_gpu_index)
 
-        def estimate():
-            heap_budget = vkdev.get_heap_budget() * 1024 * 1024 * 0.8
-            return MaxTileSize(
-                estimate_tile_size(heap_budget, model.model.bin_length, img, 4)
-            )
+            def estimate_gpu():
+                heap_budget = vkdev.get_heap_budget() * 1024 * 1024 * 0.8
+                return MaxTileSize(
+                    estimate_tile_size(heap_budget, model.model.bin_length, img, 4)
+                )
+
+            with ncnn_allocators(vkdev) as (
+                blob_vkallocator,
+                staging_vkallocator,
+            ):
+                return ncnn_auto_split(
+                    img,
+                    net,
+                    input_name=input_name,
+                    output_name=output_name,
+                    blob_vkallocator=blob_vkallocator,
+                    staging_vkallocator=staging_vkallocator,
+                    tiler=parse_tile_size_input(tile_size, estimate_gpu),
+                )
+        else:
+
+            def estimate_cpu():
+                # TODO: Improve tile size estimation in CPU mode.
+                raise ValueError(
+                    "Tile size estimation not supported with NCNN CPU inference"
+                )
 
-        with ncnn_allocators(vkdev) as (
-            blob_vkallocator,
-            staging_vkallocator,
-        ):
             return ncnn_auto_split(
                 img,
                 net,
                 input_name=input_name,
                 output_name=output_name,
-                blob_vkallocator=blob_vkallocator,
-                staging_vkallocator=staging_vkallocator,
-                tiler=parse_tile_size_input(tile_size, estimate),
+                blob_vkallocator=None,
+                staging_vkallocator=None,
+                tiler=parse_tile_size_input(tile_size, estimate_cpu),
             )
     except (RuntimeError, ValueError):
         raise

diff --git a/backend/src/server.py b/backend/src/server.py
@@ -361,8 +361,15 @@ async def list_ncnn_gpus(_request: Request):
             result.append(ncnn.get_gpu_info(i).device_name())
         return json(result)
     except Exception as exception:
-        logger.error(exception, exc_info=True)
-        return json([])
+        try:
+            from ncnn import ncnn
+
+            result = ["cpu"]
+            return json(result)
+        except Exception as exception2:
+            logger.error(exception, exc_info=True)
+            logger.error(exception2, exc_info=True)
+            return json([])
 
 
 @app.route("/listgpus/nvidia", methods=["GET"])