[multimodal] Allow float32 image input (#14359)

larryliu0820 · web-flow · commit bc18834e38cf · 2025-09-17T01:10:04.000-07:00
Letting `Image` class support both `uint8_t` and `float` data types,
changing `MultimodalPrefiller` class to support text, image, and audio
modalities with error checking and modularity.

**Image Data Handling and Type Safety:**

* Refactored the `Image` class in `image.h` from a simple struct to a
class that uses a `std::variant` to support both `uint8_t` and `float`
image data, providing type-safe accessors and a `toTensor` method for
conversion to tensors.
* Updated `load_image` in Llava `main.cpp` to construct `Image` objects
using the new class interface and move semantics, ensuring correct data
layout and encapsulation.
* Added a runtime check in `LlavaImagePrefiller` to ensure only
`uint8_t` images are processed, using the new type-checking methods.

**Multimodal Prefill Logic and Flexibility:**

* Updated the `MultimodalPrefiller` class in `multimodal_prefiller.h` to
dynamically check input types, validate tensor types against model
expectations, and handles encoder/decoder execution with improved error
handling and modularity.
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp
@@ -81,24 +81,20 @@ void load_image(const std::string& image_path, Image& image) {
       new_height,
       0,
       channels);
-  // transpose to CHW
-  image.data.resize(channels * new_width * new_height);
+  std::vector<uint8_t> chw_data(channels * new_width * new_height);
   for (int i = 0; i < new_width * new_height; ++i) {
     for (int c = 0; c < channels; ++c) {
-      image.data[c * new_width * new_height + i] =
-          resized_data[i * channels + c];
+      chw_data[c * new_width * new_height + i] = resized_data[i * channels + c];
     }
   }
-  image.width = new_width;
-  image.height = new_height;
-  image.channels = channels;
+  image = Image(std::move(chw_data), new_width, new_height, channels);
   // convert to tensor
   ET_LOG(
       Info,
       "image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32,
-      image.channels,
-      image.height,
-      image.width);
+      image.channels(),
+      image.height(),
+      image.width());
   stbi_image_free(data);
 }
 
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
@@ -268,7 +268,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       for (int i = 0; i < image_size; i++) {
         image_data[i] = image_data_jint[i];
       }
-      llm::Image image_runner{image_data, width, height, channels};
+      llm::Image image_runner{std::move(image_data), width, height, channels};
       prefill_inputs_.emplace_back(
           llm::MultimodalInput{std::move(image_runner)});
     }
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
@@ -172,12 +172,12 @@ - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
       case ExecuTorchLLMMultimodalInputTypeImage: {
         ExecuTorchLLMImage *image = input.image;
         std::vector<uint8_t> data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length);
-        nativeInputs.emplace_back(llm::MultimodalInput(llm::Image{
-          .data = std::move(data),
-          .width = (int32_t)image.width,
-          .height = (int32_t)image.height,
-          .channels = (int32_t)image.channels
-        }));
+        nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+          std::move(data),
+          (int32_t)image.width,
+          (int32_t)image.height,
+          (int32_t)image.channels
+        )));
         break;
       }
       default: {
diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h
@@ -10,19 +10,112 @@
 
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
+#include <cstddef>
 #include <cstdint>
+#include <variant>
 #include <vector>
 
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+
 namespace executorch {
 namespace extension {
 namespace llm {
 
-struct ET_EXPERIMENTAL Image {
+class ET_EXPERIMENTAL Image {
+ public:
+  // Default constructor
+  Image() : width_(0), height_(0), channels_(0) {}
+
+  // Constructor for uint8_t data
+  Image(
+      std::vector<uint8_t>&& data,
+      int32_t width,
+      int32_t height,
+      int32_t channels)
+      : data_(std::move(data)),
+        width_(width),
+        height_(height),
+        channels_(channels) {}
+
+  // Constructor for float data
+  Image(
+      std::vector<float>&& data,
+      int32_t width,
+      int32_t height,
+      int32_t channels)
+      : data_(std::move(data)),
+        width_(width),
+        height_(height),
+        channels_(channels) {}
+
+  // Getters
+  int32_t width() const {
+    return width_;
+  }
+  int32_t height() const {
+    return height_;
+  }
+  int32_t channels() const {
+    return channels_;
+  }
+
+  // Data access
+  bool is_uint8() const {
+    return std::holds_alternative<std::vector<uint8_t>>(data_);
+  }
+
+  bool is_float() const {
+    return std::holds_alternative<std::vector<float>>(data_);
+  }
+
+  const std::vector<uint8_t>& get_uint8_data() const& {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  std::vector<uint8_t>& get_uint8_data() & {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  const std::vector<float>& get_float_data() const& {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  std::vector<float>& get_float_data() & {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+      bool with_batch = false) const {
+    // Note: This creates a 3D tensor (CHW). The model might expect a 4D
+    // tensor (NCHW). The caller should handle reshaping if needed.
+    std::vector<executorch::aten::SizesType> sizes = {
+        channels(), height(), width()};
+    if (with_batch) {
+      sizes.insert(sizes.begin(), 1);
+    }
+    if (is_float()) {
+      return executorch::extension::from_blob(
+          const_cast<float*>(get_float_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Float);
+    } else if (is_uint8()) {
+      return executorch::extension::from_blob(
+          const_cast<uint8_t*>(get_uint8_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Byte);
+    }
+    ET_LOG(
+        Error, "Image data is not initialized with uint8_t or float vector.");
+    return ::executorch::runtime::Error::NotSupported;
+  }
+
+ private:
   // Assuming NCHW format
-  std::vector<uint8_t> data;
-  int32_t width;
-  int32_t height;
-  int32_t channels;
+  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
+  int32_t width_;
+  int32_t height_;
+  int32_t channels_;
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
@@ -41,10 +41,42 @@ Result<uint64_t> MultimodalPrefiller::prefill(
   ::executorch::runtime::EValue encoder_output;
   if (input.is_image()) {
     Image image = input.get_image();
-    auto image_tensor = executorch::extension::from_blob(
-        image.data.data(),
-        {3, image.height, image.width},
-        ::executorch::aten::ScalarType::Byte);
+
+    auto method_meta = ET_UNWRAP(
+        module_->method_meta(kImageEncoderMethod),
+        "Failed to get method_meta for %s",
+        kImageEncoderMethod);
+
+    ET_CHECK_MSG(
+        method_meta.num_inputs() > 0,
+        "Image encoder should have at least 1 input");
+    auto input_meta = ET_UNWRAP(
+        method_meta.input_tensor_meta(0),
+        "Cannot get input tensor meta at index 0");
+    auto expected_dtype = input_meta.scalar_type();
+
+    if (expected_dtype == ::executorch::aten::ScalarType::Float) {
+      ET_CHECK_MSG(
+          image.is_float(),
+          "Model expects float image data, but image has uint8_t data.");
+    } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
+      ET_CHECK_MSG(
+          image.is_uint8(),
+          "Model expects uint8_t image data, but image has float data.");
+    } else {
+      ET_LOG(
+          Error,
+          "Unsupported image encoder input dtype: %s",
+          ::executorch::runtime::toString(expected_dtype));
+      return ::executorch::runtime::Error::NotSupported;
+    }
+
+    // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
+    // tensor (CHW). Add a batch dimension of 1 if needed.
+    auto expected_dims = input_meta.sizes();
+    auto image_tensor = ET_UNWRAP(
+        image.toTensor(/*with_batch*/ expected_dims.size() == 4),
+        "Failed to convert image to tensor");
 
     // Run image encoder
     auto image_encoder_outputs =
diff --git a/extension/llm/runner/test/test_multimodal_input.cpp b/extension/llm/runner/test/test_multimodal_input.cpp

Original file line number	Diff line number	Diff line change
`@@ -268,7 +268,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {`
`268`	`268`	`for (int i = 0; i < image_size; i++) {`
`269`	`269`	`image_data[i] = image_data_jint[i];`
`270`	`270`	`}`
`271`		`- llm::Image image_runner{image_data, width, height, channels};`
	`271`	`+ llm::Image image_runner{std::move(image_data), width, height, channels};`
`272`	`272`	`prefill_inputs_.emplace_back(`
`273`	`273`	`llm::MultimodalInput{std::move(image_runner)});`
`274`	`274`	`}`