Update runner for weights sharing

neuropilot-captain · neuropilot-captain · commit fd5266448032 · 2025-09-09T17:33:52.000+08:00
diff --git a/backends/mediatek/runtime/include/api/NeuronAdapter.h b/backends/mediatek/runtime/include/api/NeuronAdapter.h
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.cpp
@@ -63,11 +63,16 @@ LlamaModelChunk::LlamaModelChunk(
       enableSWA(enableSWA),
       kCacheTypeSize(llm_helper::getLLMTypeSize(kCacheType)) {}
 
+LlamaModelChunk::~LlamaModelChunk() {}
+
 std::string LlamaModelChunk::SelectMethod(
     const std::vector<std::string>& methodNames) const {
   const size_t curTokenSize = GetModelId();
   for (const auto& methodName : methodNames) {
     const auto matches = utils::extract_substr(methodName, "([0-9]+)t[0-9]+c");
+    if (matches.empty()) {
+      continue;
+    }
     ET_CHECK_MSG(
         matches.size() == 2, "Invalid method name: %s", methodName.c_str());
     // Extract the first match group as token size
@@ -88,8 +93,6 @@ std::string LlamaModelChunk::SelectMethod(
   return {};
 }
 
-LlamaModelChunk::~LlamaModelChunk() {}
-
 void LlamaModelChunk::Initialize() {
   LoadModels();
   GetModelIoInfo();
@@ -367,8 +370,9 @@ void LlamaModelChunk::UpdatePosEmbAndMask(const size_t numInputToken) {
     const auto swaMaskSizeBytes = swaMaskBufferInfo.nbytesUsed;
     mMaskBuilder->setMaskBuffer(swaMaskBuffer, swaMaskSizeBytes);
     mMaskBuilder->enableSlidingWindow(kWindowSize);
-    mMaskBuilder->updateMask(
-        mTokenBatchSize, mCurrentTokenIndex, numInputToken);
+    // mMaskBuilder->updateMask(mTokenBatchSize, mCurrentTokenIndex,
+    // numInputToken);
+    mMaskBuilder->buildMask(mTokenBatchSize, mCurrentTokenIndex);
   }
   // Pass same isMaskUpdatable to both mask
   mMaskBuilder->setIsMaskUpdatable(isMaskUpdatable);
diff --git a/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h b/examples/mediatek/executor_runner/llama_runner/LlamaModelChunk.h
@@ -89,8 +89,6 @@ class LlamaModelChunk : public ModelChunk {
 
   void InitMaskBuilder();
 
-  void InitSWAMaskBuilder();
-
   void InitCache();
 
   void PrepareCacheIOs();
@@ -134,10 +132,6 @@ class LlamaModelChunk : public ModelChunk {
 
   void CheckIoCount();
 
-  size_t GetExpectedInputCount() const;
-
-  size_t GetExpectedOutputCount() const;
-
  private:
   bool AllowModelsCoexist() const override {
     return kIsSharedWeightsUsed;
@@ -150,12 +144,6 @@ class LlamaModelChunk : public ModelChunk {
   // Whether shared weights is used
   bool kIsSharedWeightsUsed = false;
 
-  // Input/Output Indexes
-  const size_t kMaskInputIndex;
-  const std::vector<size_t> kRotEmbInputIndexes;
-  const std::vector<size_t> kCacheInputIndexes;
-  const std::vector<size_t> kCacheOutputIndexes;
-
   // Cache
   TensorShape mCacheShape;
   const LLMType kCacheType;
diff --git a/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp b/examples/mediatek/executor_runner/llama_runner/llm_helper/mask_builder.cpp
@@ -5,7 +5,6 @@
  * except in compliance with the License. See the license file in the root
  * directory of this source tree for more details.
  */
-#include <iostream> //TODO: DELETE
 
 #include "llm_helper/include/mask_builder.h"
 
diff --git a/examples/mediatek/executor_runner/run_gemma2_sample.sh b/examples/mediatek/executor_runner/run_gemma2_sample.sh
@@ -37,16 +37,9 @@ TOKENIZER_PATH="/data/local/tmp/et_mtk/tokenizer_gemma2.json"
 TOKEN_EMBEDDING_PATH="/data/local/tmp/et_mtk/embedding_gemma2_2b_it_fp32.bin"
 
 # Comma-Separated Paths
-PROMPT_MODEL_PATHS="\
-/data/local/tmp/et_mtk/gemma2_2b_it_A16W4_2_chunks_128t512c/gemma2_2b_it_A16W4_2_chunks_128t512c_0.pte,\
-/data/local/tmp/et_mtk/gemma2_2b_it_A16W4_2_chunks_128t512c/gemma2_2b_it_A16W4_2_chunks_128t512c_1.pte,"
-
-# # Comma-Separated Paths
-GEN_MODEL_PATHS="\
-/data/local/tmp/et_mtk/gemma2_2b_it_A16W4_2_chunks_1t512c/gemma2_2b_it_A16W4_2_chunks_1t512c_0.pte,\
-/data/local/tmp/et_mtk/gemma2_2b_it_A16W4_2_chunks_1t512c/gemma2_2b_it_A16W4_2_chunks_1t512c_1.pte,"
-
-
+WEIGHT_SHARED_MODEL_PACKAGE_PATHS="\
+/data/local/tmp/et_mtk/gemma2_2b_it_A16W4_2_chunks/gemma2_2b_it_A16W4_2_chunks_0.pte,\
+/data/local/tmp/et_mtk/gemma2_2b_it_A16W4_2_chunks/gemma2_2b_it_A16W4_2_chunks_1.pte,"
 
 PROMPT_FILE=/data/local/tmp/et_mtk/prompt_gemma.txt
 
@@ -75,6 +68,5 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD
     --tokenizer_type=$TOKENIZER_TYPE \
     --tokenizer_path=$TOKENIZER_PATH \
     --token_embedding_path=$TOKEN_EMBEDDING_PATH \
-    --prompt_model_paths=$PROMPT_MODEL_PATHS \
-    --gen_model_paths=$GEN_MODEL_PATHS \
+    --model_package_paths=$WEIGHT_SHARED_MODEL_PACKAGE_PATHS \
     --prompt_file=$PROMPT_FILE
diff --git a/examples/mediatek/executor_runner/run_phi3_sample.sh b/examples/mediatek/executor_runner/run_phi3_sample.sh
@@ -36,19 +36,11 @@ TOKENIZER_PATH="/data/local/tmp/et_mtk/tokenizer.bin"
 TOKEN_EMBEDDING_PATH="/data/local/tmp/et_mtk/embedding_phi3.5-mini-instruct_fp32.bin"
 
 # Comma-Separated Paths
-PROMPT_MODEL_PATHS="\
-/data/local/tmp/et_mtk/phi3.5-mini-instruct_A16W4_4_chunks_128t512c/phi3.5-mini-instruct_A16W4_4_chunks_128t512c_0.pte,\
-/data/local/tmp/et_mtk/phi3.5-mini-instruct_A16W4_4_chunks_128t512c/phi3.5-mini-instruct_A16W4_4_chunks_128t512c_1.pte,\
-/data/local/tmp/et_mtk/phi3.5-mini-instruct_A16W4_4_chunks_128t512c/phi3.5-mini-instruct_A16W4_4_chunks_128t512c_2.pte,\
-/data/local/tmp/et_mtk/phi3.5-mini-instruct_A16W4_4_chunks_128t512c/phi3.5-mini-instruct_A16W4_4_chunks_128t512c_3.pte,"
-
-
-# Comma-Separated Paths
-GEN_MODEL_PATHS="\
-/data/local/tmp/et_mtk/phi3.5-mini-instruct_A16W4_4_chunks_1t512c/phi3.5-mini-instruct_A16W4_4_chunks_1t512c_0.pte,\
-/data/local/tmp/et_mtk/phi3.5-mini-instruct_A16W4_4_chunks_1t512c/phi3.5-mini-instruct_A16W4_4_chunks_1t512c_1.pte,\
-/data/local/tmp/et_mtk/phi3.5-mini-instruct_A16W4_4_chunks_1t512c/phi3.5-mini-instruct_A16W4_4_chunks_1t512c_2.pte,\
-/data/local/tmp/et_mtk/phi3.5-mini-instruct_A16W4_4_chunks_1t512c/phi3.5-mini-instruct_A16W4_4_chunks_1t512c_3.pte,"
+WEIGHT_SHARED_MODEL_PACKAGE_PATHS="\
+/data/local/tmp/et_mtk/phi3.5-mini-instruct_A16W4_4_chunks/phi3.5-mini-instruct_A16W4_4_chunks_0.pte,\
+/data/local/tmp/et_mtk/phi3.5-mini-instruct_A16W4_4_chunks/phi3.5-mini-instruct_A16W4_4_chunks_1.pte,\
+/data/local/tmp/et_mtk/phi3.5-mini-instruct_A16W4_4_chunks/phi3.5-mini-instruct_A16W4_4_chunks_2.pte,\
+/data/local/tmp/et_mtk/phi3.5-mini-instruct_A16W4_4_chunks/phi3.5-mini-instruct_A16W4_4_chunks_3.pte,"
 
 PROMPT_FILE=/data/local/tmp/et_mtk/prompt_phi3.txt
 
@@ -76,6 +68,5 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD
     --tokenizer_type=$TOKENIZER_TYPE \
     --tokenizer_path=$TOKENIZER_PATH \
     --token_embedding_path=$TOKEN_EMBEDDING_PATH \
-    --prompt_model_paths=$PROMPT_MODEL_PATHS \
-    --gen_model_paths=$GEN_MODEL_PATHS \
+    --model_package_paths=$WEIGHT_SHARED_MODEL_PACKAGE_PATHS \
     --prompt_file=$PROMPT_FILE
diff --git a/examples/mediatek/executor_runner/run_qwen2_sample.sh b/examples/mediatek/executor_runner/run_qwen2_sample.sh
@@ -36,20 +36,11 @@ TOKENIZER_PATH="/data/local/tmp/et_mtk/tokenizer_qwen3.json"
 TOKEN_EMBEDDING_PATH="/data/local/tmp/et_mtk/embedding_Qwen2-7B-Instruct_fp32.bin"
 
 # Comma-Separated Paths
-PROMPT_MODEL_PATHS="\
-/data/local/tmp/et_mtk/Qwen2-7B-Instruct_A16W4_4_chunks_128t512c/Qwen2-7B-Instruct_A16W4_4_chunks_128t512c_0.pte,\
-/data/local/tmp/et_mtk/Qwen2-7B-Instruct_A16W4_4_chunks_128t512c/Qwen2-7B-Instruct_A16W4_4_chunks_128t512c_1.pte,\
-/data/local/tmp/et_mtk/Qwen2-7B-Instruct_A16W4_4_chunks_128t512c/Qwen2-7B-Instruct_A16W4_4_chunks_128t512c_2.pte,\
-/data/local/tmp/et_mtk/Qwen2-7B-Instruct_A16W4_4_chunks_128t512c/Qwen2-7B-Instruct_A16W4_4_chunks_128t512c_3.pte,"
-
-# # Comma-Separated Paths
-GEN_MODEL_PATHS="\
-/data/local/tmp/et_mtk/Qwen2-7B-Instruct_A16W4_4_chunks_1t512c/Qwen2-7B-Instruct_A16W4_4_chunks_1t512c_0.pte,\
-/data/local/tmp/et_mtk/Qwen2-7B-Instruct_A16W4_4_chunks_1t512c/Qwen2-7B-Instruct_A16W4_4_chunks_1t512c_1.pte,\
-/data/local/tmp/et_mtk/Qwen2-7B-Instruct_A16W4_4_chunks_1t512c/Qwen2-7B-Instruct_A16W4_4_chunks_1t512c_2.pte,\
-/data/local/tmp/et_mtk/Qwen2-7B-Instruct_A16W4_4_chunks_1t512c/Qwen2-7B-Instruct_A16W4_4_chunks_1t512c_3.pte,"
-
-
+WEIGHT_SHARED_MODEL_PACKAGE_PATHS="\
+/data/local/tmp/et_mtk/Qwen2-7B-Instruct_A16W4_4_chunks/Qwen2-7B-Instruct_A16W4_4_chunks_0.pte,\
+/data/local/tmp/et_mtk/Qwen2-7B-Instruct_A16W4_4_chunks/Qwen2-7B-Instruct_A16W4_4_chunks_1.pte,\
+/data/local/tmp/et_mtk/Qwen2-7B-Instruct_A16W4_4_chunks/Qwen2-7B-Instruct_A16W4_4_chunks_2.pte,\
+/data/local/tmp/et_mtk/Qwen2-7B-Instruct_A16W4_4_chunks/Qwen2-7B-Instruct_A16W4_4_chunks_3.pte,"
 
 PROMPT_FILE=/data/local/tmp/et_mtk/prompt.txt
 
@@ -77,6 +68,5 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD
     --tokenizer_type=$TOKENIZER_TYPE \
     --tokenizer_path=$TOKENIZER_PATH \
     --token_embedding_path=$TOKEN_EMBEDDING_PATH \
-    --prompt_model_paths=$PROMPT_MODEL_PATHS \
-    --gen_model_paths=$GEN_MODEL_PATHS \
+    --model_package_paths=$WEIGHT_SHARED_MODEL_PACKAGE_PATHS \
     --prompt_file=$PROMPT_FILE
diff --git a/examples/mediatek/executor_runner/run_qwen3_sample.sh b/examples/mediatek/executor_runner/run_qwen3_sample.sh
@@ -37,22 +37,13 @@ TOKENIZER_PATH="/data/local/tmp/et_mtk/tokenizer_qwen3.json"
 TOKEN_EMBEDDING_PATH="/data/local/tmp/et_mtk/embedding_Qwen3-4B_fp32.bin"
 
 # Comma-Separated Paths
-PROMPT_MODEL_PATHS="\
-/data/local/tmp/et_mtk/Qwen3-4B_A16W4_4_chunks_128t512c/Qwen3-4B_A16W4_4_chunks_128t512c_0.pte,\
-/data/local/tmp/et_mtk/Qwen3-4B_A16W4_4_chunks_128t512c/Qwen3-4B_A16W4_4_chunks_128t512c_1.pte,\
-/data/local/tmp/et_mtk/Qwen3-4B_A16W4_4_chunks_128t512c/Qwen3-4B_A16W4_4_chunks_128t512c_2.pte,\
-/data/local/tmp/et_mtk/Qwen3-4B_A16W4_4_chunks_128t512c/Qwen3-4B_A16W4_4_chunks_128t512c_3.pte,"
+WEIGHT_SHARED_MODEL_PACKAGE_PATHS="\
+/data/local/tmp/et_mtk/Qwen3-4B_A16W4_4_chunks/Qwen3-4B_A16W4_4_chunks_0.pte,\
+/data/local/tmp/et_mtk/Qwen3-4B_A16W4_4_chunks/Qwen3-4B_A16W4_4_chunks_1.pte,\
+/data/local/tmp/et_mtk/Qwen3-4B_A16W4_4_chunks/Qwen3-4B_A16W4_4_chunks_2.pte,\
+/data/local/tmp/et_mtk/Qwen3-4B_A16W4_4_chunks/Qwen3-4B_A16W4_4_chunks_3.pte,"
 
-# # Comma-Separated Paths
-GEN_MODEL_PATHS="\
-/data/local/tmp/et_mtk/Qwen3-4B_A16W4_4_chunks_1t512c/Qwen3-4B_A16W4_4_chunks_1t512c_0.pte,\
-/data/local/tmp/et_mtk/Qwen3-4B_A16W4_4_chunks_1t512c/Qwen3-4B_A16W4_4_chunks_1t512c_1.pte,\
-/data/local/tmp/et_mtk/Qwen3-4B_A16W4_4_chunks_1t512c/Qwen3-4B_A16W4_4_chunks_1t512c_2.pte,\
-/data/local/tmp/et_mtk/Qwen3-4B_A16W4_4_chunks_1t512c/Qwen3-4B_A16W4_4_chunks_1t512c_3.pte,"
-
-
-
-PROMPT_FILE=/data/local/tmp/et_mtk/prompt.txt
+PROMPT_FILE=/data/local/tmp/et_mtk/prompt_qwen3.txt
 
 chmod +x mtk_llama_executor_runner
 
@@ -79,6 +70,5 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD
     --tokenizer_type=$TOKENIZER_TYPE \
     --tokenizer_path=$TOKENIZER_PATH \
     --token_embedding_path=$TOKEN_EMBEDDING_PATH \
-    --prompt_model_paths=$PROMPT_MODEL_PATHS \
-    --gen_model_paths=$GEN_MODEL_PATHS \
+    --model_package_paths=$WEIGHT_SHARED_MODEL_PACKAGE_PATHS \
     --prompt_file=$PROMPT_FILE
diff --git a/examples/mediatek/model_export_scripts/gemma.py b/examples/mediatek/model_export_scripts/gemma.py
@@ -315,7 +315,7 @@ def prepare_model_inputs(
         if window_size is not None:
             local_mask = generate_mask(
                 max_cache_size,
-                0,
+                seq_length,
                 input_length,
                 input_length,
                 sliding_window=True,