apache · MrJs133 · May 9, 2025 · May 9, 2025 · May 9, 2025 · May 9, 2025
diff --git a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py
@@ -386,4 +386,25 @@ class PromptConfig(BasePromptConfig):
     doc_input_text_CN: str = """介绍一下Sarah，她是一位30岁的律师，还有她的室友James，他们从2010年开始一起合租。James是一名记者，
 职业道路也很出色。另外，Sarah拥有一个个人网站www.sarahsplace.com，而James也经营着自己的网页，不过这里没有提到具体的网址。这两个人，
 Sarah和James，不仅建立起了深厚的室友情谊，还各自在网络上开辟了自己的一片天地，展示着他们各自丰富多彩的兴趣和经历。
+"""
+
+    review_prompt: str = """
+## 评审任务
+请根据以下标准答案对模型的回答进行专业评估：
+
+## 评估要求
+1. 从准确性（与标准答案一致性）、相关性（与问题相关度）、完整性（信息完整度）三个维度进行1-5分评分
+2. 计算综合评分（三个维度平均分，保留1位小数）
+3. 提供简明扼要的改进建议
+4. 使用JSON格式返回以下字段(返回内容一定要被```json ```所包围)：
+   - accuracy_score (int)
+   - relevance_score (int)
+   - completeness_score (int)
+   - overall_score (float)
+   - comment (str)
+
+## 标准答案
+{standard_answer}
+
+## 待评审回答
 """
diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py
@@ -219,7 +219,7 @@ def apply_llm_config(current_llm_config, arg1, arg2, arg3, arg4, origin_call=Non
         data = {
             "model": arg3,
             "temperature": 0.01,
-            "messages": [{"role": "user", "content": "test"}],
+            "messages": [{"role": "user", "content": "hello"}],
         }
         headers = {"Authorization": f"Bearer {arg1}"}
         status_code = test_api_connection(test_url, method="POST", headers=headers, body=data, origin_call=origin_call)

diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/other_block.py b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/other_block.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import os
 import asyncio
 from contextlib import asynccontextmanager
 
@@ -24,8 +25,10 @@
 from fastapi import FastAPI
 
 from hugegraph_llm.utils.hugegraph_utils import init_hg_test_data, run_gremlin_query, backup_data
+from hugegraph_llm.utils.other_tool_utils import auto_test_llms
 from hugegraph_llm.utils.log import log
 from hugegraph_llm.demo.rag_demo.vector_graph_block import timely_update_vid_embedding
+from hugegraph_llm.config import llm_settings, resource_path
 
 
 def create_other_block():
@@ -42,14 +45,47 @@ def create_other_block():
         out = gr.Textbox(label="Backup Graph Manually (Auto backup at 1:00 AM everyday)", show_copy_button=True)
     btn = gr.Button("Backup Graph Data")
     btn.click(fn=backup_data, inputs=inp, outputs=out)  # pylint: disable=no-member
+    # auto test llm
+    with gr.Accordion("Evaluation Model Settings (only support openai)", open=True):
+        with gr.Row():
+            review_model_name = gr.Textbox(label="Model Name", value="ernie-4.5-8k-preview", interactive=True)
+            review_max_tokens = gr.Textbox(label="Max Tokens", value=2048)
+            key = gr.Textbox(value=getattr(llm_settings, "openai_chat_api_key"), label="API Key")
+            base = gr.Textbox(value=getattr(llm_settings, "openai_chat_api_base"),label="API Base")
+    with gr.Row():
+        with gr.Column():
+            with gr.Tab("file") as tab_upload_file: # pylint: disable=W0612
+                inp1_file = gr.File(
+                    value=os.path.join(resource_path, "demo", "llm_review.yaml"),
+                    label="yaml file",
+                    file_count="single",
+                )
+            with gr.Tab("text") as tab_upload_text: # pylint: disable=W0612
+                inp1 = gr.Textbox(
+                    value="openai, model_name, api_key, api_base, max_tokens\n" \
+                    "qianfan_wenxin, model_name, api_key, secret_key\n" \
+                    "ollama/local, model_name, host, port, max_tokens\n" \
+                    "litellm, model_name, api_key, api_base, max_tokens\n",
+                    label="LLMs Config (every line represents a different LLM)",
+                    show_copy_button=True, lines=6
+                )
+    with gr.Row():
+        inp2 = gr.Textbox(value="hello, how are you?", label="Prompt", show_copy_button=True, lines=8)
+        inp3 = gr.Textbox(value="I am fine, thank you", label="Standard Answer", show_copy_button=True, lines=8)
+    out = gr.Code(label="Output", language="json", elem_classes="code-container-show")
+    btn = gr.Button("Run LLM Test")
+    btn.click( # pylint: disable=no-member
+        fn=auto_test_llms,
+        inputs=[inp1, inp1_file, inp2, inp3, review_model_name, review_max_tokens, key, base],
+        outputs=out
+    )
     with gr.Accordion("Init HugeGraph test data (🚧)", open=False):
         with gr.Row():
             inp = []
             out = gr.Textbox(label="Init Graph Demo Result", show_copy_button=True)
         btn = gr.Button("(BETA) Init HugeGraph test data (🚧)")
         btn.click(fn=init_hg_test_data, inputs=inp, outputs=out)  # pylint: disable=no-member
 
-
 @asynccontextmanager
 async def lifespan(app: FastAPI):  # pylint: disable=W0621
     log.info("Starting background scheduler...")

diff --git a/hugegraph-llm/src/hugegraph_llm/resources/demo/llm_review.yaml b/hugegraph-llm/src/hugegraph_llm/resources/demo/llm_review.yaml
@@ -0,0 +1,11 @@
+- type: openai
+  model_name: ernie-4.5-8k-preview
+  api_key: 
+  api_base: 
+  max_tokens: 2048
+
+- type: openai
+  model_name: gpt-4.1-mini
+  api_key: 
+  api_base: 
+  max_tokens: 4096
diff --git a/hugegraph-llm/src/hugegraph_llm/utils/other_tool_utils.py b/hugegraph-llm/src/hugegraph_llm/utils/other_tool_utils.py
@@ -0,0 +1,236 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import time
+import json
+import re
+import gradio as gr
+import yaml
+
+from hugegraph_llm.config import PromptConfig
+from hugegraph_llm.utils.log import log
+from hugegraph_llm.models.llms.ollama import OllamaClient
+from hugegraph_llm.models.llms.openai import OpenAIClient
+from hugegraph_llm.models.llms.qianfan import QianfanClient
+from hugegraph_llm.models.llms.litellm import LiteLLMClient
+def judge(answers, standard_answer, review_model_name, review_max_tokens, key, base):
+    try:
+        review_client = OpenAIClient(
+            api_key=key,
+            api_base=base,
+            model_name=review_model_name,
+            max_tokens=int(review_max_tokens)
+        )
+        review_prompt = PromptConfig.review_prompt.format(standard_answer=standard_answer)
+        for _, (model_name, answer) in enumerate(answers.items(), start=1):
+            review_prompt += f"### {model_name}:\n{answer.strip()}\n\n"
+        log.debug("Review_prompt: %s", review_prompt)
+        response = review_client.generate(prompt=review_prompt)
+        log.debug("orig_review_response: %s", response)
+        match = re.search(r'```json\n(.*?)\n```', response, re.DOTALL)
+        if match:
+            response = match.group(1).strip()
+        reviews = json.loads(response)
+        return reviews
+    except Exception as e: # pylint: disable=W0718
+        log.error("Review failed: %s", str(e))
+        reviews = {"error": f"Review error: {str(e)}"}
+        return reviews
+
+def parse_llm_configurations(config_text: str):
+    configs = []
+    lines = config_text.strip().split("\n")
+    for i, line in enumerate(lines, 1):
+        fields = [x.strip() for x in line.split(",")]
+        if not fields:
+            continue
+        llm_type = fields[0]
+        try:
+            if llm_type == "openai":
+                # openai, model_name, api_key, api_base, max_tokens
+                model_name, api_key, api_base, max_tokens = fields[1:5]
+                configs.append({
+                    "type": "openai",
+                    "model_name": model_name,
+                    "api_key": api_key,
+                    "api_base": api_base,
+                    "max_tokens": int(max_tokens),
+                })
+            elif llm_type == "qianfan_wenxin":
+                # qianfan_wenxin, model_name, api_key, secret_key
+                model_name, api_key, secret_key = fields[1:4]
+                configs.append({
+                    "type": "qianfan_wenxin",
+                    "model_name": model_name,
+                    "api_key": api_key,
+                    "secret_key": secret_key,
+                })
+            elif llm_type == "ollama/local":
+                # ollama/local, model_name, host, port, max_tokens
+                model_name, host, port, max_tokens = fields[1:5]
+                configs.append({
+                    "type": "ollama/local",
+                    "model_name": model_name,
+                    "host": host,
+                    "port": int(port),
+                    "max_tokens": int(max_tokens),
+                })
+            elif llm_type == "litellm":
+                # litellm, model_name, api_key, api_base, max_tokens
+                model_name, api_key, api_base, max_tokens = fields[1:5]
+                configs.append({
+                    "type": "litellm",
+                    "model_name": model_name,
+                    "api_key": api_key,
+                    "api_base": api_base,
+                    "max_tokens": int(max_tokens),
+                })
+            else:
+                raise ValueError(f"Unsupported llm type '{llm_type}' in line {i}")
+        except Exception as e:
+            raise ValueError(f"Error parsing line {i}: {line}\nDetails: {e}") from e
+    return configs
+
+def parse_llm_configurations_from_yaml(yaml_file_path: str):
+    configs = []
+    with open(yaml_file_path, "r", encoding="utf-8") as f:
+        raw_configs = yaml.safe_load(f)
+    if not isinstance(raw_configs, list):
+        raise ValueError("YAML 文件内容必须是一个 LLM 配置列表。")
+    for i, config in enumerate(raw_configs, 1):
+        try:
+            llm_type = config.get("type")
+            if llm_type == "openai":
+                configs.append({
+                    "type": "openai",
+                    "model_name": config["model_name"],
+                    "api_key": config["api_key"],
+                    "api_base": config["api_base"],
+                    "max_tokens": int(config["max_tokens"]),
+                })
+            elif llm_type == "qianfan_wenxin":
+                configs.append({
+                    "type": "qianfan_wenxin",
+                    "model_name": config["model_name"],
+                    "api_key": config["api_key"],
+                    "secret_key": config["secret_key"],
+                })
+            elif llm_type == "ollama/local":
+                configs.append({
+                    "type": "ollama/local",
+                    "model_name": config["model_name"],
+                    "host": config["host"],
+                    "port": int(config["port"]),
+                    "max_tokens": int(config["max_tokens"]),
+                })
+            elif llm_type == "litellm":
+                configs.append({
+                    "type": "litellm",
+                    "model_name": config["model_name"],
+                    "api_key": config["api_key"],
+                    "api_base": config["api_base"],
+                    "max_tokens": int(config["max_tokens"]),
+                })
+            else:
+                raise ValueError(f"不支持的 llm type '{llm_type}'，在配置第 {i} 项")
+        except Exception as e:
+            raise ValueError(f"解析配置第 {i} 项失败: {e}") from e
+
+    return configs
+
+
+def auto_test_llms(
+        llm_configs,
+        llm_configs_file,
+        prompt,
+        standard_answer,
+        review_model_name,
+        review_max_tokens,
+        key,
+        base,
+        fmt=True
+    ):
+    configs = None
+    if llm_configs_file and llm_configs:
+        raise gr.Error("Please only choose one between file and text.")
+    if llm_configs:
+        configs = parse_llm_configurations(llm_configs)
+    elif llm_configs_file:
+        configs = parse_llm_configurations_from_yaml(llm_configs_file)
+    log.debug("LLM_configs: %s", configs)
+    answers = {}
+    for config in configs:
+        output = None
+        time_start = time.perf_counter()
+        try:
+            if config["type"] == "openai":
+                    client = OpenAIClient(
+                        api_key=config["api_key"],
+                        api_base=config["api_base"],
+                        model_name=config["model_name"],
+                        max_tokens=config["max_tokens"],
+                    )
+                    output = client.generate(prompt=prompt)
+            elif config["type"] == "qianfan_wenxin":
+                    client = QianfanClient(
+                        model_name=config["model_name"],
+                        api_key=config["api_key"],
+                        secret_key=config["secret_key"]
+                    )
+                    output = client.generate(prompt=prompt)
+            elif config["type"] == "ollama/local":
+                    client = OllamaClient(
+                        model_name=config["model_name"],
+                        host=config["host"],
+                        port=config["port"],
+                    )
+                    output = client.generate(prompt=prompt)
+            elif config["type"] == "litellm":
+                    client = LiteLLMClient(
+                        api_key=config["api_key"],
+                        api_base=config["api_base"],
+                        model_name=config["model_name"],
+                        max_tokens=config["max_tokens"],
+                    )
+                    output = client.generate(prompt=prompt)
+        except Exception as e:  # pylint: disable=broad-except
+            log.error("Generate failed for %s: %s", config["model_name"], e)
+            output = f"[ERROR] {e}"
+        time_end = time.perf_counter()
+        latency = time_end - time_start
+        answers[config["model_name"]] = {
+            "answer": output,
+            "latency": f"{round(latency, 2)}s"
+        }
+    reviews = judge(
+        {k: v["answer"] for k, v in answers.items()},
+        standard_answer,
+        review_model_name,
+        review_max_tokens,
+        key,
+        base
+    )
+    log.debug("reviews: %s", reviews)
+    result = {}
+    reviews_dict = {item["model"]: item for item in reviews} if isinstance(reviews, list) else reviews
+    for model_name, infos in answers.items():
+        result[model_name] = {
+            "answer": infos["answer"],
+            "latency": infos["latency"],
+            "review": reviews_dict.get(model_name, {})
+        }
+    return json.dumps(result, indent=4, ensure_ascii=False) if fmt else reviews