feat: 统一对task中过慢的模型进行警告，并在model_config.toml中设定对应task的慢请求阈值

2025-11-29 21:50:49 +08:00 · 2025-11-29 21:50:49 +08:00 · 6470d27270
parent dc84366bb5
commit 6470d27270
5 changed files with 37 additions and 8 deletions
--- a/src/chat/replyer/group_generator.py
+++ b/src/chat/replyer/group_generator.py
@ -839,8 +839,6 @@ class DefaultReplyer:
                continue

            timing_logs.append(f"{chinese_name}: {duration:.1f}s")
-            if duration > 12:
-                logger.warning(f"回复生成前信息获取耗时过长: {chinese_name} 耗时: {duration:.1f}s，请使用更快的模型")
        logger.info(f"回复准备: {'; '.join(timing_logs)}; {almost_zero_str} <0.1s")

        expression_habits_block, selected_expressions = results_dict["expression_habits"]
--- a/src/chat/replyer/private_generator.py
+++ b/src/chat/replyer/private_generator.py
@ -760,8 +760,6 @@ class PrivateReplyer:
                continue

            timing_logs.append(f"{chinese_name}: {duration:.1f}s")
-            if duration > 12:
-                logger.warning(f"回复生成前信息获取耗时过长: {chinese_name} 耗时: {duration:.1f}s，请使用更快的模型")
        logger.info(f"回复准备: {'; '.join(timing_logs)}; {almost_zero_str} <0.1s")

        expression_habits_block, selected_expressions = results_dict["expression_habits"]
--- a/src/config/api_ada_configs.py
+++ b/src/config/api_ada_configs.py
@ -88,6 +88,9 @@ class TaskConfig(ConfigBase):
    temperature: float = 0.3
    """模型温度"""

+    slow_threshold: float = 15.0
+    """慢请求阈值（秒），超过此值会输出警告日志"""
+

@dataclass
 class ModelTaskConfig(ConfigBase):
--- a/src/llm_models/utils_model.py
+++ b/src/llm_models/utils_model.py
@ -47,6 +47,21 @@ class LLMRequest:
        }
        """模型使用量记录，用于进行负载均衡，对应为(total_tokens, penalty, usage_penalty)，惩罚值是为了能在某个模型请求不给力或正在被使用的时候进行调整"""

+    def _check_slow_request(self, time_cost: float, model_name: str) -> None:
+        """检查请求是否过慢并输出警告日志
+        
+        Args:
+            time_cost: 请求耗时（秒）
+            model_name: 使用的模型名称
+        """
+        threshold = self.model_for_task.slow_threshold
+        if time_cost > threshold:
+            request_type_display = self.request_type or "未知任务"
+            logger.warning(
+                f"LLM请求耗时过长: {request_type_display} 使用模型 {model_name} 耗时 {time_cost:.1f}s（阈值: {threshold}s），请考虑使用更快的模型\n"
+                f"  如果你认为该警告出现得过于频繁，请调整model_config.toml中对应任务的slow_threshold至符合你实际情况的合理值"
+            )
+
    async def generate_response_for_image(
        self,
        prompt: str,
@ -86,6 +101,8 @@ class LLMRequest:
        if not reasoning_content and content:
            content, extracted_reasoning = self._extract_reasoning(content)
            reasoning_content = extracted_reasoning
+        time_cost = time.time() - start_time
+        self._check_slow_request(time_cost, model_info.name)
        if usage := response.usage:
            llm_usage_recorder.record_usage_to_database(
                model_info=model_info,
@ -93,7 +110,7 @@ class LLMRequest:
                user_id="system",
                request_type=self.request_type,
                endpoint="/chat/completions",
-                time_cost=time.time() - start_time,
+                time_cost=time_cost,
            )
        return content, (reasoning_content, model_info.name, tool_calls)

@ -198,7 +215,8 @@ class LLMRequest:
            tool_options=tool_built,
        )

-        logger.debug(f"LLM请求总耗时: {time.time() - start_time}")
+        time_cost = time.time() - start_time
+        logger.debug(f"LLM请求总耗时: {time_cost}")
        logger.debug(f"LLM生成内容: {response}")

        content = response.content
@ -207,6 +225,7 @@ class LLMRequest:
        if not reasoning_content and content:
            content, extracted_reasoning = self._extract_reasoning(content)
            reasoning_content = extracted_reasoning
+        self._check_slow_request(time_cost, model_info.name)
        if usage := response.usage:
            llm_usage_recorder.record_usage_to_database(
                model_info=model_info,
@ -214,7 +233,7 @@ class LLMRequest:
                user_id="system",
                request_type=self.request_type,
                endpoint="/chat/completions",
-                time_cost=time.time() - start_time,
+                time_cost=time_cost,
            )
        return content or "", (reasoning_content, model_info.name, tool_calls)

--- a/template/model_config_template.toml
+++ b/template/model_config_template.toml
@ -1,5 +1,5 @@
 [inner]
-version = "1.8.1"
+version = "1.8.2"

 # 配置文件版本号迭代规则同bot_config.toml

@ -135,37 +135,45 @@ price_out = 0
 model_list = ["siliconflow-deepseek-v3.2"] # 使用的模型列表，每个子项对应上面的模型名称(name)
 temperature = 0.2                        # 模型温度，新V3建议0.1-0.3
 max_tokens = 2048                         # 最大输出token数
+slow_threshold = 15.0                     # 慢请求阈值（秒），模型等待回复时间超过此值会输出警告日志

 [model_task_config.utils_small] # 在麦麦的一些组件中使用的小模型，消耗量较大，建议使用速度较快的小模型
 model_list = ["qwen3-30b","qwen3-next-80b"]
 temperature = 0.7
 max_tokens = 2048
+slow_threshold = 10.0

 [model_task_config.tool_use] #工具调用模型，需要使用支持工具调用的模型
 model_list = ["qwen3-30b","qwen3-next-80b"]
 temperature = 0.7
 max_tokens = 800
+slow_threshold = 10.0

 [model_task_config.replyer] # 首要回复模型，还用于表达器和表达方式学习
 model_list = ["siliconflow-deepseek-v3.2","siliconflow-deepseek-v3.2-think","siliconflow-glm-4.6","siliconflow-glm-4.6-think"]
 temperature = 0.3                        # 模型温度，新V3建议0.1-0.3
 max_tokens = 2048
+slow_threshold = 25.0

 [model_task_config.planner] #决策：负责决定麦麦该什么时候回复的模型
 model_list = ["siliconflow-deepseek-v3.2"]
 temperature = 0.3
 max_tokens = 800
+slow_threshold = 12.0

 [model_task_config.vlm] # 图像识别模型
 model_list = ["qwen3-vl-30"]
 max_tokens = 256
+slow_threshold = 15.0

 [model_task_config.voice] # 语音识别模型
 model_list = ["sensevoice-small"]
+slow_threshold = 12.0

 # 嵌入模型
 [model_task_config.embedding]
 model_list = ["bge-m3"]
+slow_threshold = 5.0

 # ------------LPMM知识库模型------------

@ -173,13 +181,16 @@ model_list = ["bge-m3"]
 model_list = ["siliconflow-deepseek-v3.2"]
 temperature = 0.2
 max_tokens = 800
+slow_threshold = 20.0

 [model_task_config.lpmm_rdf_build] # RDF构建模型
 model_list = ["siliconflow-deepseek-v3.2"]
 temperature = 0.2
 max_tokens = 800
+slow_threshold = 20.0

 [model_task_config.lpmm_qa] # 问答模型
 model_list = ["siliconflow-deepseek-v3.2"]
 temperature = 0.7
 max_tokens = 800
+slow_threshold = 20.0