diff --git a/src/chat/replyer/group_generator.py b/src/chat/replyer/group_generator.py index 2ee403cc..3dd52272 100644 --- a/src/chat/replyer/group_generator.py +++ b/src/chat/replyer/group_generator.py @@ -839,8 +839,6 @@ class DefaultReplyer: continue timing_logs.append(f"{chinese_name}: {duration:.1f}s") - if duration > 12: - logger.warning(f"回复生成前信息获取耗时过长: {chinese_name} 耗时: {duration:.1f}s,请使用更快的模型") logger.info(f"回复准备: {'; '.join(timing_logs)}; {almost_zero_str} <0.1s") expression_habits_block, selected_expressions = results_dict["expression_habits"] diff --git a/src/chat/replyer/private_generator.py b/src/chat/replyer/private_generator.py index 93543cf5..396e806f 100644 --- a/src/chat/replyer/private_generator.py +++ b/src/chat/replyer/private_generator.py @@ -760,8 +760,6 @@ class PrivateReplyer: continue timing_logs.append(f"{chinese_name}: {duration:.1f}s") - if duration > 12: - logger.warning(f"回复生成前信息获取耗时过长: {chinese_name} 耗时: {duration:.1f}s,请使用更快的模型") logger.info(f"回复准备: {'; '.join(timing_logs)}; {almost_zero_str} <0.1s") expression_habits_block, selected_expressions = results_dict["expression_habits"] diff --git a/src/config/api_ada_configs.py b/src/config/api_ada_configs.py index 3fc9c878..897e1f87 100644 --- a/src/config/api_ada_configs.py +++ b/src/config/api_ada_configs.py @@ -88,6 +88,9 @@ class TaskConfig(ConfigBase): temperature: float = 0.3 """模型温度""" + slow_threshold: float = 15.0 + """慢请求阈值(秒),超过此值会输出警告日志""" + @dataclass class ModelTaskConfig(ConfigBase): diff --git a/src/llm_models/utils_model.py b/src/llm_models/utils_model.py index 060ec922..44ff2de3 100644 --- a/src/llm_models/utils_model.py +++ b/src/llm_models/utils_model.py @@ -47,6 +47,21 @@ class LLMRequest: } """模型使用量记录,用于进行负载均衡,对应为(total_tokens, penalty, usage_penalty),惩罚值是为了能在某个模型请求不给力或正在被使用的时候进行调整""" + def _check_slow_request(self, time_cost: float, model_name: str) -> None: + """检查请求是否过慢并输出警告日志 + + Args: + time_cost: 请求耗时(秒) + model_name: 使用的模型名称 + """ + threshold = self.model_for_task.slow_threshold + if time_cost > threshold: + request_type_display = self.request_type or "未知任务" + logger.warning( + f"LLM请求耗时过长: {request_type_display} 使用模型 {model_name} 耗时 {time_cost:.1f}s(阈值: {threshold}s),请考虑使用更快的模型\n" + f" 如果你认为该警告出现得过于频繁,请调整model_config.toml中对应任务的slow_threshold至符合你实际情况的合理值" + ) + async def generate_response_for_image( self, prompt: str, @@ -86,6 +101,8 @@ class LLMRequest: if not reasoning_content and content: content, extracted_reasoning = self._extract_reasoning(content) reasoning_content = extracted_reasoning + time_cost = time.time() - start_time + self._check_slow_request(time_cost, model_info.name) if usage := response.usage: llm_usage_recorder.record_usage_to_database( model_info=model_info, @@ -93,7 +110,7 @@ class LLMRequest: user_id="system", request_type=self.request_type, endpoint="/chat/completions", - time_cost=time.time() - start_time, + time_cost=time_cost, ) return content, (reasoning_content, model_info.name, tool_calls) @@ -198,7 +215,8 @@ class LLMRequest: tool_options=tool_built, ) - logger.debug(f"LLM请求总耗时: {time.time() - start_time}") + time_cost = time.time() - start_time + logger.debug(f"LLM请求总耗时: {time_cost}") logger.debug(f"LLM生成内容: {response}") content = response.content @@ -207,6 +225,7 @@ class LLMRequest: if not reasoning_content and content: content, extracted_reasoning = self._extract_reasoning(content) reasoning_content = extracted_reasoning + self._check_slow_request(time_cost, model_info.name) if usage := response.usage: llm_usage_recorder.record_usage_to_database( model_info=model_info, @@ -214,7 +233,7 @@ class LLMRequest: user_id="system", request_type=self.request_type, endpoint="/chat/completions", - time_cost=time.time() - start_time, + time_cost=time_cost, ) return content or "", (reasoning_content, model_info.name, tool_calls) diff --git a/template/model_config_template.toml b/template/model_config_template.toml index 38f57a79..1e072d13 100644 --- a/template/model_config_template.toml +++ b/template/model_config_template.toml @@ -1,5 +1,5 @@ [inner] -version = "1.8.1" +version = "1.8.2" # 配置文件版本号迭代规则同bot_config.toml @@ -135,37 +135,45 @@ price_out = 0 model_list = ["siliconflow-deepseek-v3.2"] # 使用的模型列表,每个子项对应上面的模型名称(name) temperature = 0.2 # 模型温度,新V3建议0.1-0.3 max_tokens = 2048 # 最大输出token数 +slow_threshold = 15.0 # 慢请求阈值(秒),模型等待回复时间超过此值会输出警告日志 [model_task_config.utils_small] # 在麦麦的一些组件中使用的小模型,消耗量较大,建议使用速度较快的小模型 model_list = ["qwen3-30b","qwen3-next-80b"] temperature = 0.7 max_tokens = 2048 +slow_threshold = 10.0 [model_task_config.tool_use] #工具调用模型,需要使用支持工具调用的模型 model_list = ["qwen3-30b","qwen3-next-80b"] temperature = 0.7 max_tokens = 800 +slow_threshold = 10.0 [model_task_config.replyer] # 首要回复模型,还用于表达器和表达方式学习 model_list = ["siliconflow-deepseek-v3.2","siliconflow-deepseek-v3.2-think","siliconflow-glm-4.6","siliconflow-glm-4.6-think"] temperature = 0.3 # 模型温度,新V3建议0.1-0.3 max_tokens = 2048 +slow_threshold = 25.0 [model_task_config.planner] #决策:负责决定麦麦该什么时候回复的模型 model_list = ["siliconflow-deepseek-v3.2"] temperature = 0.3 max_tokens = 800 +slow_threshold = 12.0 [model_task_config.vlm] # 图像识别模型 model_list = ["qwen3-vl-30"] max_tokens = 256 +slow_threshold = 15.0 [model_task_config.voice] # 语音识别模型 model_list = ["sensevoice-small"] +slow_threshold = 12.0 # 嵌入模型 [model_task_config.embedding] model_list = ["bge-m3"] +slow_threshold = 5.0 # ------------LPMM知识库模型------------ @@ -173,13 +181,16 @@ model_list = ["bge-m3"] model_list = ["siliconflow-deepseek-v3.2"] temperature = 0.2 max_tokens = 800 +slow_threshold = 20.0 [model_task_config.lpmm_rdf_build] # RDF构建模型 model_list = ["siliconflow-deepseek-v3.2"] temperature = 0.2 max_tokens = 800 +slow_threshold = 20.0 [model_task_config.lpmm_qa] # 问答模型 model_list = ["siliconflow-deepseek-v3.2"] temperature = 0.7 max_tokens = 800 +slow_threshold = 20.0