diff --git a/src/llm_models/model_client/openai_client.py b/src/llm_models/model_client/openai_client.py index 148ec8cb..dd92b9e8 100644 --- a/src/llm_models/model_client/openai_client.py +++ b/src/llm_models/model_client/openai_client.py @@ -199,6 +199,7 @@ def _build_stream_api_resp( _fc_delta_buffer: io.StringIO, _rc_delta_buffer: io.StringIO, _tool_calls_buffer: list[tuple[str, str, io.StringIO]], + finish_reason: str | None = None, ) -> APIResponse: resp = APIResponse() @@ -236,6 +237,16 @@ def _build_stream_api_resp( resp.tool_calls.append(ToolCall(call_id, function_name, arguments)) + # 检查 max_tokens 截断 + if finish_reason == "length": + if resp.content and resp.content.strip(): + logger.warning( + "⚠ OpenAI 响应因达到 max_tokens 限制被部分截断,\n" + " 可能会对回复内容造成影响,建议修改模型 max_tokens 配置!" + ) + else: + logger.warning("⚠ OpenAI 响应因达到 max_tokens 限制被截断,\n 请修改模型 max_tokens 配置!") + if not resp.content and not resp.tool_calls: raise EmptyResponseException() @@ -258,6 +269,7 @@ async def _default_stream_response_handler( _fc_delta_buffer = io.StringIO() # 正式内容缓冲区,用于存储接收到的正式内容 _tool_calls_buffer: list[tuple[str, str, io.StringIO]] = [] # 工具调用缓冲区,用于存储接收到的工具调用 _usage_record = None # 使用情况记录 + finish_reason: str | None = None # 记录最后的 finish_reason def _insure_buffer_closed(): # 确保缓冲区被关闭 @@ -285,6 +297,9 @@ async def _default_stream_response_handler( continue # 跳过本帧,避免访问 choices[0] delta = event.choices[0].delta # 获取当前块的delta内容 + if hasattr(event.choices[0], "finish_reason") and event.choices[0].finish_reason: + finish_reason = event.choices[0].finish_reason + if hasattr(delta, "reasoning_content") and delta.reasoning_content: # type: ignore # 标记:有独立的推理内容块 _has_rc_attr_flag = True @@ -311,6 +326,7 @@ async def _default_stream_response_handler( _fc_delta_buffer, _rc_delta_buffer, _tool_calls_buffer, + finish_reason=finish_reason, ), _usage_record except Exception: # 确保缓冲区被关闭 @@ -381,6 +397,23 @@ def _default_normal_response_parser( # 将原始响应存储在原始数据中 api_response.raw_data = resp + # 检查 max_tokens 截断 + try: + choice0 = resp.choices[0] + reason = getattr(choice0, "finish_reason", None) + if reason and reason == "length": + has_real_output = bool(api_response.content and api_response.content.strip()) + if has_real_output: + logger.warning( + "⚠ OpenAI 响应因达到 max_tokens 限制被部分截断,\n" + " 可能会对回复内容造成影响,建议修改模型 max_tokens 配置!" + ) + else: + logger.warning("⚠ OpenAI 响应因达到 max_tokens 限制被截断,\n 请修改模型 max_tokens 配置!") + return api_response, _usage_record + except Exception as e: + logger.debug(f"检查 MAX_TOKENS 截断时异常: {e}") + if not api_response.content and not api_response.tool_calls: raise EmptyResponseException()