From ab8982c341bbdb38e360c3269b6b34167d412130 Mon Sep 17 00:00:00 2001
From: foxplaying <166147707+foxplaying@users.noreply.github.com>
Date: Wed, 24 Sep 2025 21:02:43 +0800
Subject: [PATCH 1/8] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/llm_models/model_client/gemini_client.py | 77 +++++++++++---------
 1 file changed, 44 insertions(+), 33 deletions(-)

diff --git a/src/llm_models/model_client/gemini_client.py b/src/llm_models/model_client/gemini_client.py
index c23ba90a..8f0b552d 100644
--- a/src/llm_models/model_client/gemini_client.py
+++ b/src/llm_models/model_client/gemini_client.py
@@ -362,18 +362,29 @@ class GeminiClient(BaseClient):
                 http_options_kwargs["api_version"] = parts[1]
             else:
                 http_options_kwargs["base_url"] = api_provider.base_url
+                http_options_kwargs["api_version"] = None
         self.client = genai.Client(
             http_options=HttpOptions(**http_options_kwargs),
             api_key=api_provider.api_key,
         )  # 这里和openai不一样，gemini会自己决定自己是否需要retry
 
     @staticmethod
-    def clamp_thinking_budget(tb: int, model_id: str) -> int:
+    def clamp_thinking_budget(extra_params: dict[str, Any] | None, model_id: str) -> int:
         """
         按模型限制思考预算范围，仅支持指定的模型（支持带数字后缀的新版本）
         """
         limits = None
 
+        # 参数传入处理
+        tb = THINKING_BUDGET_AUTO
+        if extra_params and "thinking_budget" in extra_params:
+            try:
+                tb = int(extra_params["thinking_budget"])
+            except (ValueError, TypeError):
+                logger.warning(
+                    f"无效的 thinking_budget 值 {extra_params['thinking_budget']}，将使用模型自动预算模式 {tb}"
+                )
+        
         # 优先尝试精确匹配
         if model_id in THINKING_BUDGET_LIMITS:
             limits = THINKING_BUDGET_LIMITS[model_id]
@@ -416,8 +427,8 @@ class GeminiClient(BaseClient):
         model_info: ModelInfo,
         message_list: list[Message],
         tool_options: list[ToolOption] | None = None,
-        max_tokens: int = 1024,
-        temperature: float = 0.4,
+        max_tokens: Optional[int] = 1024,
+        temperature: Optional[float] = 0.4,
         response_format: RespFormat | None = None,
         stream_response_handler: Optional[
             Callable[
@@ -456,19 +467,9 @@ class GeminiClient(BaseClient):
         messages = _convert_messages(message_list)
         # 将tool_options转换为Gemini API所需的格式
         tools = _convert_tool_options(tool_options) if tool_options else None
-
-        tb = THINKING_BUDGET_AUTO
-        # 空处理
-        if extra_params and "thinking_budget" in extra_params:
-            try:
-                tb = int(extra_params["thinking_budget"])
-            except (ValueError, TypeError):
-                logger.warning(
-                    f"无效的 thinking_budget 值 {extra_params['thinking_budget']}，将使用模型自动预算模式 {tb}"
-                )
-        # 裁剪到模型支持的范围
-        tb = self.clamp_thinking_budget(tb, model_info.model_identifier)
-
+        # 解析并裁剪 thinking_budget
+        tb = self.clamp_thinking_budget(extra_params, model_info.model_identifier)
+        
         # 将response_format转换为Gemini API所需的格式
         generation_config_dict = {
             "max_output_tokens": max_tokens,
@@ -590,41 +591,51 @@ class GeminiClient(BaseClient):
 
         return response
 
-    def get_audio_transcriptions(
-        self, model_info: ModelInfo, audio_base64: str, extra_params: dict[str, Any] | None = None
+    async def get_audio_transcriptions(
+        self,
+        model_info: ModelInfo,
+        audio_base64: str,
+        max_tokens: Optional[int] = 2048,
+        extra_params: dict[str, Any] | None = None,
     ) -> APIResponse:
         """
         获取音频转录
         :param model_info: 模型信息
         :param audio_base64: 音频文件的Base64编码字符串
+        :param max_tokens: 最大输出token数（默认2048）
         :param extra_params: 额外参数（可选）
         :return: 转录响应
         """
+        # 解析并裁剪 thinking_budget
+        tb = self.clamp_thinking_budget(extra_params, model_info.model_identifier)
+
+        # 构造 prompt + 音频输入
+        prompt = "Generate a transcript of the speech. The language of the transcript should **match the language of the speech**."
+        contents = [
+            Content(
+                role="user",
+                parts=[
+                    Part.from_text(text=prompt),
+                    Part.from_bytes(data=base64.b64decode(audio_base64), mime_type="audio/wav"),
+                ],
+            )
+        ]
+
         generation_config_dict = {
-            "max_output_tokens": 2048,
+            "max_output_tokens": max_tokens,
             "response_modalities": ["TEXT"],
             "thinking_config": ThinkingConfig(
                 include_thoughts=True,
-                thinking_budget=(
-                    extra_params["thinking_budget"] if extra_params and "thinking_budget" in extra_params else 1024
-                ),
+                thinking_budget=tb,
             ),
             "safety_settings": gemini_safe_settings,
         }
         generate_content_config = GenerateContentConfig(**generation_config_dict)
-        prompt = "Generate a transcript of the speech. The language of the transcript should **match the language of the speech**."
+
         try:
-            raw_response: GenerateContentResponse = self.client.models.generate_content(
+            raw_response: GenerateContentResponse = await self.client.aio.models.generate_content(
                 model=model_info.model_identifier,
-                contents=[
-                    Content(
-                        role="user",
-                        parts=[
-                            Part.from_text(text=prompt),
-                            Part.from_bytes(data=base64.b64decode(audio_base64), mime_type="audio/wav"),
-                        ],
-                    )
-                ],
+                contents=contents,
                 config=generate_content_config,
             )
             resp, usage_record = _default_normal_response_parser(raw_response)

From b4f6b6e7b7ce9d7181020cf5972d0cbd71d757f0 Mon Sep 17 00:00:00 2001
From: foxplaying <166147707+foxplaying@users.noreply.github.com>
Date: Wed, 24 Sep 2025 21:03:49 +0800
Subject: [PATCH 2/8] fix

---
 src/llm_models/model_client/base_client.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llm_models/model_client/base_client.py b/src/llm_models/model_client/base_client.py
index eb74b0df..dcb710fe 100644
--- a/src/llm_models/model_client/base_client.py
+++ b/src/llm_models/model_client/base_client.py
@@ -72,8 +72,8 @@ class BaseClient(ABC):
         model_info: ModelInfo,
         message_list: list[Message],
         tool_options: list[ToolOption] | None = None,
-        max_tokens: int = 1024,
-        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        temperature: Optional[float] = None,
         response_format: RespFormat | None = None,
         stream_response_handler: Optional[
             Callable[[Any, asyncio.Event | None], tuple[APIResponse, tuple[int, int, int]]]
@@ -117,6 +117,7 @@ class BaseClient(ABC):
         self,
         model_info: ModelInfo,
         audio_base64: str,
+        max_tokens: Optional[int] = None,
         extra_params: dict[str, Any] | None = None,
     ) -> APIResponse:
         """

From e97af77fc818c4ba980cf343b87eebb21481b806 Mon Sep 17 00:00:00 2001
From: foxplaying <166147707+foxplaying@users.noreply.github.com>
Date: Wed, 24 Sep 2025 21:04:11 +0800
Subject: [PATCH 3/8] fix

---
 src/llm_models/model_client/openai_client.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llm_models/model_client/openai_client.py b/src/llm_models/model_client/openai_client.py
index 34134a15..8a93d4ce 100644
--- a/src/llm_models/model_client/openai_client.py
+++ b/src/llm_models/model_client/openai_client.py
@@ -403,8 +403,8 @@ class OpenaiClient(BaseClient):
         model_info: ModelInfo,
         message_list: list[Message],
         tool_options: list[ToolOption] | None = None,
-        max_tokens: int = 1024,
-        temperature: float = 0.7,
+        max_tokens: Optional[int] = 1024,
+        temperature: Optional[float] = 0.7,
         response_format: RespFormat | None = None,
         stream_response_handler: Optional[
             Callable[

From 8ba4fb60c6ced7d549ff3d9cf25c0b2b56131aa1 Mon Sep 17 00:00:00 2001
From: foxplaying <166147707+foxplaying@users.noreply.github.com>
Date: Wed, 24 Sep 2025 21:23:58 +0800
Subject: [PATCH 4/8] =?UTF-8?q?=E6=9B=B4=E5=85=B7=E4=BD=93=E7=9A=84?=
 =?UTF-8?q?=E6=8A=A5=E9=94=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/llm_models/model_client/gemini_client.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/llm_models/model_client/gemini_client.py b/src/llm_models/model_client/gemini_client.py
index 8f0b552d..cc450749 100644
--- a/src/llm_models/model_client/gemini_client.py
+++ b/src/llm_models/model_client/gemini_client.py
@@ -527,15 +527,20 @@ class GeminiClient(BaseClient):
 
                 resp, usage_record = async_response_parser(req_task.result())
         except (ClientError, ServerError) as e:
-            # 重封装ClientError和ServerError为RespNotOkException
+            # 重封装 ClientError 和 ServerError 为 RespNotOkException
             raise RespNotOkException(e.code, e.message) from None
         except (
             UnknownFunctionCallArgumentError,
             UnsupportedFunctionError,
             FunctionInvocationError,
         ) as e:
-            raise ValueError(f"工具类型错误：请检查工具选项和参数：{str(e)}") from None
+            # 工具调用相关错误
+            raise RespParseException(None, f"工具调用参数错误: {str(e)}") from None
+        except EmptyResponseException as e:
+            # 保持原始异常，便于区分“空响应”和网络异常
+            raise e
         except Exception as e:
+            # 其他未预料的错误，才归为网络连接类
             raise NetworkConnectionError() from e
 
         if usage_record:

From 39f43d315dbb72faba25e1e6fdf23662f14346af Mon Sep 17 00:00:00 2001
From: foxplaying <166147707+foxplaying@users.noreply.github.com>
Date: Wed, 24 Sep 2025 21:46:15 +0800
Subject: [PATCH 5/8] =?UTF-8?q?=E6=9B=B4=E8=AF=A6=E7=BB=86=E7=9A=84?=
 =?UTF-8?q?=E6=8A=A5=E9=94=99=E5=92=8C=E6=8F=90=E7=A4=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/llm_models/model_client/gemini_client.py | 27 +++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/llm_models/model_client/gemini_client.py b/src/llm_models/model_client/gemini_client.py
index cc450749..8b3e7908 100644
--- a/src/llm_models/model_client/gemini_client.py
+++ b/src/llm_models/model_client/gemini_client.py
@@ -335,7 +335,32 @@ def _default_normal_response_parser(
 
     # 最终的、唯一的空响应检查
     if not api_response.content and not api_response.tool_calls:
-        raise EmptyResponseException("响应中既无文本内容也无工具调用")
+        finish_reason = None
+        try:
+            if resp.candidates:
+                c0 = resp.candidates[0]
+                finish_reason = getattr(c0, "finish_reason", None) or getattr(c0, "finishReason", None)
+        except Exception:
+            pass
+
+        um = getattr(resp, "usage_metadata", None)
+
+        if finish_reason and str(finish_reason).upper().endswith("MAX_TOKENS"):
+            # 特殊处理：模型因为 max_tokens 截断
+            logger.warning(
+                f"Gemini 响应因达到 max_tokens 限制被截断，usage={um}"
+            )
+            # 返回一个带警告的响应，而不是抛异常
+            api_response.content = ""
+            api_response.reasoning_content = None
+            return api_response, None
+
+        logger.error(
+            f"Gemini 空响应诊断：finish_reason={finish_reason}, usage={um}"
+        )
+        raise EmptyResponseException(
+            f"响应中既无文本内容也无工具调用（finish_reason={finish_reason}）"
+        )
 
     return api_response, _usage_record
 

From dcb75d7b9493c701a10b7f49cf29c02d40cc2fd8 Mon Sep 17 00:00:00 2001
From: foxplaying <166147707+foxplaying@users.noreply.github.com>
Date: Wed, 24 Sep 2025 23:06:37 +0800
Subject: [PATCH 6/8] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=96=B9=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/llm_models/model_client/gemini_client.py | 59 +++++++++++---------
 1 file changed, 33 insertions(+), 26 deletions(-)

diff --git a/src/llm_models/model_client/gemini_client.py b/src/llm_models/model_client/gemini_client.py
index 8b3e7908..4074750b 100644
--- a/src/llm_models/model_client/gemini_client.py
+++ b/src/llm_models/model_client/gemini_client.py
@@ -333,34 +333,41 @@ def _default_normal_response_parser(
 
     api_response.raw_data = resp
 
+    # 检查是否因为 max_tokens 截断
+    try:
+        if resp.candidates:
+            c0 = resp.candidates[0]
+            reason = getattr(c0, "finish_reason", None) or getattr(c0, "finishReason", None)
+            if reason and "MAX_TOKENS" in str(reason):
+                # 检查第二个及之后的 parts 是否有内容
+                has_real_output = False
+                if getattr(c0, "content", None) and getattr(c0.content, "parts", None):
+                    for p in c0.content.parts[1:]:  # 跳过第一个 thought
+                        if getattr(p, "text", None) and p.text.strip():
+                            has_real_output = True
+                            break
+
+                if not has_real_output and getattr(resp, "text", None):
+                    has_real_output = True
+
+                if has_real_output:
+                    logger.warning(
+                        "⚠ Gemini 响应因达到 max_tokens 限制被部分截断，\n"
+                        "    可能会对回复内容造成影响，建议修改模型 max_tokens 配置！"
+                    )
+                else:
+                    logger.warning(
+                        "⚠ Gemini 响应因达到 max_tokens 限制被截断，\n"
+                        "    请修改模型 max_tokens 配置！"
+                    )
+
+                return api_response, _usage_record
+    except Exception as e:
+        logger.debug(f"检查 MAX_TOKENS 截断时异常: {e}")
+
     # 最终的、唯一的空响应检查
     if not api_response.content and not api_response.tool_calls:
-        finish_reason = None
-        try:
-            if resp.candidates:
-                c0 = resp.candidates[0]
-                finish_reason = getattr(c0, "finish_reason", None) or getattr(c0, "finishReason", None)
-        except Exception:
-            pass
-
-        um = getattr(resp, "usage_metadata", None)
-
-        if finish_reason and str(finish_reason).upper().endswith("MAX_TOKENS"):
-            # 特殊处理：模型因为 max_tokens 截断
-            logger.warning(
-                f"Gemini 响应因达到 max_tokens 限制被截断，usage={um}"
-            )
-            # 返回一个带警告的响应，而不是抛异常
-            api_response.content = ""
-            api_response.reasoning_content = None
-            return api_response, None
-
-        logger.error(
-            f"Gemini 空响应诊断：finish_reason={finish_reason}, usage={um}"
-        )
-        raise EmptyResponseException(
-            f"响应中既无文本内容也无工具调用（finish_reason={finish_reason}）"
-        )
+        raise EmptyResponseException("响应中既无文本内容也无工具调用")
 
     return api_response, _usage_record
 

From a54bf78945ccfc6f8a306aa2320ffd4ae2c0068e Mon Sep 17 00:00:00 2001
From: foxplaying <166147707+foxplaying@users.noreply.github.com>
Date: Wed, 24 Sep 2025 23:19:53 +0800
Subject: [PATCH 7/8] =?UTF-8?q?=E6=94=AF=E6=8C=81=E6=B5=81=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/llm_models/model_client/gemini_client.py | 27 +++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/llm_models/model_client/gemini_client.py b/src/llm_models/model_client/gemini_client.py
index 4074750b..a44059a9 100644
--- a/src/llm_models/model_client/gemini_client.py
+++ b/src/llm_models/model_client/gemini_client.py
@@ -246,12 +246,14 @@ async def _default_stream_response_handler(
     _fc_delta_buffer = io.StringIO()  # 正式内容缓冲区，用于存储接收到的正式内容
     _tool_calls_buffer: list[tuple[str, str, dict]] = []  # 工具调用缓冲区，用于存储接收到的工具调用
     _usage_record = None  # 使用情况记录
+    last_resp: GenerateContentResponse | None = None  # 保存最后一个 chunk
 
     def _insure_buffer_closed():
         if _fc_delta_buffer and not _fc_delta_buffer.closed:
             _fc_delta_buffer.close()
 
     async for chunk in resp_stream:
+        last_resp = chunk  # 保存最后一个响应
         # 检查是否有中断量
         if interrupt_flag and interrupt_flag.is_set():
             # 如果中断量被设置，则抛出ReqAbortException
@@ -270,13 +272,32 @@ async def _default_stream_response_handler(
                 (chunk.usage_metadata.candidates_token_count or 0) + (chunk.usage_metadata.thoughts_token_count or 0),
                 chunk.usage_metadata.total_token_count or 0,
             )
+
     try:
-        return _build_stream_api_resp(
+        api_response = _build_stream_api_resp(
             _fc_delta_buffer,
             _tool_calls_buffer,
-        ), _usage_record
+        )
+
+        # 检查是否因为 max_tokens 截断
+        if last_resp and last_resp.candidates:
+            c0 = last_resp.candidates[0]
+            reason = getattr(c0, "finish_reason", None) or getattr(c0, "finishReason", None)
+            if reason and "MAX_TOKENS" in str(reason):
+                if api_response.content and api_response.content.strip():
+                    logger.warning(
+                        "⚠ Gemini 响应因达到 max_tokens 限制被部分截断，\n"
+                        "    可能会对回复内容造成影响，建议修改模型 max_tokens 配置！"
+                    )
+                else:
+                    logger.warning(
+                        "⚠ Gemini 响应因达到 max_tokens 限制被截断，\n"
+                        "    请修改模型 max_tokens 配置！"
+                    )
+
+        return api_response, _usage_record
+
     except Exception:
-        # 确保缓冲区被关闭
         _insure_buffer_closed()
         raise
 

From ed9c1a999fc0d3cdc6ce7375c56f093a254e6f74 Mon Sep 17 00:00:00 2001
From: foxplaying <166147707+foxplaying@users.noreply.github.com>
Date: Fri, 26 Sep 2025 01:42:53 +0800
Subject: [PATCH 8/8] =?UTF-8?q?fix=EF=BC=9A=E4=BF=AE=E5=A4=8D=E6=B5=81?=
 =?UTF-8?q?=E5=BC=8F=E6=A3=80=E6=B5=8B=E6=96=B9=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/llm_models/model_client/gemini_client.py | 54 ++++++++++++--------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/src/llm_models/model_client/gemini_client.py b/src/llm_models/model_client/gemini_client.py
index a44059a9..0713a33a 100644
--- a/src/llm_models/model_client/gemini_client.py
+++ b/src/llm_models/model_client/gemini_client.py
@@ -182,7 +182,15 @@ def _process_delta(
 
     if delta.text:
         fc_delta_buffer.write(delta.text)
-
+        
+    # 处理 thought（Gemini 的特殊字段）
+    for c in getattr(delta, "candidates", []):
+        if c.content and getattr(c.content, "parts", None):
+            for p in c.content.parts:
+                if getattr(p, "thought", False) and getattr(p, "text", None):
+                    # 把 thought 写入 buffer，避免 resp.content 永远为空
+                    fc_delta_buffer.write(p.text)
+    
     if delta.function_calls:  # 为什么不用hasattr呢，是因为这个属性一定有，即使是个空的
         for call in delta.function_calls:
             try:
@@ -204,6 +212,7 @@ def _process_delta(
 def _build_stream_api_resp(
     _fc_delta_buffer: io.StringIO,
     _tool_calls_buffer: list[tuple[str, str, dict]],
+    last_resp: GenerateContentResponse | None = None,  # 传入 last_resp
 ) -> APIResponse:
     # sourcery skip: simplify-len-comparison, use-assigned-variable
     resp = APIResponse()
@@ -228,6 +237,24 @@ def _build_stream_api_resp(
 
             resp.tool_calls.append(ToolCall(call_id, function_name, arguments))
 
+    # 检查是否因为 max_tokens 截断
+    reason = None
+    if last_resp and getattr(last_resp, "candidates", None):
+        c0 = last_resp.candidates[0]
+        reason = getattr(c0, "finish_reason", None) or getattr(c0, "finishReason", None)
+
+    if str(reason).endswith("MAX_TOKENS"):
+        if resp.content and resp.content.strip():
+            logger.warning(
+                "⚠ Gemini 响应因达到 max_tokens 限制被部分截断，\n"
+                "    可能会对回复内容造成影响，建议修改模型 max_tokens 配置！"
+            )
+        else:
+            logger.warning(
+                "⚠ Gemini 响应因达到 max_tokens 限制被截断，\n"
+                "    请修改模型 max_tokens 配置！"
+            )
+    
     if not resp.content and not resp.tool_calls:
         raise EmptyResponseException()
 
@@ -274,30 +301,13 @@ async def _default_stream_response_handler(
             )
 
     try:
-        api_response = _build_stream_api_resp(
+        return _build_stream_api_resp(
             _fc_delta_buffer,
             _tool_calls_buffer,
-        )
-
-        # 检查是否因为 max_tokens 截断
-        if last_resp and last_resp.candidates:
-            c0 = last_resp.candidates[0]
-            reason = getattr(c0, "finish_reason", None) or getattr(c0, "finishReason", None)
-            if reason and "MAX_TOKENS" in str(reason):
-                if api_response.content and api_response.content.strip():
-                    logger.warning(
-                        "⚠ Gemini 响应因达到 max_tokens 限制被部分截断，\n"
-                        "    可能会对回复内容造成影响，建议修改模型 max_tokens 配置！"
-                    )
-                else:
-                    logger.warning(
-                        "⚠ Gemini 响应因达到 max_tokens 限制被截断，\n"
-                        "    请修改模型 max_tokens 配置！"
-                    )
-
-        return api_response, _usage_record
-
+            last_resp=last_resp,
+        ), _usage_record
     except Exception:
+        # 确保缓冲区被关闭
         _insure_buffer_closed()
         raise