MaiBot/backends/cosyvoice.py

"""
CosyVoice后端实现
使用 ModelScope 的 Fun-CosyVoice3-0.5B Gradio API 进行语音合成
"""

import asyncio
import os
import shutil
from typing import Optional, Tuple
from .base import TTSBackendBase, TTSResult
from ..utils.file import TTSFileManager
from ..config_keys import ConfigKeys
from src.common.logger import get_logger

logger = get_logger("tts_cosyvoice")

# CosyVoice指令映射表（方言、情感、语速等）
COSYVOICE_INSTRUCT_MAP = {
    # 方言
    "广东话": "You are a helpful assistant. 请用广东话表达。<|endofprompt|>",
    "东北话": "You are a helpful assistant. 请用东北话表达。<|endofprompt|>",
    "甘肃话": "You are a helpful assistant. 请用甘肃话表达。<|endofprompt|>",
    "贵州话": "You are a helpful assistant. 请用贵州话表达。<|endofprompt|>",
    "河南话": "You are a helpful assistant. 请用河南话表达。<|endofprompt|>",
    "湖北话": "You are a helpful assistant. 请用湖北话表达。<|endofprompt|>",
    "湖南话": "You are a helpful assistant. 请用湖南话表达。<|endofprompt|>",
    "江西话": "You are a helpful assistant. 请用江西话表达。<|endofprompt|>",
    "闽南话": "You are a helpful assistant. 请用闽南话表达。<|endofprompt|>",
    "宁夏话": "You are a helpful assistant. 请用宁夏话表达。<|endofprompt|>",
    "山西话": "You are a helpful assistant. 请用山西话表达。<|endofprompt|>",
    "陕西话": "You are a helpful assistant. 请用陕西话表达。<|endofprompt|>",
    "山东话": "You are a helpful assistant. 请用山东话表达。<|endofprompt|>",
    "上海话": "You are a helpful assistant. 请用上海话表达。<|endofprompt|>",
    "四川话": "You are a helpful assistant. 请用四川话表达。<|endofprompt|>",
    "天津话": "You are a helpful assistant. 请用天津话表达。<|endofprompt|>",
    "云南话": "You are a helpful assistant. 请用云南话表达。<|endofprompt|>",

    # 音量
    "大声": "You are a helpful assistant. Please say a sentence as loudly as possible.<|endofprompt|>",
    "小声": "You are a helpful assistant. Please say a sentence in a very soft voice.<|endofprompt|>",

    # 语速
    "慢速": "You are a helpful assistant. 请用尽可能慢地语速说一句话。<|endofprompt|>",
    "快速": "You are a helpful assistant. 请用尽可能快地语速说一句话。<|endofprompt|>",

    # 情感
    "开心": "You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>",
    "伤心": "You are a helpful assistant. 请非常伤心地说一句话。<|endofprompt|>",
    "生气": "You are a helpful assistant. 请非常生气地说一句话。<|endofprompt|>",

    # 特殊风格
    "小猪佩奇": "You are a helpful assistant. 我想体验一下小猪佩奇风格，可以吗？<|endofprompt|>",
    "机器人": "You are a helpful assistant. 你可以尝试用机器人的方式解答吗？<|endofprompt|>",
}


class CosyVoiceBackend(TTSBackendBase):
    """
    CosyVoice语音后端

    使用 ModelScope 的 Fun-CosyVoice3-0.5B Gradio API 进行语音合成
    支持3秒极速复刻、自然语言控制（方言、情感、语速等）
    """

    backend_name = "cosyvoice"
    backend_description = "阿里云 CosyVoice3 API (ModelScope Gradio)"
    support_private_chat = True
    default_audio_format = "wav"

    def get_default_voice(self) -> str:
        """获取默认音色（CosyVoice 不需要预设音色）"""
        return ""

    def validate_config(self) -> Tuple[bool, str]:
        """验证配置"""
        gradio_url = self.get_config(ConfigKeys.COSYVOICE_GRADIO_URL, "")

        if not gradio_url:
            return False, "CosyVoice后端缺少必需的 gradio_url 配置"

        return True, ""

    def _resolve_instruct(self, emotion: Optional[str]) -> str:
        """
        解析情感参数为指令文本

        Args:
            emotion: 情感/方言关键词

        Returns:
            指令文本
        """
        if emotion and emotion in COSYVOICE_INSTRUCT_MAP:
            return COSYVOICE_INSTRUCT_MAP[emotion]

        # 返回默认指令（确保不为空）
        default_instruct = self.get_config(
            ConfigKeys.COSYVOICE_DEFAULT_INSTRUCT,
            "You are a helpful assistant. 请用广东话表达。<|endofprompt|>"
        )

        # 如果配置为空，强制使用广东话
        if not default_instruct or not default_instruct.strip():
            default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>"

        return default_instruct

    async def execute(
        self,
        text: str,
        voice: Optional[str] = None,
        emotion: Optional[str] = None,
        **kwargs
    ) -> TTSResult:
        """
        执行 CosyVoice 语音合成

        Args:
            text: 待转换的文本
            voice: 音色（对于CosyVoice，这个参数用于指定参考音频路径）
            emotion: 情感/方言/语速参数

        Returns:
            TTSResult
        """
        # 验证配置
        is_valid, error_msg = self.validate_config()
        if not is_valid:
            return TTSResult(False, error_msg, backend_name=self.backend_name)

        # 验证文本
        if not text or not text.strip():
            return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)

        # 获取配置
        gradio_url = self.get_config(ConfigKeys.COSYVOICE_GRADIO_URL, "")
        mode_config = self.get_config(ConfigKeys.COSYVOICE_DEFAULT_MODE, "3s极速复刻")

        # mode_checkbox_group 实际上是 Radio 组件，期望字符串而不是列表
        # 处理配置可能返回字符串或列表的情况
        if isinstance(mode_config, list):
            mode_str = mode_config[0] if mode_config else "3s极速复刻"
        else:
            mode_str = mode_config if mode_config else "3s极速复刻"

        timeout = self.get_config(ConfigKeys.COSYVOICE_TIMEOUT, 60)
        reference_audio = self.get_config(ConfigKeys.COSYVOICE_REFERENCE_AUDIO, "")
        prompt_text = self.get_config(ConfigKeys.COSYVOICE_PROMPT_TEXT, "")

        # CosyVoice 的"自然语言控制"模式实际上需要参考音频和 prompt_text
        # 如果没有配置，使用默认的参考音频
        if not reference_audio or not os.path.exists(reference_audio):
            plugin_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
            default_audio = os.path.join(plugin_dir, "test.wav")
            if os.path.exists(default_audio):
                reference_audio = default_audio
                logger.debug(f"{self.log_prefix} 使用默认参考音频: {reference_audio}")

        # 如果没有 prompt_text，使用默认文本
        if not prompt_text:
            prompt_text = "大家好，我是嘉然，今天我来为大家朗读。"
            logger.debug(f"{self.log_prefix} 使用默认 prompt_text")

        # voice 参数可以覆盖配置文件中的参考音频
        if voice and os.path.exists(voice):
            reference_audio = voice

        # 解析指令文本
        instruct_text = self._resolve_instruct(emotion)

        logger.info(
            f"{self.log_prefix} CosyVoice请求: text='{text[:50]}...' "
            f"(共{len(text)}字符), mode={mode_str}, instruct={emotion or '默认'}"
        )

        try:
            # 动态导入 gradio_client（避免全局依赖）
            try:
                from gradio_client import Client, handle_file
            except ImportError:
                logger.error(f"{self.log_prefix} gradio_client 未安装，请运行: pip install gradio_client")
                return TTSResult(
                    False,
                    "gradio_client 未安装，请运行: pip install gradio_client",
                    backend_name=self.backend_name
                )

            # 创建 Gradio 客户端（设置超时）
            try:
                import httpx
                httpx_kwargs = {"timeout": httpx.Timeout(timeout, read=timeout, write=timeout, connect=30.0)}
                client = Client(gradio_url, httpx_kwargs=httpx_kwargs)
            except Exception as e:
                logger.warning(f"{self.log_prefix} 无法设置 httpx 超时，使用默认配置: {e}")
                client = Client(gradio_url)

            # 准备参数
            logger.debug(f"{self.log_prefix} 准备参考音频: {reference_audio}")
            prompt_wav_upload = handle_file(reference_audio) if reference_audio and os.path.exists(reference_audio) else None
            logger.debug(f"{self.log_prefix} 参考音频准备完成")

            # 调用 API
            logger.info(f"{self.log_prefix} 调用 Gradio API: {gradio_url} (超时: {timeout}秒)")
            logger.debug(f"{self.log_prefix} mode参数: {mode_str} (type: {type(mode_str).__name__})")
            logger.debug(f"{self.log_prefix} prompt_text: {prompt_text[:50]}...")
            logger.debug(f"{self.log_prefix} instruct_text: {instruct_text[:50]}...")

            result = await asyncio.wait_for(
                asyncio.to_thread(
                    client.predict,
                    tts_text=text,
                    mode_checkbox_group=mode_str,
                    prompt_text=prompt_text,
                    prompt_wav_upload=prompt_wav_upload,
                    prompt_wav_record=None,
                    instruct_text=instruct_text,
                    seed=0,
                    stream=False,  # API 实际期望布尔值 False，虽然文档显示为 Literal['False']
                    api_name="/generate_audio"
                ),
                timeout=timeout
            )

            logger.info(f"{self.log_prefix} CosyVoice API 响应成功")

            # result 是生成的音频文件路径
            if not result or not os.path.exists(result):
                return TTSResult(
                    False,
                    f"CosyVoice 生成失败，未返回有效文件: {result}",
                    backend_name=self.backend_name
                )

            # 读取音频数据
            try:
                with open(result, 'rb') as f:
                    audio_data = f.read()
            except Exception as e:
                logger.error(f"{self.log_prefix} 读取音频文件失败: {e}")
                return TTSResult(
                    False,
                    f"读取音频文件失败: {e}",
                    backend_name=self.backend_name
                )

            # 验证音频数据
            is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
            if not is_valid:
                logger.warning(f"{self.log_prefix} CosyVoice音频数据验证失败: {error_msg}")
                return TTSResult(
                    False,
                    f"CosyVoice语音{error_msg}",
                    backend_name=self.backend_name
                )

            logger.debug(
                f"{self.log_prefix} CosyVoice音频数据验证通过 "
                f"(大小: {len(audio_data)}字节)"
            )

            # 使用统一的发送方法
            audio_format = self.get_config(ConfigKeys.COSYVOICE_AUDIO_FORMAT, "wav")
            voice_info = f"模式: {mode_str}, 指令: {emotion or '默认'}"

            return await self.send_audio(
                audio_data=audio_data,
                audio_format=audio_format,
                prefix="tts_cosyvoice",
                voice_info=voice_info
            )

        except asyncio.TimeoutError:
            logger.error(f"{self.log_prefix} CosyVoice API 请求超时 (配置超时: {timeout}秒)")
            return TTSResult(
                False,
                "CosyVoice API 调用超时",
                backend_name=self.backend_name
            )
        except Exception as e:
            logger.error(f"{self.log_prefix} CosyVoice 执行异常: {e}")
            return TTSResult(
                False,
                f"CosyVoice 执行错误: {e}",
                backend_name=self.backend_name
            )