mirror of https://github.com/Mai-with-u/MaiBot.git
286 lines
12 KiB
Python
286 lines
12 KiB
Python
"""
|
||
CosyVoice后端实现
|
||
使用 ModelScope 的 Fun-CosyVoice3-0.5B Gradio API 进行语音合成
|
||
"""
|
||
|
||
import asyncio
|
||
import os
|
||
import shutil
|
||
from typing import Optional, Tuple
|
||
from .base import TTSBackendBase, TTSResult
|
||
from ..utils.file import TTSFileManager
|
||
from ..config_keys import ConfigKeys
|
||
from src.common.logger import get_logger
|
||
|
||
logger = get_logger("tts_cosyvoice")
|
||
|
||
# CosyVoice指令映射表(方言、情感、语速等)
|
||
COSYVOICE_INSTRUCT_MAP = {
|
||
# 方言
|
||
"广东话": "You are a helpful assistant. 请用广东话表达。<|endofprompt|>",
|
||
"东北话": "You are a helpful assistant. 请用东北话表达。<|endofprompt|>",
|
||
"甘肃话": "You are a helpful assistant. 请用甘肃话表达。<|endofprompt|>",
|
||
"贵州话": "You are a helpful assistant. 请用贵州话表达。<|endofprompt|>",
|
||
"河南话": "You are a helpful assistant. 请用河南话表达。<|endofprompt|>",
|
||
"湖北话": "You are a helpful assistant. 请用湖北话表达。<|endofprompt|>",
|
||
"湖南话": "You are a helpful assistant. 请用湖南话表达。<|endofprompt|>",
|
||
"江西话": "You are a helpful assistant. 请用江西话表达。<|endofprompt|>",
|
||
"闽南话": "You are a helpful assistant. 请用闽南话表达。<|endofprompt|>",
|
||
"宁夏话": "You are a helpful assistant. 请用宁夏话表达。<|endofprompt|>",
|
||
"山西话": "You are a helpful assistant. 请用山西话表达。<|endofprompt|>",
|
||
"陕西话": "You are a helpful assistant. 请用陕西话表达。<|endofprompt|>",
|
||
"山东话": "You are a helpful assistant. 请用山东话表达。<|endofprompt|>",
|
||
"上海话": "You are a helpful assistant. 请用上海话表达。<|endofprompt|>",
|
||
"四川话": "You are a helpful assistant. 请用四川话表达。<|endofprompt|>",
|
||
"天津话": "You are a helpful assistant. 请用天津话表达。<|endofprompt|>",
|
||
"云南话": "You are a helpful assistant. 请用云南话表达。<|endofprompt|>",
|
||
|
||
# 音量
|
||
"大声": "You are a helpful assistant. Please say a sentence as loudly as possible.<|endofprompt|>",
|
||
"小声": "You are a helpful assistant. Please say a sentence in a very soft voice.<|endofprompt|>",
|
||
|
||
# 语速
|
||
"慢速": "You are a helpful assistant. 请用尽可能慢地语速说一句话。<|endofprompt|>",
|
||
"快速": "You are a helpful assistant. 请用尽可能快地语速说一句话。<|endofprompt|>",
|
||
|
||
# 情感
|
||
"开心": "You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>",
|
||
"伤心": "You are a helpful assistant. 请非常伤心地说一句话。<|endofprompt|>",
|
||
"生气": "You are a helpful assistant. 请非常生气地说一句话。<|endofprompt|>",
|
||
|
||
# 特殊风格
|
||
"小猪佩奇": "You are a helpful assistant. 我想体验一下小猪佩奇风格,可以吗?<|endofprompt|>",
|
||
"机器人": "You are a helpful assistant. 你可以尝试用机器人的方式解答吗?<|endofprompt|>",
|
||
}
|
||
|
||
|
||
class CosyVoiceBackend(TTSBackendBase):
|
||
"""
|
||
CosyVoice语音后端
|
||
|
||
使用 ModelScope 的 Fun-CosyVoice3-0.5B Gradio API 进行语音合成
|
||
支持3秒极速复刻、自然语言控制(方言、情感、语速等)
|
||
"""
|
||
|
||
backend_name = "cosyvoice"
|
||
backend_description = "阿里云 CosyVoice3 API (ModelScope Gradio)"
|
||
support_private_chat = True
|
||
default_audio_format = "wav"
|
||
|
||
def get_default_voice(self) -> str:
|
||
"""获取默认音色(CosyVoice 不需要预设音色)"""
|
||
return ""
|
||
|
||
def validate_config(self) -> Tuple[bool, str]:
|
||
"""验证配置"""
|
||
gradio_url = self.get_config(ConfigKeys.COSYVOICE_GRADIO_URL, "")
|
||
|
||
if not gradio_url:
|
||
return False, "CosyVoice后端缺少必需的 gradio_url 配置"
|
||
|
||
return True, ""
|
||
|
||
def _resolve_instruct(self, emotion: Optional[str]) -> str:
|
||
"""
|
||
解析情感参数为指令文本
|
||
|
||
Args:
|
||
emotion: 情感/方言关键词
|
||
|
||
Returns:
|
||
指令文本
|
||
"""
|
||
if emotion and emotion in COSYVOICE_INSTRUCT_MAP:
|
||
return COSYVOICE_INSTRUCT_MAP[emotion]
|
||
|
||
# 返回默认指令(确保不为空)
|
||
default_instruct = self.get_config(
|
||
ConfigKeys.COSYVOICE_DEFAULT_INSTRUCT,
|
||
"You are a helpful assistant. 请用广东话表达。<|endofprompt|>"
|
||
)
|
||
|
||
# 如果配置为空,强制使用广东话
|
||
if not default_instruct or not default_instruct.strip():
|
||
default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>"
|
||
|
||
return default_instruct
|
||
|
||
async def execute(
|
||
self,
|
||
text: str,
|
||
voice: Optional[str] = None,
|
||
emotion: Optional[str] = None,
|
||
**kwargs
|
||
) -> TTSResult:
|
||
"""
|
||
执行 CosyVoice 语音合成
|
||
|
||
Args:
|
||
text: 待转换的文本
|
||
voice: 音色(对于CosyVoice,这个参数用于指定参考音频路径)
|
||
emotion: 情感/方言/语速参数
|
||
|
||
Returns:
|
||
TTSResult
|
||
"""
|
||
# 验证配置
|
||
is_valid, error_msg = self.validate_config()
|
||
if not is_valid:
|
||
return TTSResult(False, error_msg, backend_name=self.backend_name)
|
||
|
||
# 验证文本
|
||
if not text or not text.strip():
|
||
return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
|
||
|
||
# 获取配置
|
||
gradio_url = self.get_config(ConfigKeys.COSYVOICE_GRADIO_URL, "")
|
||
mode_config = self.get_config(ConfigKeys.COSYVOICE_DEFAULT_MODE, "3s极速复刻")
|
||
|
||
# mode_checkbox_group 实际上是 Radio 组件,期望字符串而不是列表
|
||
# 处理配置可能返回字符串或列表的情况
|
||
if isinstance(mode_config, list):
|
||
mode_str = mode_config[0] if mode_config else "3s极速复刻"
|
||
else:
|
||
mode_str = mode_config if mode_config else "3s极速复刻"
|
||
|
||
timeout = self.get_config(ConfigKeys.COSYVOICE_TIMEOUT, 60)
|
||
reference_audio = self.get_config(ConfigKeys.COSYVOICE_REFERENCE_AUDIO, "")
|
||
prompt_text = self.get_config(ConfigKeys.COSYVOICE_PROMPT_TEXT, "")
|
||
|
||
# CosyVoice 的"自然语言控制"模式实际上需要参考音频和 prompt_text
|
||
# 如果没有配置,使用默认的参考音频
|
||
if not reference_audio or not os.path.exists(reference_audio):
|
||
plugin_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
default_audio = os.path.join(plugin_dir, "test.wav")
|
||
if os.path.exists(default_audio):
|
||
reference_audio = default_audio
|
||
logger.debug(f"{self.log_prefix} 使用默认参考音频: {reference_audio}")
|
||
|
||
# 如果没有 prompt_text,使用默认文本
|
||
if not prompt_text:
|
||
prompt_text = "大家好,我是嘉然,今天我来为大家朗读。"
|
||
logger.debug(f"{self.log_prefix} 使用默认 prompt_text")
|
||
|
||
# voice 参数可以覆盖配置文件中的参考音频
|
||
if voice and os.path.exists(voice):
|
||
reference_audio = voice
|
||
|
||
# 解析指令文本
|
||
instruct_text = self._resolve_instruct(emotion)
|
||
|
||
logger.info(
|
||
f"{self.log_prefix} CosyVoice请求: text='{text[:50]}...' "
|
||
f"(共{len(text)}字符), mode={mode_str}, instruct={emotion or '默认'}"
|
||
)
|
||
|
||
try:
|
||
# 动态导入 gradio_client(避免全局依赖)
|
||
try:
|
||
from gradio_client import Client, handle_file
|
||
except ImportError:
|
||
logger.error(f"{self.log_prefix} gradio_client 未安装,请运行: pip install gradio_client")
|
||
return TTSResult(
|
||
False,
|
||
"gradio_client 未安装,请运行: pip install gradio_client",
|
||
backend_name=self.backend_name
|
||
)
|
||
|
||
# 创建 Gradio 客户端(设置超时)
|
||
try:
|
||
import httpx
|
||
httpx_kwargs = {"timeout": httpx.Timeout(timeout, read=timeout, write=timeout, connect=30.0)}
|
||
client = Client(gradio_url, httpx_kwargs=httpx_kwargs)
|
||
except Exception as e:
|
||
logger.warning(f"{self.log_prefix} 无法设置 httpx 超时,使用默认配置: {e}")
|
||
client = Client(gradio_url)
|
||
|
||
# 准备参数
|
||
logger.debug(f"{self.log_prefix} 准备参考音频: {reference_audio}")
|
||
prompt_wav_upload = handle_file(reference_audio) if reference_audio and os.path.exists(reference_audio) else None
|
||
logger.debug(f"{self.log_prefix} 参考音频准备完成")
|
||
|
||
# 调用 API
|
||
logger.info(f"{self.log_prefix} 调用 Gradio API: {gradio_url} (超时: {timeout}秒)")
|
||
logger.debug(f"{self.log_prefix} mode参数: {mode_str} (type: {type(mode_str).__name__})")
|
||
logger.debug(f"{self.log_prefix} prompt_text: {prompt_text[:50]}...")
|
||
logger.debug(f"{self.log_prefix} instruct_text: {instruct_text[:50]}...")
|
||
|
||
result = await asyncio.wait_for(
|
||
asyncio.to_thread(
|
||
client.predict,
|
||
tts_text=text,
|
||
mode_checkbox_group=mode_str,
|
||
prompt_text=prompt_text,
|
||
prompt_wav_upload=prompt_wav_upload,
|
||
prompt_wav_record=None,
|
||
instruct_text=instruct_text,
|
||
seed=0,
|
||
stream=False, # API 实际期望布尔值 False,虽然文档显示为 Literal['False']
|
||
api_name="/generate_audio"
|
||
),
|
||
timeout=timeout
|
||
)
|
||
|
||
logger.info(f"{self.log_prefix} CosyVoice API 响应成功")
|
||
|
||
# result 是生成的音频文件路径
|
||
if not result or not os.path.exists(result):
|
||
return TTSResult(
|
||
False,
|
||
f"CosyVoice 生成失败,未返回有效文件: {result}",
|
||
backend_name=self.backend_name
|
||
)
|
||
|
||
# 读取音频数据
|
||
try:
|
||
with open(result, 'rb') as f:
|
||
audio_data = f.read()
|
||
except Exception as e:
|
||
logger.error(f"{self.log_prefix} 读取音频文件失败: {e}")
|
||
return TTSResult(
|
||
False,
|
||
f"读取音频文件失败: {e}",
|
||
backend_name=self.backend_name
|
||
)
|
||
|
||
# 验证音频数据
|
||
is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
|
||
if not is_valid:
|
||
logger.warning(f"{self.log_prefix} CosyVoice音频数据验证失败: {error_msg}")
|
||
return TTSResult(
|
||
False,
|
||
f"CosyVoice语音{error_msg}",
|
||
backend_name=self.backend_name
|
||
)
|
||
|
||
logger.debug(
|
||
f"{self.log_prefix} CosyVoice音频数据验证通过 "
|
||
f"(大小: {len(audio_data)}字节)"
|
||
)
|
||
|
||
# 使用统一的发送方法
|
||
audio_format = self.get_config(ConfigKeys.COSYVOICE_AUDIO_FORMAT, "wav")
|
||
voice_info = f"模式: {mode_str}, 指令: {emotion or '默认'}"
|
||
|
||
return await self.send_audio(
|
||
audio_data=audio_data,
|
||
audio_format=audio_format,
|
||
prefix="tts_cosyvoice",
|
||
voice_info=voice_info
|
||
)
|
||
|
||
except asyncio.TimeoutError:
|
||
logger.error(f"{self.log_prefix} CosyVoice API 请求超时 (配置超时: {timeout}秒)")
|
||
return TTSResult(
|
||
False,
|
||
"CosyVoice API 调用超时",
|
||
backend_name=self.backend_name
|
||
)
|
||
except Exception as e:
|
||
logger.error(f"{self.log_prefix} CosyVoice 执行异常: {e}")
|
||
return TTSResult(
|
||
False,
|
||
f"CosyVoice 执行错误: {e}",
|
||
backend_name=self.backend_name
|
||
)
|