""" 统一TTS语音合成插件 支持五种后端:AI Voice (MaiCore内置) / GSV2P (云API) / GPT-SoVITS (本地服务) / 豆包语音 (云API) / CosyVoice (ModelScope Gradio) Version: 3.2.3 Author: 靓仔 """ import sys sys.dont_write_bytecode = True import asyncio import random from typing import List, Tuple, Type, Optional from src.common.logger import get_logger from src.plugin_system.base.base_plugin import BasePlugin from src.plugin_system.apis.plugin_register_api import register_plugin from src.plugin_system.base.base_action import BaseAction, ActionActivationType from src.plugin_system.base.base_command import BaseCommand from src.plugin_system.base.component_types import ComponentInfo, ChatMode from src.plugin_system.base.config_types import ConfigField from src.plugin_system.apis import generator_api # 导入模块化的后端和工具 from .backends import TTSBackendRegistry, TTSResult from .backends.ai_voice import AI_VOICE_ALIAS_MAP from .backends.doubao import DOUBAO_EMOTION_MAP from .utils.text import TTSTextUtils from .config_keys import ConfigKeys logger = get_logger("tts_voice_plugin") # 有效后端列表 VALID_BACKENDS = [ "ai_voice", "gsv2p", "gpt_sovits", "doubao", "cosyvoice", "comfyui", "comfyui_voiceclone", "comfyui_customvoice", ] class TTSExecutorMixin: """ TTS执行器混入类 提供 Action 和 Command 共享的后端执行逻辑 """ def _create_backend(self, backend_name: str): """ 创建后端实例 Args: backend_name: 后端名称 Returns: 后端实例 """ backend = TTSBackendRegistry.create( backend_name, self.get_config, self.log_prefix ) if backend: # 注入必要的回调函数 if hasattr(backend, 'set_send_custom'): backend.set_send_custom(self.send_custom) if hasattr(backend, 'set_send_command'): backend.set_send_command(self.send_command) return backend async def _execute_backend( self, backend_name: str, text: str, voice: str = "", emotion: str = "" ) -> TTSResult: """ 执行指定后端 Args: backend_name: 后端名称 text: 待转换文本 voice: 音色 emotion: 情感(豆包后端) Returns: TTSResult """ backend = self._create_backend(backend_name) if not backend: return TTSResult( success=False, message=f"未知的TTS后端: {backend_name}" ) # AI Voice 私聊限制检查 if backend_name == "ai_voice": is_private = self._check_is_private_chat() if is_private: logger.info(f"{self.log_prefix} AI语音仅支持群聊,自动切换到GSV2P后端") return await self._execute_backend("gsv2p", text, voice, emotion) # Pass chat context through for backends that need MaiBot LLM APIs (e.g., comfyui auto_instruct). chat_stream = None if hasattr(self, "chat_stream"): chat_stream = getattr(self, "chat_stream", None) elif hasattr(self, "message"): chat_stream = getattr(getattr(self, "message", None), "chat_stream", None) return await backend.execute(text, voice, emotion=emotion, chat_stream=chat_stream) def _check_is_private_chat(self) -> bool: """检查是否是私聊""" # Action 中使用 chat_stream if hasattr(self, 'chat_stream'): return not getattr(self.chat_stream, 'group_info', None) # Command 中使用 message if hasattr(self, 'message'): msg_info = getattr(self.message, 'message_info', None) if msg_info: return not getattr(msg_info, 'group_info', None) return False def _get_default_backend(self) -> str: """获取配置的默认后端""" backend = self.get_config(ConfigKeys.GENERAL_DEFAULT_BACKEND, "gsv2p") if backend not in VALID_BACKENDS: logger.warning(f"{self.log_prefix} 配置的默认后端 '{backend}' 无效,使用 gsv2p") return "gsv2p" return backend async def _send_error(self, message: str) -> None: """ 发送错误提示信息(受全局配置控制) Args: message: 错误消息 """ if self.get_config(ConfigKeys.GENERAL_SEND_ERROR_MESSAGES, True): await self.send_text(message) class UnifiedTTSAction(BaseAction, TTSExecutorMixin): """统一TTS Action - LLM自动触发""" action_name = "unified_tts_action" action_description = "用语音回复(支持AI Voice/GSV2P/GPT-SoVITS/豆包语音多后端)" activation_type = ActionActivationType.KEYWORD mode_enable = ChatMode.ALL parallel_action = False activation_keywords = [ "语音", "说话", "朗读", "念一下", "读出来", "voice", "speak", "tts", "语音回复", "用语音说", "播报" ] keyword_case_sensitive = False action_parameters = { "text": "要转换为语音的文本内容(必填)", "backend": "TTS后端引擎 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice,可选,建议省略让系统自动使用配置的默认后端)", "voice": "音色/风格参数(可选)", "emotion": "情感/语气参数(可选,仅豆包后端有效)。支持:开心/兴奋/温柔/骄傲/生气/愤怒/伤心/失望/委屈/平静/严肃/疑惑/慢速/快速/小声/大声等" } action_require = [ "当用户要求用语音回复时使用", "当回复简短问候语时使用(如早上好、晚安、你好等)", "当想让回复更活泼生动时可以使用", "注意:回复内容过长或者过短不适合用语音", "注意:backend参数建议省略,系统会自动使用配置的默认后端" ] associated_types = ["text", "command"] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.timeout = self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60) self.max_text_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 500) def _check_force_trigger(self, text: str) -> bool: """检查是否强制触发""" if not self.get_config(ConfigKeys.PROBABILITY_KEYWORD_FORCE_TRIGGER, True): return False force_keywords = self.get_config( ConfigKeys.PROBABILITY_FORCE_KEYWORDS, ["一定要用语音", "必须语音", "语音回复我", "务必用语音"] ) return any(kw in text for kw in force_keywords) def _probability_check(self, text: str) -> bool: """概率控制检查""" if not self.get_config(ConfigKeys.PROBABILITY_ENABLED, True): return True base_prob = self.get_config(ConfigKeys.PROBABILITY_BASE_PROBABILITY, 1.0) base_prob = max(0.0, min(1.0, base_prob)) result = random.random() < base_prob logger.info(f"{self.log_prefix} 概率检查: {base_prob:.2f}, 结果={'通过' if result else '未通过'}") return result async def _get_final_text(self, raw_text: str, reason: str, use_replyer: bool) -> Tuple[bool, str]: """获取最终要转语音的文本(使用与正常回复一致的prompt参数)""" max_text_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 200) if not use_replyer: if not raw_text: return False, "" return True, raw_text try: # 统一使用 generate_reply 以确保触发 POST_LLM 事件(日程注入) # rewrite_reply 不会触发 POST_LLM 事件,因此不适用 # 注意:长度约束放在末尾,利用 LLM 的"近因效应"提高遵守率 extra_info_parts = [] if raw_text: extra_info_parts.append(f"期望的回复内容:{raw_text}") # 长度约束放在最后,使用更强的表述 extra_info_parts.append( f"【重要】你的回复必须控制在{max_text_length}字以内,这是硬性要求。" f"超过此长度将无法转换为语音。请直接回复核心内容,不要啰嗦。" ) success, llm_response = await generator_api.generate_reply( chat_stream=self.chat_stream, reply_message=self.action_message, reply_reason=reason, extra_info="\n".join(extra_info_parts), request_type="tts_voice_plugin", from_plugin=False # 允许触发POST_LLM事件,使日程注入生效 ) if success and llm_response and llm_response.content: logger.info(f"{self.log_prefix} 语音内容生成成功") return True, llm_response.content.strip() # 如果生成失败但有原始文本,则使用原始文本 if raw_text: logger.warning(f"{self.log_prefix} 内容生成失败,使用原始文本") return True, raw_text return False, "" except Exception as e: logger.error(f"{self.log_prefix} 调用 replyer 出错: {e}") return bool(raw_text), raw_text async def execute(self) -> Tuple[bool, str]: def _chunk_sentences( parts: List[str], target_chars: int, max_chunks: int ) -> List[str]: # Greedy packing: reduces tiny fragments into fewer, longer segments. if not parts: return [] if target_chars <= 0: target_chars = 120 def pack(tgt: int) -> List[str]: out: List[str] = [] cur = "" for s in parts: s = (s or "").strip() if not s: continue if not cur: cur = s continue if len(cur) + len(s) <= tgt: cur += s else: out.append(cur) cur = s if cur: out.append(cur) return out packed = pack(target_chars) if max_chunks and max_chunks > 0 and len(packed) > max_chunks: total = len("".join(parts)) new_target = max(target_chars, int(total / max_chunks) + 1) packed = pack(new_target) return packed async def send_message_single_sentences() -> Tuple[bool, str]: result = await self._execute_backend(backend, clean_text, voice, emotion) if result.success: # 生成更详细的动作记录,帮助 planner 避免重复执行 text_preview = clean_text[:80] + "..." if len(clean_text) > 80 else clean_text await self.store_action_info( action_build_into_prompt=True, action_prompt_display=f"已用语音回复:{text_preview}", action_done=True ) else: await self._send_error(f"语音合成失败: {result.message}") return result.success, result.message async def send_message_with_splited_sentences() -> Tuple[bool, str]: # 分段发送模式:将文本分割成句子,逐句发送语音 if len(sentences) > 1: logger.info(f"{self.log_prefix} 分段发送模式:共 {len(sentences)} 句") success_count = 0 all_sentences_text = [] for i, sentence in enumerate(sentences): if not sentence.strip(): continue logger.debug(f"{self.log_prefix} 发送第 {i + 1}/{len(sentences)} 句: {sentence[:30]}...") result = await self._execute_backend(backend, sentence, voice, emotion) if result.success: success_count += 1 all_sentences_text.append(sentence) else: logger.warning(f"{self.log_prefix} 第 {i + 1} 句发送失败: {result.message}") # 句子之间添加延迟 if i < len(sentences) - 1 and split_delay > 0: await asyncio.sleep(split_delay) # 记录动作信息 if success_count > 0: # 生成更详细的动作记录,帮助 planner 避免重复执行 display_text = "".join(all_sentences_text) text_preview = display_text[:80] + "..." if len(display_text) > 80 else display_text await self.store_action_info( action_build_into_prompt=True, action_prompt_display=f"已用语音回复({success_count}段):{text_preview}", action_done=True ) return True, f"成功发送 {success_count}/{len(sentences)} 条语音" else: await self._send_error("语音合成失败") return False, "所有语音发送失败" else: # 只有一句,正常发送 return await send_message_single_sentences() """执行TTS语音合成""" try: raw_text = self.action_data.get("text", "").strip() voice = self.action_data.get("voice", "") reason = self.action_data.get("reason", "") emotion = self.action_data.get("emotion", "") use_replyer = self.get_config(ConfigKeys.GENERAL_USE_REPLYER_REWRITE, True) # 获取最终文本 success, final_text = await self._get_final_text(raw_text, reason, use_replyer) if not success or not final_text: await self._send_error("无法生成语音内容") return False, "文本为空" # 概率检查 force_trigger = self._check_force_trigger(final_text) if not force_trigger and not self._probability_check(final_text): logger.info(f"{self.log_prefix} 概率检查未通过,使用文字回复") await self.send_text(final_text) text_preview = final_text[:80] + "..." if len(final_text) > 80 else final_text await self.store_action_info( action_build_into_prompt=True, action_prompt_display=f"已用文字回复(语音概率未触发):{text_preview}", action_done=True ) return True, "概率检查未通过,已发送文字回复" # 清理文本(移除特殊字符,替换网络用语) # 注意:长度应该由LLM在生成时就遵守,这里只做字符清理 clean_text = TTSTextUtils.clean_text(final_text, self.max_text_length) if not clean_text: await self._send_error("文本处理后为空") return False, "文本处理后为空" # 如果清理后的文本仍然超过限制,说明LLM未遵守约束 if len(clean_text) > self.max_text_length: logger.warning( f"{self.log_prefix} LLM生成的文本超过长度限制 " f"({len(clean_text)} > {self.max_text_length}字符),降级为文字回复" ) await self.send_text(clean_text) text_preview = clean_text[:80] + "..." if len(clean_text) > 80 else clean_text await self.store_action_info( action_build_into_prompt=True, action_prompt_display=f"已用文字回复(内容过长):{text_preview}", action_done=True ) return True, "内容超过语音长度限制,已改为文字回复" # 获取后端并执行 backend = self._get_default_backend() logger.info(f"{self.log_prefix} 使用配置的默认后端: {backend}") # 检查是否启用分段发送 split_sentences = self.get_config(ConfigKeys.GENERAL_SPLIT_SENTENCES, True) split_delay = self.get_config(ConfigKeys.GENERAL_SPLIT_DELAY, 0.3) sentences = None # 优先使用智能分割插件的分隔符 if '|||SPLIT|||' in clean_text: logger.info("found split marker from smart segmentation plugin") sentences = [s.strip() for s in clean_text.split("|||SPLIT|||") if s.strip()] # If the upstream splitter is too aggressive, pack back into fewer segments. max_segments = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MAX_SEGMENTS, 3) or 3) chunk_chars = int(self.get_config(ConfigKeys.GENERAL_SPLIT_CHUNK_CHARS, 110) or 110) if max_segments and max_segments > 0 and len(sentences) > max_segments: sentences = _chunk_sentences(sentences, target_chars=chunk_chars, max_chunks=max_segments) return await send_message_with_splited_sentences() elif split_sentences: # 自动分段:短文本不分段;长文本最多分成 N 段,避免刷屏式多段语音。 min_total = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MIN_TOTAL_CHARS, 120) or 120) min_sentence = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MIN_SENTENCE_CHARS, 6) or 6) max_segments = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MAX_SEGMENTS, 3) or 3) chunk_chars = int(self.get_config(ConfigKeys.GENERAL_SPLIT_CHUNK_CHARS, 110) or 110) if len(clean_text) < min_total: sentences = [clean_text] else: sentences = TTSTextUtils.split_sentences(clean_text, min_length=min_sentence) if max_segments and max_segments > 0: sentences = _chunk_sentences(sentences, target_chars=chunk_chars, max_chunks=max_segments) return await send_message_with_splited_sentences() else: # 单句发送 return await send_message_single_sentences() except Exception as e: error_msg = str(e) logger.error(f"{self.log_prefix} TTS语音合成出错: {error_msg}") await self._send_error(f"语音合成出错: {error_msg}") return False, error_msg class UnifiedTTSCommand(BaseCommand, TTSExecutorMixin): """统一TTS Command - 用户手动触发""" command_name = "unified_tts_command" command_description = "将文本转换为语音,支持多种后端和音色" command_pattern = r"^/(?:tts|voice|gsv2p|gptsovits|doubao|cosyvoice|comfyui|comfyui_voiceclone|comfyui_customvoice)\s+(?P.+?)(?:\s+-v\s+(?P\S+))?(?:\s+(?Pai_voice|gsv2p|gpt_sovits|doubao|cosyvoice|comfyui|comfyui_voiceclone|comfyui_customvoice))?$" command_help = "将文本转换为语音。用法:/tts 你好世界 [-v 音色] [后端]" command_examples = [ "/tts 你好,世界!", "/tts 今天天气不错 -v 小新", "/gptsovits 你好世界 -v default", "/cosyvoice 你好世界 -v 四川话", "/tts 试试 -v 温柔妹妹 ai_voice", "/gsv2p 你好世界", "/doubao 你好世界 -v 开心" ] intercept_message = True async def _send_help(self): """发送帮助信息""" default_backend = self._get_default_backend() help_text = """【TTS语音合成插件帮助】 📝 基本语法: /tts <文本> [-v <音色>] [后端] 🎯 快捷命令: /tts <文本> 使用默认后端 /voice <文本> 使用 AI Voice /gsv2p <文本> 使用 GSV2P /gptsovits <文本> 使用 GPT-SoVITS /doubao <文本> 使用 豆包语音 /cosyvoice <文本> 使用 CosyVoice /comfyui <文本> 使用 ComfyUI(本地工作流) /comfyui_voiceclone <文本> 使用 ComfyUI VoiceClone /comfyui_customvoice <文本> 使用 ComfyUI CustomVoice 🔊 可用后端: • ai_voice - MaiCore内置(仅群聊) • gsv2p - 云端API,高质量 • gpt_sovits - 本地服务,可定制 • doubao - 火山引擎,支持情感 • cosyvoice - 阿里云,支持方言 • comfyui - 本地ComfyUI工作流(自动按 style.mode 选择) • comfyui_voiceclone - 本地ComfyUI工作流(仅 VoiceClone) • comfyui_customvoice - 本地ComfyUI工作流(仅 CustomVoice) 🎭 音色/情感参数(-v): • AI Voice: 小新、温柔妹妹、霸道总裁、妲己 等22种 • GSV2P: 原神-中文-派蒙_ZH 等(见API文档) • 豆包: 开心、生气、伤心、撒娇、严肃 等 • CosyVoice: 广东话、四川话、东北话、开心、慢速 等 📌 示例: /tts 你好世界 /tts 今天真开心 -v 开心 /gptsovits 这是本地语音合成 /doubao 我生气了 -v 生气 /cosyvoice 你好 -v 广东话 /voice 测试一下 -v 温柔妹妹 ⚙️ 当前默认后端:""" + default_backend await self.send_text(help_text) def _determine_backend(self, user_backend: str) -> Tuple[str, str]: """ 确定使用的后端 Returns: (backend_name, source_description) """ # 1. 检查命令前缀 raw_text = self.message.raw_message if self.message.raw_message else self.message.processed_plain_text if raw_text: # 命令前缀到后端的映射 prefix_backend_map = { "/gsv2p": "gsv2p", "/gptsovits": "gpt_sovits", "/doubao": "doubao", "/cosyvoice": "cosyvoice", "/voice": "ai_voice", "/comfyui": "comfyui", "/comfyui_voiceclone": "comfyui_voiceclone", "/comfyui_customvoice": "comfyui_customvoice", } for prefix, backend in prefix_backend_map.items(): if raw_text.startswith(prefix): return backend, f"命令前缀 {prefix}" # 2. 检查命令参数 if user_backend and user_backend in VALID_BACKENDS: return user_backend, f"命令参数 {user_backend}" # 3. 使用配置文件默认值 return self._get_default_backend(), "配置文件" async def execute(self) -> Tuple[bool, str, bool]: """执行TTS命令""" try: text = self.matched_groups.get("text", "").strip() voice = self.matched_groups.get("voice", "") user_backend = self.matched_groups.get("backend", "") # 处理帮助命令 if text.lower() == "help": await self._send_help() return True, "显示帮助信息", True if not text: await self._send_error("请输入要转换为语音的文本内容") return False, "缺少文本内容", True # 确定后端 backend, backend_source = self._determine_backend(user_backend) # 清理文本 max_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 500) clean_text = TTSTextUtils.clean_text(text, max_length) if not clean_text: await self._send_error("文本处理后为空") return False, "文本处理后为空", True # 检查长度限制 if len(clean_text) > max_length: await self.send_text( f"文本过长({len(clean_text)}字符)," f"超过语音合成限制({max_length}字符)," f"已改为文字发送。\n\n{clean_text}" ) return True, "文本过长,已改为文字发送", True logger.info(f"{self.log_prefix} 执行TTS命令 (后端: {backend} [来源: {backend_source}], 音色: {voice})") # 执行后端 # 对于 CosyVoice 和豆包,voice 参数实际上是情感/方言 if backend in ["cosyvoice", "doubao"]: result = await self._execute_backend(backend, clean_text, voice="", emotion=voice) else: result = await self._execute_backend(backend, clean_text, voice) if not result.success: await self._send_error(f"语音合成失败: {result.message}") return result.success, result.message, True except Exception as e: logger.error(f"{self.log_prefix} TTS命令执行出错: {e}") await self._send_error(f"语音合成出错: {e}") return False, f"执行出错: {e}", True class TTSInstructCommand(BaseCommand): """生成 CustomVoice instruct(调试/预览用)""" command_name = "tts_instruct_command" command_description = "根据待朗读文本生成 CustomVoice 的 instruct(情绪/语速/停顿)" command_pattern = r"^/tts_instruct\\s+(?P.+?)$" command_help = "用法:/tts_instruct <文本>" command_examples = [ "/tts_instruct 早上好,今天也要加油。", "/tts_instruct えっ?本当にそうなの?", ] intercept_message = True async def execute(self) -> Tuple[bool, str, int]: try: text = (self.matched_groups.get("text") or "").strip() if not text: await self.send_text("请输入要生成 instruct 的文本") return False, "缺少文本", 2 # Use the same logic as ComfyUI backend auto_instruct. from .backends.comfyui import ComfyUIBackend from .utils.text import TTSTextUtils detected = TTSTextUtils.detect_language(text) chat_stream = getattr(self.message, "chat_stream", None) chat_id = getattr(chat_stream, "stream_id", None) if chat_stream else None backend = ComfyUIBackend(self.get_config, log_prefix=self.log_prefix) instruct = await backend._infer_instruct( text=text, detected_lang=detected, chat_stream=chat_stream, chat_id=chat_id, style_name="__command__", ) if not instruct: await self.send_text("instruct 生成失败(可能未启用 comfyui.auto_instruct_enabled 或 LLM 不可用)") return False, "instruct 生成失败", 2 await self.send_text(instruct) return True, "instruct 已生成", 2 except Exception as e: await self.send_text(f"instruct 生成异常: {e}") return False, str(e), 2 @register_plugin class UnifiedTTSPlugin(BasePlugin): """统一TTS语音合成插件 - 支持多后端的文本转语音插件""" plugin_name = "tts_voice_plugin" plugin_description = "统一TTS语音合成插件,支持AI Voice、GSV2P、GPT-SoVITS、豆包语音多种后端" plugin_version = "3.2.3" plugin_author = "靓仔" enable_plugin = True config_file_name = "config.toml" dependencies = [] python_dependencies = ["aiohttp"] config_section_descriptions = { "plugin": "插件基本配置", "general": "通用设置", "components": "组件启用控制", "probability": "概率控制配置", "ai_voice": "AI Voice后端配置", "gsv2p": "GSV2P后端配置", "gpt_sovits": "GPT-SoVITS后端配置", "doubao": "豆包语音后端配置", "cosyvoice": "CosyVoice后端配置", "comfyui": "ComfyUI工作流API后端配置" } config_schema = { "plugin": { "enabled": ConfigField(type=bool, default=True, description="是否启用插件"), "config_version": ConfigField(type=str, default="3.2.3", description="配置文件版本") }, "general": { "default_backend": ConfigField( type=str, default="cosyvoice", description="默认TTS后端 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice)" ), "timeout": ConfigField(type=int, default=60, description="请求超时时间(秒)"), "max_text_length": ConfigField( type=int, default=200, description="最大文本长度(该限制会在调用LLM时注入到prompt中,让LLM直接生成符合长度的回复,而不是被动截断)" ), "use_replyer_rewrite": ConfigField( type=bool, default=True, description="是否使用replyer润色语音内容" ), "audio_output_dir": ConfigField( type=str, default="", description="音频文件输出目录(支持相对路径和绝对路径,留空使用项目根目录)" ), "use_base64_audio": ConfigField( type=bool, default=True, description="是否使用base64编码发送音频(备选方案)" ), "split_sentences": ConfigField( type=bool, default=True, description="是否分段发送语音(每句话单独发送一条语音,避免长语音播放问题)" ), "split_delay": ConfigField( type=float, default=0.3, description="分段发送时每条语音之间的延迟(秒)" ), "split_min_total_chars": ConfigField( type=int, default=120, description="自动分段启用阈值:文本长度小于该值时不分段(避免短句被切成多段)", ), "split_min_sentence_chars": ConfigField( type=int, default=6, description="句子最小长度:过短片段会合并到前一句(用于减少碎片段)", ), "split_max_segments": ConfigField( type=int, default=3, description="自动分段最大段数(避免刷屏式多段语音)。0 表示不限制。", ), "split_chunk_chars": ConfigField( type=int, default=110, description="自动分段打包目标长度(字符)。用于把多句合并成更少段。", ), "send_error_messages": ConfigField( type=bool, default=True, description="是否发送错误提示消息(关闭后语音合成失败时不会发送错误信息给用户)" ) }, "components": { "action_enabled": ConfigField(type=bool, default=True, description="是否启用Action组件"), "command_enabled": ConfigField(type=bool, default=True, description="是否启用Command组件"), "instruct_command_enabled": ConfigField(type=bool, default=True, description="是否启用instruct调试命令组件(/tts_instruct)") }, "probability": { "enabled": ConfigField(type=bool, default=False, description="是否启用概率控制"), "base_probability": ConfigField(type=float, default=1.0, description="基础触发概率"), "keyword_force_trigger": ConfigField(type=bool, default=True, description="关键词强制触发"), "force_keywords": ConfigField( type=list, default=["一定要用语音", "必须语音", "语音回复我", "务必用语音"], description="强制触发关键词" ) }, "ai_voice": { "default_character": ConfigField( type=str, default="邻家小妹", description="默认音色(可选:小新、猴哥、四郎、东北老妹儿、广西大表哥、妲己、霸道总裁、酥心御姐、说书先生、憨憨小弟、憨厚老哥、吕布、元气少女、文艺少女、磁性大叔、邻家小妹、低沉男声、傲娇少女、爹系男友、暖心姐姐、温柔妹妹、书香少女)" ) }, "gsv2p": { "api_url": ConfigField( type=str, default="https://gsv2p.acgnai.top/v1/audio/speech", description="GSV2P API地址" ), "api_token": ConfigField(type=str, default="", description="API认证Token"), "default_voice": ConfigField(type=str, default="原神-中文-派蒙_ZH", description="默认音色"), "timeout": ConfigField(type=int, default=120, description="API请求超时(秒)"), "model": ConfigField(type=str, default="tts-v4", description="TTS模型"), "response_format": ConfigField(type=str, default="wav", description="音频格式"), "speed": ConfigField(type=float, default=1.0, description="语音速度") }, "gpt_sovits": { "server": ConfigField( type=str, default="http://127.0.0.1:9880", description="GPT-SoVITS服务地址" ), "styles": ConfigField( type=list, default=[ { "name": "default", "refer_wav": "", "prompt_text": "", "prompt_language": "zh", "gpt_weights": "", "sovits_weights": "" } ], description="语音风格配置", item_type="object", item_fields={ "name": {"type": "string", "label": "风格名称", "required": True}, "refer_wav": {"type": "string", "label": "参考音频路径", "required": True}, "prompt_text": {"type": "string", "label": "参考文本", "required": True}, "prompt_language": {"type": "string", "label": "参考语言", "default": "zh"}, "gpt_weights": {"type": "string", "label": "GPT模型权重路径(可选)", "required": False}, "sovits_weights": {"type": "string", "label": "SoVITS模型权重路径(可选)", "required": False} } ) }, "doubao": { "api_url": ConfigField( type=str, default="https://openspeech.bytedance.com/api/v3/tts/unidirectional", description="豆包语音API地址" ), "app_id": ConfigField(type=str, default="", description="豆包APP ID"), "access_key": ConfigField(type=str, default="", description="豆包Access Key"), "resource_id": ConfigField(type=str, default="seed-tts-2.0", description="豆包Resource ID"), "default_voice": ConfigField( type=str, default="zh_female_vv_uranus_bigtts", description="默认音色" ), "timeout": ConfigField(type=int, default=60, description="API请求超时(秒)"), "audio_format": ConfigField(type=str, default="wav", description="音频格式"), "sample_rate": ConfigField(type=int, default=24000, description="采样率"), "bitrate": ConfigField(type=int, default=128000, description="比特率"), "speed": ConfigField(type=float, default=None, description="语音速度(可选)"), "volume": ConfigField(type=float, default=None, description="音量(可选)"), "context_texts": ConfigField( type=list, default=None, description="上下文辅助文本(可选,仅豆包2.0模型)" ) }, "cosyvoice": { "gradio_url": ConfigField( type=str, default="https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/", description="Gradio API地址" ), "default_mode": ConfigField( type=str, default="3s极速复刻", description="推理模式(3s极速复刻/自然语言控制)" ), "default_instruct": ConfigField( type=str, default="You are a helpful assistant. 请用广东话表达。<|endofprompt|>", description="默认指令(用于自然语言控制模式)" ), "reference_audio": ConfigField( type=str, default="", description="参考音频路径(用于3s极速复刻模式)" ), "prompt_text": ConfigField( type=str, default="", description="提示文本(用于3s极速复刻模式)" ), "timeout": ConfigField(type=int, default=300, description="API请求超时(秒)"), "audio_format": ConfigField(type=str, default="wav", description="音频格式") }, "comfyui": { "server": ConfigField( type=str, default="http://127.0.0.1:8188", description="ComfyUI 服务地址(示例: http://127.0.0.1:8188)", ), "input_dir": ConfigField( type=str, default="/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input", description="ComfyUI input 目录(用于放参考音频,LoadAudio 会从这里读)", ), "timeout": ConfigField(type=int, default=120, description="ComfyUI 请求超时(秒)"), "audio_quality": ConfigField( type=str, default="128k", description="输出 MP3 质量(SaveAudioMP3 quality: V0/128k/320k)", ), "mlx_python": ConfigField( type=str, default="/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python", description="MLX Qwen3-TTS venv python 路径(用于 ComfyUI-MLX 节点子进程)", ), "mlx_cli": ConfigField( type=str, default="/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py", description="mlx_voice_clone_cli.py 路径", ), "default_style": ConfigField(type=str, default="default", description="默认风格名称"), "voiceclone_default_style": ConfigField( type=str, default="", description="VoiceClone 专用默认风格名称(用于 comfyui_voiceclone 后端;留空则回退到 default_style)", ), "customvoice_default_style": ConfigField( type=str, default="", description="CustomVoice 专用默认风格名称(用于 comfyui_customvoice 后端;留空则回退到 default_style)", ), "auto_instruct_enabled": ConfigField( type=bool, default=False, description="是否启用 CustomVoice instruct 自动推断(使用 MaiBot 的 LLM 接口)", ), "auto_instruct_max_chars": ConfigField( type=int, default=120, description="自动推断 instruct 的最大长度(字符)。建议 80-160,太短会导致情绪/表演提示被截断。", ), "auto_instruct_prompt": ConfigField( type=str, default="", description="自定义 instruct 推断 prompt(留空使用内置模板)", ), "auto_instruct_base_tone": ConfigField( type=str, default="", description="自动推断 instruct 时固定附加的基调描述(会作为 `基调=...;` 前缀插入;会自动清洗为单行,且不会包含 `;`/`=`)", ), "pause_linebreak": ConfigField(type=float, default=0.0, description="换行停顿(秒)"), "period_pause": ConfigField(type=float, default=0.0, description="句号停顿(秒)"), "comma_pause": ConfigField(type=float, default=0.0, description="逗号停顿(秒)"), "question_pause": ConfigField(type=float, default=0.0, description="问号停顿(秒)"), "hyphen_pause": ConfigField(type=float, default=0.0, description="连字符停顿(秒)"), "styles": ConfigField( type=list, default=[ { "name": "default", "refer_wav": "", "prompt_text": "", "language": "", "model_choice": "1.7B", "precision": "bf16", "seed": 0, "max_new_tokens": 2048, "top_p": 0.8, "top_k": 20, "temperature": 1.0, "repetition_penalty": 1.05, } ], description="ComfyUI VoiceClone 风格配置(参考音频+逐字稿)", item_type="object", item_fields={ "name": {"type": "string", "label": "风格名称", "required": True}, "mode": {"type": "string", "label": "模式(voice_clone/custom_voice)", "required": False}, "refer_wav": {"type": "string", "label": "参考音频路径", "required": True}, "prompt_text": {"type": "string", "label": "参考文本(逐字稿)", "required": True}, "language": {"type": "string", "label": "语言(可选: Auto/Chinese/English/...) ", "required": False}, "model_choice": {"type": "string", "label": "模型(0.6B/1.7B)", "required": False}, "precision": {"type": "string", "label": "精度(bf16/fp32)", "required": False}, "model_path": {"type": "string", "label": "CustomVoice模型路径", "required": False}, "speaker": {"type": "string", "label": "CustomVoice说话人", "required": False}, "instruct": {"type": "string", "label": "CustomVoice指令(或__AUTO__)", "required": False}, "auto_instruct": {"type": "boolean", "label": "按style启用auto_instruct", "required": False}, "speed": {"type": "number", "label": "speed", "required": False}, "seed": {"type": "number", "label": "seed", "required": False}, "max_new_tokens": {"type": "number", "label": "max_new_tokens", "required": False}, "top_p": {"type": "number", "label": "top_p", "required": False}, "top_k": {"type": "number", "label": "top_k", "required": False}, "temperature": {"type": "number", "label": "temperature", "required": False}, "repetition_penalty": {"type": "number", "label": "repetition_penalty", "required": False}, }, ), } } def get_plugin_components(self) -> List[Tuple[ComponentInfo, Type]]: """返回插件组件列表""" components = [] try: action_enabled = self.get_config(ConfigKeys.COMPONENTS_ACTION_ENABLED, True) command_enabled = self.get_config(ConfigKeys.COMPONENTS_COMMAND_ENABLED, True) instruct_enabled = self.get_config(ConfigKeys.COMPONENTS_INSTRUCT_COMMAND_ENABLED, True) except AttributeError: action_enabled = True command_enabled = True instruct_enabled = True if action_enabled: components.append((UnifiedTTSAction.get_action_info(), UnifiedTTSAction)) if command_enabled: components.append((UnifiedTTSCommand.get_command_info(), UnifiedTTSCommand)) if instruct_enabled: components.append((TTSInstructCommand.get_command_info(), TTSInstructCommand)) return components