MaiBot/plugin.py

973 lines
44 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
统一TTS语音合成插件
支持五种后端AI Voice (MaiCore内置) / GSV2P (云API) / GPT-SoVITS (本地服务) / 豆包语音 (云API) / CosyVoice (ModelScope Gradio)
Version: 3.2.3
Author: 靓仔
"""
import sys
sys.dont_write_bytecode = True
import asyncio
import random
from typing import List, Tuple, Type, Optional
from src.common.logger import get_logger
from src.plugin_system.base.base_plugin import BasePlugin
from src.plugin_system.apis.plugin_register_api import register_plugin
from src.plugin_system.base.base_action import BaseAction, ActionActivationType
from src.plugin_system.base.base_command import BaseCommand
from src.plugin_system.base.component_types import ComponentInfo, ChatMode
from src.plugin_system.base.config_types import ConfigField
from src.plugin_system.apis import generator_api
# 导入模块化的后端和工具
from .backends import TTSBackendRegistry, TTSResult
from .backends.ai_voice import AI_VOICE_ALIAS_MAP
from .backends.doubao import DOUBAO_EMOTION_MAP
from .utils.text import TTSTextUtils
from .config_keys import ConfigKeys
logger = get_logger("tts_voice_plugin")
# 有效后端列表
VALID_BACKENDS = [
"ai_voice",
"gsv2p",
"gpt_sovits",
"doubao",
"cosyvoice",
"comfyui",
"comfyui_voiceclone",
"comfyui_customvoice",
]
class TTSExecutorMixin:
"""
TTS执行器混入类
提供 Action 和 Command 共享的后端执行逻辑
"""
def _create_backend(self, backend_name: str):
"""
创建后端实例
Args:
backend_name: 后端名称
Returns:
后端实例
"""
backend = TTSBackendRegistry.create(
backend_name,
self.get_config,
self.log_prefix
)
if backend:
# 注入必要的回调函数
if hasattr(backend, 'set_send_custom'):
backend.set_send_custom(self.send_custom)
if hasattr(backend, 'set_send_command'):
backend.set_send_command(self.send_command)
return backend
async def _execute_backend(
self,
backend_name: str,
text: str,
voice: str = "",
emotion: str = ""
) -> TTSResult:
"""
执行指定后端
Args:
backend_name: 后端名称
text: 待转换文本
voice: 音色
emotion: 情感(豆包后端)
Returns:
TTSResult
"""
backend = self._create_backend(backend_name)
if not backend:
return TTSResult(
success=False,
message=f"未知的TTS后端: {backend_name}"
)
# AI Voice 私聊限制检查
if backend_name == "ai_voice":
is_private = self._check_is_private_chat()
if is_private:
logger.info(f"{self.log_prefix} AI语音仅支持群聊自动切换到GSV2P后端")
return await self._execute_backend("gsv2p", text, voice, emotion)
# Pass chat context through for backends that need MaiBot LLM APIs (e.g., comfyui auto_instruct).
chat_stream = None
if hasattr(self, "chat_stream"):
chat_stream = getattr(self, "chat_stream", None)
elif hasattr(self, "message"):
chat_stream = getattr(getattr(self, "message", None), "chat_stream", None)
return await backend.execute(text, voice, emotion=emotion, chat_stream=chat_stream)
def _check_is_private_chat(self) -> bool:
"""检查是否是私聊"""
# Action 中使用 chat_stream
if hasattr(self, 'chat_stream'):
return not getattr(self.chat_stream, 'group_info', None)
# Command 中使用 message
if hasattr(self, 'message'):
msg_info = getattr(self.message, 'message_info', None)
if msg_info:
return not getattr(msg_info, 'group_info', None)
return False
def _get_default_backend(self) -> str:
"""获取配置的默认后端"""
backend = self.get_config(ConfigKeys.GENERAL_DEFAULT_BACKEND, "gsv2p")
if backend not in VALID_BACKENDS:
logger.warning(f"{self.log_prefix} 配置的默认后端 '{backend}' 无效,使用 gsv2p")
return "gsv2p"
return backend
async def _send_error(self, message: str) -> None:
"""
发送错误提示信息(受全局配置控制)
Args:
message: 错误消息
"""
if self.get_config(ConfigKeys.GENERAL_SEND_ERROR_MESSAGES, True):
await self.send_text(message)
class UnifiedTTSAction(BaseAction, TTSExecutorMixin):
"""统一TTS Action - LLM自动触发"""
action_name = "unified_tts_action"
action_description = "用语音回复支持AI Voice/GSV2P/GPT-SoVITS/豆包语音多后端)"
activation_type = ActionActivationType.KEYWORD
mode_enable = ChatMode.ALL
parallel_action = False
activation_keywords = [
"语音", "说话", "朗读", "念一下", "读出来",
"voice", "speak", "tts", "语音回复", "用语音说", "播报"
]
keyword_case_sensitive = False
action_parameters = {
"text": "要转换为语音的文本内容(必填)",
"backend": "TTS后端引擎 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice可选建议省略让系统自动使用配置的默认后端)",
"voice": "音色/风格参数(可选)",
"emotion": "情感/语气参数(可选,仅豆包后端有效)。支持:开心/兴奋/温柔/骄傲/生气/愤怒/伤心/失望/委屈/平静/严肃/疑惑/慢速/快速/小声/大声等"
}
action_require = [
"当用户要求用语音回复时使用",
"当回复简短问候语时使用(如早上好、晚安、你好等)",
"当想让回复更活泼生动时可以使用",
"注意:回复内容过长或者过短不适合用语音",
"注意backend参数建议省略系统会自动使用配置的默认后端"
]
associated_types = ["text", "command"]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.timeout = self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60)
self.max_text_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 500)
def _check_force_trigger(self, text: str) -> bool:
"""检查是否强制触发"""
if not self.get_config(ConfigKeys.PROBABILITY_KEYWORD_FORCE_TRIGGER, True):
return False
force_keywords = self.get_config(
ConfigKeys.PROBABILITY_FORCE_KEYWORDS,
["一定要用语音", "必须语音", "语音回复我", "务必用语音"]
)
return any(kw in text for kw in force_keywords)
def _probability_check(self, text: str) -> bool:
"""概率控制检查"""
if not self.get_config(ConfigKeys.PROBABILITY_ENABLED, True):
return True
base_prob = self.get_config(ConfigKeys.PROBABILITY_BASE_PROBABILITY, 1.0)
base_prob = max(0.0, min(1.0, base_prob))
result = random.random() < base_prob
logger.info(f"{self.log_prefix} 概率检查: {base_prob:.2f}, 结果={'通过' if result else '未通过'}")
return result
async def _get_final_text(self, raw_text: str, reason: str, use_replyer: bool) -> Tuple[bool, str]:
"""获取最终要转语音的文本使用与正常回复一致的prompt参数"""
max_text_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 200)
if not use_replyer:
if not raw_text:
return False, ""
return True, raw_text
try:
# 统一使用 generate_reply 以确保触发 POST_LLM 事件(日程注入)
# rewrite_reply 不会触发 POST_LLM 事件,因此不适用
# 注意:长度约束放在末尾,利用 LLM 的"近因效应"提高遵守率
extra_info_parts = []
if raw_text:
extra_info_parts.append(f"期望的回复内容:{raw_text}")
# 长度约束放在最后,使用更强的表述
extra_info_parts.append(
f"【重要】你的回复必须控制在{max_text_length}字以内,这是硬性要求。"
f"超过此长度将无法转换为语音。请直接回复核心内容,不要啰嗦。"
)
success, llm_response = await generator_api.generate_reply(
chat_stream=self.chat_stream,
reply_message=self.action_message,
reply_reason=reason,
extra_info="\n".join(extra_info_parts),
request_type="tts_voice_plugin",
from_plugin=False # 允许触发POST_LLM事件使日程注入生效
)
if success and llm_response and llm_response.content:
logger.info(f"{self.log_prefix} 语音内容生成成功")
return True, llm_response.content.strip()
# 如果生成失败但有原始文本,则使用原始文本
if raw_text:
logger.warning(f"{self.log_prefix} 内容生成失败,使用原始文本")
return True, raw_text
return False, ""
except Exception as e:
logger.error(f"{self.log_prefix} 调用 replyer 出错: {e}")
return bool(raw_text), raw_text
async def execute(self) -> Tuple[bool, str]:
def _chunk_sentences(
parts: List[str], target_chars: int, max_chunks: int
) -> List[str]:
# Greedy packing: reduces tiny fragments into fewer, longer segments.
if not parts:
return []
if target_chars <= 0:
target_chars = 120
def pack(tgt: int) -> List[str]:
out: List[str] = []
cur = ""
for s in parts:
s = (s or "").strip()
if not s:
continue
if not cur:
cur = s
continue
if len(cur) + len(s) <= tgt:
cur += s
else:
out.append(cur)
cur = s
if cur:
out.append(cur)
return out
packed = pack(target_chars)
if max_chunks and max_chunks > 0 and len(packed) > max_chunks:
total = len("".join(parts))
new_target = max(target_chars, int(total / max_chunks) + 1)
packed = pack(new_target)
return packed
async def send_message_single_sentences() -> Tuple[bool, str]:
result = await self._execute_backend(backend, clean_text, voice, emotion)
if result.success:
# 生成更详细的动作记录,帮助 planner 避免重复执行
text_preview = clean_text[:80] + "..." if len(clean_text) > 80 else clean_text
await self.store_action_info(
action_build_into_prompt=True,
action_prompt_display=f"已用语音回复:{text_preview}",
action_done=True
)
else:
await self._send_error(f"语音合成失败: {result.message}")
return result.success, result.message
async def send_message_with_splited_sentences() -> Tuple[bool, str]:
# 分段发送模式:将文本分割成句子,逐句发送语音
if len(sentences) > 1:
logger.info(f"{self.log_prefix} 分段发送模式:共 {len(sentences)}")
success_count = 0
all_sentences_text = []
for i, sentence in enumerate(sentences):
if not sentence.strip():
continue
logger.debug(f"{self.log_prefix} 发送第 {i + 1}/{len(sentences)} 句: {sentence[:30]}...")
result = await self._execute_backend(backend, sentence, voice, emotion)
if result.success:
success_count += 1
all_sentences_text.append(sentence)
else:
logger.warning(f"{self.log_prefix}{i + 1} 句发送失败: {result.message}")
# 句子之间添加延迟
if i < len(sentences) - 1 and split_delay > 0:
await asyncio.sleep(split_delay)
# 记录动作信息
if success_count > 0:
# 生成更详细的动作记录,帮助 planner 避免重复执行
display_text = "".join(all_sentences_text)
text_preview = display_text[:80] + "..." if len(display_text) > 80 else display_text
await self.store_action_info(
action_build_into_prompt=True,
action_prompt_display=f"已用语音回复({success_count}段):{text_preview}",
action_done=True
)
return True, f"成功发送 {success_count}/{len(sentences)} 条语音"
else:
await self._send_error("语音合成失败")
return False, "所有语音发送失败"
else:
# 只有一句,正常发送
return await send_message_single_sentences()
"""执行TTS语音合成"""
try:
raw_text = self.action_data.get("text", "").strip()
voice = self.action_data.get("voice", "")
reason = self.action_data.get("reason", "")
emotion = self.action_data.get("emotion", "")
use_replyer = self.get_config(ConfigKeys.GENERAL_USE_REPLYER_REWRITE, True)
# 获取最终文本
success, final_text = await self._get_final_text(raw_text, reason, use_replyer)
if not success or not final_text:
await self._send_error("无法生成语音内容")
return False, "文本为空"
# 概率检查
force_trigger = self._check_force_trigger(final_text)
if not force_trigger and not self._probability_check(final_text):
logger.info(f"{self.log_prefix} 概率检查未通过,使用文字回复")
await self.send_text(final_text)
text_preview = final_text[:80] + "..." if len(final_text) > 80 else final_text
await self.store_action_info(
action_build_into_prompt=True,
action_prompt_display=f"已用文字回复(语音概率未触发):{text_preview}",
action_done=True
)
return True, "概率检查未通过,已发送文字回复"
# 清理文本(移除特殊字符,替换网络用语)
# 注意长度应该由LLM在生成时就遵守这里只做字符清理
clean_text = TTSTextUtils.clean_text(final_text, self.max_text_length)
if not clean_text:
await self._send_error("文本处理后为空")
return False, "文本处理后为空"
# 如果清理后的文本仍然超过限制说明LLM未遵守约束
if len(clean_text) > self.max_text_length:
logger.warning(
f"{self.log_prefix} LLM生成的文本超过长度限制 "
f"({len(clean_text)} > {self.max_text_length}字符),降级为文字回复"
)
await self.send_text(clean_text)
text_preview = clean_text[:80] + "..." if len(clean_text) > 80 else clean_text
await self.store_action_info(
action_build_into_prompt=True,
action_prompt_display=f"已用文字回复(内容过长):{text_preview}",
action_done=True
)
return True, "内容超过语音长度限制,已改为文字回复"
# 获取后端并执行
backend = self._get_default_backend()
logger.info(f"{self.log_prefix} 使用配置的默认后端: {backend}")
# 检查是否启用分段发送
split_sentences = self.get_config(ConfigKeys.GENERAL_SPLIT_SENTENCES, True)
split_delay = self.get_config(ConfigKeys.GENERAL_SPLIT_DELAY, 0.3)
sentences = None
# 优先使用智能分割插件的分隔符
if '|||SPLIT|||' in clean_text:
logger.info("found split marker from smart segmentation plugin")
sentences = [s.strip() for s in clean_text.split("|||SPLIT|||") if s.strip()]
# If the upstream splitter is too aggressive, pack back into fewer segments.
max_segments = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MAX_SEGMENTS, 3) or 3)
chunk_chars = int(self.get_config(ConfigKeys.GENERAL_SPLIT_CHUNK_CHARS, 110) or 110)
if max_segments and max_segments > 0 and len(sentences) > max_segments:
sentences = _chunk_sentences(sentences, target_chars=chunk_chars, max_chunks=max_segments)
return await send_message_with_splited_sentences()
elif split_sentences:
# 自动分段:短文本不分段;长文本最多分成 N 段,避免刷屏式多段语音。
min_total = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MIN_TOTAL_CHARS, 120) or 120)
min_sentence = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MIN_SENTENCE_CHARS, 6) or 6)
max_segments = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MAX_SEGMENTS, 3) or 3)
chunk_chars = int(self.get_config(ConfigKeys.GENERAL_SPLIT_CHUNK_CHARS, 110) or 110)
if len(clean_text) < min_total:
sentences = [clean_text]
else:
sentences = TTSTextUtils.split_sentences(clean_text, min_length=min_sentence)
if max_segments and max_segments > 0:
sentences = _chunk_sentences(sentences, target_chars=chunk_chars, max_chunks=max_segments)
return await send_message_with_splited_sentences()
else:
# 单句发送
return await send_message_single_sentences()
except Exception as e:
error_msg = str(e)
logger.error(f"{self.log_prefix} TTS语音合成出错: {error_msg}")
await self._send_error(f"语音合成出错: {error_msg}")
return False, error_msg
class UnifiedTTSCommand(BaseCommand, TTSExecutorMixin):
"""统一TTS Command - 用户手动触发"""
command_name = "unified_tts_command"
command_description = "将文本转换为语音,支持多种后端和音色"
command_pattern = r"^/(?:tts|voice|gsv2p|gptsovits|doubao|cosyvoice|comfyui|comfyui_voiceclone|comfyui_customvoice)\s+(?P<text>.+?)(?:\s+-v\s+(?P<voice>\S+))?(?:\s+(?P<backend>ai_voice|gsv2p|gpt_sovits|doubao|cosyvoice|comfyui|comfyui_voiceclone|comfyui_customvoice))?$"
command_help = "将文本转换为语音。用法:/tts 你好世界 [-v 音色] [后端]"
command_examples = [
"/tts 你好,世界!",
"/tts 今天天气不错 -v 小新",
"/gptsovits 你好世界 -v default",
"/cosyvoice 你好世界 -v 四川话",
"/tts 试试 -v 温柔妹妹 ai_voice",
"/gsv2p 你好世界",
"/doubao 你好世界 -v 开心"
]
intercept_message = True
async def _send_help(self):
"""发送帮助信息"""
default_backend = self._get_default_backend()
help_text = """【TTS语音合成插件帮助】
📝 基本语法:
/tts <文本> [-v <音色>] [后端]
🎯 快捷命令:
/tts <文本> 使用默认后端
/voice <文本> 使用 AI Voice
/gsv2p <文本> 使用 GSV2P
/gptsovits <文本> 使用 GPT-SoVITS
/doubao <文本> 使用 豆包语音
/cosyvoice <文本> 使用 CosyVoice
/comfyui <文本> 使用 ComfyUI(本地工作流)
/comfyui_voiceclone <文本> 使用 ComfyUI VoiceClone
/comfyui_customvoice <文本> 使用 ComfyUI CustomVoice
🔊 可用后端:
• ai_voice - MaiCore内置仅群聊
• gsv2p - 云端API高质量
• gpt_sovits - 本地服务,可定制
• doubao - 火山引擎,支持情感
• cosyvoice - 阿里云,支持方言
• comfyui - 本地ComfyUI工作流(自动按 style.mode 选择)
• comfyui_voiceclone - 本地ComfyUI工作流(仅 VoiceClone)
• comfyui_customvoice - 本地ComfyUI工作流(仅 CustomVoice)
🎭 音色/情感参数(-v
• AI Voice: 小新、温柔妹妹、霸道总裁、妲己 等22种
• GSV2P: 原神-中文-派蒙_ZH 等见API文档
• 豆包: 开心、生气、伤心、撒娇、严肃 等
• CosyVoice: 广东话、四川话、东北话、开心、慢速 等
📌 示例:
/tts 你好世界
/tts 今天真开心 -v 开心
/gptsovits 这是本地语音合成
/doubao 我生气了 -v 生气
/cosyvoice 你好 -v 广东话
/voice 测试一下 -v 温柔妹妹
⚙️ 当前默认后端:""" + default_backend
await self.send_text(help_text)
def _determine_backend(self, user_backend: str) -> Tuple[str, str]:
"""
确定使用的后端
Returns:
(backend_name, source_description)
"""
# 1. 检查命令前缀
raw_text = self.message.raw_message if self.message.raw_message else self.message.processed_plain_text
if raw_text:
# 命令前缀到后端的映射
prefix_backend_map = {
"/gsv2p": "gsv2p",
"/gptsovits": "gpt_sovits",
"/doubao": "doubao",
"/cosyvoice": "cosyvoice",
"/voice": "ai_voice",
"/comfyui": "comfyui",
"/comfyui_voiceclone": "comfyui_voiceclone",
"/comfyui_customvoice": "comfyui_customvoice",
}
for prefix, backend in prefix_backend_map.items():
if raw_text.startswith(prefix):
return backend, f"命令前缀 {prefix}"
# 2. 检查命令参数
if user_backend and user_backend in VALID_BACKENDS:
return user_backend, f"命令参数 {user_backend}"
# 3. 使用配置文件默认值
return self._get_default_backend(), "配置文件"
async def execute(self) -> Tuple[bool, str, bool]:
"""执行TTS命令"""
try:
text = self.matched_groups.get("text", "").strip()
voice = self.matched_groups.get("voice", "")
user_backend = self.matched_groups.get("backend", "")
# 处理帮助命令
if text.lower() == "help":
await self._send_help()
return True, "显示帮助信息", True
if not text:
await self._send_error("请输入要转换为语音的文本内容")
return False, "缺少文本内容", True
# 确定后端
backend, backend_source = self._determine_backend(user_backend)
# 清理文本
max_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 500)
clean_text = TTSTextUtils.clean_text(text, max_length)
if not clean_text:
await self._send_error("文本处理后为空")
return False, "文本处理后为空", True
# 检查长度限制
if len(clean_text) > max_length:
await self.send_text(
f"文本过长({len(clean_text)}字符),"
f"超过语音合成限制({max_length}字符),"
f"已改为文字发送。\n\n{clean_text}"
)
return True, "文本过长,已改为文字发送", True
logger.info(f"{self.log_prefix} 执行TTS命令 (后端: {backend} [来源: {backend_source}], 音色: {voice})")
# 执行后端
# 对于 CosyVoice 和豆包voice 参数实际上是情感/方言
if backend in ["cosyvoice", "doubao"]:
result = await self._execute_backend(backend, clean_text, voice="", emotion=voice)
else:
result = await self._execute_backend(backend, clean_text, voice)
if not result.success:
await self._send_error(f"语音合成失败: {result.message}")
return result.success, result.message, True
except Exception as e:
logger.error(f"{self.log_prefix} TTS命令执行出错: {e}")
await self._send_error(f"语音合成出错: {e}")
return False, f"执行出错: {e}", True
class TTSInstructCommand(BaseCommand):
"""生成 CustomVoice instruct调试/预览用)"""
command_name = "tts_instruct_command"
command_description = "根据待朗读文本生成 CustomVoice 的 instruct情绪/语速/停顿)"
command_pattern = r"^/tts_instruct\\s+(?P<text>.+?)$"
command_help = "用法:/tts_instruct <文本>"
command_examples = [
"/tts_instruct 早上好,今天也要加油。",
"/tts_instruct えっ?本当にそうなの?",
]
intercept_message = True
async def execute(self) -> Tuple[bool, str, int]:
try:
text = (self.matched_groups.get("text") or "").strip()
if not text:
await self.send_text("请输入要生成 instruct 的文本")
return False, "缺少文本", 2
# Use the same logic as ComfyUI backend auto_instruct.
from .backends.comfyui import ComfyUIBackend
from .utils.text import TTSTextUtils
detected = TTSTextUtils.detect_language(text)
chat_stream = getattr(self.message, "chat_stream", None)
chat_id = getattr(chat_stream, "stream_id", None) if chat_stream else None
backend = ComfyUIBackend(self.get_config, log_prefix=self.log_prefix)
instruct = await backend._infer_instruct(
text=text,
detected_lang=detected,
chat_stream=chat_stream,
chat_id=chat_id,
style_name="__command__",
)
if not instruct:
await self.send_text("instruct 生成失败(可能未启用 comfyui.auto_instruct_enabled 或 LLM 不可用)")
return False, "instruct 生成失败", 2
await self.send_text(instruct)
return True, "instruct 已生成", 2
except Exception as e:
await self.send_text(f"instruct 生成异常: {e}")
return False, str(e), 2
@register_plugin
class UnifiedTTSPlugin(BasePlugin):
"""统一TTS语音合成插件 - 支持多后端的文本转语音插件"""
plugin_name = "tts_voice_plugin"
plugin_description = "统一TTS语音合成插件支持AI Voice、GSV2P、GPT-SoVITS、豆包语音多种后端"
plugin_version = "3.2.3"
plugin_author = "靓仔"
enable_plugin = True
config_file_name = "config.toml"
dependencies = []
python_dependencies = ["aiohttp"]
config_section_descriptions = {
"plugin": "插件基本配置",
"general": "通用设置",
"components": "组件启用控制",
"probability": "概率控制配置",
"ai_voice": "AI Voice后端配置",
"gsv2p": "GSV2P后端配置",
"gpt_sovits": "GPT-SoVITS后端配置",
"doubao": "豆包语音后端配置",
"cosyvoice": "CosyVoice后端配置",
"comfyui": "ComfyUI工作流API后端配置"
}
config_schema = {
"plugin": {
"enabled": ConfigField(type=bool, default=True, description="是否启用插件"),
"config_version": ConfigField(type=str, default="3.2.3", description="配置文件版本")
},
"general": {
"default_backend": ConfigField(
type=str, default="cosyvoice",
description="默认TTS后端 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice)"
),
"timeout": ConfigField(type=int, default=60, description="请求超时时间(秒)"),
"max_text_length": ConfigField(
type=int, default=200,
description="最大文本长度该限制会在调用LLM时注入到prompt中让LLM直接生成符合长度的回复而不是被动截断"
),
"use_replyer_rewrite": ConfigField(
type=bool, default=True,
description="是否使用replyer润色语音内容"
),
"audio_output_dir": ConfigField(
type=str, default="",
description="音频文件输出目录(支持相对路径和绝对路径,留空使用项目根目录)"
),
"use_base64_audio": ConfigField(
type=bool, default=True,
description="是否使用base64编码发送音频备选方案"
),
"split_sentences": ConfigField(
type=bool, default=True,
description="是否分段发送语音(每句话单独发送一条语音,避免长语音播放问题)"
),
"split_delay": ConfigField(
type=float, default=0.3,
description="分段发送时每条语音之间的延迟(秒)"
),
"split_min_total_chars": ConfigField(
type=int, default=120,
description="自动分段启用阈值:文本长度小于该值时不分段(避免短句被切成多段)",
),
"split_min_sentence_chars": ConfigField(
type=int, default=6,
description="句子最小长度:过短片段会合并到前一句(用于减少碎片段)",
),
"split_max_segments": ConfigField(
type=int, default=3,
description="自动分段最大段数避免刷屏式多段语音。0 表示不限制。",
),
"split_chunk_chars": ConfigField(
type=int, default=110,
description="自动分段打包目标长度(字符)。用于把多句合并成更少段。",
),
"send_error_messages": ConfigField(
type=bool, default=True,
description="是否发送错误提示消息(关闭后语音合成失败时不会发送错误信息给用户)"
)
},
"components": {
"action_enabled": ConfigField(type=bool, default=True, description="是否启用Action组件"),
"command_enabled": ConfigField(type=bool, default=True, description="是否启用Command组件"),
"instruct_command_enabled": ConfigField(type=bool, default=True, description="是否启用instruct调试命令组件(/tts_instruct)")
},
"probability": {
"enabled": ConfigField(type=bool, default=False, description="是否启用概率控制"),
"base_probability": ConfigField(type=float, default=1.0, description="基础触发概率"),
"keyword_force_trigger": ConfigField(type=bool, default=True, description="关键词强制触发"),
"force_keywords": ConfigField(
type=list,
default=["一定要用语音", "必须语音", "语音回复我", "务必用语音"],
description="强制触发关键词"
)
},
"ai_voice": {
"default_character": ConfigField(
type=str,
default="邻家小妹",
description="默认音色(可选:小新、猴哥、四郎、东北老妹儿、广西大表哥、妲己、霸道总裁、酥心御姐、说书先生、憨憨小弟、憨厚老哥、吕布、元气少女、文艺少女、磁性大叔、邻家小妹、低沉男声、傲娇少女、爹系男友、暖心姐姐、温柔妹妹、书香少女)"
)
},
"gsv2p": {
"api_url": ConfigField(
type=str, default="https://gsv2p.acgnai.top/v1/audio/speech",
description="GSV2P API地址"
),
"api_token": ConfigField(type=str, default="", description="API认证Token"),
"default_voice": ConfigField(type=str, default="原神-中文-派蒙_ZH", description="默认音色"),
"timeout": ConfigField(type=int, default=120, description="API请求超时"),
"model": ConfigField(type=str, default="tts-v4", description="TTS模型"),
"response_format": ConfigField(type=str, default="wav", description="音频格式"),
"speed": ConfigField(type=float, default=1.0, description="语音速度")
},
"gpt_sovits": {
"server": ConfigField(
type=str, default="http://127.0.0.1:9880",
description="GPT-SoVITS服务地址"
),
"styles": ConfigField(
type=list,
default=[
{
"name": "default",
"refer_wav": "",
"prompt_text": "",
"prompt_language": "zh",
"gpt_weights": "",
"sovits_weights": ""
}
],
description="语音风格配置",
item_type="object",
item_fields={
"name": {"type": "string", "label": "风格名称", "required": True},
"refer_wav": {"type": "string", "label": "参考音频路径", "required": True},
"prompt_text": {"type": "string", "label": "参考文本", "required": True},
"prompt_language": {"type": "string", "label": "参考语言", "default": "zh"},
"gpt_weights": {"type": "string", "label": "GPT模型权重路径可选", "required": False},
"sovits_weights": {"type": "string", "label": "SoVITS模型权重路径可选", "required": False}
}
)
},
"doubao": {
"api_url": ConfigField(
type=str,
default="https://openspeech.bytedance.com/api/v3/tts/unidirectional",
description="豆包语音API地址"
),
"app_id": ConfigField(type=str, default="", description="豆包APP ID"),
"access_key": ConfigField(type=str, default="", description="豆包Access Key"),
"resource_id": ConfigField(type=str, default="seed-tts-2.0", description="豆包Resource ID"),
"default_voice": ConfigField(
type=str, default="zh_female_vv_uranus_bigtts",
description="默认音色"
),
"timeout": ConfigField(type=int, default=60, description="API请求超时"),
"audio_format": ConfigField(type=str, default="wav", description="音频格式"),
"sample_rate": ConfigField(type=int, default=24000, description="采样率"),
"bitrate": ConfigField(type=int, default=128000, description="比特率"),
"speed": ConfigField(type=float, default=None, description="语音速度(可选)"),
"volume": ConfigField(type=float, default=None, description="音量(可选)"),
"context_texts": ConfigField(
type=list, default=None,
description="上下文辅助文本可选仅豆包2.0模型)"
)
},
"cosyvoice": {
"gradio_url": ConfigField(
type=str,
default="https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/",
description="Gradio API地址"
),
"default_mode": ConfigField(
type=str,
default="3s极速复刻",
description="推理模式3s极速复刻/自然语言控制)"
),
"default_instruct": ConfigField(
type=str,
default="You are a helpful assistant. 请用广东话表达。<|endofprompt|>",
description="默认指令(用于自然语言控制模式)"
),
"reference_audio": ConfigField(
type=str,
default="",
description="参考音频路径用于3s极速复刻模式"
),
"prompt_text": ConfigField(
type=str,
default="",
description="提示文本用于3s极速复刻模式"
),
"timeout": ConfigField(type=int, default=300, description="API请求超时"),
"audio_format": ConfigField(type=str, default="wav", description="音频格式")
},
"comfyui": {
"server": ConfigField(
type=str,
default="http://127.0.0.1:8188",
description="ComfyUI 服务地址(示例: http://127.0.0.1:8188",
),
"input_dir": ConfigField(
type=str,
default="/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input",
description="ComfyUI input 目录用于放参考音频LoadAudio 会从这里读)",
),
"timeout": ConfigField(type=int, default=120, description="ComfyUI 请求超时(秒)"),
"audio_quality": ConfigField(
type=str,
default="128k",
description="输出 MP3 质量SaveAudioMP3 quality: V0/128k/320k",
),
"mlx_python": ConfigField(
type=str,
default="/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python",
description="MLX Qwen3-TTS venv python 路径(用于 ComfyUI-MLX 节点子进程)",
),
"mlx_cli": ConfigField(
type=str,
default="/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py",
description="mlx_voice_clone_cli.py 路径",
),
"default_style": ConfigField(type=str, default="default", description="默认风格名称"),
"voiceclone_default_style": ConfigField(
type=str,
default="",
description="VoiceClone 专用默认风格名称(用于 comfyui_voiceclone 后端;留空则回退到 default_style",
),
"customvoice_default_style": ConfigField(
type=str,
default="",
description="CustomVoice 专用默认风格名称(用于 comfyui_customvoice 后端;留空则回退到 default_style",
),
"auto_instruct_enabled": ConfigField(
type=bool,
default=False,
description="是否启用 CustomVoice instruct 自动推断(使用 MaiBot 的 LLM 接口)",
),
"auto_instruct_max_chars": ConfigField(
type=int,
default=120,
description="自动推断 instruct 的最大长度(字符)。建议 80-160太短会导致情绪/表演提示被截断。",
),
"auto_instruct_prompt": ConfigField(
type=str,
default="",
description="自定义 instruct 推断 prompt留空使用内置模板",
),
"auto_instruct_base_tone": ConfigField(
type=str,
default="",
description="自动推断 instruct 时固定附加的基调描述(会作为 `基调=...;` 前缀插入;会自动清洗为单行,且不会包含 `;`/`=`",
),
"pause_linebreak": ConfigField(type=float, default=0.0, description="换行停顿(秒)"),
"period_pause": ConfigField(type=float, default=0.0, description="句号停顿(秒)"),
"comma_pause": ConfigField(type=float, default=0.0, description="逗号停顿(秒)"),
"question_pause": ConfigField(type=float, default=0.0, description="问号停顿(秒)"),
"hyphen_pause": ConfigField(type=float, default=0.0, description="连字符停顿(秒)"),
"styles": ConfigField(
type=list,
default=[
{
"name": "default",
"refer_wav": "",
"prompt_text": "",
"language": "",
"model_choice": "1.7B",
"precision": "bf16",
"seed": 0,
"max_new_tokens": 2048,
"top_p": 0.8,
"top_k": 20,
"temperature": 1.0,
"repetition_penalty": 1.05,
}
],
description="ComfyUI VoiceClone 风格配置(参考音频+逐字稿)",
item_type="object",
item_fields={
"name": {"type": "string", "label": "风格名称", "required": True},
"mode": {"type": "string", "label": "模式(voice_clone/custom_voice)", "required": False},
"refer_wav": {"type": "string", "label": "参考音频路径", "required": True},
"prompt_text": {"type": "string", "label": "参考文本(逐字稿)", "required": True},
"language": {"type": "string", "label": "语言(可选: Auto/Chinese/English/...) ", "required": False},
"model_choice": {"type": "string", "label": "模型(0.6B/1.7B)", "required": False},
"precision": {"type": "string", "label": "精度(bf16/fp32)", "required": False},
"model_path": {"type": "string", "label": "CustomVoice模型路径", "required": False},
"speaker": {"type": "string", "label": "CustomVoice说话人", "required": False},
"instruct": {"type": "string", "label": "CustomVoice指令(或__AUTO__)", "required": False},
"auto_instruct": {"type": "boolean", "label": "按style启用auto_instruct", "required": False},
"speed": {"type": "number", "label": "speed", "required": False},
"seed": {"type": "number", "label": "seed", "required": False},
"max_new_tokens": {"type": "number", "label": "max_new_tokens", "required": False},
"top_p": {"type": "number", "label": "top_p", "required": False},
"top_k": {"type": "number", "label": "top_k", "required": False},
"temperature": {"type": "number", "label": "temperature", "required": False},
"repetition_penalty": {"type": "number", "label": "repetition_penalty", "required": False},
},
),
}
}
def get_plugin_components(self) -> List[Tuple[ComponentInfo, Type]]:
"""返回插件组件列表"""
components = []
try:
action_enabled = self.get_config(ConfigKeys.COMPONENTS_ACTION_ENABLED, True)
command_enabled = self.get_config(ConfigKeys.COMPONENTS_COMMAND_ENABLED, True)
instruct_enabled = self.get_config(ConfigKeys.COMPONENTS_INSTRUCT_COMMAND_ENABLED, True)
except AttributeError:
action_enabled = True
command_enabled = True
instruct_enabled = True
if action_enabled:
components.append((UnifiedTTSAction.get_action_info(), UnifiedTTSAction))
if command_enabled:
components.append((UnifiedTTSCommand.get_command_info(), UnifiedTTSCommand))
if instruct_enabled:
components.append((TTSInstructCommand.get_command_info(), TTSInstructCommand))
return components