MaiBot/src/jargon/jargon_miner.py

import time
import json
import asyncio
from collections import OrderedDict
from typing import List, Dict, Optional, Any
from json_repair import repair_json
from peewee import fn

from src.common.logger import get_logger
from src.common.database.database_model import Jargon
from src.llm_models.utils_model import LLMRequest
from src.config.config import model_config, global_config
from src.chat.message_receive.chat_stream import get_chat_manager
from src.plugin_system.apis import llm_api
from src.chat.utils.chat_message_builder import (
    build_readable_messages_with_id,
    get_raw_msg_by_timestamp_with_chat_inclusive,
    get_raw_msg_before_timestamp_with_chat,
    build_readable_messages_with_list,
)
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
from src.chat.utils.utils import parse_platform_accounts


logger = get_logger("jargon")


def _contains_bot_self_name(content: str) -> bool:
    """
    判断词条是否包含机器人的昵称或别名
    """
    if not content:
        return False

    bot_config = getattr(global_config, "bot", None)
    if not bot_config:
        return False

    target = content.strip().lower()
    nickname = str(getattr(bot_config, "nickname", "") or "").strip().lower()
    alias_names = [str(alias or "").strip().lower() for alias in getattr(bot_config, "alias_names", []) or []]

    candidates = [name for name in [nickname, *alias_names] if name]

    return any(name in target for name in candidates if target)


def _format_context_message(msg: Any, seq_index: int) -> str:
    """
    将单条消息格式化为带序号的上下文行
    """
    if msg is None:
        return ""

    text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip()
    if not text:
        return ""

    user_info = getattr(msg, "user_info", None)
    nickname = ""
    if user_info:
        nickname = getattr(user_info, "user_nickname", "") or getattr(user_info, "user_id", "")

    if not nickname:
        nickname = getattr(msg, "user_nickname", "") or getattr(msg, "user_id", "") or "某人"

    return f"{nickname}: {text}"


def _build_context_paragraph(messages: List[Any], center_index: int) -> Optional[str]:
    """
    构建包含中心消息上下文的段落（前3条+后3条）
    """
    if not messages or center_index < 0 or center_index >= len(messages):
        return None

    context_start = max(0, center_index - 3)
    context_end = min(len(messages), center_index + 1 + 3)

    context_lines: List[str] = []
    for idx in range(context_start, context_end):
        formatted_line = _format_context_message(messages[idx], idx + 1)
        if formatted_line:
            context_lines.append(formatted_line)

    if not context_lines:
        return None

    paragraph = "\n".join(context_lines).strip()
    return paragraph or None


def _is_bot_message(msg: Any) -> bool:
    """判断消息是否来自机器人自身"""
    if msg is None:
        return False

    bot_config = getattr(global_config, "bot", None)
    if not bot_config:
        return False

    platform = (
        str(getattr(msg, "user_platform", "") or getattr(getattr(msg, "user_info", None), "platform", "") or "")
        .strip()
        .lower()
    )
    user_id = (
        str(getattr(msg, "user_id", "") or getattr(getattr(msg, "user_info", None), "user_id", "") or "")
        .strip()
    )

    if not platform or not user_id:
        return False

    platform_accounts = {}
    try:
        platform_accounts = parse_platform_accounts(getattr(bot_config, "platforms", []) or [])
    except Exception:
        platform_accounts = {}

    bot_accounts: Dict[str, str] = {}
    qq_account = str(getattr(bot_config, "qq_account", "") or "").strip()
    if qq_account:
        bot_accounts["qq"] = qq_account

    telegram_account = str(getattr(bot_config, "telegram_account", "") or "").strip()
    if telegram_account:
        bot_accounts["telegram"] = telegram_account

    for plat, account in platform_accounts.items():
        if account and plat not in bot_accounts:
            bot_accounts[plat] = account

    bot_account = bot_accounts.get(platform)
    return bool(bot_account and user_id == bot_account)


def _has_adjacent_bot_message(messages: List[Any], center_index: int) -> bool:
    """检查目标消息的上一条或下一条是否为机器人发言"""
    for neighbor in (center_index - 1, center_index + 1):
        if 0 <= neighbor < len(messages) and _is_bot_message(messages[neighbor]):
            return True
    return False


def _init_prompt() -> None:
    prompt_str = """
**聊天内容，其中的{bot_name}的发言内容是你自己的发言，[msg_id] 是消息ID**
{chat_str}

请从上面这段聊天内容中提取"可能是黑话"的候选项（黑话/俚语/网络缩写/口头禅）。
- 必须为对话中真实出现过的短词或短语
- 必须是你无法理解含义的词语，没有明确含义的词语
- 请不要选择有明确含义，或者含义清晰的词语
- 排除：人名、@、表情包/图片中的内容、纯标点、常规功能词（如的、了、呢、啊等）
- 每个词条长度建议 2-8 个字符（不强制），尽量短小
- 合并重复项，去重

黑话必须为以下几种类型：
- 由字母构成的，汉语拼音首字母的简写词，例如：nb、yyds、xswl
- 英文词语的缩写，用英文字母概括一个词汇或含义，例如：CPU、GPU、API
- 中文词语的缩写，用几个汉字概括一个词汇或含义，例如：社死、内卷

以 JSON 数组输出，元素为对象（严格按以下结构）：
请你提取出可能的黑话，最多10
[
  {{"content": "词条", "msg_id": "m12"}},  // msg_id 必须与上方聊天中展示的ID完全一致
  {{"content": "词条2", "msg_id": "m15"}}
]

现在请输出：
"""
    Prompt(prompt_str, "extract_jargon_prompt")


def _init_inference_prompts() -> None:
    """初始化含义推断相关的prompt"""
    # Prompt 1: 基于raw_content和content推断
    prompt1_str = """
**词条内容**
{content}
**词条出现的上下文。其中的{bot_name}的发言内容是你自己的发言**
{raw_content_list}

请根据上下文，推断"{content}"这个词条的含义。
- 如果这是一个黑话、俚语或网络用语，请推断其含义
- 如果含义明确（常规词汇），也请说明
- 如果上下文信息不足，无法推断含义，请设置 no_info 为 true

以 JSON 格式输出：
{{
  "meaning": "详细含义说明（包含使用场景、来源、具体解释等）",
  "no_info": false
}}
注意：如果信息不足无法推断，请设置 "no_info": true，此时 meaning 可以为空字符串
"""
    Prompt(prompt1_str, "jargon_inference_with_context_prompt")

    # Prompt 2: 仅基于content推断
    prompt2_str = """
**词条内容**
{content}

请仅根据这个词条本身，推断其含义。
- 如果这是一个黑话、俚语或网络用语，请推断其含义
- 如果含义明确（常规词汇），也请说明

以 JSON 格式输出：
{{
  "meaning": "详细含义说明（包含使用场景、来源、具体解释等）"
}}
"""
    Prompt(prompt2_str, "jargon_inference_content_only_prompt")

    # Prompt 3: 比较两个推断结果
    prompt3_str = """
**推断结果1（基于上下文）**
{inference1}

**推断结果2（仅基于词条）**
{inference2}

请比较这两个推断结果，判断它们是否相同或类似。
- 如果两个推断结果的"含义"相同或类似，说明这个词条不是黑话（含义明确）
- 如果两个推断结果有差异，说明这个词条可能是黑话（需要上下文才能理解）

以 JSON 格式输出：
{{
  "is_similar": true/false,
  "reason": "判断理由"
}}
"""
    Prompt(prompt3_str, "jargon_compare_inference_prompt")


_init_prompt()
_init_inference_prompts()


async def _enrich_raw_content_if_needed(
    content: str,
    raw_content_list: List[str],
    chat_id: str,
    messages: List[Any],
    extraction_start_time: float,
    extraction_end_time: float,
) -> List[str]:
    """
    检查raw_content是否只包含黑话本身，如果是，则获取该消息的前三条消息作为原始内容

    Args:
        content: 黑话内容
        raw_content_list: 原始raw_content列表
        chat_id: 聊天ID
        messages: 当前时间窗口内的消息列表
        extraction_start_time: 提取开始时间
        extraction_end_time: 提取结束时间

    Returns:
        处理后的raw_content列表
    """
    enriched_list = []

    for raw_content in raw_content_list:
        # 检查raw_content是否只包含黑话本身（去除空白字符后比较）
        raw_content_clean = raw_content.strip()
        content_clean = content.strip()

        # 如果raw_content只包含黑话本身（可能有一些标点或空白），则尝试获取上下文
        # 去除所有空白字符后比较，确保只包含黑话本身
        raw_content_normalized = raw_content_clean.replace(" ", "").replace("\n", "").replace("\t", "")
        content_normalized = content_clean.replace(" ", "").replace("\n", "").replace("\t", "")

        if raw_content_normalized == content_normalized:
            # 在消息列表中查找只包含该黑话的消息（去除空白后比较）
            target_message = None
            for msg in messages:
                msg_content = (msg.processed_plain_text or msg.display_message or "").strip()
                msg_content_normalized = msg_content.replace(" ", "").replace("\n", "").replace("\t", "")
                # 检查消息内容是否只包含黑话本身（去除空白后完全匹配）
                if msg_content_normalized == content_normalized:
                    target_message = msg
                    break

            if target_message and target_message.time:
                # 获取该消息的前三条消息
                try:
                    previous_messages = get_raw_msg_before_timestamp_with_chat(
                        chat_id=chat_id, timestamp=target_message.time, limit=3
                    )

                    if previous_messages:
                        # 将前三条消息和当前消息一起格式化
                        context_messages = previous_messages + [target_message]
                        # 按时间排序
                        context_messages.sort(key=lambda x: x.time or 0)

                        # 格式化为可读消息
                        formatted_context, _ = await build_readable_messages_with_list(
                            context_messages,
                            replace_bot_name=True,
                            timestamp_mode="relative",
                            truncate=False,
                        )

                        if formatted_context.strip():
                            enriched_list.append(formatted_context.strip())
                            logger.warning(f"为黑话 {content} 补充了上下文消息")
                        else:
                            # 如果格式化失败，使用原始raw_content
                            enriched_list.append(raw_content)
                    else:
                        # 没有找到前三条消息，使用原始raw_content
                        enriched_list.append(raw_content)
                except Exception as e:
                    logger.warning(f"获取黑话 {content} 的上下文消息失败: {e}")
                    # 出错时使用原始raw_content
                    enriched_list.append(raw_content)
            else:
                # 没有找到包含黑话的消息，使用原始raw_content
                enriched_list.append(raw_content)
        else:
            # raw_content包含更多内容，直接使用
            enriched_list.append(raw_content)

    return enriched_list


def _should_infer_meaning(jargon_obj: Jargon) -> bool:
    """
    判断是否需要进行含义推断
    在 count 达到 3,6, 10, 20, 40, 60, 100 时进行推断
    并且count必须大于last_inference_count，避免重启后重复判定
    如果is_complete为True，不再进行推断
    """
    # 如果已完成所有推断，不再推断
    if jargon_obj.is_complete:
        return False

    count = jargon_obj.count or 0
    last_inference = jargon_obj.last_inference_count or 0

    # 阈值列表：3,6, 10, 20, 40, 60, 100
    thresholds = [2, 4, 8, 12, 24, 60, 100]

    if count < thresholds[0]:
        return False

    # 如果count没有超过上次判定值，不需要判定
    if count <= last_inference:
        return False

    # 找到第一个大于last_inference的阈值
    next_threshold = None
    for threshold in thresholds:
        if threshold > last_inference:
            next_threshold = threshold
            break

    # 如果没有找到下一个阈值，说明已经超过100，不应该再推断
    if next_threshold is None:
        return False

    # 检查count是否达到或超过这个阈值
    return count >= next_threshold


class JargonMiner:
    def __init__(self, chat_id: str) -> None:
        self.chat_id = chat_id
        self.last_learning_time: float = time.time()
        # 频率控制，可按需调整
        self.min_messages_for_learning: int = 10
        self.min_learning_interval: float = 20

        self.llm = LLMRequest(
            model_set=model_config.model_task_config.utils,
            request_type="jargon.extract",
        )

        # 初始化stream_name作为类属性，避免重复提取
        chat_manager = get_chat_manager()
        stream_name = chat_manager.get_stream_name(self.chat_id)
        self.stream_name = stream_name if stream_name else self.chat_id
        self.cache_limit = 100
        self.cache: OrderedDict[str, None] = OrderedDict()

    def _add_to_cache(self, content: str) -> None:
        """将提取到的黑话加入缓存，保持LRU语义"""
        if not content:
            return

        key = content.strip()
        if not key:
            return

        if key in self.cache:
            self.cache.move_to_end(key)
        else:
            self.cache[key] = None
            if len(self.cache) > self.cache_limit:
                self.cache.popitem(last=False)

    def _collect_cached_entries(self, messages: List[Any]) -> List[Dict[str, List[str]]]:
        """检查缓存中的黑话是否出现在当前消息窗口，生成对应上下文"""
        if not self.cache or not messages:
            return []

        cached_entries: List[Dict[str, List[str]]] = []
        processed_pairs = set()

        for idx, msg in enumerate(messages):
            msg_text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip()
            if not msg_text or _is_bot_message(msg):
                continue

            for content in self.cache.keys():
                if not content:
                    continue
                if (content, idx) in processed_pairs:
                    continue
                if content in msg_text:
                    if _has_adjacent_bot_message(messages, idx):
                        continue
                    paragraph = _build_context_paragraph(messages, idx)
                    if not paragraph:
                        continue
                    cached_entries.append({"content": content, "raw_content": [paragraph]})
                    processed_pairs.add((content, idx))

        return cached_entries

    async def _infer_meaning_by_id(self, jargon_id: int) -> None:
        """通过ID加载对象并推断"""
        try:
            jargon_obj = Jargon.get_by_id(jargon_id)
            # 再次检查is_complete，因为可能在异步任务执行时已被标记为完成
            if jargon_obj.is_complete:
                logger.debug(f"jargon {jargon_obj.content} 已完成所有推断，跳过")
                return
            await self.infer_meaning(jargon_obj)
        except Exception as e:
            logger.error(f"通过ID推断jargon失败: {e}")

    async def infer_meaning(self, jargon_obj: Jargon) -> None:
        """
        对jargon进行含义推断
        """
        try:
            content = jargon_obj.content
            raw_content_str = jargon_obj.raw_content or ""

            # 解析raw_content列表
            raw_content_list = []
            if raw_content_str:
                try:
                    raw_content_list = (
                        json.loads(raw_content_str) if isinstance(raw_content_str, str) else raw_content_str
                    )
                    if not isinstance(raw_content_list, list):
                        raw_content_list = [raw_content_list] if raw_content_list else []
                except (json.JSONDecodeError, TypeError):
                    raw_content_list = [raw_content_str] if raw_content_str else []

            if not raw_content_list:
                logger.warning(f"jargon {content} 没有raw_content，跳过推断")
                return

            # 步骤1: 基于raw_content和content推断
            raw_content_text = "\n".join(raw_content_list)
            prompt1 = await global_prompt_manager.format_prompt(
                "jargon_inference_with_context_prompt",
                content=content,
                bot_name = global_config.bot.nickname,
                raw_content_list=raw_content_text,
            )

            response1, _ = await self.llm.generate_response_async(prompt1, temperature=0.3)
            if not response1:
                logger.warning(f"jargon {content} 推断1失败：无响应")
                return

            # 解析推断1结果
            inference1 = None
            try:
                resp1 = response1.strip()
                if resp1.startswith("{") and resp1.endswith("}"):
                    inference1 = json.loads(resp1)
                else:
                    repaired = repair_json(resp1)
                    inference1 = json.loads(repaired) if isinstance(repaired, str) else repaired
                if not isinstance(inference1, dict):
                    logger.warning(f"jargon {content} 推断1结果格式错误")
                    return
            except Exception as e:
                logger.error(f"jargon {content} 推断1解析失败: {e}")
                return

            # 检查推断1是否表示信息不足无法推断
            no_info = inference1.get("no_info", False)
            meaning1 = inference1.get("meaning", "").strip()
            if no_info or not meaning1:
                logger.info(f"jargon {content} 推断1表示信息不足无法推断，放弃本次推断，待下次更新")
                # 更新最后一次判定的count值，避免在同一阈值重复尝试
                jargon_obj.last_inference_count = jargon_obj.count or 0
                jargon_obj.save()
                return

            # 步骤2: 仅基于content推断
            prompt2 = await global_prompt_manager.format_prompt(
                "jargon_inference_content_only_prompt",
                content=content,
            )

            response2, _ = await self.llm.generate_response_async(prompt2, temperature=0.3)
            if not response2:
                logger.warning(f"jargon {content} 推断2失败：无响应")
                return

            # 解析推断2结果
            inference2 = None
            try:
                resp2 = response2.strip()
                if resp2.startswith("{") and resp2.endswith("}"):
                    inference2 = json.loads(resp2)
                else:
                    repaired = repair_json(resp2)
                    inference2 = json.loads(repaired) if isinstance(repaired, str) else repaired
                if not isinstance(inference2, dict):
                    logger.warning(f"jargon {content} 推断2结果格式错误")
                    return
            except Exception as e:
                logger.error(f"jargon {content} 推断2解析失败: {e}")
                return

            # logger.info(f"jargon {content} 推断2提示词: {prompt2}")
            # logger.info(f"jargon {content} 推断2结果: {response2}")
            # logger.info(f"jargon {content} 推断1提示词: {prompt1}")
            # logger.info(f"jargon {content} 推断1结果: {response1}")

            if global_config.debug.show_jargon_prompt:
                logger.info(f"jargon {content} 推断2提示词: {prompt2}")
                logger.info(f"jargon {content} 推断2结果: {response2}")
                logger.info(f"jargon {content} 推断1提示词: {prompt1}")
                logger.info(f"jargon {content} 推断1结果: {response1}")
            else:
                logger.debug(f"jargon {content} 推断2提示词: {prompt2}")
                logger.debug(f"jargon {content} 推断2结果: {response2}")
                logger.debug(f"jargon {content} 推断1提示词: {prompt1}")
                logger.debug(f"jargon {content} 推断1结果: {response1}")

            # 步骤3: 比较两个推断结果
            prompt3 = await global_prompt_manager.format_prompt(
                "jargon_compare_inference_prompt",
                inference1=json.dumps(inference1, ensure_ascii=False),
                inference2=json.dumps(inference2, ensure_ascii=False),
            )

            if global_config.debug.show_jargon_prompt:
                logger.info(f"jargon {content} 比较提示词: {prompt3}")

            response3, _ = await self.llm.generate_response_async(prompt3, temperature=0.3)
            if not response3:
                logger.warning(f"jargon {content} 比较失败：无响应")
                return

            # 解析比较结果
            comparison = None
            try:
                resp3 = response3.strip()
                if resp3.startswith("{") and resp3.endswith("}"):
                    comparison = json.loads(resp3)
                else:
                    repaired = repair_json(resp3)
                    comparison = json.loads(repaired) if isinstance(repaired, str) else repaired
                if not isinstance(comparison, dict):
                    logger.warning(f"jargon {content} 比较结果格式错误")
                    return
            except Exception as e:
                logger.error(f"jargon {content} 比较解析失败: {e}")
                return

            # 判断是否为黑话
            is_similar = comparison.get("is_similar", False)
            is_jargon = not is_similar  # 如果相似，说明不是黑话；如果有差异，说明是黑话

            # 更新数据库记录
            jargon_obj.is_jargon = is_jargon
            if is_jargon:
                # 是黑话，使用推断1的结果（基于上下文，更准确）
                jargon_obj.meaning = inference1.get("meaning", "")
            else:
                # 不是黑话，清空含义，不再存储任何内容
                jargon_obj.meaning = ""

            # 更新最后一次判定的count值，避免重启后重复判定
            jargon_obj.last_inference_count = jargon_obj.count or 0

            # 如果count>=100，标记为完成，不再进行推断
            if (jargon_obj.count or 0) >= 100:
                jargon_obj.is_complete = True

            jargon_obj.save()
            logger.debug(
                f"jargon {content} 推断完成: is_jargon={is_jargon}, meaning={jargon_obj.meaning}, last_inference_count={jargon_obj.last_inference_count}, is_complete={jargon_obj.is_complete}"
            )

            # 固定输出推断结果，格式化为可读形式
            if is_jargon:
                # 是黑话，输出格式：[聊天名]xxx的含义是 xxxxxxxxxxx
                meaning = jargon_obj.meaning or "无详细说明"
                is_global = jargon_obj.is_global
                if is_global:
                    logger.info(f"[黑话]{content}的含义是 {meaning}")
                else:
                    logger.info(f"[{self.stream_name}]{content}的含义是 {meaning}")
            else:
                # 不是黑话，输出格式：[聊天名]xxx 不是黑话
                logger.info(f"[{self.stream_name}]{content} 不是黑话")

        except Exception as e:
            logger.error(f"jargon推断失败: {e}")
            import traceback

            traceback.print_exc()

    def should_trigger(self) -> bool:
        # 冷却时间检查
        if time.time() - self.last_learning_time < self.min_learning_interval:
            return False

        # 拉取最近消息数量是否足够
        recent_messages = get_raw_msg_by_timestamp_with_chat_inclusive(
            chat_id=self.chat_id,
            timestamp_start=self.last_learning_time,
            timestamp_end=time.time(),
        )
        return bool(recent_messages and len(recent_messages) >= self.min_messages_for_learning)

    async def run_once(self) -> None:
        try:
            if not self.should_trigger():
                return

            chat_stream = get_chat_manager().get_stream(self.chat_id)
            if not chat_stream:
                return

            # 记录本次提取的时间窗口，避免重复提取
            extraction_start_time = self.last_learning_time
            extraction_end_time = time.time()

            # 拉取学习窗口内的消息
            messages = get_raw_msg_by_timestamp_with_chat_inclusive(
                chat_id=self.chat_id,
                timestamp_start=extraction_start_time,
                timestamp_end=extraction_end_time,
                limit=20,
            )
            if not messages:
                return

            # 按时间排序，确保编号与上下文一致
            messages = sorted(messages, key=lambda msg: msg.time or 0)

            chat_str, message_id_list = build_readable_messages_with_id(
                messages=messages,
                replace_bot_name=True,
                timestamp_mode="relative",
                truncate=False,
                show_actions=False,
                show_pic=True,
                pic_single=True,
            )
            if not chat_str.strip():
                return

            msg_id_to_index: Dict[str, int] = {}
            for idx, (msg_id, _msg) in enumerate(message_id_list or []):
                if not msg_id:
                    continue
                msg_id_to_index[msg_id] = idx
            if not msg_id_to_index:
                logger.warning("未能生成消息ID映射，跳过本次提取")
                return

            prompt: str = await global_prompt_manager.format_prompt(
                "extract_jargon_prompt",
                bot_name=global_config.bot.nickname,
                chat_str=chat_str,
            )

            response, _ = await self.llm.generate_response_async(prompt, temperature=0.2)
            if not response:
                return

            if global_config.debug.show_jargon_prompt:
                logger.info(f"jargon提取提示词: {prompt}")
                logger.info(f"jargon提取结果: {response}")

            # 解析为JSON
            entries: List[dict] = []
            try:
                resp = response.strip()
                parsed = None
                if resp.startswith("[") and resp.endswith("]"):
                    parsed = json.loads(resp)
                else:
                    repaired = repair_json(resp)
                    if isinstance(repaired, str):
                        parsed = json.loads(repaired)
                    else:
                        parsed = repaired

                if isinstance(parsed, dict):
                    parsed = [parsed]

                if not isinstance(parsed, list):
                    return

                for item in parsed:
                    if not isinstance(item, dict):
                        continue

                    content = str(item.get("content", "")).strip()
                    msg_id_value = item.get("msg_id")

                    if not content:
                        continue

                    if _contains_bot_self_name(content):
                        logger.debug(f"解析阶段跳过包含机器人昵称/别名的词条: {content}")
                        continue

                    msg_id_str = str(msg_id_value or "").strip()
                    if not msg_id_str:
                        logger.warning(f"解析jargon失败：msg_id缺失，content={content}")
                        continue

                    msg_index = msg_id_to_index.get(msg_id_str)
                    if msg_index is None:
                        logger.warning(f"解析jargon失败：msg_id未找到，content={content}, msg_id={msg_id_str}")
                        continue

                    target_msg = messages[msg_index]
                    if _is_bot_message(target_msg):
                        logger.debug(f"解析阶段跳过引用机器人自身消息的词条: content={content}, msg_id={msg_id_str}")
                        continue
                    if _has_adjacent_bot_message(messages, msg_index):
                        logger.debug(
                            f"解析阶段跳过因邻近机器人发言的词条: content={content}, msg_id={msg_id_str}"
                        )
                        continue

                    context_paragraph = _build_context_paragraph(messages, msg_index)
                    if not context_paragraph:
                        logger.warning(f"解析jargon失败：上下文为空，content={content}, msg_id={msg_id_str}")
                        continue

                    entries.append({"content": content, "raw_content": [context_paragraph]})
                cached_entries = self._collect_cached_entries(messages)
                if cached_entries:
                    entries.extend(cached_entries)
            except Exception as e:
                logger.error(f"解析jargon JSON失败: {e}; 原始: {response}")
                return

            if not entries:
                return

            # 去重并合并raw_content（按 content 聚合）
            merged_entries: OrderedDict[str, Dict[str, List[str]]] = OrderedDict()
            for entry in entries:
                content_key = entry["content"]
                raw_list = entry.get("raw_content", []) or []
                if content_key in merged_entries:
                    merged_entries[content_key]["raw_content"].extend(raw_list)
                else:
                    merged_entries[content_key] = {
                        "content": content_key,
                        "raw_content": list(raw_list),
                    }

            uniq_entries = []
            for merged_entry in merged_entries.values():
                raw_content_list = merged_entry["raw_content"]
                if raw_content_list:
                    merged_entry["raw_content"] = list(dict.fromkeys(raw_content_list))
                uniq_entries.append(merged_entry)

            saved = 0
            updated = 0
            for entry in uniq_entries:
                content = entry["content"]
                raw_content_list = entry["raw_content"]  # 已经是列表

                # 检查并补充raw_content：如果只包含黑话本身，则获取前三条消息作为上下文
                raw_content_list = await _enrich_raw_content_if_needed(
                    content=content,
                    raw_content_list=raw_content_list,
                    chat_id=self.chat_id,
                    messages=messages,
                    extraction_start_time=extraction_start_time,
                    extraction_end_time=extraction_end_time,
                )

                try:
                    # 根据all_global配置决定查询逻辑
                    if global_config.jargon.all_global:
                        # 开启all_global：无视chat_id，查询所有content匹配的记录（所有记录都是全局的）
                        query = Jargon.select().where(Jargon.content == content)
                    else:
                        # 关闭all_global：只查询chat_id匹配的记录（不考虑is_global）
                        query = Jargon.select().where((Jargon.chat_id == self.chat_id) & (Jargon.content == content))

                    if query.exists():
                        obj = query.get()
                        try:
                            obj.count = (obj.count or 0) + 1
                        except Exception:
                            obj.count = 1

                        # 合并raw_content列表：读取现有列表，追加新值，去重
                        existing_raw_content = []
                        if obj.raw_content:
                            try:
                                existing_raw_content = (
                                    json.loads(obj.raw_content) if isinstance(obj.raw_content, str) else obj.raw_content
                                )
                                if not isinstance(existing_raw_content, list):
                                    existing_raw_content = [existing_raw_content] if existing_raw_content else []
                            except (json.JSONDecodeError, TypeError):
                                existing_raw_content = [obj.raw_content] if obj.raw_content else []

                        # 合并并去重
                        merged_list = list(dict.fromkeys(existing_raw_content + raw_content_list))
                        obj.raw_content = json.dumps(merged_list, ensure_ascii=False)

                        # 开启all_global时，确保记录标记为is_global=True
                        if global_config.jargon.all_global:
                            obj.is_global = True
                        # 关闭all_global时，保持原有is_global不变（不修改）

                        obj.save()

                        # 检查是否需要推断（达到阈值且超过上次判定值）
                        if _should_infer_meaning(obj):
                            # 异步触发推断，不阻塞主流程
                            # 重新加载对象以确保数据最新
                            jargon_id = obj.id
                            asyncio.create_task(self._infer_meaning_by_id(jargon_id))

                        updated += 1
                    else:
                        # 没找到匹配记录，创建新记录
                        if global_config.jargon.all_global:
                            # 开启all_global：新记录默认为is_global=True
                            is_global_new = True
                        else:
                            # 关闭all_global：新记录is_global=False
                            is_global_new = False

                        Jargon.create(
                            content=content,
                            raw_content=json.dumps(raw_content_list, ensure_ascii=False),
                            chat_id=self.chat_id,
                            is_global=is_global_new,
                            count=1,
                        )
                        saved += 1
                except Exception as e:
                    logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}")
                    continue
                finally:
                    self._add_to_cache(content)

            # 固定输出提取的jargon结果，格式化为可读形式（只要有提取结果就输出）
            if uniq_entries:
                # 收集所有提取的jargon内容
                jargon_list = [entry["content"] for entry in uniq_entries]
                jargon_str = ",".join(jargon_list)

                # 输出格式化的结果（使用logger.info会自动应用jargon模块的颜色）
                logger.info(f"[{self.stream_name}]疑似黑话: {jargon_str}")

                # 更新为本次提取的结束时间，确保不会重复提取相同的消息窗口
                self.last_learning_time = extraction_end_time

            if saved or updated:
                logger.info(f"jargon写入: 新增 {saved} 条，更新 {updated} 条，chat_id={self.chat_id}")
        except Exception as e:
            logger.error(f"JargonMiner 运行失败: {e}")


class JargonMinerManager:
    def __init__(self) -> None:
        self._miners: dict[str, JargonMiner] = {}

    def get_miner(self, chat_id: str) -> JargonMiner:
        if chat_id not in self._miners:
            self._miners[chat_id] = JargonMiner(chat_id)
        return self._miners[chat_id]


miner_manager = JargonMinerManager()


async def extract_and_store_jargon(chat_id: str) -> None:
    miner = miner_manager.get_miner(chat_id)
    await miner.run_once()


def search_jargon(
    keyword: str, chat_id: Optional[str] = None, limit: int = 10, case_sensitive: bool = False, fuzzy: bool = True
) -> List[Dict[str, str]]:
    """
    搜索jargon，支持大小写不敏感和模糊搜索

    Args:
        keyword: 搜索关键词
        chat_id: 可选的聊天ID
            - 如果开启了all_global：此参数被忽略，查询所有is_global=True的记录
            - 如果关闭了all_global：如果提供则优先搜索该聊天或global的jargon
        limit: 返回结果数量限制，默认10
        case_sensitive: 是否大小写敏感，默认False（不敏感）
        fuzzy: 是否模糊搜索，默认True（使用LIKE匹配）

    Returns:
        List[Dict[str, str]]: 包含content, meaning的字典列表
    """
    if not keyword or not keyword.strip():
        return []

    keyword = keyword.strip()

    # 构建查询
    query = Jargon.select(Jargon.content, Jargon.meaning)

    # 构建搜索条件
    if case_sensitive:
        # 大小写敏感
        if fuzzy:
            # 模糊搜索
            search_condition = Jargon.content.contains(keyword)
        else:
            # 精确匹配
            search_condition = Jargon.content == keyword
    else:
        # 大小写不敏感
        if fuzzy:
            # 模糊搜索（使用LOWER函数）
            search_condition = fn.LOWER(Jargon.content).contains(keyword.lower())
        else:
            # 精确匹配（使用LOWER函数）
            search_condition = fn.LOWER(Jargon.content) == keyword.lower()

    query = query.where(search_condition)

    # 根据all_global配置决定查询逻辑
    if global_config.jargon.all_global:
        # 开启all_global：所有记录都是全局的，查询所有is_global=True的记录（无视chat_id）
        query = query.where(Jargon.is_global)
    else:
        # 关闭all_global：如果提供了chat_id，优先搜索该聊天或global的jargon
        if chat_id:
            query = query.where((Jargon.chat_id == chat_id) | Jargon.is_global)

    # 只返回有meaning的记录
    query = query.where((Jargon.meaning.is_null(False)) & (Jargon.meaning != ""))

    # 按count降序排序，优先返回出现频率高的
    query = query.order_by(Jargon.count.desc())

    # 限制结果数量
    query = query.limit(limit)

    # 执行查询并返回结果
    results = []
    for jargon in query:
        results.append({"content": jargon.content or "", "meaning": jargon.meaning or ""})

    return results


async def store_jargon_from_answer(jargon_keyword: str, answer: str, chat_id: str) -> None:
    """将黑话存入jargon系统

    Args:
        jargon_keyword: 黑话关键词
        answer: 答案内容（将概括为raw_content）
        chat_id: 聊天ID
    """
    try:
        # 概括答案为简短的raw_content
        summary_prompt = f"""请将以下答案概括为一句简短的话（不超过50字），作为黑话"{jargon_keyword}"的使用示例：

答案：{answer}

只输出概括后的内容，不要输出其他内容："""

        success, summary, _, _ = await llm_api.generate_with_model(
            summary_prompt,
            model_config=model_config.model_task_config.utils_small,
            request_type="memory.summarize_jargon",
        )

        logger.info(f"概括答案提示: {summary_prompt}")
        logger.info(f"概括答案: {summary}")

        if not success:
            logger.warning(f"概括答案失败，使用原始答案: {summary}")
            summary = answer[:100]  # 截取前100字符作为备用

        raw_content = summary.strip()[:200]  # 限制长度

        # 检查是否已存在
        if global_config.jargon.all_global:
            query = Jargon.select().where(Jargon.content == jargon_keyword)
        else:
            query = Jargon.select().where((Jargon.chat_id == chat_id) & (Jargon.content == jargon_keyword))

        if query.exists():
            # 更新现有记录
            obj = query.get()
            obj.count = (obj.count or 0) + 1

            # 合并raw_content列表
            existing_raw_content = []
            if obj.raw_content:
                try:
                    existing_raw_content = (
                        json.loads(obj.raw_content) if isinstance(obj.raw_content, str) else obj.raw_content
                    )
                    if not isinstance(existing_raw_content, list):
                        existing_raw_content = [existing_raw_content] if existing_raw_content else []
                except (json.JSONDecodeError, TypeError):
                    existing_raw_content = [obj.raw_content] if obj.raw_content else []

            # 合并并去重
            merged_list = list(dict.fromkeys(existing_raw_content + [raw_content]))
            obj.raw_content = json.dumps(merged_list, ensure_ascii=False)

            if global_config.jargon.all_global:
                obj.is_global = True

            obj.save()
            logger.info(f"更新jargon记录: {jargon_keyword}")
        else:
            # 创建新记录
            is_global_new = True if global_config.jargon.all_global else False
            Jargon.create(
                content=jargon_keyword,
                raw_content=json.dumps([raw_content], ensure_ascii=False),
                chat_id=chat_id,
                is_global=is_global_new,
                count=1,
            )
            logger.info(f"创建新jargon记录: {jargon_keyword}")

    except Exception as e:
        logger.error(f"存储jargon失败: {e}")