feat：升级jargon，提取更快速，更精准

2025-11-25 01:58:31 +08:00 · 2025-11-25 01:58:31 +08:00 · b684a95fbc
parent a1dd26d578
commit b684a95fbc
5 changed files with 297 additions and 52 deletions
--- a/.gitignore
+++ b/.gitignore
@ -22,6 +22,7 @@ MaiMBot-LPMM
 *.zip
 run_bot.bat
 run_na.bat
+run_all_in_wt.bat
 run.bat
 log_debug/
 run_amds.bat
--- a/src/chat/utils/chat_message_builder.py
+++ b/src/chat/utils/chat_message_builder.py
@ -352,6 +352,7 @@ def _build_readable_messages_internal(
    pic_counter: int = 1,
    show_pic: bool = True,
    message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None,
+    pic_single: bool = False,
 ) -> Tuple[str, List[Tuple[float, str, str]], Dict[str, str], int]:
    # sourcery skip: use-getitem-for-re-match-groups
    """
@ -378,6 +379,7 @@ def _build_readable_messages_internal(
    if pic_id_mapping is None:
        pic_id_mapping = {}
    current_pic_counter = pic_counter
+    pic_description_cache: Dict[str, str] = {}

    # 创建时间戳到消息ID的映射，用于在消息前添加[id]标识符
    timestamp_to_id_mapping: Dict[float, str] = {}
@ -400,6 +402,17 @@ def _build_readable_messages_internal(
            nonlocal current_pic_counter
            nonlocal pic_counter
            pic_id = match.group(1)
+            if pic_single:
+                if pic_id not in pic_description_cache:
+                    description = "内容正在阅读，请稍等"
+                    try:
+                        image = Images.get_or_none(Images.image_id == pic_id)
+                        if image and image.description:
+                            description = image.description
+                    except Exception:
+                        pass
+                    pic_description_cache[pic_id] = description
+                return f"[图片：{pic_description_cache[pic_id]}]"
            if pic_id not in pic_id_mapping:
                pic_id_mapping[pic_id] = f"图片{current_pic_counter}"
                current_pic_counter += 1
@ -603,6 +616,7 @@ async def build_readable_messages_with_list(
    replace_bot_name: bool = True,
    timestamp_mode: str = "relative",
    truncate: bool = False,
+    pic_single: bool = False,
 ) -> Tuple[str, List[Tuple[float, str, str]]]:
    """
    将消息列表转换为可读的文本格式，并返回原始(时间戳, 昵称, 内容)列表。
@ -613,10 +627,16 @@ async def build_readable_messages_with_list(
        replace_bot_name,
        timestamp_mode,
        truncate,
+        pic_id_mapping=None,
+        pic_counter=1,
+        show_pic=True,
+        message_id_list=None,
+        pic_single=pic_single,
    )

-    if pic_mapping_info := build_pic_mapping_info(pic_id_mapping):
-        formatted_string = f"{pic_mapping_info}\n\n{formatted_string}"
+    if not pic_single:
+        if pic_mapping_info := build_pic_mapping_info(pic_id_mapping):
+            formatted_string = f"{pic_mapping_info}\n\n{formatted_string}"

    return formatted_string, details_list

@ -630,6 +650,7 @@ def build_readable_messages_with_id(
    show_actions: bool = False,
    show_pic: bool = True,
    remove_emoji_stickers: bool = False,
+    pic_single: bool = False,
 ) -> Tuple[str, List[Tuple[str, DatabaseMessages]]]:
    """
    将消息列表转换为可读的文本格式，并返回原始(时间戳, 昵称, 内容)列表。
@ -647,6 +668,7 @@ def build_readable_messages_with_id(
        read_mark=read_mark,
        message_id_list=message_id_list,
        remove_emoji_stickers=remove_emoji_stickers,
+        pic_single=pic_single,
    )

    return formatted_string, message_id_list
@ -662,6 +684,7 @@ def build_readable_messages(
    show_pic: bool = True,
    message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None,
    remove_emoji_stickers: bool = False,
+    pic_single: bool = False,
 ) -> str:  # sourcery skip: extract-method
    """
    将消息列表转换为可读的文本格式。
@ -769,14 +792,14 @@ def build_readable_messages(
            truncate,
            show_pic=show_pic,
            message_id_list=message_id_list,
+            pic_single=pic_single,
        )

-        # 生成图片映射信息并添加到最前面
-        pic_mapping_info = build_pic_mapping_info(pic_id_mapping)
-        if pic_mapping_info:
-            return f"{pic_mapping_info}\n\n{formatted_string}"
-        else:
-            return formatted_string
+        if not pic_single:
+            pic_mapping_info = build_pic_mapping_info(pic_id_mapping)
+            if pic_mapping_info:
+                return f"{pic_mapping_info}\n\n{formatted_string}"
+        return formatted_string
    else:
        # 按 read_mark 分割消息
        messages_before_mark = [msg for msg in copy_messages if (msg.time or 0) <= read_mark]
@ -796,6 +819,7 @@ def build_readable_messages(
            pic_counter,
            show_pic=show_pic,
            message_id_list=message_id_list,
+            pic_single=pic_single,
        )
        formatted_after, _, pic_id_mapping, _ = _build_readable_messages_internal(
            messages_after_mark,
@ -806,15 +830,19 @@ def build_readable_messages(
            pic_counter,
            show_pic=show_pic,
            message_id_list=message_id_list,
+            pic_single=pic_single,
        )

        read_mark_line = "\n--- 以上消息是你已经看过，请关注以下未读的新消息---\n"

        # 生成图片映射信息
-        if pic_id_mapping:
-            pic_mapping_info = f"图片信息：\n{build_pic_mapping_info(pic_id_mapping)}\n聊天记录信息：\n"
+        if not pic_single:
+            if pic_id_mapping:
+                pic_mapping_info = f"图片信息：\n{build_pic_mapping_info(pic_id_mapping)}\n聊天记录信息：\n"
+            else:
+                pic_mapping_info = "聊天记录信息：\n"
        else:
-            pic_mapping_info = "聊天记录信息：\n"
+            pic_mapping_info = ""

        # 组合结果
        result_parts = []
@ -832,7 +860,7 @@ def build_readable_messages(
        return "".join(result_parts)


-async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str:
+async def build_anonymous_messages(messages: List[DatabaseMessages], show_ids: bool = False) -> str:
    """
    构建匿名可读消息，将不同人的名称转为唯一占位符（A、B、C...），bot自己用SELF。
    处理 回复<aaa:bbb> 和 @<aaa:bbb> 字段，将bbb映射为匿名占位符。
@ -889,7 +917,7 @@ async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str:
            current_char += 1
        return person_map[person_id]

-    for msg in messages:
+    for i, msg in enumerate(messages):
        try:
            platform = msg.chat_info.platform
            user_id = msg.user_info.user_id
@ -910,7 +938,12 @@ async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str:

            content = replace_user_references(content, platform, anon_name_resolver, replace_bot_name=False)

-            header = f"{anon_name}说 "
+            # 构建消息头，如果启用show_ids则添加序号
+            if show_ids:
+                header = f"[{i + 1}] {anon_name}说 "
+            else:
+                header = f"{anon_name}说 "
+                
            output_lines.append(header)
            stripped_line = content.strip()
            if stripped_line:
--- a/src/common/database/database_model.py
+++ b/src/common/database/database_model.py
@ -330,8 +330,6 @@ class Jargon(BaseModel):

    content = TextField()
    raw_content = TextField(null=True)
-    type = TextField(null=True)
-    translation = TextField(null=True)
    meaning = TextField(null=True)
    chat_id = TextField(index=True)
    is_global = BooleanField(default=False)
--- a/src/express/expression_reflector.py
+++ b/src/express/expression_reflector.py
@ -39,9 +39,19 @@ class ExpressionReflector:

            # 检查是否在允许列表中
            allow_reflect = global_config.expression.allow_reflect
-            if allow_reflect and self.chat_id not in allow_reflect:
-                logger.info(f"[Expression Reflection] 当前聊天流 {self.chat_id} 不在允许列表中，跳过")
-                return False
+            if allow_reflect:
+                # 将 allow_reflect 中的 platform:id:type 格式转换为 chat_id 列表
+                allow_reflect_chat_ids = []
+                for stream_config in allow_reflect:
+                    parsed_chat_id = global_config.expression._parse_stream_config_to_chat_id(stream_config)
+                    if parsed_chat_id:
+                        allow_reflect_chat_ids.append(parsed_chat_id)
+                    else:
+                        logger.warning(f"[Expression Reflection] 无法解析 allow_reflect 配置项: {stream_config}")
+                
+                if self.chat_id not in allow_reflect_chat_ids:
+                    logger.info(f"[Expression Reflection] 当前聊天流 {self.chat_id} 不在允许列表中，跳过")
+                    return False

            # 检查上一次提问时间
            current_time = time.time()
--- a/src/jargon/jargon_miner.py
+++ b/src/jargon/jargon_miner.py
@ -1,6 +1,7 @@
 import time
 import json
 import asyncio
+from collections import OrderedDict
 from typing import List, Dict, Optional, Any
 from json_repair import repair_json
 from peewee import fn
@ -12,12 +13,13 @@ from src.config.config import model_config, global_config
 from src.chat.message_receive.chat_stream import get_chat_manager
 from src.plugin_system.apis import llm_api
 from src.chat.utils.chat_message_builder import (
-    build_anonymous_messages,
+    build_readable_messages_with_id,
    get_raw_msg_by_timestamp_with_chat_inclusive,
    get_raw_msg_before_timestamp_with_chat,
    build_readable_messages_with_list,
 )
 from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
+from src.chat.utils.utils import parse_platform_accounts


 logger = get_logger("jargon")
@ -43,9 +45,107 @@ def _contains_bot_self_name(content: str) -> bool:
    return any(name in target for name in candidates if target)


+def _format_context_message(msg: Any, seq_index: int) -> str:
+    """
+    将单条消息格式化为带序号的上下文行
+    """
+    if msg is None:
+        return ""
+
+    text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip()
+    if not text:
+        return ""
+
+    user_info = getattr(msg, "user_info", None)
+    nickname = ""
+    if user_info:
+        nickname = getattr(user_info, "user_nickname", "") or getattr(user_info, "user_id", "")
+
+    if not nickname:
+        nickname = getattr(msg, "user_nickname", "") or getattr(msg, "user_id", "") or "某人"
+
+    return f"{nickname}: {text}"
+
+
+def _build_context_paragraph(messages: List[Any], center_index: int) -> Optional[str]:
+    """
+    构建包含中心消息上下文的段落（前3条+后3条）
+    """
+    if not messages or center_index < 0 or center_index >= len(messages):
+        return None
+
+    context_start = max(0, center_index - 3)
+    context_end = min(len(messages), center_index + 1 + 3)
+
+    context_lines: List[str] = []
+    for idx in range(context_start, context_end):
+        formatted_line = _format_context_message(messages[idx], idx + 1)
+        if formatted_line:
+            context_lines.append(formatted_line)
+
+    if not context_lines:
+        return None
+
+    paragraph = "\n".join(context_lines).strip()
+    return paragraph or None
+
+
+def _is_bot_message(msg: Any) -> bool:
+    """判断消息是否来自机器人自身"""
+    if msg is None:
+        return False
+
+    bot_config = getattr(global_config, "bot", None)
+    if not bot_config:
+        return False
+
+    platform = (
+        str(getattr(msg, "user_platform", "") or getattr(getattr(msg, "user_info", None), "platform", "") or "")
+        .strip()
+        .lower()
+    )
+    user_id = (
+        str(getattr(msg, "user_id", "") or getattr(getattr(msg, "user_info", None), "user_id", "") or "")
+        .strip()
+    )
+
+    if not platform or not user_id:
+        return False
+
+    platform_accounts = {}
+    try:
+        platform_accounts = parse_platform_accounts(getattr(bot_config, "platforms", []) or [])
+    except Exception:
+        platform_accounts = {}
+
+    bot_accounts: Dict[str, str] = {}
+    qq_account = str(getattr(bot_config, "qq_account", "") or "").strip()
+    if qq_account:
+        bot_accounts["qq"] = qq_account
+
+    telegram_account = str(getattr(bot_config, "telegram_account", "") or "").strip()
+    if telegram_account:
+        bot_accounts["telegram"] = telegram_account
+
+    for plat, account in platform_accounts.items():
+        if account and plat not in bot_accounts:
+            bot_accounts[plat] = account
+
+    bot_account = bot_accounts.get(platform)
+    return bool(bot_account and user_id == bot_account)
+
+
+def _has_adjacent_bot_message(messages: List[Any], center_index: int) -> bool:
+    """检查目标消息的上一条或下一条是否为机器人发言"""
+    for neighbor in (center_index - 1, center_index + 1):
+        if 0 <= neighbor < len(messages) and _is_bot_message(messages[neighbor]):
+            return True
+    return False
+
+
 def _init_prompt() -> None:
    prompt_str = """
-**聊天内容，其中的SELF是你自己的发言**
+**聊天内容，其中的{bot_name}的发言内容是你自己的发言，[msg_id] 是消息ID**
 {chat_str}

 请从上面这段聊天内容中提取"可能是黑话"的候选项（黑话/俚语/网络缩写/口头禅）。
@ -62,9 +162,10 @@ def _init_prompt() -> None:
 - 中文词语的缩写，用几个汉字概括一个词汇或含义，例如：社死、内卷

 以 JSON 数组输出，元素为对象（严格按以下结构）：
+请你提取出可能的黑话，最多10
 [
-  {{"content": "词条", "raw_content": "包含该词条的完整对话上下文原文"}},
-  {{"content": "词条2", "raw_content": "包含该词条的完整对话上下文原文"}}
+  {{"content": "词条", "msg_id": "m12"}},  // msg_id 必须与上方聊天中展示的ID完全一致
+  {{"content": "词条2", "msg_id": "m15"}}
 ]

 现在请输出：
@ -78,10 +179,10 @@ def _init_inference_prompts() -> None:
    prompt1_str = """
 **词条内容**
 {content}
-**词条出现的上下文（raw_content）其中的SELF是你自己的发言**
+**词条出现的上下文。其中的{bot_name}的发言内容是你自己的发言**
 {raw_content_list}

-请根据以上词条内容和上下文，推断这个词条的含义。
+请根据上下文，推断"{content}"这个词条的含义。
 - 如果这是一个黑话、俚语或网络用语，请推断其含义
 - 如果含义明确（常规词汇），也请说明
 - 如果上下文信息不足，无法推断含义，请设置 no_info 为 true
@ -240,7 +341,7 @@ def _should_infer_meaning(jargon_obj: Jargon) -> bool:
    last_inference = jargon_obj.last_inference_count or 0

    # 阈值列表：3,6, 10, 20, 40, 60, 100
-    thresholds = [3, 6, 10, 20, 40, 60, 100]
+    thresholds = [2, 4, 8, 12, 24, 60, 100]

    if count < thresholds[0]:
        return False
@ -281,6 +382,53 @@ class JargonMiner:
        chat_manager = get_chat_manager()
        stream_name = chat_manager.get_stream_name(self.chat_id)
        self.stream_name = stream_name if stream_name else self.chat_id
+        self.cache_limit = 100
+        self.cache: OrderedDict[str, None] = OrderedDict()
+
+    def _add_to_cache(self, content: str) -> None:
+        """将提取到的黑话加入缓存，保持LRU语义"""
+        if not content:
+            return
+
+        key = content.strip()
+        if not key:
+            return
+
+        if key in self.cache:
+            self.cache.move_to_end(key)
+        else:
+            self.cache[key] = None
+            if len(self.cache) > self.cache_limit:
+                self.cache.popitem(last=False)
+
+    def _collect_cached_entries(self, messages: List[Any]) -> List[Dict[str, List[str]]]:
+        """检查缓存中的黑话是否出现在当前消息窗口，生成对应上下文"""
+        if not self.cache or not messages:
+            return []
+
+        cached_entries: List[Dict[str, List[str]]] = []
+        processed_pairs = set()
+
+        for idx, msg in enumerate(messages):
+            msg_text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip()
+            if not msg_text or _is_bot_message(msg):
+                continue
+
+            for content in self.cache.keys():
+                if not content:
+                    continue
+                if (content, idx) in processed_pairs:
+                    continue
+                if content in msg_text:
+                    if _has_adjacent_bot_message(messages, idx):
+                        continue
+                    paragraph = _build_context_paragraph(messages, idx)
+                    if not paragraph:
+                        continue
+                    cached_entries.append({"content": content, "raw_content": [paragraph]})
+                    processed_pairs.add((content, idx))
+
+        return cached_entries

    async def _infer_meaning_by_id(self, jargon_id: int) -> None:
        """通过ID加载对象并推断"""
@ -323,6 +471,7 @@ class JargonMiner:
            prompt1 = await global_prompt_manager.format_prompt(
                "jargon_inference_with_context_prompt",
                content=content,
+                bot_name = global_config.bot.nickname,
                raw_content_list=raw_content_text,
            )

@ -441,8 +590,8 @@ class JargonMiner:
                # 是黑话，使用推断1的结果（基于上下文，更准确）
                jargon_obj.meaning = inference1.get("meaning", "")
            else:
-                # 不是黑话，也记录含义（使用推断2的结果，因为含义明确）
-                jargon_obj.meaning = inference2.get("meaning", "")
+                # 不是黑话，清空含义，不再存储任何内容
+                jargon_obj.meaning = ""

            # 更新最后一次判定的count值，避免重启后重复判定
            jargon_obj.last_inference_count = jargon_obj.count or 0
@ -511,12 +660,33 @@ class JargonMiner:
            if not messages:
                return

-            chat_str: str = await build_anonymous_messages(messages)
+            # 按时间排序，确保编号与上下文一致
+            messages = sorted(messages, key=lambda msg: msg.time or 0)
+
+            chat_str, message_id_list = build_readable_messages_with_id(
+                messages=messages,
+                replace_bot_name=True,
+                timestamp_mode="relative",
+                truncate=False,
+                show_actions=False,
+                show_pic=True,
+                pic_single=True,
+            )
            if not chat_str.strip():
                return

+            msg_id_to_index: Dict[str, int] = {}
+            for idx, (msg_id, _msg) in enumerate(message_id_list or []):
+                if not msg_id:
+                    continue
+                msg_id_to_index[msg_id] = idx
+            if not msg_id_to_index:
+                logger.warning("未能生成消息ID映射，跳过本次提取")
+                return
+
            prompt: str = await global_prompt_manager.format_prompt(
                "extract_jargon_prompt",
+                bot_name=global_config.bot.nickname,
                chat_str=chat_str,
            )

@ -551,25 +721,46 @@ class JargonMiner:
                for item in parsed:
                    if not isinstance(item, dict):
                        continue
+
                    content = str(item.get("content", "")).strip()
-                    raw_content_value = item.get("raw_content", "")
+                    msg_id_value = item.get("msg_id")

-                    # 处理raw_content：可能是字符串或列表
-                    raw_content_list = []
-                    if isinstance(raw_content_value, list):
-                        raw_content_list = [str(rc).strip() for rc in raw_content_value if str(rc).strip()]
-                        # 去重
-                        raw_content_list = list(dict.fromkeys(raw_content_list))
-                    elif isinstance(raw_content_value, str):
-                        raw_content_str = raw_content_value.strip()
-                        if raw_content_str:
-                            raw_content_list = [raw_content_str]
+                    if not content:
+                        continue

-                    if content and raw_content_list:
-                        if _contains_bot_self_name(content):
-                            logger.debug(f"解析阶段跳过包含机器人昵称/别名的词条: {content}")
-                            continue
-                        entries.append({"content": content, "raw_content": raw_content_list})
+                    if _contains_bot_self_name(content):
+                        logger.debug(f"解析阶段跳过包含机器人昵称/别名的词条: {content}")
+                        continue
+
+                    msg_id_str = str(msg_id_value or "").strip()
+                    if not msg_id_str:
+                        logger.warning(f"解析jargon失败：msg_id缺失，content={content}")
+                        continue
+
+                    msg_index = msg_id_to_index.get(msg_id_str)
+                    if msg_index is None:
+                        logger.warning(f"解析jargon失败：msg_id未找到，content={content}, msg_id={msg_id_str}")
+                        continue
+
+                    target_msg = messages[msg_index]
+                    if _is_bot_message(target_msg):
+                        logger.debug(f"解析阶段跳过引用机器人自身消息的词条: content={content}, msg_id={msg_id_str}")
+                        continue
+                    if _has_adjacent_bot_message(messages, msg_index):
+                        logger.debug(
+                            f"解析阶段跳过因邻近机器人发言的词条: content={content}, msg_id={msg_id_str}"
+                        )
+                        continue
+
+                    context_paragraph = _build_context_paragraph(messages, msg_index)
+                    if not context_paragraph:
+                        logger.warning(f"解析jargon失败：上下文为空，content={content}, msg_id={msg_id_str}")
+                        continue
+
+                    entries.append({"content": content, "raw_content": [context_paragraph]})
+                cached_entries = self._collect_cached_entries(messages)
+                if cached_entries:
+                    entries.extend(cached_entries)
            except Exception as e:
                logger.error(f"解析jargon JSON失败: {e}; 原始: {response}")
                return
@ -577,15 +768,25 @@ class JargonMiner:
            if not entries:
                return

-            # 去重并写入DB（按 chat_id + content 去重）
-            # 使用content作为去重键
-            seen = set()
-            uniq_entries = []
+            # 去重并合并raw_content（按 content 聚合）
+            merged_entries: OrderedDict[str, Dict[str, List[str]]] = OrderedDict()
            for entry in entries:
                content_key = entry["content"]
-                if content_key not in seen:
-                    seen.add(content_key)
-                    uniq_entries.append(entry)
+                raw_list = entry.get("raw_content", []) or []
+                if content_key in merged_entries:
+                    merged_entries[content_key]["raw_content"].extend(raw_list)
+                else:
+                    merged_entries[content_key] = {
+                        "content": content_key,
+                        "raw_content": list(raw_list),
+                    }
+
+            uniq_entries = []
+            for merged_entry in merged_entries.values():
+                raw_content_list = merged_entry["raw_content"]
+                if raw_content_list:
+                    merged_entry["raw_content"] = list(dict.fromkeys(raw_content_list))
+                uniq_entries.append(merged_entry)

            saved = 0
            updated = 0
@ -670,6 +871,8 @@ class JargonMiner:
                except Exception as e:
                    logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}")
                    continue
+                finally:
+                    self._add_to_cache(content)

            # 固定输出提取的jargon结果，格式化为可读形式（只要有提取结果就输出）
            if uniq_entries: