diff --git a/.gitignore b/.gitignore index 9db98c1d..19de4e40 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ MaiMBot-LPMM *.zip run_bot.bat run_na.bat +run_all_in_wt.bat run.bat log_debug/ run_amds.bat diff --git a/src/chat/utils/chat_message_builder.py b/src/chat/utils/chat_message_builder.py index 4bd7850f..30db7584 100644 --- a/src/chat/utils/chat_message_builder.py +++ b/src/chat/utils/chat_message_builder.py @@ -352,6 +352,7 @@ def _build_readable_messages_internal( pic_counter: int = 1, show_pic: bool = True, message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None, + pic_single: bool = False, ) -> Tuple[str, List[Tuple[float, str, str]], Dict[str, str], int]: # sourcery skip: use-getitem-for-re-match-groups """ @@ -378,6 +379,7 @@ def _build_readable_messages_internal( if pic_id_mapping is None: pic_id_mapping = {} current_pic_counter = pic_counter + pic_description_cache: Dict[str, str] = {} # 创建时间戳到消息ID的映射,用于在消息前添加[id]标识符 timestamp_to_id_mapping: Dict[float, str] = {} @@ -400,6 +402,17 @@ def _build_readable_messages_internal( nonlocal current_pic_counter nonlocal pic_counter pic_id = match.group(1) + if pic_single: + if pic_id not in pic_description_cache: + description = "内容正在阅读,请稍等" + try: + image = Images.get_or_none(Images.image_id == pic_id) + if image and image.description: + description = image.description + except Exception: + pass + pic_description_cache[pic_id] = description + return f"[图片:{pic_description_cache[pic_id]}]" if pic_id not in pic_id_mapping: pic_id_mapping[pic_id] = f"图片{current_pic_counter}" current_pic_counter += 1 @@ -603,6 +616,7 @@ async def build_readable_messages_with_list( replace_bot_name: bool = True, timestamp_mode: str = "relative", truncate: bool = False, + pic_single: bool = False, ) -> Tuple[str, List[Tuple[float, str, str]]]: """ 将消息列表转换为可读的文本格式,并返回原始(时间戳, 昵称, 内容)列表。 @@ -613,10 +627,16 @@ async def build_readable_messages_with_list( replace_bot_name, timestamp_mode, truncate, + pic_id_mapping=None, + pic_counter=1, + show_pic=True, + message_id_list=None, + pic_single=pic_single, ) - if pic_mapping_info := build_pic_mapping_info(pic_id_mapping): - formatted_string = f"{pic_mapping_info}\n\n{formatted_string}" + if not pic_single: + if pic_mapping_info := build_pic_mapping_info(pic_id_mapping): + formatted_string = f"{pic_mapping_info}\n\n{formatted_string}" return formatted_string, details_list @@ -630,6 +650,7 @@ def build_readable_messages_with_id( show_actions: bool = False, show_pic: bool = True, remove_emoji_stickers: bool = False, + pic_single: bool = False, ) -> Tuple[str, List[Tuple[str, DatabaseMessages]]]: """ 将消息列表转换为可读的文本格式,并返回原始(时间戳, 昵称, 内容)列表。 @@ -647,6 +668,7 @@ def build_readable_messages_with_id( read_mark=read_mark, message_id_list=message_id_list, remove_emoji_stickers=remove_emoji_stickers, + pic_single=pic_single, ) return formatted_string, message_id_list @@ -662,6 +684,7 @@ def build_readable_messages( show_pic: bool = True, message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None, remove_emoji_stickers: bool = False, + pic_single: bool = False, ) -> str: # sourcery skip: extract-method """ 将消息列表转换为可读的文本格式。 @@ -769,14 +792,14 @@ def build_readable_messages( truncate, show_pic=show_pic, message_id_list=message_id_list, + pic_single=pic_single, ) - # 生成图片映射信息并添加到最前面 - pic_mapping_info = build_pic_mapping_info(pic_id_mapping) - if pic_mapping_info: - return f"{pic_mapping_info}\n\n{formatted_string}" - else: - return formatted_string + if not pic_single: + pic_mapping_info = build_pic_mapping_info(pic_id_mapping) + if pic_mapping_info: + return f"{pic_mapping_info}\n\n{formatted_string}" + return formatted_string else: # 按 read_mark 分割消息 messages_before_mark = [msg for msg in copy_messages if (msg.time or 0) <= read_mark] @@ -796,6 +819,7 @@ def build_readable_messages( pic_counter, show_pic=show_pic, message_id_list=message_id_list, + pic_single=pic_single, ) formatted_after, _, pic_id_mapping, _ = _build_readable_messages_internal( messages_after_mark, @@ -806,15 +830,19 @@ def build_readable_messages( pic_counter, show_pic=show_pic, message_id_list=message_id_list, + pic_single=pic_single, ) read_mark_line = "\n--- 以上消息是你已经看过,请关注以下未读的新消息---\n" # 生成图片映射信息 - if pic_id_mapping: - pic_mapping_info = f"图片信息:\n{build_pic_mapping_info(pic_id_mapping)}\n聊天记录信息:\n" + if not pic_single: + if pic_id_mapping: + pic_mapping_info = f"图片信息:\n{build_pic_mapping_info(pic_id_mapping)}\n聊天记录信息:\n" + else: + pic_mapping_info = "聊天记录信息:\n" else: - pic_mapping_info = "聊天记录信息:\n" + pic_mapping_info = "" # 组合结果 result_parts = [] @@ -832,7 +860,7 @@ def build_readable_messages( return "".join(result_parts) -async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str: +async def build_anonymous_messages(messages: List[DatabaseMessages], show_ids: bool = False) -> str: """ 构建匿名可读消息,将不同人的名称转为唯一占位符(A、B、C...),bot自己用SELF。 处理 回复 和 @ 字段,将bbb映射为匿名占位符。 @@ -889,7 +917,7 @@ async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str: current_char += 1 return person_map[person_id] - for msg in messages: + for i, msg in enumerate(messages): try: platform = msg.chat_info.platform user_id = msg.user_info.user_id @@ -910,7 +938,12 @@ async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str: content = replace_user_references(content, platform, anon_name_resolver, replace_bot_name=False) - header = f"{anon_name}说 " + # 构建消息头,如果启用show_ids则添加序号 + if show_ids: + header = f"[{i + 1}] {anon_name}说 " + else: + header = f"{anon_name}说 " + output_lines.append(header) stripped_line = content.strip() if stripped_line: diff --git a/src/common/database/database_model.py b/src/common/database/database_model.py index 440004db..ba2f774c 100644 --- a/src/common/database/database_model.py +++ b/src/common/database/database_model.py @@ -330,8 +330,6 @@ class Jargon(BaseModel): content = TextField() raw_content = TextField(null=True) - type = TextField(null=True) - translation = TextField(null=True) meaning = TextField(null=True) chat_id = TextField(index=True) is_global = BooleanField(default=False) diff --git a/src/express/expression_reflector.py b/src/express/expression_reflector.py index 9256b3dd..0cf9812b 100644 --- a/src/express/expression_reflector.py +++ b/src/express/expression_reflector.py @@ -39,9 +39,19 @@ class ExpressionReflector: # 检查是否在允许列表中 allow_reflect = global_config.expression.allow_reflect - if allow_reflect and self.chat_id not in allow_reflect: - logger.info(f"[Expression Reflection] 当前聊天流 {self.chat_id} 不在允许列表中,跳过") - return False + if allow_reflect: + # 将 allow_reflect 中的 platform:id:type 格式转换为 chat_id 列表 + allow_reflect_chat_ids = [] + for stream_config in allow_reflect: + parsed_chat_id = global_config.expression._parse_stream_config_to_chat_id(stream_config) + if parsed_chat_id: + allow_reflect_chat_ids.append(parsed_chat_id) + else: + logger.warning(f"[Expression Reflection] 无法解析 allow_reflect 配置项: {stream_config}") + + if self.chat_id not in allow_reflect_chat_ids: + logger.info(f"[Expression Reflection] 当前聊天流 {self.chat_id} 不在允许列表中,跳过") + return False # 检查上一次提问时间 current_time = time.time() diff --git a/src/jargon/jargon_miner.py b/src/jargon/jargon_miner.py index 4319ad3d..2a7020cc 100644 --- a/src/jargon/jargon_miner.py +++ b/src/jargon/jargon_miner.py @@ -1,6 +1,7 @@ import time import json import asyncio +from collections import OrderedDict from typing import List, Dict, Optional, Any from json_repair import repair_json from peewee import fn @@ -12,12 +13,13 @@ from src.config.config import model_config, global_config from src.chat.message_receive.chat_stream import get_chat_manager from src.plugin_system.apis import llm_api from src.chat.utils.chat_message_builder import ( - build_anonymous_messages, + build_readable_messages_with_id, get_raw_msg_by_timestamp_with_chat_inclusive, get_raw_msg_before_timestamp_with_chat, build_readable_messages_with_list, ) from src.chat.utils.prompt_builder import Prompt, global_prompt_manager +from src.chat.utils.utils import parse_platform_accounts logger = get_logger("jargon") @@ -43,9 +45,107 @@ def _contains_bot_self_name(content: str) -> bool: return any(name in target for name in candidates if target) +def _format_context_message(msg: Any, seq_index: int) -> str: + """ + 将单条消息格式化为带序号的上下文行 + """ + if msg is None: + return "" + + text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip() + if not text: + return "" + + user_info = getattr(msg, "user_info", None) + nickname = "" + if user_info: + nickname = getattr(user_info, "user_nickname", "") or getattr(user_info, "user_id", "") + + if not nickname: + nickname = getattr(msg, "user_nickname", "") or getattr(msg, "user_id", "") or "某人" + + return f"{nickname}: {text}" + + +def _build_context_paragraph(messages: List[Any], center_index: int) -> Optional[str]: + """ + 构建包含中心消息上下文的段落(前3条+后3条) + """ + if not messages or center_index < 0 or center_index >= len(messages): + return None + + context_start = max(0, center_index - 3) + context_end = min(len(messages), center_index + 1 + 3) + + context_lines: List[str] = [] + for idx in range(context_start, context_end): + formatted_line = _format_context_message(messages[idx], idx + 1) + if formatted_line: + context_lines.append(formatted_line) + + if not context_lines: + return None + + paragraph = "\n".join(context_lines).strip() + return paragraph or None + + +def _is_bot_message(msg: Any) -> bool: + """判断消息是否来自机器人自身""" + if msg is None: + return False + + bot_config = getattr(global_config, "bot", None) + if not bot_config: + return False + + platform = ( + str(getattr(msg, "user_platform", "") or getattr(getattr(msg, "user_info", None), "platform", "") or "") + .strip() + .lower() + ) + user_id = ( + str(getattr(msg, "user_id", "") or getattr(getattr(msg, "user_info", None), "user_id", "") or "") + .strip() + ) + + if not platform or not user_id: + return False + + platform_accounts = {} + try: + platform_accounts = parse_platform_accounts(getattr(bot_config, "platforms", []) or []) + except Exception: + platform_accounts = {} + + bot_accounts: Dict[str, str] = {} + qq_account = str(getattr(bot_config, "qq_account", "") or "").strip() + if qq_account: + bot_accounts["qq"] = qq_account + + telegram_account = str(getattr(bot_config, "telegram_account", "") or "").strip() + if telegram_account: + bot_accounts["telegram"] = telegram_account + + for plat, account in platform_accounts.items(): + if account and plat not in bot_accounts: + bot_accounts[plat] = account + + bot_account = bot_accounts.get(platform) + return bool(bot_account and user_id == bot_account) + + +def _has_adjacent_bot_message(messages: List[Any], center_index: int) -> bool: + """检查目标消息的上一条或下一条是否为机器人发言""" + for neighbor in (center_index - 1, center_index + 1): + if 0 <= neighbor < len(messages) and _is_bot_message(messages[neighbor]): + return True + return False + + def _init_prompt() -> None: prompt_str = """ -**聊天内容,其中的SELF是你自己的发言** +**聊天内容,其中的{bot_name}的发言内容是你自己的发言,[msg_id] 是消息ID** {chat_str} 请从上面这段聊天内容中提取"可能是黑话"的候选项(黑话/俚语/网络缩写/口头禅)。 @@ -62,9 +162,10 @@ def _init_prompt() -> None: - 中文词语的缩写,用几个汉字概括一个词汇或含义,例如:社死、内卷 以 JSON 数组输出,元素为对象(严格按以下结构): +请你提取出可能的黑话,最多10 [ - {{"content": "词条", "raw_content": "包含该词条的完整对话上下文原文"}}, - {{"content": "词条2", "raw_content": "包含该词条的完整对话上下文原文"}} + {{"content": "词条", "msg_id": "m12"}}, // msg_id 必须与上方聊天中展示的ID完全一致 + {{"content": "词条2", "msg_id": "m15"}} ] 现在请输出: @@ -78,10 +179,10 @@ def _init_inference_prompts() -> None: prompt1_str = """ **词条内容** {content} -**词条出现的上下文(raw_content)其中的SELF是你自己的发言** +**词条出现的上下文。其中的{bot_name}的发言内容是你自己的发言** {raw_content_list} -请根据以上词条内容和上下文,推断这个词条的含义。 +请根据上下文,推断"{content}"这个词条的含义。 - 如果这是一个黑话、俚语或网络用语,请推断其含义 - 如果含义明确(常规词汇),也请说明 - 如果上下文信息不足,无法推断含义,请设置 no_info 为 true @@ -240,7 +341,7 @@ def _should_infer_meaning(jargon_obj: Jargon) -> bool: last_inference = jargon_obj.last_inference_count or 0 # 阈值列表:3,6, 10, 20, 40, 60, 100 - thresholds = [3, 6, 10, 20, 40, 60, 100] + thresholds = [2, 4, 8, 12, 24, 60, 100] if count < thresholds[0]: return False @@ -281,6 +382,53 @@ class JargonMiner: chat_manager = get_chat_manager() stream_name = chat_manager.get_stream_name(self.chat_id) self.stream_name = stream_name if stream_name else self.chat_id + self.cache_limit = 100 + self.cache: OrderedDict[str, None] = OrderedDict() + + def _add_to_cache(self, content: str) -> None: + """将提取到的黑话加入缓存,保持LRU语义""" + if not content: + return + + key = content.strip() + if not key: + return + + if key in self.cache: + self.cache.move_to_end(key) + else: + self.cache[key] = None + if len(self.cache) > self.cache_limit: + self.cache.popitem(last=False) + + def _collect_cached_entries(self, messages: List[Any]) -> List[Dict[str, List[str]]]: + """检查缓存中的黑话是否出现在当前消息窗口,生成对应上下文""" + if not self.cache or not messages: + return [] + + cached_entries: List[Dict[str, List[str]]] = [] + processed_pairs = set() + + for idx, msg in enumerate(messages): + msg_text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip() + if not msg_text or _is_bot_message(msg): + continue + + for content in self.cache.keys(): + if not content: + continue + if (content, idx) in processed_pairs: + continue + if content in msg_text: + if _has_adjacent_bot_message(messages, idx): + continue + paragraph = _build_context_paragraph(messages, idx) + if not paragraph: + continue + cached_entries.append({"content": content, "raw_content": [paragraph]}) + processed_pairs.add((content, idx)) + + return cached_entries async def _infer_meaning_by_id(self, jargon_id: int) -> None: """通过ID加载对象并推断""" @@ -323,6 +471,7 @@ class JargonMiner: prompt1 = await global_prompt_manager.format_prompt( "jargon_inference_with_context_prompt", content=content, + bot_name = global_config.bot.nickname, raw_content_list=raw_content_text, ) @@ -441,8 +590,8 @@ class JargonMiner: # 是黑话,使用推断1的结果(基于上下文,更准确) jargon_obj.meaning = inference1.get("meaning", "") else: - # 不是黑话,也记录含义(使用推断2的结果,因为含义明确) - jargon_obj.meaning = inference2.get("meaning", "") + # 不是黑话,清空含义,不再存储任何内容 + jargon_obj.meaning = "" # 更新最后一次判定的count值,避免重启后重复判定 jargon_obj.last_inference_count = jargon_obj.count or 0 @@ -511,12 +660,33 @@ class JargonMiner: if not messages: return - chat_str: str = await build_anonymous_messages(messages) + # 按时间排序,确保编号与上下文一致 + messages = sorted(messages, key=lambda msg: msg.time or 0) + + chat_str, message_id_list = build_readable_messages_with_id( + messages=messages, + replace_bot_name=True, + timestamp_mode="relative", + truncate=False, + show_actions=False, + show_pic=True, + pic_single=True, + ) if not chat_str.strip(): return + msg_id_to_index: Dict[str, int] = {} + for idx, (msg_id, _msg) in enumerate(message_id_list or []): + if not msg_id: + continue + msg_id_to_index[msg_id] = idx + if not msg_id_to_index: + logger.warning("未能生成消息ID映射,跳过本次提取") + return + prompt: str = await global_prompt_manager.format_prompt( "extract_jargon_prompt", + bot_name=global_config.bot.nickname, chat_str=chat_str, ) @@ -551,25 +721,46 @@ class JargonMiner: for item in parsed: if not isinstance(item, dict): continue + content = str(item.get("content", "")).strip() - raw_content_value = item.get("raw_content", "") + msg_id_value = item.get("msg_id") - # 处理raw_content:可能是字符串或列表 - raw_content_list = [] - if isinstance(raw_content_value, list): - raw_content_list = [str(rc).strip() for rc in raw_content_value if str(rc).strip()] - # 去重 - raw_content_list = list(dict.fromkeys(raw_content_list)) - elif isinstance(raw_content_value, str): - raw_content_str = raw_content_value.strip() - if raw_content_str: - raw_content_list = [raw_content_str] + if not content: + continue - if content and raw_content_list: - if _contains_bot_self_name(content): - logger.debug(f"解析阶段跳过包含机器人昵称/别名的词条: {content}") - continue - entries.append({"content": content, "raw_content": raw_content_list}) + if _contains_bot_self_name(content): + logger.debug(f"解析阶段跳过包含机器人昵称/别名的词条: {content}") + continue + + msg_id_str = str(msg_id_value or "").strip() + if not msg_id_str: + logger.warning(f"解析jargon失败:msg_id缺失,content={content}") + continue + + msg_index = msg_id_to_index.get(msg_id_str) + if msg_index is None: + logger.warning(f"解析jargon失败:msg_id未找到,content={content}, msg_id={msg_id_str}") + continue + + target_msg = messages[msg_index] + if _is_bot_message(target_msg): + logger.debug(f"解析阶段跳过引用机器人自身消息的词条: content={content}, msg_id={msg_id_str}") + continue + if _has_adjacent_bot_message(messages, msg_index): + logger.debug( + f"解析阶段跳过因邻近机器人发言的词条: content={content}, msg_id={msg_id_str}" + ) + continue + + context_paragraph = _build_context_paragraph(messages, msg_index) + if not context_paragraph: + logger.warning(f"解析jargon失败:上下文为空,content={content}, msg_id={msg_id_str}") + continue + + entries.append({"content": content, "raw_content": [context_paragraph]}) + cached_entries = self._collect_cached_entries(messages) + if cached_entries: + entries.extend(cached_entries) except Exception as e: logger.error(f"解析jargon JSON失败: {e}; 原始: {response}") return @@ -577,15 +768,25 @@ class JargonMiner: if not entries: return - # 去重并写入DB(按 chat_id + content 去重) - # 使用content作为去重键 - seen = set() - uniq_entries = [] + # 去重并合并raw_content(按 content 聚合) + merged_entries: OrderedDict[str, Dict[str, List[str]]] = OrderedDict() for entry in entries: content_key = entry["content"] - if content_key not in seen: - seen.add(content_key) - uniq_entries.append(entry) + raw_list = entry.get("raw_content", []) or [] + if content_key in merged_entries: + merged_entries[content_key]["raw_content"].extend(raw_list) + else: + merged_entries[content_key] = { + "content": content_key, + "raw_content": list(raw_list), + } + + uniq_entries = [] + for merged_entry in merged_entries.values(): + raw_content_list = merged_entry["raw_content"] + if raw_content_list: + merged_entry["raw_content"] = list(dict.fromkeys(raw_content_list)) + uniq_entries.append(merged_entry) saved = 0 updated = 0 @@ -670,6 +871,8 @@ class JargonMiner: except Exception as e: logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}") continue + finally: + self._add_to_cache(content) # 固定输出提取的jargon结果,格式化为可读形式(只要有提取结果就输出) if uniq_entries: