From 644d470558a835f69cfa0523c8bf5d19bd02fae8 Mon Sep 17 00:00:00 2001 From: SengokuCola <1026294844@qq.com> Date: Tue, 25 Nov 2025 19:19:52 +0800 Subject: [PATCH] =?UTF-8?q?feat=EF=BC=9A=E9=BB=91=E8=AF=9D=E6=9B=B4?= =?UTF-8?q?=E9=AB=98=E7=9A=84=E6=8F=90=E5=8F=96=E7=8E=87;=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E6=8F=90=E5=8F=96=E5=87=86=E7=A1=AE=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 黑话解释现在独立运行,拥有更高的提取率 增加提取准确性 --- src/chat/replyer/group_generator.py | 10 +- src/chat/replyer/private_generator.py | 11 +- src/chat/replyer/prompt/replyer_prompt.py | 6 +- src/jargon/jargon_explainer.py | 261 +++++++++++++++ src/jargon/jargon_miner.py | 386 ++++------------------ src/jargon/jargon_utils.py | 199 +++++++++++ src/memory_system/memory_retrieval.py | 74 ++++- template/bot_config_template.toml | 2 +- 8 files changed, 609 insertions(+), 340 deletions(-) create mode 100644 src/jargon/jargon_explainer.py create mode 100644 src/jargon/jargon_utils.py diff --git a/src/chat/replyer/group_generator.py b/src/chat/replyer/group_generator.py index cc948db1..7d741be9 100644 --- a/src/chat/replyer/group_generator.py +++ b/src/chat/replyer/group_generator.py @@ -36,6 +36,7 @@ from src.chat.replyer.prompt.lpmm_prompt import init_lpmm_prompt from src.chat.replyer.prompt.replyer_prompt import init_replyer_prompt from src.chat.replyer.prompt.rewrite_prompt import init_rewrite_prompt from src.memory_system.memory_retrieval import init_memory_retrieval_prompt, build_memory_retrieval_prompt +from src.jargon.jargon_explainer import explain_jargon_in_context init_lpmm_prompt() init_replyer_prompt() @@ -786,7 +787,7 @@ class DefaultReplyer: show_actions=True, ) - # 并行执行七个构建任务 + # 并行执行八个构建任务(包括黑话解释) task_results = await asyncio.gather( self._time_and_run_task( self.build_expression_habits(chat_talking_prompt_short, target, reply_reason), "expression_habits" @@ -804,6 +805,10 @@ class DefaultReplyer: ), "memory_retrieval", ), + self._time_and_run_task( + explain_jargon_in_context(chat_id, message_list_before_short, chat_talking_prompt_short), + "jargon_explanation", + ), ) # 任务名称中英文映射 @@ -816,6 +821,7 @@ class DefaultReplyer: "personality_prompt": "人格信息", "mood_state_prompt": "情绪状态", "memory_retrieval": "记忆检索", + "jargon_explanation": "黑话解释", } # 处理结果 @@ -846,6 +852,7 @@ class DefaultReplyer: memory_retrieval: str = results_dict["memory_retrieval"] keywords_reaction_prompt = await self.build_keywords_reaction_prompt(target) mood_state_prompt: str = results_dict["mood_state_prompt"] + jargon_explanation: Optional[str] = results_dict.get("jargon_explanation") # 从 chosen_actions 中提取 planner 的整体思考理由 planner_reasoning = "" @@ -896,6 +903,7 @@ class DefaultReplyer: mood_state=mood_state_prompt, # relation_info_block=relation_info, extra_info_block=extra_info_block, + jargon_explanation=jargon_explanation, identity=personality_prompt, action_descriptions=actions_info, sender_name=sender, diff --git a/src/chat/replyer/private_generator.py b/src/chat/replyer/private_generator.py index 74b04f6e..25c0a36a 100644 --- a/src/chat/replyer/private_generator.py +++ b/src/chat/replyer/private_generator.py @@ -37,6 +37,7 @@ from src.chat.replyer.prompt.lpmm_prompt import init_lpmm_prompt from src.chat.replyer.prompt.replyer_prompt import init_replyer_prompt from src.chat.replyer.prompt.rewrite_prompt import init_rewrite_prompt from src.memory_system.memory_retrieval import init_memory_retrieval_prompt, build_memory_retrieval_prompt +from src.jargon.jargon_explainer import explain_jargon_in_context init_lpmm_prompt() init_replyer_prompt() @@ -706,7 +707,7 @@ class PrivateReplyer: show_actions=True, ) - # 并行执行八个构建任务 + # 并行执行九个构建任务(包括黑话解释) task_results = await asyncio.gather( self._time_and_run_task( self.build_expression_habits(chat_talking_prompt_short, target, reply_reason), "expression_habits" @@ -725,6 +726,10 @@ class PrivateReplyer: ), "memory_retrieval", ), + self._time_and_run_task( + explain_jargon_in_context(chat_id, message_list_before_short, chat_talking_prompt_short), + "jargon_explanation", + ), ) # 任务名称中英文映射 @@ -737,6 +742,7 @@ class PrivateReplyer: "personality_prompt": "人格信息", "mood_state_prompt": "情绪状态", "memory_retrieval": "记忆检索", + "jargon_explanation": "黑话解释", } # 处理结果 @@ -767,6 +773,7 @@ class PrivateReplyer: mood_state_prompt: str = results_dict["mood_state_prompt"] memory_retrieval: str = results_dict["memory_retrieval"] keywords_reaction_prompt = await self.build_keywords_reaction_prompt(target) + jargon_explanation: Optional[str] = results_dict.get("jargon_explanation") # 从 chosen_actions 中提取 planner 的整体思考理由 planner_reasoning = "" @@ -813,6 +820,7 @@ class PrivateReplyer: identity=personality_prompt, action_descriptions=actions_info, dialogue_prompt=dialogue_prompt, + jargon_explanation=jargon_explanation, time_block=time_block, target=target, reason=reply_reason, @@ -835,6 +843,7 @@ class PrivateReplyer: identity=personality_prompt, action_descriptions=actions_info, dialogue_prompt=dialogue_prompt, + jargon_explanation=jargon_explanation, time_block=time_block, reply_target_block=reply_target_block, reply_style=global_config.personality.reply_style, diff --git a/src/chat/replyer/prompt/replyer_prompt.py b/src/chat/replyer/prompt/replyer_prompt.py index 7c7a91e3..f9ee25a4 100644 --- a/src/chat/replyer/prompt/replyer_prompt.py +++ b/src/chat/replyer/prompt/replyer_prompt.py @@ -8,7 +8,7 @@ def init_replyer_prompt(): Prompt( """{knowledge_prompt}{tool_info_block}{extra_info_block} -{expression_habits_block}{memory_retrieval} +{expression_habits_block}{memory_retrieval}{jargon_explanation} 你正在qq群里聊天,下面是群里正在聊的内容,其中包含聊天记录和聊天中的图片 其中标注 {bot_name}(你) 的发言是你自己的发言,请注意区分: @@ -29,7 +29,7 @@ def init_replyer_prompt(): Prompt( """{knowledge_prompt}{tool_info_block}{extra_info_block} -{expression_habits_block}{memory_retrieval} +{expression_habits_block}{memory_retrieval}{jargon_explanation} 你正在和{sender_name}聊天,这是你们之前聊的内容: {time_block} @@ -48,7 +48,7 @@ def init_replyer_prompt(): Prompt( """{knowledge_prompt}{tool_info_block}{extra_info_block} -{expression_habits_block}{memory_retrieval} +{expression_habits_block}{memory_retrieval}{jargon_explanation} 你正在和{sender_name}聊天,这是你们之前聊的内容: {time_block} diff --git a/src/jargon/jargon_explainer.py b/src/jargon/jargon_explainer.py new file mode 100644 index 00000000..1c1fd7f6 --- /dev/null +++ b/src/jargon/jargon_explainer.py @@ -0,0 +1,261 @@ +import re +import time +from typing import List, Dict, Optional, Any + +from src.common.logger import get_logger +from src.common.database.database_model import Jargon +from src.llm_models.utils_model import LLMRequest +from src.config.config import model_config, global_config +from src.chat.utils.prompt_builder import Prompt, global_prompt_manager +from src.jargon.jargon_miner import search_jargon +from src.jargon.jargon_utils import is_bot_message, contains_bot_self_name, parse_chat_id_list, chat_id_list_contains + +logger = get_logger("jargon") + + +def _init_explainer_prompts() -> None: + """初始化黑话解释器相关的prompt""" + # Prompt:概括黑话解释结果 + summarize_prompt_str = """ +**上下文聊天内容** +{chat_context} + +**提取到的黑话及其含义** +{jargon_explanations} + +请根据上述信息,对黑话解释进行概括和整理。 +- 如果上下文中有黑话出现,请简要说明这些黑话在上下文中的使用情况 +- 将黑话解释整理成简洁、易读的格式 +- 如果某个黑话在上下文中没有出现,可以省略 +- 输出格式要自然,适合作为回复参考信息 + +请输出概括后的黑话解释(直接输出文本,不要使用JSON格式): +""" + Prompt(summarize_prompt_str, "jargon_explainer_summarize_prompt") + + +_init_explainer_prompts() + + +class JargonExplainer: + """黑话解释器,用于在回复前识别和解释上下文中的黑话""" + + def __init__(self, chat_id: str) -> None: + self.chat_id = chat_id + self.llm = LLMRequest( + model_set=model_config.model_task_config.utils, + request_type="jargon.explain", + ) + + def match_jargon_from_messages( + self, messages: List[Any] + ) -> List[Dict[str, str]]: + """ + 通过直接匹配数据库中的jargon字符串来提取黑话 + + Args: + messages: 消息列表 + + Returns: + List[Dict[str, str]]: 提取到的黑话列表,每个元素包含content + """ + start_time = time.time() + + if not messages: + return [] + + # 收集所有消息的文本内容 + message_texts: List[str] = [] + for msg in messages: + # 跳过机器人自己的消息 + if is_bot_message(msg): + continue + + msg_text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip() + if msg_text: + message_texts.append(msg_text) + + if not message_texts: + return [] + + # 合并所有消息文本 + combined_text = " ".join(message_texts) + + # 查询所有有meaning的jargon记录 + query = Jargon.select().where( + (Jargon.meaning.is_null(False)) & (Jargon.meaning != "") + ) + + # 根据all_global配置决定查询逻辑 + if global_config.jargon.all_global: + # 开启all_global:只查询is_global=True的记录 + query = query.where(Jargon.is_global) + else: + # 关闭all_global:查询is_global=True或chat_id列表包含当前chat_id的记录 + # 这里先查询所有,然后在Python层面过滤 + pass + + # 按count降序排序,优先匹配出现频率高的 + query = query.order_by(Jargon.count.desc()) + + # 执行查询并匹配 + matched_jargon: Dict[str, Dict[str, str]] = {} + query_time = time.time() + + for jargon in query: + content = jargon.content or "" + if not content or not content.strip(): + continue + + # 跳过包含机器人昵称的词条 + if contains_bot_self_name(content): + continue + + # 检查chat_id(如果all_global=False) + if not global_config.jargon.all_global: + if jargon.is_global: + # 全局黑话,包含 + pass + else: + # 检查chat_id列表是否包含当前chat_id + chat_id_list = parse_chat_id_list(jargon.chat_id) + if not chat_id_list_contains(chat_id_list, self.chat_id): + continue + + # 在文本中查找匹配(大小写不敏感) + pattern = re.escape(content) + # 使用单词边界或中文字符边界来匹配,避免部分匹配 + # 对于中文,使用Unicode字符类;对于英文,使用单词边界 + if re.search(r'[\u4e00-\u9fff]', content): + # 包含中文,使用更宽松的匹配 + search_pattern = pattern + else: + # 纯英文/数字,使用单词边界 + search_pattern = r'\b' + pattern + r'\b' + + if re.search(search_pattern, combined_text, re.IGNORECASE): + # 找到匹配,记录(去重) + if content not in matched_jargon: + matched_jargon[content] = {"content": content} + + match_time = time.time() + total_time = match_time - start_time + query_duration = query_time - start_time + match_duration = match_time - query_time + + logger.info( + f"黑话匹配完成: 查询耗时 {query_duration:.3f}s, 匹配耗时 {match_duration:.3f}s, " + f"总耗时 {total_time:.3f}s, 匹配到 {len(matched_jargon)} 个黑话" + ) + + return list(matched_jargon.values()) + + async def explain_jargon( + self, messages: List[Any], chat_context: str + ) -> Optional[str]: + """ + 解释上下文中的黑话 + + Args: + messages: 消息列表 + chat_context: 聊天上下文的文本表示 + + Returns: + Optional[str]: 黑话解释的概括文本,如果没有黑话则返回None + """ + if not messages: + return None + + # 直接匹配方式:从数据库中查询jargon并在消息中匹配 + jargon_entries = self.match_jargon_from_messages(messages) + + if not jargon_entries: + return None + + # 去重(按content) + unique_jargon: Dict[str, Dict[str, str]] = {} + for entry in jargon_entries: + content = entry["content"] + if content not in unique_jargon: + unique_jargon[content] = entry + + jargon_list = list(unique_jargon.values()) + logger.info(f"从上下文中提取到 {len(jargon_list)} 个黑话: {[j['content'] for j in jargon_list]}") + + # 查询每个黑话的含义 + jargon_explanations: List[str] = [] + for entry in jargon_list: + content = entry["content"] + + # 根据是否开启全局黑话,决定查询方式 + if global_config.jargon.all_global: + # 开启全局黑话:查询所有is_global=True的记录 + results = search_jargon( + keyword=content, + chat_id=None, # 不指定chat_id,查询全局黑话 + limit=1, + case_sensitive=False, + fuzzy=False, # 精确匹配 + ) + else: + # 关闭全局黑话:优先查询当前聊天或全局的黑话 + results = search_jargon( + keyword=content, + chat_id=self.chat_id, + limit=1, + case_sensitive=False, + fuzzy=False, # 精确匹配 + ) + + if results and len(results) > 0: + meaning = results[0].get("meaning", "").strip() + if meaning: + jargon_explanations.append(f"- {content}: {meaning}") + else: + logger.info(f"黑话 {content} 没有找到含义") + else: + logger.info(f"黑话 {content} 未在数据库中找到") + + if not jargon_explanations: + logger.info("没有找到任何黑话的含义,跳过解释") + return None + + # 拼接所有黑话解释 + explanations_text = "\n".join(jargon_explanations) + + # 使用LLM概括黑话解释 + summarize_prompt = await global_prompt_manager.format_prompt( + "jargon_explainer_summarize_prompt", + chat_context=chat_context, + jargon_explanations=explanations_text, + ) + + summary, _ = await self.llm.generate_response_async(summarize_prompt, temperature=0.3) + if not summary: + # 如果LLM概括失败,直接返回原始解释 + return f"上下文中的黑话解释:\n{explanations_text}" + + summary = summary.strip() + if not summary: + return f"上下文中的黑话解释:\n{explanations_text}" + + return summary + + +async def explain_jargon_in_context( + chat_id: str, messages: List[Any], chat_context: str +) -> Optional[str]: + """ + 解释上下文中的黑话(便捷函数) + + Args: + chat_id: 聊天ID + messages: 消息列表 + chat_context: 聊天上下文的文本表示 + + Returns: + Optional[str]: 黑话解释的概括文本,如果没有黑话则返回None + """ + explainer = JargonExplainer(chat_id) + return await explainer.explain_jargon(messages, chat_context) + diff --git a/src/jargon/jargon_miner.py b/src/jargon/jargon_miner.py index 1bbe49f7..0e25af57 100644 --- a/src/jargon/jargon_miner.py +++ b/src/jargon/jargon_miner.py @@ -11,127 +11,24 @@ from src.common.database.database_model import Jargon from src.llm_models.utils_model import LLMRequest from src.config.config import model_config, global_config from src.chat.message_receive.chat_stream import get_chat_manager -from src.plugin_system.apis import llm_api from src.chat.utils.chat_message_builder import ( - build_readable_messages, build_readable_messages_with_id, get_raw_msg_by_timestamp_with_chat_inclusive, - get_raw_msg_before_timestamp_with_chat, - build_readable_messages_with_list, ) from src.chat.utils.prompt_builder import Prompt, global_prompt_manager -from src.chat.utils.utils import parse_platform_accounts +from src.jargon.jargon_utils import ( + is_bot_message, + build_context_paragraph, + contains_bot_self_name, + parse_chat_id_list, + chat_id_list_contains, + update_chat_id_list +) logger = get_logger("jargon") -def _contains_bot_self_name(content: str) -> bool: - """ - 判断词条是否包含机器人的昵称或别名 - """ - if not content: - return False - - bot_config = getattr(global_config, "bot", None) - if not bot_config: - return False - - target = content.strip().lower() - nickname = str(getattr(bot_config, "nickname", "") or "").strip().lower() - alias_names = [str(alias or "").strip().lower() for alias in getattr(bot_config, "alias_names", []) or []] - - candidates = [name for name in [nickname, *alias_names] if name] - - return any(name in target for name in candidates if target) - - -def _build_context_paragraph(messages: List[Any], center_index: int) -> Optional[str]: - """ - 构建包含中心消息上下文的段落(前3条+后3条),使用标准的 readable builder 输出 - """ - if not messages or center_index < 0 or center_index >= len(messages): - return None - - context_start = max(0, center_index - 3) - context_end = min(len(messages), center_index + 1 + 3) - context_messages = messages[context_start:context_end] - - if not context_messages: - return None - - try: - paragraph = build_readable_messages( - messages=context_messages, - replace_bot_name=True, - timestamp_mode="relative", - read_mark=0.0, - truncate=False, - show_actions=False, - show_pic=True, - message_id_list=None, - remove_emoji_stickers=False, - pic_single=True, - ) - except Exception as e: - logger.warning(f"构建上下文段落失败: {e}") - return None - - paragraph = paragraph.strip() - return paragraph or None - - -def _is_bot_message(msg: Any) -> bool: - """判断消息是否来自机器人自身""" - if msg is None: - return False - - bot_config = getattr(global_config, "bot", None) - if not bot_config: - return False - - platform = ( - str(getattr(msg, "user_platform", "") or getattr(getattr(msg, "user_info", None), "platform", "") or "") - .strip() - .lower() - ) - user_id = ( - str(getattr(msg, "user_id", "") or getattr(getattr(msg, "user_info", None), "user_id", "") or "") - .strip() - ) - - if not platform or not user_id: - return False - - platform_accounts = {} - try: - platform_accounts = parse_platform_accounts(getattr(bot_config, "platforms", []) or []) - except Exception: - platform_accounts = {} - - bot_accounts: Dict[str, str] = {} - qq_account = str(getattr(bot_config, "qq_account", "") or "").strip() - if qq_account: - bot_accounts["qq"] = qq_account - - telegram_account = str(getattr(bot_config, "telegram_account", "") or "").strip() - if telegram_account: - bot_accounts["telegram"] = telegram_account - - for plat, account in platform_accounts.items(): - if account and plat not in bot_accounts: - bot_accounts[plat] = account - - bot_account = bot_accounts.get(platform) - return bool(bot_account and user_id == bot_account) - - -def _has_adjacent_bot_message(messages: List[Any], center_index: int) -> bool: - """检查目标消息的上一条或下一条是否为机器人发言""" - for neighbor in (center_index - 1, center_index + 1): - if 0 <= neighbor < len(messages) and _is_bot_message(messages[neighbor]): - return True - return False def _init_prompt() -> None: @@ -176,6 +73,7 @@ def _init_inference_prompts() -> None: 请根据上下文,推断"{content}"这个词条的含义。 - 如果这是一个黑话、俚语或网络用语,请推断其含义 - 如果含义明确(常规词汇),也请说明 +- {bot_name} 的发言内容可能包含错误,请不要参考其发言内容 - 如果上下文信息不足,无法推断含义,请设置 no_info 为 true 以 JSON 格式输出: @@ -228,94 +126,6 @@ _init_prompt() _init_inference_prompts() -async def _enrich_raw_content_if_needed( - content: str, - raw_content_list: List[str], - chat_id: str, - messages: List[Any], - extraction_start_time: float, - extraction_end_time: float, -) -> List[str]: - """ - 检查raw_content是否只包含黑话本身,如果是,则获取该消息的前三条消息作为原始内容 - - Args: - content: 黑话内容 - raw_content_list: 原始raw_content列表 - chat_id: 聊天ID - messages: 当前时间窗口内的消息列表 - extraction_start_time: 提取开始时间 - extraction_end_time: 提取结束时间 - - Returns: - 处理后的raw_content列表 - """ - enriched_list = [] - - for raw_content in raw_content_list: - # 检查raw_content是否只包含黑话本身(去除空白字符后比较) - raw_content_clean = raw_content.strip() - content_clean = content.strip() - - # 如果raw_content只包含黑话本身(可能有一些标点或空白),则尝试获取上下文 - # 去除所有空白字符后比较,确保只包含黑话本身 - raw_content_normalized = raw_content_clean.replace(" ", "").replace("\n", "").replace("\t", "") - content_normalized = content_clean.replace(" ", "").replace("\n", "").replace("\t", "") - - if raw_content_normalized == content_normalized: - # 在消息列表中查找只包含该黑话的消息(去除空白后比较) - target_message = None - for msg in messages: - msg_content = (msg.processed_plain_text or msg.display_message or "").strip() - msg_content_normalized = msg_content.replace(" ", "").replace("\n", "").replace("\t", "") - # 检查消息内容是否只包含黑话本身(去除空白后完全匹配) - if msg_content_normalized == content_normalized: - target_message = msg - break - - if target_message and target_message.time: - # 获取该消息的前三条消息 - try: - previous_messages = get_raw_msg_before_timestamp_with_chat( - chat_id=chat_id, timestamp=target_message.time, limit=3 - ) - - if previous_messages: - # 将前三条消息和当前消息一起格式化 - context_messages = previous_messages + [target_message] - # 按时间排序 - context_messages.sort(key=lambda x: x.time or 0) - - # 格式化为可读消息 - formatted_context, _ = await build_readable_messages_with_list( - context_messages, - replace_bot_name=True, - timestamp_mode="relative", - truncate=False, - ) - - if formatted_context.strip(): - enriched_list.append(formatted_context.strip()) - logger.warning(f"为黑话 {content} 补充了上下文消息") - else: - # 如果格式化失败,使用原始raw_content - enriched_list.append(raw_content) - else: - # 没有找到前三条消息,使用原始raw_content - enriched_list.append(raw_content) - except Exception as e: - logger.warning(f"获取黑话 {content} 的上下文消息失败: {e}") - # 出错时使用原始raw_content - enriched_list.append(raw_content) - else: - # 没有找到包含黑话的消息,使用原始raw_content - enriched_list.append(raw_content) - else: - # raw_content包含更多内容,直接使用 - enriched_list.append(raw_content) - - return enriched_list - def _should_infer_meaning(jargon_obj: Jargon) -> bool: """ @@ -402,7 +212,7 @@ class JargonMiner: for idx, msg in enumerate(messages): msg_text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip() - if not msg_text or _is_bot_message(msg): + if not msg_text or is_bot_message(msg): continue for content in self.cache.keys(): @@ -411,9 +221,7 @@ class JargonMiner: if (content, idx) in processed_pairs: continue if content in msg_text: - if _has_adjacent_bot_message(messages, idx): - continue - paragraph = _build_context_paragraph(messages, idx) + paragraph = build_context_paragraph(messages, idx) if not paragraph: continue cached_entries.append({"content": content, "raw_content": [paragraph]}) @@ -719,7 +527,7 @@ class JargonMiner: if not content: continue - if _contains_bot_self_name(content): + if contains_bot_self_name(content): logger.info(f"解析阶段跳过包含机器人昵称/别名的词条: {content}") continue @@ -734,16 +542,11 @@ class JargonMiner: continue target_msg = messages[msg_index] - if _is_bot_message(target_msg): + if is_bot_message(target_msg): logger.info(f"解析阶段跳过引用机器人自身消息的词条: content={content}, msg_id={msg_id_str}") continue - if _has_adjacent_bot_message(messages, msg_index): - logger.info( - f"解析阶段跳过因邻近机器人发言的词条: content={content}, msg_id={msg_id_str}" - ) - continue - context_paragraph = _build_context_paragraph(messages, msg_index) + context_paragraph = build_context_paragraph(messages, msg_index) if not context_paragraph: logger.warning(f"解析jargon失败:上下文为空,content={content}, msg_id={msg_id_str}") continue @@ -785,27 +588,27 @@ class JargonMiner: content = entry["content"] raw_content_list = entry["raw_content"] # 已经是列表 - # 检查并补充raw_content:如果只包含黑话本身,则获取前三条消息作为上下文 - # raw_content_list = await _enrich_raw_content_if_needed( - # content=content, - # raw_content_list=raw_content_list, - # chat_id=self.chat_id, - # messages=messages, - # extraction_start_time=extraction_start_time, - # extraction_end_time=extraction_end_time, - # ) try: - # 根据all_global配置决定查询逻辑 - if global_config.jargon.all_global: - # 开启all_global:无视chat_id,查询所有content匹配的记录(所有记录都是全局的) - query = Jargon.select().where(Jargon.content == content) - else: - # 关闭all_global:只查询chat_id匹配的记录(不考虑is_global) - query = Jargon.select().where((Jargon.chat_id == self.chat_id) & (Jargon.content == content)) + # 查询所有content匹配的记录 + query = Jargon.select().where(Jargon.content == content) - if query.exists(): - obj = query.get() + # 查找匹配的记录 + matched_obj = None + for obj in query: + if global_config.jargon.all_global: + # 开启all_global:所有content匹配的记录都可以 + matched_obj = obj + break + else: + # 关闭all_global:需要检查chat_id列表是否包含目标chat_id + chat_id_list = parse_chat_id_list(obj.chat_id) + if chat_id_list_contains(chat_id_list, self.chat_id): + matched_obj = obj + break + + if matched_obj: + obj = matched_obj try: obj.count = (obj.count or 0) + 1 except Exception: @@ -827,6 +630,11 @@ class JargonMiner: merged_list = list(dict.fromkeys(existing_raw_content + raw_content_list)) obj.raw_content = json.dumps(merged_list, ensure_ascii=False) + # 更新chat_id列表:增加当前chat_id的计数 + chat_id_list = parse_chat_id_list(obj.chat_id) + updated_chat_id_list = update_chat_id_list(chat_id_list, self.chat_id, increment=1) + obj.chat_id = json.dumps(updated_chat_id_list, ensure_ascii=False) + # 开启all_global时,确保记录标记为is_global=True if global_config.jargon.all_global: obj.is_global = True @@ -851,10 +659,14 @@ class JargonMiner: # 关闭all_global:新记录is_global=False is_global_new = False + # 使用新格式创建chat_id列表:[[chat_id, count]] + chat_id_list = [[self.chat_id, 1]] + chat_id_json = json.dumps(chat_id_list, ensure_ascii=False) + Jargon.create( content=content, raw_content=json.dumps(raw_content_list, ensure_ascii=False), - chat_id=self.chat_id, + chat_id=chat_id_json, is_global=is_global_new, count=1, ) @@ -924,8 +736,8 @@ def search_jargon( keyword = keyword.strip() - # 构建查询 - query = Jargon.select(Jargon.content, Jargon.meaning) + # 构建查询(选择所有需要的字段,以便后续过滤) + query = Jargon.select() # 构建搜索条件 if case_sensitive: @@ -951,102 +763,34 @@ def search_jargon( if global_config.jargon.all_global: # 开启all_global:所有记录都是全局的,查询所有is_global=True的记录(无视chat_id) query = query.where(Jargon.is_global) - else: - # 关闭all_global:如果提供了chat_id,优先搜索该聊天或global的jargon - if chat_id: - query = query.where((Jargon.chat_id == chat_id) | Jargon.is_global) + # 注意:对于all_global=False的情况,chat_id过滤在Python层面进行,以便兼容新旧格式 - # 只返回有meaning的记录 - query = query.where((Jargon.meaning.is_null(False)) & (Jargon.meaning != "")) + # 注意:meaning的过滤移到Python层面,因为我们需要先过滤chat_id # 按count降序排序,优先返回出现频率高的 query = query.order_by(Jargon.count.desc()) - # 限制结果数量 - query = query.limit(limit) + # 限制结果数量(先多取一些,因为后面可能过滤) + query = query.limit(limit * 2) - # 执行查询并返回结果 + # 执行查询并返回结果,过滤chat_id results = [] for jargon in query: + # 如果提供了chat_id且all_global=False,需要检查chat_id列表是否包含目标chat_id + if chat_id and not global_config.jargon.all_global: + chat_id_list = parse_chat_id_list(jargon.chat_id) + # 如果记录是is_global=True,或者chat_id列表包含目标chat_id,则包含 + if not jargon.is_global and not chat_id_list_contains(chat_id_list, chat_id): + continue + + # 只返回有meaning的记录 + if not jargon.meaning or jargon.meaning.strip() == "": + continue + results.append({"content": jargon.content or "", "meaning": jargon.meaning or ""}) + + # 达到限制数量后停止 + if len(results) >= limit: + break - return results - - -async def store_jargon_from_answer(jargon_keyword: str, answer: str, chat_id: str) -> None: - """将黑话存入jargon系统 - - Args: - jargon_keyword: 黑话关键词 - answer: 答案内容(将概括为raw_content) - chat_id: 聊天ID - """ - try: - # 概括答案为简短的raw_content - summary_prompt = f"""请将以下答案概括为一句简短的话(不超过50字),作为黑话"{jargon_keyword}"的使用示例: - -答案:{answer} - -只输出概括后的内容,不要输出其他内容:""" - - success, summary, _, _ = await llm_api.generate_with_model( - summary_prompt, - model_config=model_config.model_task_config.utils_small, - request_type="memory.summarize_jargon", - ) - - logger.info(f"概括答案提示: {summary_prompt}") - logger.info(f"概括答案: {summary}") - - if not success: - logger.warning(f"概括答案失败,使用原始答案: {summary}") - summary = answer[:100] # 截取前100字符作为备用 - - raw_content = summary.strip()[:200] # 限制长度 - - # 检查是否已存在 - if global_config.jargon.all_global: - query = Jargon.select().where(Jargon.content == jargon_keyword) - else: - query = Jargon.select().where((Jargon.chat_id == chat_id) & (Jargon.content == jargon_keyword)) - - if query.exists(): - # 更新现有记录 - obj = query.get() - obj.count = (obj.count or 0) + 1 - - # 合并raw_content列表 - existing_raw_content = [] - if obj.raw_content: - try: - existing_raw_content = ( - json.loads(obj.raw_content) if isinstance(obj.raw_content, str) else obj.raw_content - ) - if not isinstance(existing_raw_content, list): - existing_raw_content = [existing_raw_content] if existing_raw_content else [] - except (json.JSONDecodeError, TypeError): - existing_raw_content = [obj.raw_content] if obj.raw_content else [] - - # 合并并去重 - merged_list = list(dict.fromkeys(existing_raw_content + [raw_content])) - obj.raw_content = json.dumps(merged_list, ensure_ascii=False) - - if global_config.jargon.all_global: - obj.is_global = True - - obj.save() - logger.info(f"更新jargon记录: {jargon_keyword}") - else: - # 创建新记录 - is_global_new = True if global_config.jargon.all_global else False - Jargon.create( - content=jargon_keyword, - raw_content=json.dumps([raw_content], ensure_ascii=False), - chat_id=chat_id, - is_global=is_global_new, - count=1, - ) - logger.info(f"创建新jargon记录: {jargon_keyword}") - - except Exception as e: - logger.error(f"存储jargon失败: {e}") + return results \ No newline at end of file diff --git a/src/jargon/jargon_utils.py b/src/jargon/jargon_utils.py new file mode 100644 index 00000000..f17889f4 --- /dev/null +++ b/src/jargon/jargon_utils.py @@ -0,0 +1,199 @@ +import json +from typing import List, Dict, Optional, Any + +from src.common.logger import get_logger +from src.common.database.database_model import Jargon +from src.config.config import global_config +from src.chat.utils.chat_message_builder import ( + build_readable_messages, + build_readable_messages_with_id, +) +from src.chat.utils.utils import parse_platform_accounts + + +logger = get_logger("jargon") + +def parse_chat_id_list(chat_id_value: Any) -> List[List[Any]]: + """ + 解析chat_id字段,兼容旧格式(字符串)和新格式(JSON列表) + + Args: + chat_id_value: 可能是字符串(旧格式)或JSON字符串(新格式) + + Returns: + List[List[Any]]: 格式为 [[chat_id, count], ...] 的列表 + """ + if not chat_id_value: + return [] + + # 如果是字符串,尝试解析为JSON + if isinstance(chat_id_value, str): + # 尝试解析JSON + try: + parsed = json.loads(chat_id_value) + if isinstance(parsed, list): + # 新格式:已经是列表 + return parsed + elif isinstance(parsed, str): + # 解析后还是字符串,说明是旧格式 + return [[parsed, 1]] + else: + # 其他类型,当作旧格式处理 + return [[str(chat_id_value), 1]] + except (json.JSONDecodeError, TypeError): + # 解析失败,当作旧格式(纯字符串) + return [[str(chat_id_value), 1]] + elif isinstance(chat_id_value, list): + # 已经是列表格式 + return chat_id_value + else: + # 其他类型,转换为旧格式 + return [[str(chat_id_value), 1]] + + +def update_chat_id_list(chat_id_list: List[List[Any]], target_chat_id: str, increment: int = 1) -> List[List[Any]]: + """ + 更新chat_id列表,如果target_chat_id已存在则增加计数,否则添加新条目 + + Args: + chat_id_list: 当前的chat_id列表,格式为 [[chat_id, count], ...] + target_chat_id: 要更新或添加的chat_id + increment: 增加的计数,默认为1 + + Returns: + List[List[Any]]: 更新后的chat_id列表 + """ + # 查找是否已存在该chat_id + found = False + for item in chat_id_list: + if isinstance(item, list) and len(item) >= 1 and str(item[0]) == str(target_chat_id): + # 找到匹配的chat_id,增加计数 + if len(item) >= 2: + item[1] = (item[1] if isinstance(item[1], (int, float)) else 0) + increment + else: + item.append(increment) + found = True + break + + if not found: + # 未找到,添加新条目 + chat_id_list.append([target_chat_id, increment]) + + return chat_id_list + + +def chat_id_list_contains(chat_id_list: List[List[Any]], target_chat_id: str) -> bool: + """ + 检查chat_id列表中是否包含指定的chat_id + + Args: + chat_id_list: chat_id列表,格式为 [[chat_id, count], ...] + target_chat_id: 要查找的chat_id + + Returns: + bool: 如果包含则返回True + """ + for item in chat_id_list: + if isinstance(item, list) and len(item) >= 1 and str(item[0]) == str(target_chat_id): + return True + return False + + +def contains_bot_self_name(content: str) -> bool: + """ + 判断词条是否包含机器人的昵称或别名 + """ + if not content: + return False + + bot_config = getattr(global_config, "bot", None) + if not bot_config: + return False + + target = content.strip().lower() + nickname = str(getattr(bot_config, "nickname", "") or "").strip().lower() + alias_names = [str(alias or "").strip().lower() for alias in getattr(bot_config, "alias_names", []) or []] + + candidates = [name for name in [nickname, *alias_names] if name] + + return any(name in target for name in candidates if target) + + +def build_context_paragraph(messages: List[Any], center_index: int) -> Optional[str]: + """ + 构建包含中心消息上下文的段落(前3条+后3条),使用标准的 readable builder 输出 + """ + if not messages or center_index < 0 or center_index >= len(messages): + return None + + context_start = max(0, center_index - 3) + context_end = min(len(messages), center_index + 1 + 3) + context_messages = messages[context_start:context_end] + + if not context_messages: + return None + + try: + paragraph = build_readable_messages( + messages=context_messages, + replace_bot_name=True, + timestamp_mode="relative", + read_mark=0.0, + truncate=False, + show_actions=False, + show_pic=True, + message_id_list=None, + remove_emoji_stickers=False, + pic_single=True, + ) + except Exception as e: + logger.warning(f"构建上下文段落失败: {e}") + return None + + paragraph = paragraph.strip() + return paragraph or None + + +def is_bot_message(msg: Any) -> bool: + """判断消息是否来自机器人自身""" + if msg is None: + return False + + bot_config = getattr(global_config, "bot", None) + if not bot_config: + return False + + platform = ( + str(getattr(msg, "user_platform", "") or getattr(getattr(msg, "user_info", None), "platform", "") or "") + .strip() + .lower() + ) + user_id = ( + str(getattr(msg, "user_id", "") or getattr(getattr(msg, "user_info", None), "user_id", "") or "") + .strip() + ) + + if not platform or not user_id: + return False + + platform_accounts = {} + try: + platform_accounts = parse_platform_accounts(getattr(bot_config, "platforms", []) or []) + except Exception: + platform_accounts = {} + + bot_accounts: Dict[str, str] = {} + qq_account = str(getattr(bot_config, "qq_account", "") or "").strip() + if qq_account: + bot_accounts["qq"] = qq_account + + telegram_account = str(getattr(bot_config, "telegram_account", "") or "").strip() + if telegram_account: + bot_accounts["telegram"] = telegram_account + + for plat, account in platform_accounts.items(): + if account and plat not in bot_accounts: + bot_accounts[plat] = account + + bot_account = bot_accounts.get(platform) + return bool(bot_account and user_id == bot_account) \ No newline at end of file diff --git a/src/memory_system/memory_retrieval.py b/src/memory_system/memory_retrieval.py index f2625c83..ca02e9a5 100644 --- a/src/memory_system/memory_retrieval.py +++ b/src/memory_system/memory_retrieval.py @@ -8,11 +8,12 @@ from src.common.logger import get_logger from src.config.config import global_config, model_config from src.chat.utils.prompt_builder import Prompt, global_prompt_manager from src.plugin_system.apis import llm_api -from src.common.database.database_model import ThinkingBack +from src.common.database.database_model import ThinkingBack, Jargon from json_repair import repair_json from src.memory_system.retrieval_tools import get_tool_registry, init_all_tools from src.memory_system.retrieval_tools.query_lpmm_knowledge import query_lpmm_knowledge from src.llm_models.payload_content.message import MessageBuilder, RoleType, Message +from src.jargon.jargon_utils import parse_chat_id_list, chat_id_list_contains, contains_bot_self_name logger = get_logger("memory_retrieval") @@ -63,27 +64,23 @@ def init_memory_retrieval_prompt(): 2. 是否有需要回忆的内容(比如"之前说过"、"上次"、"以前"等) 3. 是否有需要查找历史信息的问题 4. 是否有问题可以搜集信息帮助你聊天 -5. 对话中是否包含黑话、俚语、缩写等可能需要查询的概念 重要提示: - **每次只能提出一个问题**,选择最需要查询的关键问题 - 如果"最近已查询的问题和结果"中已经包含了类似的问题并得到了答案,请避免重复生成相同或相似的问题,不需要重复查询 - 如果之前已经查询过某个问题但未找到答案,可以尝试用不同的方式提问或更具体的问题 -如果你认为需要从记忆中检索信息来回答,请: -1. 识别对话中可能需要查询的概念(黑话/俚语/缩写/专有名词等关键词),放入"concepts"字段 -2. 根据上下文提出**一个**最关键的问题来帮助你回复目标消息,放入"questions"字段 +如果你认为需要从记忆中检索信息来回答,请根据上下文提出**一个**最关键的问题来帮助你回复目标消息,放入"questions"字段 问题格式示例: - "xxx在前几天干了什么" -- "xxx是什么" +- "xxx是什么,在什么时候提到过?" - "xxxx和xxx的关系是什么" - "xxx在某个时间点发生了什么" 输出格式示例(需要检索时): ```json {{ - "concepts": ["AAA", "BBB", "CCC"], #需要检索的概念列表(字符串数组),如果不需要检索概念则输出空数组[] "questions": ["张三在前几天干了什么"] #问题数组(字符串数组),如果不需要检索记忆则输出空数组[],如果需要检索则只输出包含一个问题的数组 }} ``` @@ -91,7 +88,6 @@ def init_memory_retrieval_prompt(): 输出格式示例(不需要检索时): ```json {{ - "concepts": [], "questions": [] }} ``` @@ -280,6 +276,54 @@ async def _retrieve_concepts_with_jargon(concepts: List[str], chat_id: str) -> s return "" +def _match_jargon_from_text(chat_text: str, chat_id: str) -> List[str]: + """直接在聊天文本中匹配已知的jargon,返回出现过的黑话列表""" + if not chat_text or not chat_text.strip(): + return [] + + start_time = time.time() + + query = Jargon.select().where((Jargon.meaning.is_null(False)) & (Jargon.meaning != "")) + if global_config.jargon.all_global: + query = query.where(Jargon.is_global) + + query = query.order_by(Jargon.count.desc()) + + query_time = time.time() + matched: Dict[str, None] = {} + + for jargon in query: + content = (jargon.content or "").strip() + if not content: + continue + + if contains_bot_self_name(content): + continue + + if not global_config.jargon.all_global and not jargon.is_global: + chat_id_list = parse_chat_id_list(jargon.chat_id) + if not chat_id_list_contains(chat_id_list, chat_id): + continue + + pattern = re.escape(content) + if re.search(r"[\u4e00-\u9fff]", content): + search_pattern = pattern + else: + search_pattern = r"\b" + pattern + r"\b" + + if re.search(search_pattern, chat_text, re.IGNORECASE): + matched[content] = None + + end_time = time.time() + logger.info( + f"记忆检索黑话匹配: 查询耗时 {(query_time - start_time):.3f}s, " + f"匹配耗时 {(end_time - query_time):.3f}s, 总耗时 {(end_time - start_time):.3f}s, " + f"匹配到 {len(matched)} 个黑话" + ) + + return list(matched.keys()) + + async def _react_agent_solve_question( question: str, chat_id: str, max_iterations: int = 5, timeout: float = 30.0, initial_info: str = "" ) -> Tuple[bool, str, List[Dict[str, Any]], bool]: @@ -991,11 +1035,17 @@ async def build_memory_retrieval_prompt( return "" # 解析概念列表和问题列表 - concepts, questions = _parse_questions_json(response) - logger.info(f"解析到 {len(concepts)} 个概念: {concepts}") + _, questions = _parse_questions_json(response) logger.info(f"解析到 {len(questions)} 个问题: {questions}") - # 对概念进行jargon检索,作为初始信息 + # 使用匹配逻辑自动识别聊天中的黑话概念 + concepts = _match_jargon_from_text(message, chat_id) + if concepts: + logger.info(f"黑话匹配命中 {len(concepts)} 个概念: {concepts}") + else: + logger.info("黑话匹配未命中任何概念") + + # 对匹配到的概念进行jargon检索,作为初始信息 initial_info = "" if concepts: logger.info(f"开始对 {len(concepts)} 个概念进行jargon检索") @@ -1026,8 +1076,6 @@ async def build_memory_retrieval_prompt( else: return "" - logger.info(f"解析到 {len(questions)} 个问题: {questions}") - # 第二步:并行处理所有问题(使用配置的最大迭代次数/120秒超时) max_iterations = global_config.memory.max_agent_iterations logger.info(f"问题数量: {len(questions)},设置最大迭代次数: {max_iterations},超时时间: 120秒") diff --git a/template/bot_config_template.toml b/template/bot_config_template.toml index 53b17f1e..95a68725 100644 --- a/template/bot_config_template.toml +++ b/template/bot_config_template.toml @@ -1,5 +1,5 @@ [inner] -version = "6.23.1" +version = "6.23.4" #----以下是给开发人员阅读的,如果你只是部署了麦麦,不需要阅读---- #如果你想要修改配置文件,请递增version的值