feat:黑话更高的提取率;增加提取准确性

黑话解释现在独立运行,拥有更高的提取率
增加提取准确性
pull/1385/head
SengokuCola 2025-11-25 19:19:52 +08:00
parent 39ab2b5fab
commit 644d470558
8 changed files with 609 additions and 340 deletions

View File

@ -36,6 +36,7 @@ from src.chat.replyer.prompt.lpmm_prompt import init_lpmm_prompt
from src.chat.replyer.prompt.replyer_prompt import init_replyer_prompt from src.chat.replyer.prompt.replyer_prompt import init_replyer_prompt
from src.chat.replyer.prompt.rewrite_prompt import init_rewrite_prompt from src.chat.replyer.prompt.rewrite_prompt import init_rewrite_prompt
from src.memory_system.memory_retrieval import init_memory_retrieval_prompt, build_memory_retrieval_prompt from src.memory_system.memory_retrieval import init_memory_retrieval_prompt, build_memory_retrieval_prompt
from src.jargon.jargon_explainer import explain_jargon_in_context
init_lpmm_prompt() init_lpmm_prompt()
init_replyer_prompt() init_replyer_prompt()
@ -786,7 +787,7 @@ class DefaultReplyer:
show_actions=True, show_actions=True,
) )
# 并行执行七个构建任务 # 并行执行八个构建任务(包括黑话解释)
task_results = await asyncio.gather( task_results = await asyncio.gather(
self._time_and_run_task( self._time_and_run_task(
self.build_expression_habits(chat_talking_prompt_short, target, reply_reason), "expression_habits" self.build_expression_habits(chat_talking_prompt_short, target, reply_reason), "expression_habits"
@ -804,6 +805,10 @@ class DefaultReplyer:
), ),
"memory_retrieval", "memory_retrieval",
), ),
self._time_and_run_task(
explain_jargon_in_context(chat_id, message_list_before_short, chat_talking_prompt_short),
"jargon_explanation",
),
) )
# 任务名称中英文映射 # 任务名称中英文映射
@ -816,6 +821,7 @@ class DefaultReplyer:
"personality_prompt": "人格信息", "personality_prompt": "人格信息",
"mood_state_prompt": "情绪状态", "mood_state_prompt": "情绪状态",
"memory_retrieval": "记忆检索", "memory_retrieval": "记忆检索",
"jargon_explanation": "黑话解释",
} }
# 处理结果 # 处理结果
@ -846,6 +852,7 @@ class DefaultReplyer:
memory_retrieval: str = results_dict["memory_retrieval"] memory_retrieval: str = results_dict["memory_retrieval"]
keywords_reaction_prompt = await self.build_keywords_reaction_prompt(target) keywords_reaction_prompt = await self.build_keywords_reaction_prompt(target)
mood_state_prompt: str = results_dict["mood_state_prompt"] mood_state_prompt: str = results_dict["mood_state_prompt"]
jargon_explanation: Optional[str] = results_dict.get("jargon_explanation")
# 从 chosen_actions 中提取 planner 的整体思考理由 # 从 chosen_actions 中提取 planner 的整体思考理由
planner_reasoning = "" planner_reasoning = ""
@ -896,6 +903,7 @@ class DefaultReplyer:
mood_state=mood_state_prompt, mood_state=mood_state_prompt,
# relation_info_block=relation_info, # relation_info_block=relation_info,
extra_info_block=extra_info_block, extra_info_block=extra_info_block,
jargon_explanation=jargon_explanation,
identity=personality_prompt, identity=personality_prompt,
action_descriptions=actions_info, action_descriptions=actions_info,
sender_name=sender, sender_name=sender,

View File

@ -37,6 +37,7 @@ from src.chat.replyer.prompt.lpmm_prompt import init_lpmm_prompt
from src.chat.replyer.prompt.replyer_prompt import init_replyer_prompt from src.chat.replyer.prompt.replyer_prompt import init_replyer_prompt
from src.chat.replyer.prompt.rewrite_prompt import init_rewrite_prompt from src.chat.replyer.prompt.rewrite_prompt import init_rewrite_prompt
from src.memory_system.memory_retrieval import init_memory_retrieval_prompt, build_memory_retrieval_prompt from src.memory_system.memory_retrieval import init_memory_retrieval_prompt, build_memory_retrieval_prompt
from src.jargon.jargon_explainer import explain_jargon_in_context
init_lpmm_prompt() init_lpmm_prompt()
init_replyer_prompt() init_replyer_prompt()
@ -706,7 +707,7 @@ class PrivateReplyer:
show_actions=True, show_actions=True,
) )
# 并行执行八个构建任务 # 并行执行九个构建任务(包括黑话解释)
task_results = await asyncio.gather( task_results = await asyncio.gather(
self._time_and_run_task( self._time_and_run_task(
self.build_expression_habits(chat_talking_prompt_short, target, reply_reason), "expression_habits" self.build_expression_habits(chat_talking_prompt_short, target, reply_reason), "expression_habits"
@ -725,6 +726,10 @@ class PrivateReplyer:
), ),
"memory_retrieval", "memory_retrieval",
), ),
self._time_and_run_task(
explain_jargon_in_context(chat_id, message_list_before_short, chat_talking_prompt_short),
"jargon_explanation",
),
) )
# 任务名称中英文映射 # 任务名称中英文映射
@ -737,6 +742,7 @@ class PrivateReplyer:
"personality_prompt": "人格信息", "personality_prompt": "人格信息",
"mood_state_prompt": "情绪状态", "mood_state_prompt": "情绪状态",
"memory_retrieval": "记忆检索", "memory_retrieval": "记忆检索",
"jargon_explanation": "黑话解释",
} }
# 处理结果 # 处理结果
@ -767,6 +773,7 @@ class PrivateReplyer:
mood_state_prompt: str = results_dict["mood_state_prompt"] mood_state_prompt: str = results_dict["mood_state_prompt"]
memory_retrieval: str = results_dict["memory_retrieval"] memory_retrieval: str = results_dict["memory_retrieval"]
keywords_reaction_prompt = await self.build_keywords_reaction_prompt(target) keywords_reaction_prompt = await self.build_keywords_reaction_prompt(target)
jargon_explanation: Optional[str] = results_dict.get("jargon_explanation")
# 从 chosen_actions 中提取 planner 的整体思考理由 # 从 chosen_actions 中提取 planner 的整体思考理由
planner_reasoning = "" planner_reasoning = ""
@ -813,6 +820,7 @@ class PrivateReplyer:
identity=personality_prompt, identity=personality_prompt,
action_descriptions=actions_info, action_descriptions=actions_info,
dialogue_prompt=dialogue_prompt, dialogue_prompt=dialogue_prompt,
jargon_explanation=jargon_explanation,
time_block=time_block, time_block=time_block,
target=target, target=target,
reason=reply_reason, reason=reply_reason,
@ -835,6 +843,7 @@ class PrivateReplyer:
identity=personality_prompt, identity=personality_prompt,
action_descriptions=actions_info, action_descriptions=actions_info,
dialogue_prompt=dialogue_prompt, dialogue_prompt=dialogue_prompt,
jargon_explanation=jargon_explanation,
time_block=time_block, time_block=time_block,
reply_target_block=reply_target_block, reply_target_block=reply_target_block,
reply_style=global_config.personality.reply_style, reply_style=global_config.personality.reply_style,

View File

@ -8,7 +8,7 @@ def init_replyer_prompt():
Prompt( Prompt(
"""{knowledge_prompt}{tool_info_block}{extra_info_block} """{knowledge_prompt}{tool_info_block}{extra_info_block}
{expression_habits_block}{memory_retrieval} {expression_habits_block}{memory_retrieval}{jargon_explanation}
你正在qq群里聊天下面是群里正在聊的内容其中包含聊天记录和聊天中的图片 你正在qq群里聊天下面是群里正在聊的内容其中包含聊天记录和聊天中的图片
其中标注 {bot_name}() 的发言是你自己的发言请注意区分: 其中标注 {bot_name}() 的发言是你自己的发言请注意区分:
@ -29,7 +29,7 @@ def init_replyer_prompt():
Prompt( Prompt(
"""{knowledge_prompt}{tool_info_block}{extra_info_block} """{knowledge_prompt}{tool_info_block}{extra_info_block}
{expression_habits_block}{memory_retrieval} {expression_habits_block}{memory_retrieval}{jargon_explanation}
你正在和{sender_name}聊天这是你们之前聊的内容: 你正在和{sender_name}聊天这是你们之前聊的内容:
{time_block} {time_block}
@ -48,7 +48,7 @@ def init_replyer_prompt():
Prompt( Prompt(
"""{knowledge_prompt}{tool_info_block}{extra_info_block} """{knowledge_prompt}{tool_info_block}{extra_info_block}
{expression_habits_block}{memory_retrieval} {expression_habits_block}{memory_retrieval}{jargon_explanation}
你正在和{sender_name}聊天这是你们之前聊的内容: 你正在和{sender_name}聊天这是你们之前聊的内容:
{time_block} {time_block}

View File

@ -0,0 +1,261 @@
import re
import time
from typing import List, Dict, Optional, Any
from src.common.logger import get_logger
from src.common.database.database_model import Jargon
from src.llm_models.utils_model import LLMRequest
from src.config.config import model_config, global_config
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
from src.jargon.jargon_miner import search_jargon
from src.jargon.jargon_utils import is_bot_message, contains_bot_self_name, parse_chat_id_list, chat_id_list_contains
logger = get_logger("jargon")
def _init_explainer_prompts() -> None:
"""初始化黑话解释器相关的prompt"""
# Prompt概括黑话解释结果
summarize_prompt_str = """
**上下文聊天内容**
{chat_context}
**提取到的黑话及其含义**
{jargon_explanations}
请根据上述信息对黑话解释进行概括和整理
- 如果上下文中有黑话出现请简要说明这些黑话在上下文中的使用情况
- 将黑话解释整理成简洁易读的格式
- 如果某个黑话在上下文中没有出现可以省略
- 输出格式要自然适合作为回复参考信息
请输出概括后的黑话解释直接输出文本不要使用JSON格式
"""
Prompt(summarize_prompt_str, "jargon_explainer_summarize_prompt")
_init_explainer_prompts()
class JargonExplainer:
"""黑话解释器,用于在回复前识别和解释上下文中的黑话"""
def __init__(self, chat_id: str) -> None:
self.chat_id = chat_id
self.llm = LLMRequest(
model_set=model_config.model_task_config.utils,
request_type="jargon.explain",
)
def match_jargon_from_messages(
self, messages: List[Any]
) -> List[Dict[str, str]]:
"""
通过直接匹配数据库中的jargon字符串来提取黑话
Args:
messages: 消息列表
Returns:
List[Dict[str, str]]: 提取到的黑话列表每个元素包含content
"""
start_time = time.time()
if not messages:
return []
# 收集所有消息的文本内容
message_texts: List[str] = []
for msg in messages:
# 跳过机器人自己的消息
if is_bot_message(msg):
continue
msg_text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip()
if msg_text:
message_texts.append(msg_text)
if not message_texts:
return []
# 合并所有消息文本
combined_text = " ".join(message_texts)
# 查询所有有meaning的jargon记录
query = Jargon.select().where(
(Jargon.meaning.is_null(False)) & (Jargon.meaning != "")
)
# 根据all_global配置决定查询逻辑
if global_config.jargon.all_global:
# 开启all_global只查询is_global=True的记录
query = query.where(Jargon.is_global)
else:
# 关闭all_global查询is_global=True或chat_id列表包含当前chat_id的记录
# 这里先查询所有然后在Python层面过滤
pass
# 按count降序排序优先匹配出现频率高的
query = query.order_by(Jargon.count.desc())
# 执行查询并匹配
matched_jargon: Dict[str, Dict[str, str]] = {}
query_time = time.time()
for jargon in query:
content = jargon.content or ""
if not content or not content.strip():
continue
# 跳过包含机器人昵称的词条
if contains_bot_self_name(content):
continue
# 检查chat_id如果all_global=False
if not global_config.jargon.all_global:
if jargon.is_global:
# 全局黑话,包含
pass
else:
# 检查chat_id列表是否包含当前chat_id
chat_id_list = parse_chat_id_list(jargon.chat_id)
if not chat_id_list_contains(chat_id_list, self.chat_id):
continue
# 在文本中查找匹配(大小写不敏感)
pattern = re.escape(content)
# 使用单词边界或中文字符边界来匹配,避免部分匹配
# 对于中文使用Unicode字符类对于英文使用单词边界
if re.search(r'[\u4e00-\u9fff]', content):
# 包含中文,使用更宽松的匹配
search_pattern = pattern
else:
# 纯英文/数字,使用单词边界
search_pattern = r'\b' + pattern + r'\b'
if re.search(search_pattern, combined_text, re.IGNORECASE):
# 找到匹配,记录(去重)
if content not in matched_jargon:
matched_jargon[content] = {"content": content}
match_time = time.time()
total_time = match_time - start_time
query_duration = query_time - start_time
match_duration = match_time - query_time
logger.info(
f"黑话匹配完成: 查询耗时 {query_duration:.3f}s, 匹配耗时 {match_duration:.3f}s, "
f"总耗时 {total_time:.3f}s, 匹配到 {len(matched_jargon)} 个黑话"
)
return list(matched_jargon.values())
async def explain_jargon(
self, messages: List[Any], chat_context: str
) -> Optional[str]:
"""
解释上下文中的黑话
Args:
messages: 消息列表
chat_context: 聊天上下文的文本表示
Returns:
Optional[str]: 黑话解释的概括文本如果没有黑话则返回None
"""
if not messages:
return None
# 直接匹配方式从数据库中查询jargon并在消息中匹配
jargon_entries = self.match_jargon_from_messages(messages)
if not jargon_entries:
return None
# 去重按content
unique_jargon: Dict[str, Dict[str, str]] = {}
for entry in jargon_entries:
content = entry["content"]
if content not in unique_jargon:
unique_jargon[content] = entry
jargon_list = list(unique_jargon.values())
logger.info(f"从上下文中提取到 {len(jargon_list)} 个黑话: {[j['content'] for j in jargon_list]}")
# 查询每个黑话的含义
jargon_explanations: List[str] = []
for entry in jargon_list:
content = entry["content"]
# 根据是否开启全局黑话,决定查询方式
if global_config.jargon.all_global:
# 开启全局黑话查询所有is_global=True的记录
results = search_jargon(
keyword=content,
chat_id=None, # 不指定chat_id查询全局黑话
limit=1,
case_sensitive=False,
fuzzy=False, # 精确匹配
)
else:
# 关闭全局黑话:优先查询当前聊天或全局的黑话
results = search_jargon(
keyword=content,
chat_id=self.chat_id,
limit=1,
case_sensitive=False,
fuzzy=False, # 精确匹配
)
if results and len(results) > 0:
meaning = results[0].get("meaning", "").strip()
if meaning:
jargon_explanations.append(f"- {content}: {meaning}")
else:
logger.info(f"黑话 {content} 没有找到含义")
else:
logger.info(f"黑话 {content} 未在数据库中找到")
if not jargon_explanations:
logger.info("没有找到任何黑话的含义,跳过解释")
return None
# 拼接所有黑话解释
explanations_text = "\n".join(jargon_explanations)
# 使用LLM概括黑话解释
summarize_prompt = await global_prompt_manager.format_prompt(
"jargon_explainer_summarize_prompt",
chat_context=chat_context,
jargon_explanations=explanations_text,
)
summary, _ = await self.llm.generate_response_async(summarize_prompt, temperature=0.3)
if not summary:
# 如果LLM概括失败直接返回原始解释
return f"上下文中的黑话解释:\n{explanations_text}"
summary = summary.strip()
if not summary:
return f"上下文中的黑话解释:\n{explanations_text}"
return summary
async def explain_jargon_in_context(
chat_id: str, messages: List[Any], chat_context: str
) -> Optional[str]:
"""
解释上下文中的黑话便捷函数
Args:
chat_id: 聊天ID
messages: 消息列表
chat_context: 聊天上下文的文本表示
Returns:
Optional[str]: 黑话解释的概括文本如果没有黑话则返回None
"""
explainer = JargonExplainer(chat_id)
return await explainer.explain_jargon(messages, chat_context)

View File

@ -11,127 +11,24 @@ from src.common.database.database_model import Jargon
from src.llm_models.utils_model import LLMRequest from src.llm_models.utils_model import LLMRequest
from src.config.config import model_config, global_config from src.config.config import model_config, global_config
from src.chat.message_receive.chat_stream import get_chat_manager from src.chat.message_receive.chat_stream import get_chat_manager
from src.plugin_system.apis import llm_api
from src.chat.utils.chat_message_builder import ( from src.chat.utils.chat_message_builder import (
build_readable_messages,
build_readable_messages_with_id, build_readable_messages_with_id,
get_raw_msg_by_timestamp_with_chat_inclusive, get_raw_msg_by_timestamp_with_chat_inclusive,
get_raw_msg_before_timestamp_with_chat,
build_readable_messages_with_list,
) )
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
from src.chat.utils.utils import parse_platform_accounts from src.jargon.jargon_utils import (
is_bot_message,
build_context_paragraph,
contains_bot_self_name,
parse_chat_id_list,
chat_id_list_contains,
update_chat_id_list
)
logger = get_logger("jargon") logger = get_logger("jargon")
def _contains_bot_self_name(content: str) -> bool:
"""
判断词条是否包含机器人的昵称或别名
"""
if not content:
return False
bot_config = getattr(global_config, "bot", None)
if not bot_config:
return False
target = content.strip().lower()
nickname = str(getattr(bot_config, "nickname", "") or "").strip().lower()
alias_names = [str(alias or "").strip().lower() for alias in getattr(bot_config, "alias_names", []) or []]
candidates = [name for name in [nickname, *alias_names] if name]
return any(name in target for name in candidates if target)
def _build_context_paragraph(messages: List[Any], center_index: int) -> Optional[str]:
"""
构建包含中心消息上下文的段落前3条+后3条使用标准的 readable builder 输出
"""
if not messages or center_index < 0 or center_index >= len(messages):
return None
context_start = max(0, center_index - 3)
context_end = min(len(messages), center_index + 1 + 3)
context_messages = messages[context_start:context_end]
if not context_messages:
return None
try:
paragraph = build_readable_messages(
messages=context_messages,
replace_bot_name=True,
timestamp_mode="relative",
read_mark=0.0,
truncate=False,
show_actions=False,
show_pic=True,
message_id_list=None,
remove_emoji_stickers=False,
pic_single=True,
)
except Exception as e:
logger.warning(f"构建上下文段落失败: {e}")
return None
paragraph = paragraph.strip()
return paragraph or None
def _is_bot_message(msg: Any) -> bool:
"""判断消息是否来自机器人自身"""
if msg is None:
return False
bot_config = getattr(global_config, "bot", None)
if not bot_config:
return False
platform = (
str(getattr(msg, "user_platform", "") or getattr(getattr(msg, "user_info", None), "platform", "") or "")
.strip()
.lower()
)
user_id = (
str(getattr(msg, "user_id", "") or getattr(getattr(msg, "user_info", None), "user_id", "") or "")
.strip()
)
if not platform or not user_id:
return False
platform_accounts = {}
try:
platform_accounts = parse_platform_accounts(getattr(bot_config, "platforms", []) or [])
except Exception:
platform_accounts = {}
bot_accounts: Dict[str, str] = {}
qq_account = str(getattr(bot_config, "qq_account", "") or "").strip()
if qq_account:
bot_accounts["qq"] = qq_account
telegram_account = str(getattr(bot_config, "telegram_account", "") or "").strip()
if telegram_account:
bot_accounts["telegram"] = telegram_account
for plat, account in platform_accounts.items():
if account and plat not in bot_accounts:
bot_accounts[plat] = account
bot_account = bot_accounts.get(platform)
return bool(bot_account and user_id == bot_account)
def _has_adjacent_bot_message(messages: List[Any], center_index: int) -> bool:
"""检查目标消息的上一条或下一条是否为机器人发言"""
for neighbor in (center_index - 1, center_index + 1):
if 0 <= neighbor < len(messages) and _is_bot_message(messages[neighbor]):
return True
return False
def _init_prompt() -> None: def _init_prompt() -> None:
@ -176,6 +73,7 @@ def _init_inference_prompts() -> None:
请根据上下文推断"{content}"这个词条的含义 请根据上下文推断"{content}"这个词条的含义
- 如果这是一个黑话俚语或网络用语请推断其含义 - 如果这是一个黑话俚语或网络用语请推断其含义
- 如果含义明确常规词汇也请说明 - 如果含义明确常规词汇也请说明
- {bot_name} 的发言内容可能包含错误请不要参考其发言内容
- 如果上下文信息不足无法推断含义请设置 no_info true - 如果上下文信息不足无法推断含义请设置 no_info true
JSON 格式输出 JSON 格式输出
@ -228,94 +126,6 @@ _init_prompt()
_init_inference_prompts() _init_inference_prompts()
async def _enrich_raw_content_if_needed(
content: str,
raw_content_list: List[str],
chat_id: str,
messages: List[Any],
extraction_start_time: float,
extraction_end_time: float,
) -> List[str]:
"""
检查raw_content是否只包含黑话本身如果是则获取该消息的前三条消息作为原始内容
Args:
content: 黑话内容
raw_content_list: 原始raw_content列表
chat_id: 聊天ID
messages: 当前时间窗口内的消息列表
extraction_start_time: 提取开始时间
extraction_end_time: 提取结束时间
Returns:
处理后的raw_content列表
"""
enriched_list = []
for raw_content in raw_content_list:
# 检查raw_content是否只包含黑话本身去除空白字符后比较
raw_content_clean = raw_content.strip()
content_clean = content.strip()
# 如果raw_content只包含黑话本身可能有一些标点或空白则尝试获取上下文
# 去除所有空白字符后比较,确保只包含黑话本身
raw_content_normalized = raw_content_clean.replace(" ", "").replace("\n", "").replace("\t", "")
content_normalized = content_clean.replace(" ", "").replace("\n", "").replace("\t", "")
if raw_content_normalized == content_normalized:
# 在消息列表中查找只包含该黑话的消息(去除空白后比较)
target_message = None
for msg in messages:
msg_content = (msg.processed_plain_text or msg.display_message or "").strip()
msg_content_normalized = msg_content.replace(" ", "").replace("\n", "").replace("\t", "")
# 检查消息内容是否只包含黑话本身(去除空白后完全匹配)
if msg_content_normalized == content_normalized:
target_message = msg
break
if target_message and target_message.time:
# 获取该消息的前三条消息
try:
previous_messages = get_raw_msg_before_timestamp_with_chat(
chat_id=chat_id, timestamp=target_message.time, limit=3
)
if previous_messages:
# 将前三条消息和当前消息一起格式化
context_messages = previous_messages + [target_message]
# 按时间排序
context_messages.sort(key=lambda x: x.time or 0)
# 格式化为可读消息
formatted_context, _ = await build_readable_messages_with_list(
context_messages,
replace_bot_name=True,
timestamp_mode="relative",
truncate=False,
)
if formatted_context.strip():
enriched_list.append(formatted_context.strip())
logger.warning(f"为黑话 {content} 补充了上下文消息")
else:
# 如果格式化失败使用原始raw_content
enriched_list.append(raw_content)
else:
# 没有找到前三条消息使用原始raw_content
enriched_list.append(raw_content)
except Exception as e:
logger.warning(f"获取黑话 {content} 的上下文消息失败: {e}")
# 出错时使用原始raw_content
enriched_list.append(raw_content)
else:
# 没有找到包含黑话的消息使用原始raw_content
enriched_list.append(raw_content)
else:
# raw_content包含更多内容直接使用
enriched_list.append(raw_content)
return enriched_list
def _should_infer_meaning(jargon_obj: Jargon) -> bool: def _should_infer_meaning(jargon_obj: Jargon) -> bool:
""" """
@ -402,7 +212,7 @@ class JargonMiner:
for idx, msg in enumerate(messages): for idx, msg in enumerate(messages):
msg_text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip() msg_text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip()
if not msg_text or _is_bot_message(msg): if not msg_text or is_bot_message(msg):
continue continue
for content in self.cache.keys(): for content in self.cache.keys():
@ -411,9 +221,7 @@ class JargonMiner:
if (content, idx) in processed_pairs: if (content, idx) in processed_pairs:
continue continue
if content in msg_text: if content in msg_text:
if _has_adjacent_bot_message(messages, idx): paragraph = build_context_paragraph(messages, idx)
continue
paragraph = _build_context_paragraph(messages, idx)
if not paragraph: if not paragraph:
continue continue
cached_entries.append({"content": content, "raw_content": [paragraph]}) cached_entries.append({"content": content, "raw_content": [paragraph]})
@ -719,7 +527,7 @@ class JargonMiner:
if not content: if not content:
continue continue
if _contains_bot_self_name(content): if contains_bot_self_name(content):
logger.info(f"解析阶段跳过包含机器人昵称/别名的词条: {content}") logger.info(f"解析阶段跳过包含机器人昵称/别名的词条: {content}")
continue continue
@ -734,16 +542,11 @@ class JargonMiner:
continue continue
target_msg = messages[msg_index] target_msg = messages[msg_index]
if _is_bot_message(target_msg): if is_bot_message(target_msg):
logger.info(f"解析阶段跳过引用机器人自身消息的词条: content={content}, msg_id={msg_id_str}") logger.info(f"解析阶段跳过引用机器人自身消息的词条: content={content}, msg_id={msg_id_str}")
continue continue
if _has_adjacent_bot_message(messages, msg_index):
logger.info(
f"解析阶段跳过因邻近机器人发言的词条: content={content}, msg_id={msg_id_str}"
)
continue
context_paragraph = _build_context_paragraph(messages, msg_index) context_paragraph = build_context_paragraph(messages, msg_index)
if not context_paragraph: if not context_paragraph:
logger.warning(f"解析jargon失败上下文为空content={content}, msg_id={msg_id_str}") logger.warning(f"解析jargon失败上下文为空content={content}, msg_id={msg_id_str}")
continue continue
@ -785,27 +588,27 @@ class JargonMiner:
content = entry["content"] content = entry["content"]
raw_content_list = entry["raw_content"] # 已经是列表 raw_content_list = entry["raw_content"] # 已经是列表
# 检查并补充raw_content如果只包含黑话本身则获取前三条消息作为上下文
# raw_content_list = await _enrich_raw_content_if_needed(
# content=content,
# raw_content_list=raw_content_list,
# chat_id=self.chat_id,
# messages=messages,
# extraction_start_time=extraction_start_time,
# extraction_end_time=extraction_end_time,
# )
try: try:
# 根据all_global配置决定查询逻辑 # 查询所有content匹配的记录
if global_config.jargon.all_global: query = Jargon.select().where(Jargon.content == content)
# 开启all_global无视chat_id查询所有content匹配的记录所有记录都是全局的
query = Jargon.select().where(Jargon.content == content)
else:
# 关闭all_global只查询chat_id匹配的记录不考虑is_global
query = Jargon.select().where((Jargon.chat_id == self.chat_id) & (Jargon.content == content))
if query.exists(): # 查找匹配的记录
obj = query.get() matched_obj = None
for obj in query:
if global_config.jargon.all_global:
# 开启all_global所有content匹配的记录都可以
matched_obj = obj
break
else:
# 关闭all_global需要检查chat_id列表是否包含目标chat_id
chat_id_list = parse_chat_id_list(obj.chat_id)
if chat_id_list_contains(chat_id_list, self.chat_id):
matched_obj = obj
break
if matched_obj:
obj = matched_obj
try: try:
obj.count = (obj.count or 0) + 1 obj.count = (obj.count or 0) + 1
except Exception: except Exception:
@ -827,6 +630,11 @@ class JargonMiner:
merged_list = list(dict.fromkeys(existing_raw_content + raw_content_list)) merged_list = list(dict.fromkeys(existing_raw_content + raw_content_list))
obj.raw_content = json.dumps(merged_list, ensure_ascii=False) obj.raw_content = json.dumps(merged_list, ensure_ascii=False)
# 更新chat_id列表增加当前chat_id的计数
chat_id_list = parse_chat_id_list(obj.chat_id)
updated_chat_id_list = update_chat_id_list(chat_id_list, self.chat_id, increment=1)
obj.chat_id = json.dumps(updated_chat_id_list, ensure_ascii=False)
# 开启all_global时确保记录标记为is_global=True # 开启all_global时确保记录标记为is_global=True
if global_config.jargon.all_global: if global_config.jargon.all_global:
obj.is_global = True obj.is_global = True
@ -851,10 +659,14 @@ class JargonMiner:
# 关闭all_global新记录is_global=False # 关闭all_global新记录is_global=False
is_global_new = False is_global_new = False
# 使用新格式创建chat_id列表[[chat_id, count]]
chat_id_list = [[self.chat_id, 1]]
chat_id_json = json.dumps(chat_id_list, ensure_ascii=False)
Jargon.create( Jargon.create(
content=content, content=content,
raw_content=json.dumps(raw_content_list, ensure_ascii=False), raw_content=json.dumps(raw_content_list, ensure_ascii=False),
chat_id=self.chat_id, chat_id=chat_id_json,
is_global=is_global_new, is_global=is_global_new,
count=1, count=1,
) )
@ -924,8 +736,8 @@ def search_jargon(
keyword = keyword.strip() keyword = keyword.strip()
# 构建查询 # 构建查询(选择所有需要的字段,以便后续过滤)
query = Jargon.select(Jargon.content, Jargon.meaning) query = Jargon.select()
# 构建搜索条件 # 构建搜索条件
if case_sensitive: if case_sensitive:
@ -951,102 +763,34 @@ def search_jargon(
if global_config.jargon.all_global: if global_config.jargon.all_global:
# 开启all_global所有记录都是全局的查询所有is_global=True的记录无视chat_id # 开启all_global所有记录都是全局的查询所有is_global=True的记录无视chat_id
query = query.where(Jargon.is_global) query = query.where(Jargon.is_global)
else: # 注意对于all_global=False的情况chat_id过滤在Python层面进行以便兼容新旧格式
# 关闭all_global如果提供了chat_id优先搜索该聊天或global的jargon
if chat_id:
query = query.where((Jargon.chat_id == chat_id) | Jargon.is_global)
# 只返回有meaning的记录 # 注意meaning的过滤移到Python层面因为我们需要先过滤chat_id
query = query.where((Jargon.meaning.is_null(False)) & (Jargon.meaning != ""))
# 按count降序排序优先返回出现频率高的 # 按count降序排序优先返回出现频率高的
query = query.order_by(Jargon.count.desc()) query = query.order_by(Jargon.count.desc())
# 限制结果数量 # 限制结果数量(先多取一些,因为后面可能过滤)
query = query.limit(limit) query = query.limit(limit * 2)
# 执行查询并返回结果 # 执行查询并返回结果过滤chat_id
results = [] results = []
for jargon in query: for jargon in query:
# 如果提供了chat_id且all_global=False需要检查chat_id列表是否包含目标chat_id
if chat_id and not global_config.jargon.all_global:
chat_id_list = parse_chat_id_list(jargon.chat_id)
# 如果记录是is_global=True或者chat_id列表包含目标chat_id则包含
if not jargon.is_global and not chat_id_list_contains(chat_id_list, chat_id):
continue
# 只返回有meaning的记录
if not jargon.meaning or jargon.meaning.strip() == "":
continue
results.append({"content": jargon.content or "", "meaning": jargon.meaning or ""}) results.append({"content": jargon.content or "", "meaning": jargon.meaning or ""})
# 达到限制数量后停止
if len(results) >= limit:
break
return results return results
async def store_jargon_from_answer(jargon_keyword: str, answer: str, chat_id: str) -> None:
"""将黑话存入jargon系统
Args:
jargon_keyword: 黑话关键词
answer: 答案内容将概括为raw_content
chat_id: 聊天ID
"""
try:
# 概括答案为简短的raw_content
summary_prompt = f"""请将以下答案概括为一句简短的话不超过50字作为黑话"{jargon_keyword}"的使用示例:
答案{answer}
只输出概括后的内容不要输出其他内容"""
success, summary, _, _ = await llm_api.generate_with_model(
summary_prompt,
model_config=model_config.model_task_config.utils_small,
request_type="memory.summarize_jargon",
)
logger.info(f"概括答案提示: {summary_prompt}")
logger.info(f"概括答案: {summary}")
if not success:
logger.warning(f"概括答案失败,使用原始答案: {summary}")
summary = answer[:100] # 截取前100字符作为备用
raw_content = summary.strip()[:200] # 限制长度
# 检查是否已存在
if global_config.jargon.all_global:
query = Jargon.select().where(Jargon.content == jargon_keyword)
else:
query = Jargon.select().where((Jargon.chat_id == chat_id) & (Jargon.content == jargon_keyword))
if query.exists():
# 更新现有记录
obj = query.get()
obj.count = (obj.count or 0) + 1
# 合并raw_content列表
existing_raw_content = []
if obj.raw_content:
try:
existing_raw_content = (
json.loads(obj.raw_content) if isinstance(obj.raw_content, str) else obj.raw_content
)
if not isinstance(existing_raw_content, list):
existing_raw_content = [existing_raw_content] if existing_raw_content else []
except (json.JSONDecodeError, TypeError):
existing_raw_content = [obj.raw_content] if obj.raw_content else []
# 合并并去重
merged_list = list(dict.fromkeys(existing_raw_content + [raw_content]))
obj.raw_content = json.dumps(merged_list, ensure_ascii=False)
if global_config.jargon.all_global:
obj.is_global = True
obj.save()
logger.info(f"更新jargon记录: {jargon_keyword}")
else:
# 创建新记录
is_global_new = True if global_config.jargon.all_global else False
Jargon.create(
content=jargon_keyword,
raw_content=json.dumps([raw_content], ensure_ascii=False),
chat_id=chat_id,
is_global=is_global_new,
count=1,
)
logger.info(f"创建新jargon记录: {jargon_keyword}")
except Exception as e:
logger.error(f"存储jargon失败: {e}")

View File

@ -0,0 +1,199 @@
import json
from typing import List, Dict, Optional, Any
from src.common.logger import get_logger
from src.common.database.database_model import Jargon
from src.config.config import global_config
from src.chat.utils.chat_message_builder import (
build_readable_messages,
build_readable_messages_with_id,
)
from src.chat.utils.utils import parse_platform_accounts
logger = get_logger("jargon")
def parse_chat_id_list(chat_id_value: Any) -> List[List[Any]]:
"""
解析chat_id字段兼容旧格式字符串和新格式JSON列表
Args:
chat_id_value: 可能是字符串旧格式或JSON字符串新格式
Returns:
List[List[Any]]: 格式为 [[chat_id, count], ...] 的列表
"""
if not chat_id_value:
return []
# 如果是字符串尝试解析为JSON
if isinstance(chat_id_value, str):
# 尝试解析JSON
try:
parsed = json.loads(chat_id_value)
if isinstance(parsed, list):
# 新格式:已经是列表
return parsed
elif isinstance(parsed, str):
# 解析后还是字符串,说明是旧格式
return [[parsed, 1]]
else:
# 其他类型,当作旧格式处理
return [[str(chat_id_value), 1]]
except (json.JSONDecodeError, TypeError):
# 解析失败,当作旧格式(纯字符串)
return [[str(chat_id_value), 1]]
elif isinstance(chat_id_value, list):
# 已经是列表格式
return chat_id_value
else:
# 其他类型,转换为旧格式
return [[str(chat_id_value), 1]]
def update_chat_id_list(chat_id_list: List[List[Any]], target_chat_id: str, increment: int = 1) -> List[List[Any]]:
"""
更新chat_id列表如果target_chat_id已存在则增加计数否则添加新条目
Args:
chat_id_list: 当前的chat_id列表格式为 [[chat_id, count], ...]
target_chat_id: 要更新或添加的chat_id
increment: 增加的计数默认为1
Returns:
List[List[Any]]: 更新后的chat_id列表
"""
# 查找是否已存在该chat_id
found = False
for item in chat_id_list:
if isinstance(item, list) and len(item) >= 1 and str(item[0]) == str(target_chat_id):
# 找到匹配的chat_id增加计数
if len(item) >= 2:
item[1] = (item[1] if isinstance(item[1], (int, float)) else 0) + increment
else:
item.append(increment)
found = True
break
if not found:
# 未找到,添加新条目
chat_id_list.append([target_chat_id, increment])
return chat_id_list
def chat_id_list_contains(chat_id_list: List[List[Any]], target_chat_id: str) -> bool:
"""
检查chat_id列表中是否包含指定的chat_id
Args:
chat_id_list: chat_id列表格式为 [[chat_id, count], ...]
target_chat_id: 要查找的chat_id
Returns:
bool: 如果包含则返回True
"""
for item in chat_id_list:
if isinstance(item, list) and len(item) >= 1 and str(item[0]) == str(target_chat_id):
return True
return False
def contains_bot_self_name(content: str) -> bool:
"""
判断词条是否包含机器人的昵称或别名
"""
if not content:
return False
bot_config = getattr(global_config, "bot", None)
if not bot_config:
return False
target = content.strip().lower()
nickname = str(getattr(bot_config, "nickname", "") or "").strip().lower()
alias_names = [str(alias or "").strip().lower() for alias in getattr(bot_config, "alias_names", []) or []]
candidates = [name for name in [nickname, *alias_names] if name]
return any(name in target for name in candidates if target)
def build_context_paragraph(messages: List[Any], center_index: int) -> Optional[str]:
"""
构建包含中心消息上下文的段落前3条+后3条使用标准的 readable builder 输出
"""
if not messages or center_index < 0 or center_index >= len(messages):
return None
context_start = max(0, center_index - 3)
context_end = min(len(messages), center_index + 1 + 3)
context_messages = messages[context_start:context_end]
if not context_messages:
return None
try:
paragraph = build_readable_messages(
messages=context_messages,
replace_bot_name=True,
timestamp_mode="relative",
read_mark=0.0,
truncate=False,
show_actions=False,
show_pic=True,
message_id_list=None,
remove_emoji_stickers=False,
pic_single=True,
)
except Exception as e:
logger.warning(f"构建上下文段落失败: {e}")
return None
paragraph = paragraph.strip()
return paragraph or None
def is_bot_message(msg: Any) -> bool:
"""判断消息是否来自机器人自身"""
if msg is None:
return False
bot_config = getattr(global_config, "bot", None)
if not bot_config:
return False
platform = (
str(getattr(msg, "user_platform", "") or getattr(getattr(msg, "user_info", None), "platform", "") or "")
.strip()
.lower()
)
user_id = (
str(getattr(msg, "user_id", "") or getattr(getattr(msg, "user_info", None), "user_id", "") or "")
.strip()
)
if not platform or not user_id:
return False
platform_accounts = {}
try:
platform_accounts = parse_platform_accounts(getattr(bot_config, "platforms", []) or [])
except Exception:
platform_accounts = {}
bot_accounts: Dict[str, str] = {}
qq_account = str(getattr(bot_config, "qq_account", "") or "").strip()
if qq_account:
bot_accounts["qq"] = qq_account
telegram_account = str(getattr(bot_config, "telegram_account", "") or "").strip()
if telegram_account:
bot_accounts["telegram"] = telegram_account
for plat, account in platform_accounts.items():
if account and plat not in bot_accounts:
bot_accounts[plat] = account
bot_account = bot_accounts.get(platform)
return bool(bot_account and user_id == bot_account)

View File

@ -8,11 +8,12 @@ from src.common.logger import get_logger
from src.config.config import global_config, model_config from src.config.config import global_config, model_config
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
from src.plugin_system.apis import llm_api from src.plugin_system.apis import llm_api
from src.common.database.database_model import ThinkingBack from src.common.database.database_model import ThinkingBack, Jargon
from json_repair import repair_json from json_repair import repair_json
from src.memory_system.retrieval_tools import get_tool_registry, init_all_tools from src.memory_system.retrieval_tools import get_tool_registry, init_all_tools
from src.memory_system.retrieval_tools.query_lpmm_knowledge import query_lpmm_knowledge from src.memory_system.retrieval_tools.query_lpmm_knowledge import query_lpmm_knowledge
from src.llm_models.payload_content.message import MessageBuilder, RoleType, Message from src.llm_models.payload_content.message import MessageBuilder, RoleType, Message
from src.jargon.jargon_utils import parse_chat_id_list, chat_id_list_contains, contains_bot_self_name
logger = get_logger("memory_retrieval") logger = get_logger("memory_retrieval")
@ -63,27 +64,23 @@ def init_memory_retrieval_prompt():
2. 是否有需要回忆的内容比如"之前说过""上次""以前" 2. 是否有需要回忆的内容比如"之前说过""上次""以前"
3. 是否有需要查找历史信息的问题 3. 是否有需要查找历史信息的问题
4. 是否有问题可以搜集信息帮助你聊天 4. 是否有问题可以搜集信息帮助你聊天
5. 对话中是否包含黑话俚语缩写等可能需要查询的概念
重要提示 重要提示
- **每次只能提出一个问题**选择最需要查询的关键问题 - **每次只能提出一个问题**选择最需要查询的关键问题
- 如果"最近已查询的问题和结果"中已经包含了类似的问题并得到了答案请避免重复生成相同或相似的问题不需要重复查询 - 如果"最近已查询的问题和结果"中已经包含了类似的问题并得到了答案请避免重复生成相同或相似的问题不需要重复查询
- 如果之前已经查询过某个问题但未找到答案可以尝试用不同的方式提问或更具体的问题 - 如果之前已经查询过某个问题但未找到答案可以尝试用不同的方式提问或更具体的问题
如果你认为需要从记忆中检索信息来回答 如果你认为需要从记忆中检索信息来回答请根据上下文提出**一个**最关键的问题来帮助你回复目标消息放入"questions"字段
1. 识别对话中可能需要查询的概念黑话/俚语/缩写/专有名词等关键词放入"concepts"字段
2. 根据上下文提出**一个**最关键的问题来帮助你回复目标消息放入"questions"字段
问题格式示例 问题格式示例
- "xxx在前几天干了什么" - "xxx在前几天干了什么"
- "xxx是什么" - "xxx是什么,在什么时候提到过?"
- "xxxx和xxx的关系是什么" - "xxxx和xxx的关系是什么"
- "xxx在某个时间点发生了什么" - "xxx在某个时间点发生了什么"
输出格式示例需要检索时 输出格式示例需要检索时
```json ```json
{{ {{
"concepts": ["AAA", "BBB", "CCC"], #需要检索的概念列表(字符串数组),如果不需要检索概念则输出空数组[]
"questions": ["张三在前几天干了什么"] #问题数组(字符串数组),如果不需要检索记忆则输出空数组[],如果需要检索则只输出包含一个问题的数组 "questions": ["张三在前几天干了什么"] #问题数组(字符串数组),如果不需要检索记忆则输出空数组[],如果需要检索则只输出包含一个问题的数组
}} }}
``` ```
@ -91,7 +88,6 @@ def init_memory_retrieval_prompt():
输出格式示例不需要检索时 输出格式示例不需要检索时
```json ```json
{{ {{
"concepts": [],
"questions": [] "questions": []
}} }}
``` ```
@ -280,6 +276,54 @@ async def _retrieve_concepts_with_jargon(concepts: List[str], chat_id: str) -> s
return "" return ""
def _match_jargon_from_text(chat_text: str, chat_id: str) -> List[str]:
"""直接在聊天文本中匹配已知的jargon返回出现过的黑话列表"""
if not chat_text or not chat_text.strip():
return []
start_time = time.time()
query = Jargon.select().where((Jargon.meaning.is_null(False)) & (Jargon.meaning != ""))
if global_config.jargon.all_global:
query = query.where(Jargon.is_global)
query = query.order_by(Jargon.count.desc())
query_time = time.time()
matched: Dict[str, None] = {}
for jargon in query:
content = (jargon.content or "").strip()
if not content:
continue
if contains_bot_self_name(content):
continue
if not global_config.jargon.all_global and not jargon.is_global:
chat_id_list = parse_chat_id_list(jargon.chat_id)
if not chat_id_list_contains(chat_id_list, chat_id):
continue
pattern = re.escape(content)
if re.search(r"[\u4e00-\u9fff]", content):
search_pattern = pattern
else:
search_pattern = r"\b" + pattern + r"\b"
if re.search(search_pattern, chat_text, re.IGNORECASE):
matched[content] = None
end_time = time.time()
logger.info(
f"记忆检索黑话匹配: 查询耗时 {(query_time - start_time):.3f}s, "
f"匹配耗时 {(end_time - query_time):.3f}s, 总耗时 {(end_time - start_time):.3f}s, "
f"匹配到 {len(matched)} 个黑话"
)
return list(matched.keys())
async def _react_agent_solve_question( async def _react_agent_solve_question(
question: str, chat_id: str, max_iterations: int = 5, timeout: float = 30.0, initial_info: str = "" question: str, chat_id: str, max_iterations: int = 5, timeout: float = 30.0, initial_info: str = ""
) -> Tuple[bool, str, List[Dict[str, Any]], bool]: ) -> Tuple[bool, str, List[Dict[str, Any]], bool]:
@ -991,11 +1035,17 @@ async def build_memory_retrieval_prompt(
return "" return ""
# 解析概念列表和问题列表 # 解析概念列表和问题列表
concepts, questions = _parse_questions_json(response) _, questions = _parse_questions_json(response)
logger.info(f"解析到 {len(concepts)} 个概念: {concepts}")
logger.info(f"解析到 {len(questions)} 个问题: {questions}") logger.info(f"解析到 {len(questions)} 个问题: {questions}")
# 对概念进行jargon检索作为初始信息 # 使用匹配逻辑自动识别聊天中的黑话概念
concepts = _match_jargon_from_text(message, chat_id)
if concepts:
logger.info(f"黑话匹配命中 {len(concepts)} 个概念: {concepts}")
else:
logger.info("黑话匹配未命中任何概念")
# 对匹配到的概念进行jargon检索作为初始信息
initial_info = "" initial_info = ""
if concepts: if concepts:
logger.info(f"开始对 {len(concepts)} 个概念进行jargon检索") logger.info(f"开始对 {len(concepts)} 个概念进行jargon检索")
@ -1026,8 +1076,6 @@ async def build_memory_retrieval_prompt(
else: else:
return "" return ""
logger.info(f"解析到 {len(questions)} 个问题: {questions}")
# 第二步:并行处理所有问题(使用配置的最大迭代次数/120秒超时 # 第二步:并行处理所有问题(使用配置的最大迭代次数/120秒超时
max_iterations = global_config.memory.max_agent_iterations max_iterations = global_config.memory.max_agent_iterations
logger.info(f"问题数量: {len(questions)},设置最大迭代次数: {max_iterations},超时时间: 120秒") logger.info(f"问题数量: {len(questions)},设置最大迭代次数: {max_iterations},超时时间: 120秒")

View File

@ -1,5 +1,5 @@
[inner] [inner]
version = "6.23.1" version = "6.23.4"
#----以下是给开发人员阅读的,如果你只是部署了麦麦,不需要阅读---- #----以下是给开发人员阅读的,如果你只是部署了麦麦,不需要阅读----
#如果你想要修改配置文件请递增version的值 #如果你想要修改配置文件请递增version的值