mirror of https://github.com/Mai-with-u/MaiBot.git
feat:升级jargon,提取更快速,更精准
parent
a1dd26d578
commit
b684a95fbc
|
|
@ -22,6 +22,7 @@ MaiMBot-LPMM
|
|||
*.zip
|
||||
run_bot.bat
|
||||
run_na.bat
|
||||
run_all_in_wt.bat
|
||||
run.bat
|
||||
log_debug/
|
||||
run_amds.bat
|
||||
|
|
|
|||
|
|
@ -352,6 +352,7 @@ def _build_readable_messages_internal(
|
|||
pic_counter: int = 1,
|
||||
show_pic: bool = True,
|
||||
message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None,
|
||||
pic_single: bool = False,
|
||||
) -> Tuple[str, List[Tuple[float, str, str]], Dict[str, str], int]:
|
||||
# sourcery skip: use-getitem-for-re-match-groups
|
||||
"""
|
||||
|
|
@ -378,6 +379,7 @@ def _build_readable_messages_internal(
|
|||
if pic_id_mapping is None:
|
||||
pic_id_mapping = {}
|
||||
current_pic_counter = pic_counter
|
||||
pic_description_cache: Dict[str, str] = {}
|
||||
|
||||
# 创建时间戳到消息ID的映射,用于在消息前添加[id]标识符
|
||||
timestamp_to_id_mapping: Dict[float, str] = {}
|
||||
|
|
@ -400,6 +402,17 @@ def _build_readable_messages_internal(
|
|||
nonlocal current_pic_counter
|
||||
nonlocal pic_counter
|
||||
pic_id = match.group(1)
|
||||
if pic_single:
|
||||
if pic_id not in pic_description_cache:
|
||||
description = "内容正在阅读,请稍等"
|
||||
try:
|
||||
image = Images.get_or_none(Images.image_id == pic_id)
|
||||
if image and image.description:
|
||||
description = image.description
|
||||
except Exception:
|
||||
pass
|
||||
pic_description_cache[pic_id] = description
|
||||
return f"[图片:{pic_description_cache[pic_id]}]"
|
||||
if pic_id not in pic_id_mapping:
|
||||
pic_id_mapping[pic_id] = f"图片{current_pic_counter}"
|
||||
current_pic_counter += 1
|
||||
|
|
@ -603,6 +616,7 @@ async def build_readable_messages_with_list(
|
|||
replace_bot_name: bool = True,
|
||||
timestamp_mode: str = "relative",
|
||||
truncate: bool = False,
|
||||
pic_single: bool = False,
|
||||
) -> Tuple[str, List[Tuple[float, str, str]]]:
|
||||
"""
|
||||
将消息列表转换为可读的文本格式,并返回原始(时间戳, 昵称, 内容)列表。
|
||||
|
|
@ -613,10 +627,16 @@ async def build_readable_messages_with_list(
|
|||
replace_bot_name,
|
||||
timestamp_mode,
|
||||
truncate,
|
||||
pic_id_mapping=None,
|
||||
pic_counter=1,
|
||||
show_pic=True,
|
||||
message_id_list=None,
|
||||
pic_single=pic_single,
|
||||
)
|
||||
|
||||
if pic_mapping_info := build_pic_mapping_info(pic_id_mapping):
|
||||
formatted_string = f"{pic_mapping_info}\n\n{formatted_string}"
|
||||
if not pic_single:
|
||||
if pic_mapping_info := build_pic_mapping_info(pic_id_mapping):
|
||||
formatted_string = f"{pic_mapping_info}\n\n{formatted_string}"
|
||||
|
||||
return formatted_string, details_list
|
||||
|
||||
|
|
@ -630,6 +650,7 @@ def build_readable_messages_with_id(
|
|||
show_actions: bool = False,
|
||||
show_pic: bool = True,
|
||||
remove_emoji_stickers: bool = False,
|
||||
pic_single: bool = False,
|
||||
) -> Tuple[str, List[Tuple[str, DatabaseMessages]]]:
|
||||
"""
|
||||
将消息列表转换为可读的文本格式,并返回原始(时间戳, 昵称, 内容)列表。
|
||||
|
|
@ -647,6 +668,7 @@ def build_readable_messages_with_id(
|
|||
read_mark=read_mark,
|
||||
message_id_list=message_id_list,
|
||||
remove_emoji_stickers=remove_emoji_stickers,
|
||||
pic_single=pic_single,
|
||||
)
|
||||
|
||||
return formatted_string, message_id_list
|
||||
|
|
@ -662,6 +684,7 @@ def build_readable_messages(
|
|||
show_pic: bool = True,
|
||||
message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None,
|
||||
remove_emoji_stickers: bool = False,
|
||||
pic_single: bool = False,
|
||||
) -> str: # sourcery skip: extract-method
|
||||
"""
|
||||
将消息列表转换为可读的文本格式。
|
||||
|
|
@ -769,14 +792,14 @@ def build_readable_messages(
|
|||
truncate,
|
||||
show_pic=show_pic,
|
||||
message_id_list=message_id_list,
|
||||
pic_single=pic_single,
|
||||
)
|
||||
|
||||
# 生成图片映射信息并添加到最前面
|
||||
pic_mapping_info = build_pic_mapping_info(pic_id_mapping)
|
||||
if pic_mapping_info:
|
||||
return f"{pic_mapping_info}\n\n{formatted_string}"
|
||||
else:
|
||||
return formatted_string
|
||||
if not pic_single:
|
||||
pic_mapping_info = build_pic_mapping_info(pic_id_mapping)
|
||||
if pic_mapping_info:
|
||||
return f"{pic_mapping_info}\n\n{formatted_string}"
|
||||
return formatted_string
|
||||
else:
|
||||
# 按 read_mark 分割消息
|
||||
messages_before_mark = [msg for msg in copy_messages if (msg.time or 0) <= read_mark]
|
||||
|
|
@ -796,6 +819,7 @@ def build_readable_messages(
|
|||
pic_counter,
|
||||
show_pic=show_pic,
|
||||
message_id_list=message_id_list,
|
||||
pic_single=pic_single,
|
||||
)
|
||||
formatted_after, _, pic_id_mapping, _ = _build_readable_messages_internal(
|
||||
messages_after_mark,
|
||||
|
|
@ -806,15 +830,19 @@ def build_readable_messages(
|
|||
pic_counter,
|
||||
show_pic=show_pic,
|
||||
message_id_list=message_id_list,
|
||||
pic_single=pic_single,
|
||||
)
|
||||
|
||||
read_mark_line = "\n--- 以上消息是你已经看过,请关注以下未读的新消息---\n"
|
||||
|
||||
# 生成图片映射信息
|
||||
if pic_id_mapping:
|
||||
pic_mapping_info = f"图片信息:\n{build_pic_mapping_info(pic_id_mapping)}\n聊天记录信息:\n"
|
||||
if not pic_single:
|
||||
if pic_id_mapping:
|
||||
pic_mapping_info = f"图片信息:\n{build_pic_mapping_info(pic_id_mapping)}\n聊天记录信息:\n"
|
||||
else:
|
||||
pic_mapping_info = "聊天记录信息:\n"
|
||||
else:
|
||||
pic_mapping_info = "聊天记录信息:\n"
|
||||
pic_mapping_info = ""
|
||||
|
||||
# 组合结果
|
||||
result_parts = []
|
||||
|
|
@ -832,7 +860,7 @@ def build_readable_messages(
|
|||
return "".join(result_parts)
|
||||
|
||||
|
||||
async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str:
|
||||
async def build_anonymous_messages(messages: List[DatabaseMessages], show_ids: bool = False) -> str:
|
||||
"""
|
||||
构建匿名可读消息,将不同人的名称转为唯一占位符(A、B、C...),bot自己用SELF。
|
||||
处理 回复<aaa:bbb> 和 @<aaa:bbb> 字段,将bbb映射为匿名占位符。
|
||||
|
|
@ -889,7 +917,7 @@ async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str:
|
|||
current_char += 1
|
||||
return person_map[person_id]
|
||||
|
||||
for msg in messages:
|
||||
for i, msg in enumerate(messages):
|
||||
try:
|
||||
platform = msg.chat_info.platform
|
||||
user_id = msg.user_info.user_id
|
||||
|
|
@ -910,7 +938,12 @@ async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str:
|
|||
|
||||
content = replace_user_references(content, platform, anon_name_resolver, replace_bot_name=False)
|
||||
|
||||
header = f"{anon_name}说 "
|
||||
# 构建消息头,如果启用show_ids则添加序号
|
||||
if show_ids:
|
||||
header = f"[{i + 1}] {anon_name}说 "
|
||||
else:
|
||||
header = f"{anon_name}说 "
|
||||
|
||||
output_lines.append(header)
|
||||
stripped_line = content.strip()
|
||||
if stripped_line:
|
||||
|
|
|
|||
|
|
@ -330,8 +330,6 @@ class Jargon(BaseModel):
|
|||
|
||||
content = TextField()
|
||||
raw_content = TextField(null=True)
|
||||
type = TextField(null=True)
|
||||
translation = TextField(null=True)
|
||||
meaning = TextField(null=True)
|
||||
chat_id = TextField(index=True)
|
||||
is_global = BooleanField(default=False)
|
||||
|
|
|
|||
|
|
@ -39,9 +39,19 @@ class ExpressionReflector:
|
|||
|
||||
# 检查是否在允许列表中
|
||||
allow_reflect = global_config.expression.allow_reflect
|
||||
if allow_reflect and self.chat_id not in allow_reflect:
|
||||
logger.info(f"[Expression Reflection] 当前聊天流 {self.chat_id} 不在允许列表中,跳过")
|
||||
return False
|
||||
if allow_reflect:
|
||||
# 将 allow_reflect 中的 platform:id:type 格式转换为 chat_id 列表
|
||||
allow_reflect_chat_ids = []
|
||||
for stream_config in allow_reflect:
|
||||
parsed_chat_id = global_config.expression._parse_stream_config_to_chat_id(stream_config)
|
||||
if parsed_chat_id:
|
||||
allow_reflect_chat_ids.append(parsed_chat_id)
|
||||
else:
|
||||
logger.warning(f"[Expression Reflection] 无法解析 allow_reflect 配置项: {stream_config}")
|
||||
|
||||
if self.chat_id not in allow_reflect_chat_ids:
|
||||
logger.info(f"[Expression Reflection] 当前聊天流 {self.chat_id} 不在允许列表中,跳过")
|
||||
return False
|
||||
|
||||
# 检查上一次提问时间
|
||||
current_time = time.time()
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import time
|
||||
import json
|
||||
import asyncio
|
||||
from collections import OrderedDict
|
||||
from typing import List, Dict, Optional, Any
|
||||
from json_repair import repair_json
|
||||
from peewee import fn
|
||||
|
|
@ -12,12 +13,13 @@ from src.config.config import model_config, global_config
|
|||
from src.chat.message_receive.chat_stream import get_chat_manager
|
||||
from src.plugin_system.apis import llm_api
|
||||
from src.chat.utils.chat_message_builder import (
|
||||
build_anonymous_messages,
|
||||
build_readable_messages_with_id,
|
||||
get_raw_msg_by_timestamp_with_chat_inclusive,
|
||||
get_raw_msg_before_timestamp_with_chat,
|
||||
build_readable_messages_with_list,
|
||||
)
|
||||
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
|
||||
from src.chat.utils.utils import parse_platform_accounts
|
||||
|
||||
|
||||
logger = get_logger("jargon")
|
||||
|
|
@ -43,9 +45,107 @@ def _contains_bot_self_name(content: str) -> bool:
|
|||
return any(name in target for name in candidates if target)
|
||||
|
||||
|
||||
def _format_context_message(msg: Any, seq_index: int) -> str:
|
||||
"""
|
||||
将单条消息格式化为带序号的上下文行
|
||||
"""
|
||||
if msg is None:
|
||||
return ""
|
||||
|
||||
text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip()
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
user_info = getattr(msg, "user_info", None)
|
||||
nickname = ""
|
||||
if user_info:
|
||||
nickname = getattr(user_info, "user_nickname", "") or getattr(user_info, "user_id", "")
|
||||
|
||||
if not nickname:
|
||||
nickname = getattr(msg, "user_nickname", "") or getattr(msg, "user_id", "") or "某人"
|
||||
|
||||
return f"{nickname}: {text}"
|
||||
|
||||
|
||||
def _build_context_paragraph(messages: List[Any], center_index: int) -> Optional[str]:
|
||||
"""
|
||||
构建包含中心消息上下文的段落(前3条+后3条)
|
||||
"""
|
||||
if not messages or center_index < 0 or center_index >= len(messages):
|
||||
return None
|
||||
|
||||
context_start = max(0, center_index - 3)
|
||||
context_end = min(len(messages), center_index + 1 + 3)
|
||||
|
||||
context_lines: List[str] = []
|
||||
for idx in range(context_start, context_end):
|
||||
formatted_line = _format_context_message(messages[idx], idx + 1)
|
||||
if formatted_line:
|
||||
context_lines.append(formatted_line)
|
||||
|
||||
if not context_lines:
|
||||
return None
|
||||
|
||||
paragraph = "\n".join(context_lines).strip()
|
||||
return paragraph or None
|
||||
|
||||
|
||||
def _is_bot_message(msg: Any) -> bool:
|
||||
"""判断消息是否来自机器人自身"""
|
||||
if msg is None:
|
||||
return False
|
||||
|
||||
bot_config = getattr(global_config, "bot", None)
|
||||
if not bot_config:
|
||||
return False
|
||||
|
||||
platform = (
|
||||
str(getattr(msg, "user_platform", "") or getattr(getattr(msg, "user_info", None), "platform", "") or "")
|
||||
.strip()
|
||||
.lower()
|
||||
)
|
||||
user_id = (
|
||||
str(getattr(msg, "user_id", "") or getattr(getattr(msg, "user_info", None), "user_id", "") or "")
|
||||
.strip()
|
||||
)
|
||||
|
||||
if not platform or not user_id:
|
||||
return False
|
||||
|
||||
platform_accounts = {}
|
||||
try:
|
||||
platform_accounts = parse_platform_accounts(getattr(bot_config, "platforms", []) or [])
|
||||
except Exception:
|
||||
platform_accounts = {}
|
||||
|
||||
bot_accounts: Dict[str, str] = {}
|
||||
qq_account = str(getattr(bot_config, "qq_account", "") or "").strip()
|
||||
if qq_account:
|
||||
bot_accounts["qq"] = qq_account
|
||||
|
||||
telegram_account = str(getattr(bot_config, "telegram_account", "") or "").strip()
|
||||
if telegram_account:
|
||||
bot_accounts["telegram"] = telegram_account
|
||||
|
||||
for plat, account in platform_accounts.items():
|
||||
if account and plat not in bot_accounts:
|
||||
bot_accounts[plat] = account
|
||||
|
||||
bot_account = bot_accounts.get(platform)
|
||||
return bool(bot_account and user_id == bot_account)
|
||||
|
||||
|
||||
def _has_adjacent_bot_message(messages: List[Any], center_index: int) -> bool:
|
||||
"""检查目标消息的上一条或下一条是否为机器人发言"""
|
||||
for neighbor in (center_index - 1, center_index + 1):
|
||||
if 0 <= neighbor < len(messages) and _is_bot_message(messages[neighbor]):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _init_prompt() -> None:
|
||||
prompt_str = """
|
||||
**聊天内容,其中的SELF是你自己的发言**
|
||||
**聊天内容,其中的{bot_name}的发言内容是你自己的发言,[msg_id] 是消息ID**
|
||||
{chat_str}
|
||||
|
||||
请从上面这段聊天内容中提取"可能是黑话"的候选项(黑话/俚语/网络缩写/口头禅)。
|
||||
|
|
@ -62,9 +162,10 @@ def _init_prompt() -> None:
|
|||
- 中文词语的缩写,用几个汉字概括一个词汇或含义,例如:社死、内卷
|
||||
|
||||
以 JSON 数组输出,元素为对象(严格按以下结构):
|
||||
请你提取出可能的黑话,最多10
|
||||
[
|
||||
{{"content": "词条", "raw_content": "包含该词条的完整对话上下文原文"}},
|
||||
{{"content": "词条2", "raw_content": "包含该词条的完整对话上下文原文"}}
|
||||
{{"content": "词条", "msg_id": "m12"}}, // msg_id 必须与上方聊天中展示的ID完全一致
|
||||
{{"content": "词条2", "msg_id": "m15"}}
|
||||
]
|
||||
|
||||
现在请输出:
|
||||
|
|
@ -78,10 +179,10 @@ def _init_inference_prompts() -> None:
|
|||
prompt1_str = """
|
||||
**词条内容**
|
||||
{content}
|
||||
**词条出现的上下文(raw_content)其中的SELF是你自己的发言**
|
||||
**词条出现的上下文。其中的{bot_name}的发言内容是你自己的发言**
|
||||
{raw_content_list}
|
||||
|
||||
请根据以上词条内容和上下文,推断这个词条的含义。
|
||||
请根据上下文,推断"{content}"这个词条的含义。
|
||||
- 如果这是一个黑话、俚语或网络用语,请推断其含义
|
||||
- 如果含义明确(常规词汇),也请说明
|
||||
- 如果上下文信息不足,无法推断含义,请设置 no_info 为 true
|
||||
|
|
@ -240,7 +341,7 @@ def _should_infer_meaning(jargon_obj: Jargon) -> bool:
|
|||
last_inference = jargon_obj.last_inference_count or 0
|
||||
|
||||
# 阈值列表:3,6, 10, 20, 40, 60, 100
|
||||
thresholds = [3, 6, 10, 20, 40, 60, 100]
|
||||
thresholds = [2, 4, 8, 12, 24, 60, 100]
|
||||
|
||||
if count < thresholds[0]:
|
||||
return False
|
||||
|
|
@ -281,6 +382,53 @@ class JargonMiner:
|
|||
chat_manager = get_chat_manager()
|
||||
stream_name = chat_manager.get_stream_name(self.chat_id)
|
||||
self.stream_name = stream_name if stream_name else self.chat_id
|
||||
self.cache_limit = 100
|
||||
self.cache: OrderedDict[str, None] = OrderedDict()
|
||||
|
||||
def _add_to_cache(self, content: str) -> None:
|
||||
"""将提取到的黑话加入缓存,保持LRU语义"""
|
||||
if not content:
|
||||
return
|
||||
|
||||
key = content.strip()
|
||||
if not key:
|
||||
return
|
||||
|
||||
if key in self.cache:
|
||||
self.cache.move_to_end(key)
|
||||
else:
|
||||
self.cache[key] = None
|
||||
if len(self.cache) > self.cache_limit:
|
||||
self.cache.popitem(last=False)
|
||||
|
||||
def _collect_cached_entries(self, messages: List[Any]) -> List[Dict[str, List[str]]]:
|
||||
"""检查缓存中的黑话是否出现在当前消息窗口,生成对应上下文"""
|
||||
if not self.cache or not messages:
|
||||
return []
|
||||
|
||||
cached_entries: List[Dict[str, List[str]]] = []
|
||||
processed_pairs = set()
|
||||
|
||||
for idx, msg in enumerate(messages):
|
||||
msg_text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip()
|
||||
if not msg_text or _is_bot_message(msg):
|
||||
continue
|
||||
|
||||
for content in self.cache.keys():
|
||||
if not content:
|
||||
continue
|
||||
if (content, idx) in processed_pairs:
|
||||
continue
|
||||
if content in msg_text:
|
||||
if _has_adjacent_bot_message(messages, idx):
|
||||
continue
|
||||
paragraph = _build_context_paragraph(messages, idx)
|
||||
if not paragraph:
|
||||
continue
|
||||
cached_entries.append({"content": content, "raw_content": [paragraph]})
|
||||
processed_pairs.add((content, idx))
|
||||
|
||||
return cached_entries
|
||||
|
||||
async def _infer_meaning_by_id(self, jargon_id: int) -> None:
|
||||
"""通过ID加载对象并推断"""
|
||||
|
|
@ -323,6 +471,7 @@ class JargonMiner:
|
|||
prompt1 = await global_prompt_manager.format_prompt(
|
||||
"jargon_inference_with_context_prompt",
|
||||
content=content,
|
||||
bot_name = global_config.bot.nickname,
|
||||
raw_content_list=raw_content_text,
|
||||
)
|
||||
|
||||
|
|
@ -441,8 +590,8 @@ class JargonMiner:
|
|||
# 是黑话,使用推断1的结果(基于上下文,更准确)
|
||||
jargon_obj.meaning = inference1.get("meaning", "")
|
||||
else:
|
||||
# 不是黑话,也记录含义(使用推断2的结果,因为含义明确)
|
||||
jargon_obj.meaning = inference2.get("meaning", "")
|
||||
# 不是黑话,清空含义,不再存储任何内容
|
||||
jargon_obj.meaning = ""
|
||||
|
||||
# 更新最后一次判定的count值,避免重启后重复判定
|
||||
jargon_obj.last_inference_count = jargon_obj.count or 0
|
||||
|
|
@ -511,12 +660,33 @@ class JargonMiner:
|
|||
if not messages:
|
||||
return
|
||||
|
||||
chat_str: str = await build_anonymous_messages(messages)
|
||||
# 按时间排序,确保编号与上下文一致
|
||||
messages = sorted(messages, key=lambda msg: msg.time or 0)
|
||||
|
||||
chat_str, message_id_list = build_readable_messages_with_id(
|
||||
messages=messages,
|
||||
replace_bot_name=True,
|
||||
timestamp_mode="relative",
|
||||
truncate=False,
|
||||
show_actions=False,
|
||||
show_pic=True,
|
||||
pic_single=True,
|
||||
)
|
||||
if not chat_str.strip():
|
||||
return
|
||||
|
||||
msg_id_to_index: Dict[str, int] = {}
|
||||
for idx, (msg_id, _msg) in enumerate(message_id_list or []):
|
||||
if not msg_id:
|
||||
continue
|
||||
msg_id_to_index[msg_id] = idx
|
||||
if not msg_id_to_index:
|
||||
logger.warning("未能生成消息ID映射,跳过本次提取")
|
||||
return
|
||||
|
||||
prompt: str = await global_prompt_manager.format_prompt(
|
||||
"extract_jargon_prompt",
|
||||
bot_name=global_config.bot.nickname,
|
||||
chat_str=chat_str,
|
||||
)
|
||||
|
||||
|
|
@ -551,25 +721,46 @@ class JargonMiner:
|
|||
for item in parsed:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
content = str(item.get("content", "")).strip()
|
||||
raw_content_value = item.get("raw_content", "")
|
||||
msg_id_value = item.get("msg_id")
|
||||
|
||||
# 处理raw_content:可能是字符串或列表
|
||||
raw_content_list = []
|
||||
if isinstance(raw_content_value, list):
|
||||
raw_content_list = [str(rc).strip() for rc in raw_content_value if str(rc).strip()]
|
||||
# 去重
|
||||
raw_content_list = list(dict.fromkeys(raw_content_list))
|
||||
elif isinstance(raw_content_value, str):
|
||||
raw_content_str = raw_content_value.strip()
|
||||
if raw_content_str:
|
||||
raw_content_list = [raw_content_str]
|
||||
if not content:
|
||||
continue
|
||||
|
||||
if content and raw_content_list:
|
||||
if _contains_bot_self_name(content):
|
||||
logger.debug(f"解析阶段跳过包含机器人昵称/别名的词条: {content}")
|
||||
continue
|
||||
entries.append({"content": content, "raw_content": raw_content_list})
|
||||
if _contains_bot_self_name(content):
|
||||
logger.debug(f"解析阶段跳过包含机器人昵称/别名的词条: {content}")
|
||||
continue
|
||||
|
||||
msg_id_str = str(msg_id_value or "").strip()
|
||||
if not msg_id_str:
|
||||
logger.warning(f"解析jargon失败:msg_id缺失,content={content}")
|
||||
continue
|
||||
|
||||
msg_index = msg_id_to_index.get(msg_id_str)
|
||||
if msg_index is None:
|
||||
logger.warning(f"解析jargon失败:msg_id未找到,content={content}, msg_id={msg_id_str}")
|
||||
continue
|
||||
|
||||
target_msg = messages[msg_index]
|
||||
if _is_bot_message(target_msg):
|
||||
logger.debug(f"解析阶段跳过引用机器人自身消息的词条: content={content}, msg_id={msg_id_str}")
|
||||
continue
|
||||
if _has_adjacent_bot_message(messages, msg_index):
|
||||
logger.debug(
|
||||
f"解析阶段跳过因邻近机器人发言的词条: content={content}, msg_id={msg_id_str}"
|
||||
)
|
||||
continue
|
||||
|
||||
context_paragraph = _build_context_paragraph(messages, msg_index)
|
||||
if not context_paragraph:
|
||||
logger.warning(f"解析jargon失败:上下文为空,content={content}, msg_id={msg_id_str}")
|
||||
continue
|
||||
|
||||
entries.append({"content": content, "raw_content": [context_paragraph]})
|
||||
cached_entries = self._collect_cached_entries(messages)
|
||||
if cached_entries:
|
||||
entries.extend(cached_entries)
|
||||
except Exception as e:
|
||||
logger.error(f"解析jargon JSON失败: {e}; 原始: {response}")
|
||||
return
|
||||
|
|
@ -577,15 +768,25 @@ class JargonMiner:
|
|||
if not entries:
|
||||
return
|
||||
|
||||
# 去重并写入DB(按 chat_id + content 去重)
|
||||
# 使用content作为去重键
|
||||
seen = set()
|
||||
uniq_entries = []
|
||||
# 去重并合并raw_content(按 content 聚合)
|
||||
merged_entries: OrderedDict[str, Dict[str, List[str]]] = OrderedDict()
|
||||
for entry in entries:
|
||||
content_key = entry["content"]
|
||||
if content_key not in seen:
|
||||
seen.add(content_key)
|
||||
uniq_entries.append(entry)
|
||||
raw_list = entry.get("raw_content", []) or []
|
||||
if content_key in merged_entries:
|
||||
merged_entries[content_key]["raw_content"].extend(raw_list)
|
||||
else:
|
||||
merged_entries[content_key] = {
|
||||
"content": content_key,
|
||||
"raw_content": list(raw_list),
|
||||
}
|
||||
|
||||
uniq_entries = []
|
||||
for merged_entry in merged_entries.values():
|
||||
raw_content_list = merged_entry["raw_content"]
|
||||
if raw_content_list:
|
||||
merged_entry["raw_content"] = list(dict.fromkeys(raw_content_list))
|
||||
uniq_entries.append(merged_entry)
|
||||
|
||||
saved = 0
|
||||
updated = 0
|
||||
|
|
@ -670,6 +871,8 @@ class JargonMiner:
|
|||
except Exception as e:
|
||||
logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}")
|
||||
continue
|
||||
finally:
|
||||
self._add_to_cache(content)
|
||||
|
||||
# 固定输出提取的jargon结果,格式化为可读形式(只要有提取结果就输出)
|
||||
if uniq_entries:
|
||||
|
|
|
|||
Loading…
Reference in New Issue