feat:升级jargon,提取更快速,更精准

pull/1385/head
SengokuCola 2025-11-25 01:58:31 +08:00
parent a1dd26d578
commit b684a95fbc
5 changed files with 297 additions and 52 deletions

1
.gitignore vendored
View File

@ -22,6 +22,7 @@ MaiMBot-LPMM
*.zip *.zip
run_bot.bat run_bot.bat
run_na.bat run_na.bat
run_all_in_wt.bat
run.bat run.bat
log_debug/ log_debug/
run_amds.bat run_amds.bat

View File

@ -352,6 +352,7 @@ def _build_readable_messages_internal(
pic_counter: int = 1, pic_counter: int = 1,
show_pic: bool = True, show_pic: bool = True,
message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None, message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None,
pic_single: bool = False,
) -> Tuple[str, List[Tuple[float, str, str]], Dict[str, str], int]: ) -> Tuple[str, List[Tuple[float, str, str]], Dict[str, str], int]:
# sourcery skip: use-getitem-for-re-match-groups # sourcery skip: use-getitem-for-re-match-groups
""" """
@ -378,6 +379,7 @@ def _build_readable_messages_internal(
if pic_id_mapping is None: if pic_id_mapping is None:
pic_id_mapping = {} pic_id_mapping = {}
current_pic_counter = pic_counter current_pic_counter = pic_counter
pic_description_cache: Dict[str, str] = {}
# 创建时间戳到消息ID的映射用于在消息前添加[id]标识符 # 创建时间戳到消息ID的映射用于在消息前添加[id]标识符
timestamp_to_id_mapping: Dict[float, str] = {} timestamp_to_id_mapping: Dict[float, str] = {}
@ -400,6 +402,17 @@ def _build_readable_messages_internal(
nonlocal current_pic_counter nonlocal current_pic_counter
nonlocal pic_counter nonlocal pic_counter
pic_id = match.group(1) pic_id = match.group(1)
if pic_single:
if pic_id not in pic_description_cache:
description = "内容正在阅读,请稍等"
try:
image = Images.get_or_none(Images.image_id == pic_id)
if image and image.description:
description = image.description
except Exception:
pass
pic_description_cache[pic_id] = description
return f"[图片:{pic_description_cache[pic_id]}]"
if pic_id not in pic_id_mapping: if pic_id not in pic_id_mapping:
pic_id_mapping[pic_id] = f"图片{current_pic_counter}" pic_id_mapping[pic_id] = f"图片{current_pic_counter}"
current_pic_counter += 1 current_pic_counter += 1
@ -603,6 +616,7 @@ async def build_readable_messages_with_list(
replace_bot_name: bool = True, replace_bot_name: bool = True,
timestamp_mode: str = "relative", timestamp_mode: str = "relative",
truncate: bool = False, truncate: bool = False,
pic_single: bool = False,
) -> Tuple[str, List[Tuple[float, str, str]]]: ) -> Tuple[str, List[Tuple[float, str, str]]]:
""" """
将消息列表转换为可读的文本格式并返回原始(时间戳, 昵称, 内容)列表 将消息列表转换为可读的文本格式并返回原始(时间戳, 昵称, 内容)列表
@ -613,10 +627,16 @@ async def build_readable_messages_with_list(
replace_bot_name, replace_bot_name,
timestamp_mode, timestamp_mode,
truncate, truncate,
pic_id_mapping=None,
pic_counter=1,
show_pic=True,
message_id_list=None,
pic_single=pic_single,
) )
if pic_mapping_info := build_pic_mapping_info(pic_id_mapping): if not pic_single:
formatted_string = f"{pic_mapping_info}\n\n{formatted_string}" if pic_mapping_info := build_pic_mapping_info(pic_id_mapping):
formatted_string = f"{pic_mapping_info}\n\n{formatted_string}"
return formatted_string, details_list return formatted_string, details_list
@ -630,6 +650,7 @@ def build_readable_messages_with_id(
show_actions: bool = False, show_actions: bool = False,
show_pic: bool = True, show_pic: bool = True,
remove_emoji_stickers: bool = False, remove_emoji_stickers: bool = False,
pic_single: bool = False,
) -> Tuple[str, List[Tuple[str, DatabaseMessages]]]: ) -> Tuple[str, List[Tuple[str, DatabaseMessages]]]:
""" """
将消息列表转换为可读的文本格式并返回原始(时间戳, 昵称, 内容)列表 将消息列表转换为可读的文本格式并返回原始(时间戳, 昵称, 内容)列表
@ -647,6 +668,7 @@ def build_readable_messages_with_id(
read_mark=read_mark, read_mark=read_mark,
message_id_list=message_id_list, message_id_list=message_id_list,
remove_emoji_stickers=remove_emoji_stickers, remove_emoji_stickers=remove_emoji_stickers,
pic_single=pic_single,
) )
return formatted_string, message_id_list return formatted_string, message_id_list
@ -662,6 +684,7 @@ def build_readable_messages(
show_pic: bool = True, show_pic: bool = True,
message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None, message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None,
remove_emoji_stickers: bool = False, remove_emoji_stickers: bool = False,
pic_single: bool = False,
) -> str: # sourcery skip: extract-method ) -> str: # sourcery skip: extract-method
""" """
将消息列表转换为可读的文本格式 将消息列表转换为可读的文本格式
@ -769,14 +792,14 @@ def build_readable_messages(
truncate, truncate,
show_pic=show_pic, show_pic=show_pic,
message_id_list=message_id_list, message_id_list=message_id_list,
pic_single=pic_single,
) )
# 生成图片映射信息并添加到最前面 if not pic_single:
pic_mapping_info = build_pic_mapping_info(pic_id_mapping) pic_mapping_info = build_pic_mapping_info(pic_id_mapping)
if pic_mapping_info: if pic_mapping_info:
return f"{pic_mapping_info}\n\n{formatted_string}" return f"{pic_mapping_info}\n\n{formatted_string}"
else: return formatted_string
return formatted_string
else: else:
# 按 read_mark 分割消息 # 按 read_mark 分割消息
messages_before_mark = [msg for msg in copy_messages if (msg.time or 0) <= read_mark] messages_before_mark = [msg for msg in copy_messages if (msg.time or 0) <= read_mark]
@ -796,6 +819,7 @@ def build_readable_messages(
pic_counter, pic_counter,
show_pic=show_pic, show_pic=show_pic,
message_id_list=message_id_list, message_id_list=message_id_list,
pic_single=pic_single,
) )
formatted_after, _, pic_id_mapping, _ = _build_readable_messages_internal( formatted_after, _, pic_id_mapping, _ = _build_readable_messages_internal(
messages_after_mark, messages_after_mark,
@ -806,15 +830,19 @@ def build_readable_messages(
pic_counter, pic_counter,
show_pic=show_pic, show_pic=show_pic,
message_id_list=message_id_list, message_id_list=message_id_list,
pic_single=pic_single,
) )
read_mark_line = "\n--- 以上消息是你已经看过,请关注以下未读的新消息---\n" read_mark_line = "\n--- 以上消息是你已经看过,请关注以下未读的新消息---\n"
# 生成图片映射信息 # 生成图片映射信息
if pic_id_mapping: if not pic_single:
pic_mapping_info = f"图片信息:\n{build_pic_mapping_info(pic_id_mapping)}\n聊天记录信息:\n" if pic_id_mapping:
pic_mapping_info = f"图片信息:\n{build_pic_mapping_info(pic_id_mapping)}\n聊天记录信息:\n"
else:
pic_mapping_info = "聊天记录信息:\n"
else: else:
pic_mapping_info = "聊天记录信息:\n" pic_mapping_info = ""
# 组合结果 # 组合结果
result_parts = [] result_parts = []
@ -832,7 +860,7 @@ def build_readable_messages(
return "".join(result_parts) return "".join(result_parts)
async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str: async def build_anonymous_messages(messages: List[DatabaseMessages], show_ids: bool = False) -> str:
""" """
构建匿名可读消息将不同人的名称转为唯一占位符ABC...bot自己用SELF 构建匿名可读消息将不同人的名称转为唯一占位符ABC...bot自己用SELF
处理 回复<aaa:bbb> @<aaa:bbb> 字段将bbb映射为匿名占位符 处理 回复<aaa:bbb> @<aaa:bbb> 字段将bbb映射为匿名占位符
@ -889,7 +917,7 @@ async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str:
current_char += 1 current_char += 1
return person_map[person_id] return person_map[person_id]
for msg in messages: for i, msg in enumerate(messages):
try: try:
platform = msg.chat_info.platform platform = msg.chat_info.platform
user_id = msg.user_info.user_id user_id = msg.user_info.user_id
@ -910,7 +938,12 @@ async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str:
content = replace_user_references(content, platform, anon_name_resolver, replace_bot_name=False) content = replace_user_references(content, platform, anon_name_resolver, replace_bot_name=False)
header = f"{anon_name}" # 构建消息头如果启用show_ids则添加序号
if show_ids:
header = f"[{i + 1}] {anon_name}"
else:
header = f"{anon_name}"
output_lines.append(header) output_lines.append(header)
stripped_line = content.strip() stripped_line = content.strip()
if stripped_line: if stripped_line:

View File

@ -330,8 +330,6 @@ class Jargon(BaseModel):
content = TextField() content = TextField()
raw_content = TextField(null=True) raw_content = TextField(null=True)
type = TextField(null=True)
translation = TextField(null=True)
meaning = TextField(null=True) meaning = TextField(null=True)
chat_id = TextField(index=True) chat_id = TextField(index=True)
is_global = BooleanField(default=False) is_global = BooleanField(default=False)

View File

@ -39,9 +39,19 @@ class ExpressionReflector:
# 检查是否在允许列表中 # 检查是否在允许列表中
allow_reflect = global_config.expression.allow_reflect allow_reflect = global_config.expression.allow_reflect
if allow_reflect and self.chat_id not in allow_reflect: if allow_reflect:
logger.info(f"[Expression Reflection] 当前聊天流 {self.chat_id} 不在允许列表中,跳过") # 将 allow_reflect 中的 platform:id:type 格式转换为 chat_id 列表
return False allow_reflect_chat_ids = []
for stream_config in allow_reflect:
parsed_chat_id = global_config.expression._parse_stream_config_to_chat_id(stream_config)
if parsed_chat_id:
allow_reflect_chat_ids.append(parsed_chat_id)
else:
logger.warning(f"[Expression Reflection] 无法解析 allow_reflect 配置项: {stream_config}")
if self.chat_id not in allow_reflect_chat_ids:
logger.info(f"[Expression Reflection] 当前聊天流 {self.chat_id} 不在允许列表中,跳过")
return False
# 检查上一次提问时间 # 检查上一次提问时间
current_time = time.time() current_time = time.time()

View File

@ -1,6 +1,7 @@
import time import time
import json import json
import asyncio import asyncio
from collections import OrderedDict
from typing import List, Dict, Optional, Any from typing import List, Dict, Optional, Any
from json_repair import repair_json from json_repair import repair_json
from peewee import fn from peewee import fn
@ -12,12 +13,13 @@ from src.config.config import model_config, global_config
from src.chat.message_receive.chat_stream import get_chat_manager from src.chat.message_receive.chat_stream import get_chat_manager
from src.plugin_system.apis import llm_api from src.plugin_system.apis import llm_api
from src.chat.utils.chat_message_builder import ( from src.chat.utils.chat_message_builder import (
build_anonymous_messages, build_readable_messages_with_id,
get_raw_msg_by_timestamp_with_chat_inclusive, get_raw_msg_by_timestamp_with_chat_inclusive,
get_raw_msg_before_timestamp_with_chat, get_raw_msg_before_timestamp_with_chat,
build_readable_messages_with_list, build_readable_messages_with_list,
) )
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
from src.chat.utils.utils import parse_platform_accounts
logger = get_logger("jargon") logger = get_logger("jargon")
@ -43,9 +45,107 @@ def _contains_bot_self_name(content: str) -> bool:
return any(name in target for name in candidates if target) return any(name in target for name in candidates if target)
def _format_context_message(msg: Any, seq_index: int) -> str:
"""
将单条消息格式化为带序号的上下文行
"""
if msg is None:
return ""
text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip()
if not text:
return ""
user_info = getattr(msg, "user_info", None)
nickname = ""
if user_info:
nickname = getattr(user_info, "user_nickname", "") or getattr(user_info, "user_id", "")
if not nickname:
nickname = getattr(msg, "user_nickname", "") or getattr(msg, "user_id", "") or "某人"
return f"{nickname}: {text}"
def _build_context_paragraph(messages: List[Any], center_index: int) -> Optional[str]:
"""
构建包含中心消息上下文的段落前3条+后3条
"""
if not messages or center_index < 0 or center_index >= len(messages):
return None
context_start = max(0, center_index - 3)
context_end = min(len(messages), center_index + 1 + 3)
context_lines: List[str] = []
for idx in range(context_start, context_end):
formatted_line = _format_context_message(messages[idx], idx + 1)
if formatted_line:
context_lines.append(formatted_line)
if not context_lines:
return None
paragraph = "\n".join(context_lines).strip()
return paragraph or None
def _is_bot_message(msg: Any) -> bool:
"""判断消息是否来自机器人自身"""
if msg is None:
return False
bot_config = getattr(global_config, "bot", None)
if not bot_config:
return False
platform = (
str(getattr(msg, "user_platform", "") or getattr(getattr(msg, "user_info", None), "platform", "") or "")
.strip()
.lower()
)
user_id = (
str(getattr(msg, "user_id", "") or getattr(getattr(msg, "user_info", None), "user_id", "") or "")
.strip()
)
if not platform or not user_id:
return False
platform_accounts = {}
try:
platform_accounts = parse_platform_accounts(getattr(bot_config, "platforms", []) or [])
except Exception:
platform_accounts = {}
bot_accounts: Dict[str, str] = {}
qq_account = str(getattr(bot_config, "qq_account", "") or "").strip()
if qq_account:
bot_accounts["qq"] = qq_account
telegram_account = str(getattr(bot_config, "telegram_account", "") or "").strip()
if telegram_account:
bot_accounts["telegram"] = telegram_account
for plat, account in platform_accounts.items():
if account and plat not in bot_accounts:
bot_accounts[plat] = account
bot_account = bot_accounts.get(platform)
return bool(bot_account and user_id == bot_account)
def _has_adjacent_bot_message(messages: List[Any], center_index: int) -> bool:
"""检查目标消息的上一条或下一条是否为机器人发言"""
for neighbor in (center_index - 1, center_index + 1):
if 0 <= neighbor < len(messages) and _is_bot_message(messages[neighbor]):
return True
return False
def _init_prompt() -> None: def _init_prompt() -> None:
prompt_str = """ prompt_str = """
**聊天内容其中的SELF是你自己的发言** **聊天内容其中的{bot_name}的发言内容是你自己的发言[msg_id] 是消息ID**
{chat_str} {chat_str}
请从上面这段聊天内容中提取"可能是黑话"的候选项黑话/俚语/网络缩写/口头禅 请从上面这段聊天内容中提取"可能是黑话"的候选项黑话/俚语/网络缩写/口头禅
@ -62,9 +162,10 @@ def _init_prompt() -> None:
- 中文词语的缩写用几个汉字概括一个词汇或含义例如社死内卷 - 中文词语的缩写用几个汉字概括一个词汇或含义例如社死内卷
JSON 数组输出元素为对象严格按以下结构 JSON 数组输出元素为对象严格按以下结构
请你提取出可能的黑话最多10
[ [
{{"content": "词条", "raw_content": "包含该词条的完整对话上下文原文"}}, {{"content": "词条", "msg_id": "m12"}}, // msg_id 必须与上方聊天中展示的ID完全一致
{{"content": "词条2", "raw_content": "包含该词条的完整对话上下文原文"}} {{"content": "词条2", "msg_id": "m15"}}
] ]
现在请输出 现在请输出
@ -78,10 +179,10 @@ def _init_inference_prompts() -> None:
prompt1_str = """ prompt1_str = """
**词条内容** **词条内容**
{content} {content}
**词条出现的上下文raw_content其中的SELF是你自己的发言** **词条出现的上下文其中的{bot_name}的发言内容是你自己的发言**
{raw_content_list} {raw_content_list}
请根据以上词条内容和上下文推断这个词条的含义 请根据上下文推断"{content}"这个词条的含义
- 如果这是一个黑话俚语或网络用语请推断其含义 - 如果这是一个黑话俚语或网络用语请推断其含义
- 如果含义明确常规词汇也请说明 - 如果含义明确常规词汇也请说明
- 如果上下文信息不足无法推断含义请设置 no_info true - 如果上下文信息不足无法推断含义请设置 no_info true
@ -240,7 +341,7 @@ def _should_infer_meaning(jargon_obj: Jargon) -> bool:
last_inference = jargon_obj.last_inference_count or 0 last_inference = jargon_obj.last_inference_count or 0
# 阈值列表3,6, 10, 20, 40, 60, 100 # 阈值列表3,6, 10, 20, 40, 60, 100
thresholds = [3, 6, 10, 20, 40, 60, 100] thresholds = [2, 4, 8, 12, 24, 60, 100]
if count < thresholds[0]: if count < thresholds[0]:
return False return False
@ -281,6 +382,53 @@ class JargonMiner:
chat_manager = get_chat_manager() chat_manager = get_chat_manager()
stream_name = chat_manager.get_stream_name(self.chat_id) stream_name = chat_manager.get_stream_name(self.chat_id)
self.stream_name = stream_name if stream_name else self.chat_id self.stream_name = stream_name if stream_name else self.chat_id
self.cache_limit = 100
self.cache: OrderedDict[str, None] = OrderedDict()
def _add_to_cache(self, content: str) -> None:
"""将提取到的黑话加入缓存保持LRU语义"""
if not content:
return
key = content.strip()
if not key:
return
if key in self.cache:
self.cache.move_to_end(key)
else:
self.cache[key] = None
if len(self.cache) > self.cache_limit:
self.cache.popitem(last=False)
def _collect_cached_entries(self, messages: List[Any]) -> List[Dict[str, List[str]]]:
"""检查缓存中的黑话是否出现在当前消息窗口,生成对应上下文"""
if not self.cache or not messages:
return []
cached_entries: List[Dict[str, List[str]]] = []
processed_pairs = set()
for idx, msg in enumerate(messages):
msg_text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip()
if not msg_text or _is_bot_message(msg):
continue
for content in self.cache.keys():
if not content:
continue
if (content, idx) in processed_pairs:
continue
if content in msg_text:
if _has_adjacent_bot_message(messages, idx):
continue
paragraph = _build_context_paragraph(messages, idx)
if not paragraph:
continue
cached_entries.append({"content": content, "raw_content": [paragraph]})
processed_pairs.add((content, idx))
return cached_entries
async def _infer_meaning_by_id(self, jargon_id: int) -> None: async def _infer_meaning_by_id(self, jargon_id: int) -> None:
"""通过ID加载对象并推断""" """通过ID加载对象并推断"""
@ -323,6 +471,7 @@ class JargonMiner:
prompt1 = await global_prompt_manager.format_prompt( prompt1 = await global_prompt_manager.format_prompt(
"jargon_inference_with_context_prompt", "jargon_inference_with_context_prompt",
content=content, content=content,
bot_name = global_config.bot.nickname,
raw_content_list=raw_content_text, raw_content_list=raw_content_text,
) )
@ -441,8 +590,8 @@ class JargonMiner:
# 是黑话使用推断1的结果基于上下文更准确 # 是黑话使用推断1的结果基于上下文更准确
jargon_obj.meaning = inference1.get("meaning", "") jargon_obj.meaning = inference1.get("meaning", "")
else: else:
# 不是黑话,也记录含义使用推断2的结果因为含义明确 # 不是黑话,清空含义,不再存储任何内容
jargon_obj.meaning = inference2.get("meaning", "") jargon_obj.meaning = ""
# 更新最后一次判定的count值避免重启后重复判定 # 更新最后一次判定的count值避免重启后重复判定
jargon_obj.last_inference_count = jargon_obj.count or 0 jargon_obj.last_inference_count = jargon_obj.count or 0
@ -511,12 +660,33 @@ class JargonMiner:
if not messages: if not messages:
return return
chat_str: str = await build_anonymous_messages(messages) # 按时间排序,确保编号与上下文一致
messages = sorted(messages, key=lambda msg: msg.time or 0)
chat_str, message_id_list = build_readable_messages_with_id(
messages=messages,
replace_bot_name=True,
timestamp_mode="relative",
truncate=False,
show_actions=False,
show_pic=True,
pic_single=True,
)
if not chat_str.strip(): if not chat_str.strip():
return return
msg_id_to_index: Dict[str, int] = {}
for idx, (msg_id, _msg) in enumerate(message_id_list or []):
if not msg_id:
continue
msg_id_to_index[msg_id] = idx
if not msg_id_to_index:
logger.warning("未能生成消息ID映射跳过本次提取")
return
prompt: str = await global_prompt_manager.format_prompt( prompt: str = await global_prompt_manager.format_prompt(
"extract_jargon_prompt", "extract_jargon_prompt",
bot_name=global_config.bot.nickname,
chat_str=chat_str, chat_str=chat_str,
) )
@ -551,25 +721,46 @@ class JargonMiner:
for item in parsed: for item in parsed:
if not isinstance(item, dict): if not isinstance(item, dict):
continue continue
content = str(item.get("content", "")).strip() content = str(item.get("content", "")).strip()
raw_content_value = item.get("raw_content", "") msg_id_value = item.get("msg_id")
# 处理raw_content可能是字符串或列表 if not content:
raw_content_list = [] continue
if isinstance(raw_content_value, list):
raw_content_list = [str(rc).strip() for rc in raw_content_value if str(rc).strip()]
# 去重
raw_content_list = list(dict.fromkeys(raw_content_list))
elif isinstance(raw_content_value, str):
raw_content_str = raw_content_value.strip()
if raw_content_str:
raw_content_list = [raw_content_str]
if content and raw_content_list: if _contains_bot_self_name(content):
if _contains_bot_self_name(content): logger.debug(f"解析阶段跳过包含机器人昵称/别名的词条: {content}")
logger.debug(f"解析阶段跳过包含机器人昵称/别名的词条: {content}") continue
continue
entries.append({"content": content, "raw_content": raw_content_list}) msg_id_str = str(msg_id_value or "").strip()
if not msg_id_str:
logger.warning(f"解析jargon失败msg_id缺失content={content}")
continue
msg_index = msg_id_to_index.get(msg_id_str)
if msg_index is None:
logger.warning(f"解析jargon失败msg_id未找到content={content}, msg_id={msg_id_str}")
continue
target_msg = messages[msg_index]
if _is_bot_message(target_msg):
logger.debug(f"解析阶段跳过引用机器人自身消息的词条: content={content}, msg_id={msg_id_str}")
continue
if _has_adjacent_bot_message(messages, msg_index):
logger.debug(
f"解析阶段跳过因邻近机器人发言的词条: content={content}, msg_id={msg_id_str}"
)
continue
context_paragraph = _build_context_paragraph(messages, msg_index)
if not context_paragraph:
logger.warning(f"解析jargon失败上下文为空content={content}, msg_id={msg_id_str}")
continue
entries.append({"content": content, "raw_content": [context_paragraph]})
cached_entries = self._collect_cached_entries(messages)
if cached_entries:
entries.extend(cached_entries)
except Exception as e: except Exception as e:
logger.error(f"解析jargon JSON失败: {e}; 原始: {response}") logger.error(f"解析jargon JSON失败: {e}; 原始: {response}")
return return
@ -577,15 +768,25 @@ class JargonMiner:
if not entries: if not entries:
return return
# 去重并写入DB按 chat_id + content 去重) # 去重并合并raw_content按 content 聚合)
# 使用content作为去重键 merged_entries: OrderedDict[str, Dict[str, List[str]]] = OrderedDict()
seen = set()
uniq_entries = []
for entry in entries: for entry in entries:
content_key = entry["content"] content_key = entry["content"]
if content_key not in seen: raw_list = entry.get("raw_content", []) or []
seen.add(content_key) if content_key in merged_entries:
uniq_entries.append(entry) merged_entries[content_key]["raw_content"].extend(raw_list)
else:
merged_entries[content_key] = {
"content": content_key,
"raw_content": list(raw_list),
}
uniq_entries = []
for merged_entry in merged_entries.values():
raw_content_list = merged_entry["raw_content"]
if raw_content_list:
merged_entry["raw_content"] = list(dict.fromkeys(raw_content_list))
uniq_entries.append(merged_entry)
saved = 0 saved = 0
updated = 0 updated = 0
@ -670,6 +871,8 @@ class JargonMiner:
except Exception as e: except Exception as e:
logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}") logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}")
continue continue
finally:
self._add_to_cache(content)
# 固定输出提取的jargon结果格式化为可读形式只要有提取结果就输出 # 固定输出提取的jargon结果格式化为可读形式只要有提取结果就输出
if uniq_entries: if uniq_entries: