mirror of https://github.com/Mai-with-u/MaiBot.git
feat:升级jargon,提取更快速,更精准
parent
a1dd26d578
commit
b684a95fbc
|
|
@ -22,6 +22,7 @@ MaiMBot-LPMM
|
||||||
*.zip
|
*.zip
|
||||||
run_bot.bat
|
run_bot.bat
|
||||||
run_na.bat
|
run_na.bat
|
||||||
|
run_all_in_wt.bat
|
||||||
run.bat
|
run.bat
|
||||||
log_debug/
|
log_debug/
|
||||||
run_amds.bat
|
run_amds.bat
|
||||||
|
|
|
||||||
|
|
@ -352,6 +352,7 @@ def _build_readable_messages_internal(
|
||||||
pic_counter: int = 1,
|
pic_counter: int = 1,
|
||||||
show_pic: bool = True,
|
show_pic: bool = True,
|
||||||
message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None,
|
message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None,
|
||||||
|
pic_single: bool = False,
|
||||||
) -> Tuple[str, List[Tuple[float, str, str]], Dict[str, str], int]:
|
) -> Tuple[str, List[Tuple[float, str, str]], Dict[str, str], int]:
|
||||||
# sourcery skip: use-getitem-for-re-match-groups
|
# sourcery skip: use-getitem-for-re-match-groups
|
||||||
"""
|
"""
|
||||||
|
|
@ -378,6 +379,7 @@ def _build_readable_messages_internal(
|
||||||
if pic_id_mapping is None:
|
if pic_id_mapping is None:
|
||||||
pic_id_mapping = {}
|
pic_id_mapping = {}
|
||||||
current_pic_counter = pic_counter
|
current_pic_counter = pic_counter
|
||||||
|
pic_description_cache: Dict[str, str] = {}
|
||||||
|
|
||||||
# 创建时间戳到消息ID的映射,用于在消息前添加[id]标识符
|
# 创建时间戳到消息ID的映射,用于在消息前添加[id]标识符
|
||||||
timestamp_to_id_mapping: Dict[float, str] = {}
|
timestamp_to_id_mapping: Dict[float, str] = {}
|
||||||
|
|
@ -400,6 +402,17 @@ def _build_readable_messages_internal(
|
||||||
nonlocal current_pic_counter
|
nonlocal current_pic_counter
|
||||||
nonlocal pic_counter
|
nonlocal pic_counter
|
||||||
pic_id = match.group(1)
|
pic_id = match.group(1)
|
||||||
|
if pic_single:
|
||||||
|
if pic_id not in pic_description_cache:
|
||||||
|
description = "内容正在阅读,请稍等"
|
||||||
|
try:
|
||||||
|
image = Images.get_or_none(Images.image_id == pic_id)
|
||||||
|
if image and image.description:
|
||||||
|
description = image.description
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
pic_description_cache[pic_id] = description
|
||||||
|
return f"[图片:{pic_description_cache[pic_id]}]"
|
||||||
if pic_id not in pic_id_mapping:
|
if pic_id not in pic_id_mapping:
|
||||||
pic_id_mapping[pic_id] = f"图片{current_pic_counter}"
|
pic_id_mapping[pic_id] = f"图片{current_pic_counter}"
|
||||||
current_pic_counter += 1
|
current_pic_counter += 1
|
||||||
|
|
@ -603,6 +616,7 @@ async def build_readable_messages_with_list(
|
||||||
replace_bot_name: bool = True,
|
replace_bot_name: bool = True,
|
||||||
timestamp_mode: str = "relative",
|
timestamp_mode: str = "relative",
|
||||||
truncate: bool = False,
|
truncate: bool = False,
|
||||||
|
pic_single: bool = False,
|
||||||
) -> Tuple[str, List[Tuple[float, str, str]]]:
|
) -> Tuple[str, List[Tuple[float, str, str]]]:
|
||||||
"""
|
"""
|
||||||
将消息列表转换为可读的文本格式,并返回原始(时间戳, 昵称, 内容)列表。
|
将消息列表转换为可读的文本格式,并返回原始(时间戳, 昵称, 内容)列表。
|
||||||
|
|
@ -613,10 +627,16 @@ async def build_readable_messages_with_list(
|
||||||
replace_bot_name,
|
replace_bot_name,
|
||||||
timestamp_mode,
|
timestamp_mode,
|
||||||
truncate,
|
truncate,
|
||||||
|
pic_id_mapping=None,
|
||||||
|
pic_counter=1,
|
||||||
|
show_pic=True,
|
||||||
|
message_id_list=None,
|
||||||
|
pic_single=pic_single,
|
||||||
)
|
)
|
||||||
|
|
||||||
if pic_mapping_info := build_pic_mapping_info(pic_id_mapping):
|
if not pic_single:
|
||||||
formatted_string = f"{pic_mapping_info}\n\n{formatted_string}"
|
if pic_mapping_info := build_pic_mapping_info(pic_id_mapping):
|
||||||
|
formatted_string = f"{pic_mapping_info}\n\n{formatted_string}"
|
||||||
|
|
||||||
return formatted_string, details_list
|
return formatted_string, details_list
|
||||||
|
|
||||||
|
|
@ -630,6 +650,7 @@ def build_readable_messages_with_id(
|
||||||
show_actions: bool = False,
|
show_actions: bool = False,
|
||||||
show_pic: bool = True,
|
show_pic: bool = True,
|
||||||
remove_emoji_stickers: bool = False,
|
remove_emoji_stickers: bool = False,
|
||||||
|
pic_single: bool = False,
|
||||||
) -> Tuple[str, List[Tuple[str, DatabaseMessages]]]:
|
) -> Tuple[str, List[Tuple[str, DatabaseMessages]]]:
|
||||||
"""
|
"""
|
||||||
将消息列表转换为可读的文本格式,并返回原始(时间戳, 昵称, 内容)列表。
|
将消息列表转换为可读的文本格式,并返回原始(时间戳, 昵称, 内容)列表。
|
||||||
|
|
@ -647,6 +668,7 @@ def build_readable_messages_with_id(
|
||||||
read_mark=read_mark,
|
read_mark=read_mark,
|
||||||
message_id_list=message_id_list,
|
message_id_list=message_id_list,
|
||||||
remove_emoji_stickers=remove_emoji_stickers,
|
remove_emoji_stickers=remove_emoji_stickers,
|
||||||
|
pic_single=pic_single,
|
||||||
)
|
)
|
||||||
|
|
||||||
return formatted_string, message_id_list
|
return formatted_string, message_id_list
|
||||||
|
|
@ -662,6 +684,7 @@ def build_readable_messages(
|
||||||
show_pic: bool = True,
|
show_pic: bool = True,
|
||||||
message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None,
|
message_id_list: Optional[List[Tuple[str, DatabaseMessages]]] = None,
|
||||||
remove_emoji_stickers: bool = False,
|
remove_emoji_stickers: bool = False,
|
||||||
|
pic_single: bool = False,
|
||||||
) -> str: # sourcery skip: extract-method
|
) -> str: # sourcery skip: extract-method
|
||||||
"""
|
"""
|
||||||
将消息列表转换为可读的文本格式。
|
将消息列表转换为可读的文本格式。
|
||||||
|
|
@ -769,14 +792,14 @@ def build_readable_messages(
|
||||||
truncate,
|
truncate,
|
||||||
show_pic=show_pic,
|
show_pic=show_pic,
|
||||||
message_id_list=message_id_list,
|
message_id_list=message_id_list,
|
||||||
|
pic_single=pic_single,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 生成图片映射信息并添加到最前面
|
if not pic_single:
|
||||||
pic_mapping_info = build_pic_mapping_info(pic_id_mapping)
|
pic_mapping_info = build_pic_mapping_info(pic_id_mapping)
|
||||||
if pic_mapping_info:
|
if pic_mapping_info:
|
||||||
return f"{pic_mapping_info}\n\n{formatted_string}"
|
return f"{pic_mapping_info}\n\n{formatted_string}"
|
||||||
else:
|
return formatted_string
|
||||||
return formatted_string
|
|
||||||
else:
|
else:
|
||||||
# 按 read_mark 分割消息
|
# 按 read_mark 分割消息
|
||||||
messages_before_mark = [msg for msg in copy_messages if (msg.time or 0) <= read_mark]
|
messages_before_mark = [msg for msg in copy_messages if (msg.time or 0) <= read_mark]
|
||||||
|
|
@ -796,6 +819,7 @@ def build_readable_messages(
|
||||||
pic_counter,
|
pic_counter,
|
||||||
show_pic=show_pic,
|
show_pic=show_pic,
|
||||||
message_id_list=message_id_list,
|
message_id_list=message_id_list,
|
||||||
|
pic_single=pic_single,
|
||||||
)
|
)
|
||||||
formatted_after, _, pic_id_mapping, _ = _build_readable_messages_internal(
|
formatted_after, _, pic_id_mapping, _ = _build_readable_messages_internal(
|
||||||
messages_after_mark,
|
messages_after_mark,
|
||||||
|
|
@ -806,15 +830,19 @@ def build_readable_messages(
|
||||||
pic_counter,
|
pic_counter,
|
||||||
show_pic=show_pic,
|
show_pic=show_pic,
|
||||||
message_id_list=message_id_list,
|
message_id_list=message_id_list,
|
||||||
|
pic_single=pic_single,
|
||||||
)
|
)
|
||||||
|
|
||||||
read_mark_line = "\n--- 以上消息是你已经看过,请关注以下未读的新消息---\n"
|
read_mark_line = "\n--- 以上消息是你已经看过,请关注以下未读的新消息---\n"
|
||||||
|
|
||||||
# 生成图片映射信息
|
# 生成图片映射信息
|
||||||
if pic_id_mapping:
|
if not pic_single:
|
||||||
pic_mapping_info = f"图片信息:\n{build_pic_mapping_info(pic_id_mapping)}\n聊天记录信息:\n"
|
if pic_id_mapping:
|
||||||
|
pic_mapping_info = f"图片信息:\n{build_pic_mapping_info(pic_id_mapping)}\n聊天记录信息:\n"
|
||||||
|
else:
|
||||||
|
pic_mapping_info = "聊天记录信息:\n"
|
||||||
else:
|
else:
|
||||||
pic_mapping_info = "聊天记录信息:\n"
|
pic_mapping_info = ""
|
||||||
|
|
||||||
# 组合结果
|
# 组合结果
|
||||||
result_parts = []
|
result_parts = []
|
||||||
|
|
@ -832,7 +860,7 @@ def build_readable_messages(
|
||||||
return "".join(result_parts)
|
return "".join(result_parts)
|
||||||
|
|
||||||
|
|
||||||
async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str:
|
async def build_anonymous_messages(messages: List[DatabaseMessages], show_ids: bool = False) -> str:
|
||||||
"""
|
"""
|
||||||
构建匿名可读消息,将不同人的名称转为唯一占位符(A、B、C...),bot自己用SELF。
|
构建匿名可读消息,将不同人的名称转为唯一占位符(A、B、C...),bot自己用SELF。
|
||||||
处理 回复<aaa:bbb> 和 @<aaa:bbb> 字段,将bbb映射为匿名占位符。
|
处理 回复<aaa:bbb> 和 @<aaa:bbb> 字段,将bbb映射为匿名占位符。
|
||||||
|
|
@ -889,7 +917,7 @@ async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str:
|
||||||
current_char += 1
|
current_char += 1
|
||||||
return person_map[person_id]
|
return person_map[person_id]
|
||||||
|
|
||||||
for msg in messages:
|
for i, msg in enumerate(messages):
|
||||||
try:
|
try:
|
||||||
platform = msg.chat_info.platform
|
platform = msg.chat_info.platform
|
||||||
user_id = msg.user_info.user_id
|
user_id = msg.user_info.user_id
|
||||||
|
|
@ -910,7 +938,12 @@ async def build_anonymous_messages(messages: List[DatabaseMessages]) -> str:
|
||||||
|
|
||||||
content = replace_user_references(content, platform, anon_name_resolver, replace_bot_name=False)
|
content = replace_user_references(content, platform, anon_name_resolver, replace_bot_name=False)
|
||||||
|
|
||||||
header = f"{anon_name}说 "
|
# 构建消息头,如果启用show_ids则添加序号
|
||||||
|
if show_ids:
|
||||||
|
header = f"[{i + 1}] {anon_name}说 "
|
||||||
|
else:
|
||||||
|
header = f"{anon_name}说 "
|
||||||
|
|
||||||
output_lines.append(header)
|
output_lines.append(header)
|
||||||
stripped_line = content.strip()
|
stripped_line = content.strip()
|
||||||
if stripped_line:
|
if stripped_line:
|
||||||
|
|
|
||||||
|
|
@ -330,8 +330,6 @@ class Jargon(BaseModel):
|
||||||
|
|
||||||
content = TextField()
|
content = TextField()
|
||||||
raw_content = TextField(null=True)
|
raw_content = TextField(null=True)
|
||||||
type = TextField(null=True)
|
|
||||||
translation = TextField(null=True)
|
|
||||||
meaning = TextField(null=True)
|
meaning = TextField(null=True)
|
||||||
chat_id = TextField(index=True)
|
chat_id = TextField(index=True)
|
||||||
is_global = BooleanField(default=False)
|
is_global = BooleanField(default=False)
|
||||||
|
|
|
||||||
|
|
@ -39,9 +39,19 @@ class ExpressionReflector:
|
||||||
|
|
||||||
# 检查是否在允许列表中
|
# 检查是否在允许列表中
|
||||||
allow_reflect = global_config.expression.allow_reflect
|
allow_reflect = global_config.expression.allow_reflect
|
||||||
if allow_reflect and self.chat_id not in allow_reflect:
|
if allow_reflect:
|
||||||
logger.info(f"[Expression Reflection] 当前聊天流 {self.chat_id} 不在允许列表中,跳过")
|
# 将 allow_reflect 中的 platform:id:type 格式转换为 chat_id 列表
|
||||||
return False
|
allow_reflect_chat_ids = []
|
||||||
|
for stream_config in allow_reflect:
|
||||||
|
parsed_chat_id = global_config.expression._parse_stream_config_to_chat_id(stream_config)
|
||||||
|
if parsed_chat_id:
|
||||||
|
allow_reflect_chat_ids.append(parsed_chat_id)
|
||||||
|
else:
|
||||||
|
logger.warning(f"[Expression Reflection] 无法解析 allow_reflect 配置项: {stream_config}")
|
||||||
|
|
||||||
|
if self.chat_id not in allow_reflect_chat_ids:
|
||||||
|
logger.info(f"[Expression Reflection] 当前聊天流 {self.chat_id} 不在允许列表中,跳过")
|
||||||
|
return False
|
||||||
|
|
||||||
# 检查上一次提问时间
|
# 检查上一次提问时间
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
|
from collections import OrderedDict
|
||||||
from typing import List, Dict, Optional, Any
|
from typing import List, Dict, Optional, Any
|
||||||
from json_repair import repair_json
|
from json_repair import repair_json
|
||||||
from peewee import fn
|
from peewee import fn
|
||||||
|
|
@ -12,12 +13,13 @@ from src.config.config import model_config, global_config
|
||||||
from src.chat.message_receive.chat_stream import get_chat_manager
|
from src.chat.message_receive.chat_stream import get_chat_manager
|
||||||
from src.plugin_system.apis import llm_api
|
from src.plugin_system.apis import llm_api
|
||||||
from src.chat.utils.chat_message_builder import (
|
from src.chat.utils.chat_message_builder import (
|
||||||
build_anonymous_messages,
|
build_readable_messages_with_id,
|
||||||
get_raw_msg_by_timestamp_with_chat_inclusive,
|
get_raw_msg_by_timestamp_with_chat_inclusive,
|
||||||
get_raw_msg_before_timestamp_with_chat,
|
get_raw_msg_before_timestamp_with_chat,
|
||||||
build_readable_messages_with_list,
|
build_readable_messages_with_list,
|
||||||
)
|
)
|
||||||
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
|
from src.chat.utils.prompt_builder import Prompt, global_prompt_manager
|
||||||
|
from src.chat.utils.utils import parse_platform_accounts
|
||||||
|
|
||||||
|
|
||||||
logger = get_logger("jargon")
|
logger = get_logger("jargon")
|
||||||
|
|
@ -43,9 +45,107 @@ def _contains_bot_self_name(content: str) -> bool:
|
||||||
return any(name in target for name in candidates if target)
|
return any(name in target for name in candidates if target)
|
||||||
|
|
||||||
|
|
||||||
|
def _format_context_message(msg: Any, seq_index: int) -> str:
|
||||||
|
"""
|
||||||
|
将单条消息格式化为带序号的上下文行
|
||||||
|
"""
|
||||||
|
if msg is None:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip()
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
user_info = getattr(msg, "user_info", None)
|
||||||
|
nickname = ""
|
||||||
|
if user_info:
|
||||||
|
nickname = getattr(user_info, "user_nickname", "") or getattr(user_info, "user_id", "")
|
||||||
|
|
||||||
|
if not nickname:
|
||||||
|
nickname = getattr(msg, "user_nickname", "") or getattr(msg, "user_id", "") or "某人"
|
||||||
|
|
||||||
|
return f"{nickname}: {text}"
|
||||||
|
|
||||||
|
|
||||||
|
def _build_context_paragraph(messages: List[Any], center_index: int) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
构建包含中心消息上下文的段落(前3条+后3条)
|
||||||
|
"""
|
||||||
|
if not messages or center_index < 0 or center_index >= len(messages):
|
||||||
|
return None
|
||||||
|
|
||||||
|
context_start = max(0, center_index - 3)
|
||||||
|
context_end = min(len(messages), center_index + 1 + 3)
|
||||||
|
|
||||||
|
context_lines: List[str] = []
|
||||||
|
for idx in range(context_start, context_end):
|
||||||
|
formatted_line = _format_context_message(messages[idx], idx + 1)
|
||||||
|
if formatted_line:
|
||||||
|
context_lines.append(formatted_line)
|
||||||
|
|
||||||
|
if not context_lines:
|
||||||
|
return None
|
||||||
|
|
||||||
|
paragraph = "\n".join(context_lines).strip()
|
||||||
|
return paragraph or None
|
||||||
|
|
||||||
|
|
||||||
|
def _is_bot_message(msg: Any) -> bool:
|
||||||
|
"""判断消息是否来自机器人自身"""
|
||||||
|
if msg is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
bot_config = getattr(global_config, "bot", None)
|
||||||
|
if not bot_config:
|
||||||
|
return False
|
||||||
|
|
||||||
|
platform = (
|
||||||
|
str(getattr(msg, "user_platform", "") or getattr(getattr(msg, "user_info", None), "platform", "") or "")
|
||||||
|
.strip()
|
||||||
|
.lower()
|
||||||
|
)
|
||||||
|
user_id = (
|
||||||
|
str(getattr(msg, "user_id", "") or getattr(getattr(msg, "user_info", None), "user_id", "") or "")
|
||||||
|
.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
if not platform or not user_id:
|
||||||
|
return False
|
||||||
|
|
||||||
|
platform_accounts = {}
|
||||||
|
try:
|
||||||
|
platform_accounts = parse_platform_accounts(getattr(bot_config, "platforms", []) or [])
|
||||||
|
except Exception:
|
||||||
|
platform_accounts = {}
|
||||||
|
|
||||||
|
bot_accounts: Dict[str, str] = {}
|
||||||
|
qq_account = str(getattr(bot_config, "qq_account", "") or "").strip()
|
||||||
|
if qq_account:
|
||||||
|
bot_accounts["qq"] = qq_account
|
||||||
|
|
||||||
|
telegram_account = str(getattr(bot_config, "telegram_account", "") or "").strip()
|
||||||
|
if telegram_account:
|
||||||
|
bot_accounts["telegram"] = telegram_account
|
||||||
|
|
||||||
|
for plat, account in platform_accounts.items():
|
||||||
|
if account and plat not in bot_accounts:
|
||||||
|
bot_accounts[plat] = account
|
||||||
|
|
||||||
|
bot_account = bot_accounts.get(platform)
|
||||||
|
return bool(bot_account and user_id == bot_account)
|
||||||
|
|
||||||
|
|
||||||
|
def _has_adjacent_bot_message(messages: List[Any], center_index: int) -> bool:
|
||||||
|
"""检查目标消息的上一条或下一条是否为机器人发言"""
|
||||||
|
for neighbor in (center_index - 1, center_index + 1):
|
||||||
|
if 0 <= neighbor < len(messages) and _is_bot_message(messages[neighbor]):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _init_prompt() -> None:
|
def _init_prompt() -> None:
|
||||||
prompt_str = """
|
prompt_str = """
|
||||||
**聊天内容,其中的SELF是你自己的发言**
|
**聊天内容,其中的{bot_name}的发言内容是你自己的发言,[msg_id] 是消息ID**
|
||||||
{chat_str}
|
{chat_str}
|
||||||
|
|
||||||
请从上面这段聊天内容中提取"可能是黑话"的候选项(黑话/俚语/网络缩写/口头禅)。
|
请从上面这段聊天内容中提取"可能是黑话"的候选项(黑话/俚语/网络缩写/口头禅)。
|
||||||
|
|
@ -62,9 +162,10 @@ def _init_prompt() -> None:
|
||||||
- 中文词语的缩写,用几个汉字概括一个词汇或含义,例如:社死、内卷
|
- 中文词语的缩写,用几个汉字概括一个词汇或含义,例如:社死、内卷
|
||||||
|
|
||||||
以 JSON 数组输出,元素为对象(严格按以下结构):
|
以 JSON 数组输出,元素为对象(严格按以下结构):
|
||||||
|
请你提取出可能的黑话,最多10
|
||||||
[
|
[
|
||||||
{{"content": "词条", "raw_content": "包含该词条的完整对话上下文原文"}},
|
{{"content": "词条", "msg_id": "m12"}}, // msg_id 必须与上方聊天中展示的ID完全一致
|
||||||
{{"content": "词条2", "raw_content": "包含该词条的完整对话上下文原文"}}
|
{{"content": "词条2", "msg_id": "m15"}}
|
||||||
]
|
]
|
||||||
|
|
||||||
现在请输出:
|
现在请输出:
|
||||||
|
|
@ -78,10 +179,10 @@ def _init_inference_prompts() -> None:
|
||||||
prompt1_str = """
|
prompt1_str = """
|
||||||
**词条内容**
|
**词条内容**
|
||||||
{content}
|
{content}
|
||||||
**词条出现的上下文(raw_content)其中的SELF是你自己的发言**
|
**词条出现的上下文。其中的{bot_name}的发言内容是你自己的发言**
|
||||||
{raw_content_list}
|
{raw_content_list}
|
||||||
|
|
||||||
请根据以上词条内容和上下文,推断这个词条的含义。
|
请根据上下文,推断"{content}"这个词条的含义。
|
||||||
- 如果这是一个黑话、俚语或网络用语,请推断其含义
|
- 如果这是一个黑话、俚语或网络用语,请推断其含义
|
||||||
- 如果含义明确(常规词汇),也请说明
|
- 如果含义明确(常规词汇),也请说明
|
||||||
- 如果上下文信息不足,无法推断含义,请设置 no_info 为 true
|
- 如果上下文信息不足,无法推断含义,请设置 no_info 为 true
|
||||||
|
|
@ -240,7 +341,7 @@ def _should_infer_meaning(jargon_obj: Jargon) -> bool:
|
||||||
last_inference = jargon_obj.last_inference_count or 0
|
last_inference = jargon_obj.last_inference_count or 0
|
||||||
|
|
||||||
# 阈值列表:3,6, 10, 20, 40, 60, 100
|
# 阈值列表:3,6, 10, 20, 40, 60, 100
|
||||||
thresholds = [3, 6, 10, 20, 40, 60, 100]
|
thresholds = [2, 4, 8, 12, 24, 60, 100]
|
||||||
|
|
||||||
if count < thresholds[0]:
|
if count < thresholds[0]:
|
||||||
return False
|
return False
|
||||||
|
|
@ -281,6 +382,53 @@ class JargonMiner:
|
||||||
chat_manager = get_chat_manager()
|
chat_manager = get_chat_manager()
|
||||||
stream_name = chat_manager.get_stream_name(self.chat_id)
|
stream_name = chat_manager.get_stream_name(self.chat_id)
|
||||||
self.stream_name = stream_name if stream_name else self.chat_id
|
self.stream_name = stream_name if stream_name else self.chat_id
|
||||||
|
self.cache_limit = 100
|
||||||
|
self.cache: OrderedDict[str, None] = OrderedDict()
|
||||||
|
|
||||||
|
def _add_to_cache(self, content: str) -> None:
|
||||||
|
"""将提取到的黑话加入缓存,保持LRU语义"""
|
||||||
|
if not content:
|
||||||
|
return
|
||||||
|
|
||||||
|
key = content.strip()
|
||||||
|
if not key:
|
||||||
|
return
|
||||||
|
|
||||||
|
if key in self.cache:
|
||||||
|
self.cache.move_to_end(key)
|
||||||
|
else:
|
||||||
|
self.cache[key] = None
|
||||||
|
if len(self.cache) > self.cache_limit:
|
||||||
|
self.cache.popitem(last=False)
|
||||||
|
|
||||||
|
def _collect_cached_entries(self, messages: List[Any]) -> List[Dict[str, List[str]]]:
|
||||||
|
"""检查缓存中的黑话是否出现在当前消息窗口,生成对应上下文"""
|
||||||
|
if not self.cache or not messages:
|
||||||
|
return []
|
||||||
|
|
||||||
|
cached_entries: List[Dict[str, List[str]]] = []
|
||||||
|
processed_pairs = set()
|
||||||
|
|
||||||
|
for idx, msg in enumerate(messages):
|
||||||
|
msg_text = (getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "").strip()
|
||||||
|
if not msg_text or _is_bot_message(msg):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for content in self.cache.keys():
|
||||||
|
if not content:
|
||||||
|
continue
|
||||||
|
if (content, idx) in processed_pairs:
|
||||||
|
continue
|
||||||
|
if content in msg_text:
|
||||||
|
if _has_adjacent_bot_message(messages, idx):
|
||||||
|
continue
|
||||||
|
paragraph = _build_context_paragraph(messages, idx)
|
||||||
|
if not paragraph:
|
||||||
|
continue
|
||||||
|
cached_entries.append({"content": content, "raw_content": [paragraph]})
|
||||||
|
processed_pairs.add((content, idx))
|
||||||
|
|
||||||
|
return cached_entries
|
||||||
|
|
||||||
async def _infer_meaning_by_id(self, jargon_id: int) -> None:
|
async def _infer_meaning_by_id(self, jargon_id: int) -> None:
|
||||||
"""通过ID加载对象并推断"""
|
"""通过ID加载对象并推断"""
|
||||||
|
|
@ -323,6 +471,7 @@ class JargonMiner:
|
||||||
prompt1 = await global_prompt_manager.format_prompt(
|
prompt1 = await global_prompt_manager.format_prompt(
|
||||||
"jargon_inference_with_context_prompt",
|
"jargon_inference_with_context_prompt",
|
||||||
content=content,
|
content=content,
|
||||||
|
bot_name = global_config.bot.nickname,
|
||||||
raw_content_list=raw_content_text,
|
raw_content_list=raw_content_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -441,8 +590,8 @@ class JargonMiner:
|
||||||
# 是黑话,使用推断1的结果(基于上下文,更准确)
|
# 是黑话,使用推断1的结果(基于上下文,更准确)
|
||||||
jargon_obj.meaning = inference1.get("meaning", "")
|
jargon_obj.meaning = inference1.get("meaning", "")
|
||||||
else:
|
else:
|
||||||
# 不是黑话,也记录含义(使用推断2的结果,因为含义明确)
|
# 不是黑话,清空含义,不再存储任何内容
|
||||||
jargon_obj.meaning = inference2.get("meaning", "")
|
jargon_obj.meaning = ""
|
||||||
|
|
||||||
# 更新最后一次判定的count值,避免重启后重复判定
|
# 更新最后一次判定的count值,避免重启后重复判定
|
||||||
jargon_obj.last_inference_count = jargon_obj.count or 0
|
jargon_obj.last_inference_count = jargon_obj.count or 0
|
||||||
|
|
@ -511,12 +660,33 @@ class JargonMiner:
|
||||||
if not messages:
|
if not messages:
|
||||||
return
|
return
|
||||||
|
|
||||||
chat_str: str = await build_anonymous_messages(messages)
|
# 按时间排序,确保编号与上下文一致
|
||||||
|
messages = sorted(messages, key=lambda msg: msg.time or 0)
|
||||||
|
|
||||||
|
chat_str, message_id_list = build_readable_messages_with_id(
|
||||||
|
messages=messages,
|
||||||
|
replace_bot_name=True,
|
||||||
|
timestamp_mode="relative",
|
||||||
|
truncate=False,
|
||||||
|
show_actions=False,
|
||||||
|
show_pic=True,
|
||||||
|
pic_single=True,
|
||||||
|
)
|
||||||
if not chat_str.strip():
|
if not chat_str.strip():
|
||||||
return
|
return
|
||||||
|
|
||||||
|
msg_id_to_index: Dict[str, int] = {}
|
||||||
|
for idx, (msg_id, _msg) in enumerate(message_id_list or []):
|
||||||
|
if not msg_id:
|
||||||
|
continue
|
||||||
|
msg_id_to_index[msg_id] = idx
|
||||||
|
if not msg_id_to_index:
|
||||||
|
logger.warning("未能生成消息ID映射,跳过本次提取")
|
||||||
|
return
|
||||||
|
|
||||||
prompt: str = await global_prompt_manager.format_prompt(
|
prompt: str = await global_prompt_manager.format_prompt(
|
||||||
"extract_jargon_prompt",
|
"extract_jargon_prompt",
|
||||||
|
bot_name=global_config.bot.nickname,
|
||||||
chat_str=chat_str,
|
chat_str=chat_str,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -551,25 +721,46 @@ class JargonMiner:
|
||||||
for item in parsed:
|
for item in parsed:
|
||||||
if not isinstance(item, dict):
|
if not isinstance(item, dict):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
content = str(item.get("content", "")).strip()
|
content = str(item.get("content", "")).strip()
|
||||||
raw_content_value = item.get("raw_content", "")
|
msg_id_value = item.get("msg_id")
|
||||||
|
|
||||||
# 处理raw_content:可能是字符串或列表
|
if not content:
|
||||||
raw_content_list = []
|
continue
|
||||||
if isinstance(raw_content_value, list):
|
|
||||||
raw_content_list = [str(rc).strip() for rc in raw_content_value if str(rc).strip()]
|
|
||||||
# 去重
|
|
||||||
raw_content_list = list(dict.fromkeys(raw_content_list))
|
|
||||||
elif isinstance(raw_content_value, str):
|
|
||||||
raw_content_str = raw_content_value.strip()
|
|
||||||
if raw_content_str:
|
|
||||||
raw_content_list = [raw_content_str]
|
|
||||||
|
|
||||||
if content and raw_content_list:
|
if _contains_bot_self_name(content):
|
||||||
if _contains_bot_self_name(content):
|
logger.debug(f"解析阶段跳过包含机器人昵称/别名的词条: {content}")
|
||||||
logger.debug(f"解析阶段跳过包含机器人昵称/别名的词条: {content}")
|
continue
|
||||||
continue
|
|
||||||
entries.append({"content": content, "raw_content": raw_content_list})
|
msg_id_str = str(msg_id_value or "").strip()
|
||||||
|
if not msg_id_str:
|
||||||
|
logger.warning(f"解析jargon失败:msg_id缺失,content={content}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
msg_index = msg_id_to_index.get(msg_id_str)
|
||||||
|
if msg_index is None:
|
||||||
|
logger.warning(f"解析jargon失败:msg_id未找到,content={content}, msg_id={msg_id_str}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
target_msg = messages[msg_index]
|
||||||
|
if _is_bot_message(target_msg):
|
||||||
|
logger.debug(f"解析阶段跳过引用机器人自身消息的词条: content={content}, msg_id={msg_id_str}")
|
||||||
|
continue
|
||||||
|
if _has_adjacent_bot_message(messages, msg_index):
|
||||||
|
logger.debug(
|
||||||
|
f"解析阶段跳过因邻近机器人发言的词条: content={content}, msg_id={msg_id_str}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
context_paragraph = _build_context_paragraph(messages, msg_index)
|
||||||
|
if not context_paragraph:
|
||||||
|
logger.warning(f"解析jargon失败:上下文为空,content={content}, msg_id={msg_id_str}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
entries.append({"content": content, "raw_content": [context_paragraph]})
|
||||||
|
cached_entries = self._collect_cached_entries(messages)
|
||||||
|
if cached_entries:
|
||||||
|
entries.extend(cached_entries)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"解析jargon JSON失败: {e}; 原始: {response}")
|
logger.error(f"解析jargon JSON失败: {e}; 原始: {response}")
|
||||||
return
|
return
|
||||||
|
|
@ -577,15 +768,25 @@ class JargonMiner:
|
||||||
if not entries:
|
if not entries:
|
||||||
return
|
return
|
||||||
|
|
||||||
# 去重并写入DB(按 chat_id + content 去重)
|
# 去重并合并raw_content(按 content 聚合)
|
||||||
# 使用content作为去重键
|
merged_entries: OrderedDict[str, Dict[str, List[str]]] = OrderedDict()
|
||||||
seen = set()
|
|
||||||
uniq_entries = []
|
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
content_key = entry["content"]
|
content_key = entry["content"]
|
||||||
if content_key not in seen:
|
raw_list = entry.get("raw_content", []) or []
|
||||||
seen.add(content_key)
|
if content_key in merged_entries:
|
||||||
uniq_entries.append(entry)
|
merged_entries[content_key]["raw_content"].extend(raw_list)
|
||||||
|
else:
|
||||||
|
merged_entries[content_key] = {
|
||||||
|
"content": content_key,
|
||||||
|
"raw_content": list(raw_list),
|
||||||
|
}
|
||||||
|
|
||||||
|
uniq_entries = []
|
||||||
|
for merged_entry in merged_entries.values():
|
||||||
|
raw_content_list = merged_entry["raw_content"]
|
||||||
|
if raw_content_list:
|
||||||
|
merged_entry["raw_content"] = list(dict.fromkeys(raw_content_list))
|
||||||
|
uniq_entries.append(merged_entry)
|
||||||
|
|
||||||
saved = 0
|
saved = 0
|
||||||
updated = 0
|
updated = 0
|
||||||
|
|
@ -670,6 +871,8 @@ class JargonMiner:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}")
|
logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}")
|
||||||
continue
|
continue
|
||||||
|
finally:
|
||||||
|
self._add_to_cache(content)
|
||||||
|
|
||||||
# 固定输出提取的jargon结果,格式化为可读形式(只要有提取结果就输出)
|
# 固定输出提取的jargon结果,格式化为可读形式(只要有提取结果就输出)
|
||||||
if uniq_entries:
|
if uniq_entries:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue