fix:修复黑话提取的学习缓存

pull/1467/head
SengokuCola 2025-12-27 19:43:46 +08:00
parent a6e1a0e7d3
commit 8e7a2aecae
6 changed files with 103 additions and 313 deletions

View File

@ -92,7 +92,7 @@ class ExpressionLearner:
model_set=model_config.model_task_config.utils, request_type="expression.learner"
)
self.summary_model: LLMRequest = LLMRequest(
model_set=model_config.model_task_config.utils, request_type="expression.summary"
model_set=model_config.model_task_config.tool_use, request_type="expression.summary"
)
self.check_model: Optional[LLMRequest] = None # 检查用的 LLM 实例,延迟初始化
self.chat_id = chat_id
@ -142,6 +142,17 @@ class ExpressionLearner:
jargon_entries: List[Tuple[str, str]] # (content, source_id)
expressions, jargon_entries = parse_expression_response(response)
# 从缓存中检查 jargon 是否出现在 messages 中
cached_jargon_entries = self._check_cached_jargons_in_messages(random_msg)
if cached_jargon_entries:
# 合并缓存中的 jargon 条目(去重:如果 content 已存在则跳过)
existing_contents = {content for content, _ in jargon_entries}
for content, source_id in cached_jargon_entries:
if content not in existing_contents:
jargon_entries.append((content, source_id))
existing_contents.add(content)
logger.info(f"从缓存中检查到黑话: {content}")
# 检查表达方式数量如果超过10个则放弃本次表达学习
if len(expressions) > 20:
logger.info(f"表达方式提取数量超过10个实际{len(expressions)}个),放弃本次表达学习")
@ -483,6 +494,68 @@ class ExpressionLearner:
logger.error(f"立即检查表达方式失败 [ID: {expr_obj.id}]: {e}", exc_info=True)
# 检查失败时,保持 checked=False等待后续自动检查任务处理
def _check_cached_jargons_in_messages(self, messages: List[Any]) -> List[Tuple[str, str]]:
"""
检查缓存中的 jargon 是否出现在 messages
Args:
messages: 消息列表
Returns:
List[Tuple[str, str]]: 匹配到的黑话条目列表每个元素是 (content, source_id)
"""
if not messages:
return []
# 获取 jargon_miner 实例
jargon_miner = miner_manager.get_miner(self.chat_id)
# 获取缓存中的所有 jargon
cached_jargons = jargon_miner.get_cached_jargons()
if not cached_jargons:
return []
matched_entries: List[Tuple[str, str]] = []
# 遍历 messages检查缓存中的 jargon 是否出现
for i, msg in enumerate(messages):
# 跳过机器人自己的消息
if is_bot_message(msg):
continue
# 获取消息文本
msg_text = (
getattr(msg, "processed_plain_text", None) or
""
).strip()
if not msg_text:
continue
# 检查每个缓存中的 jargon 是否出现在消息文本中
for jargon in cached_jargons:
if not jargon or not jargon.strip():
continue
jargon_content = jargon.strip()
# 使用正则匹配,考虑单词边界(类似 jargon_explainer 中的逻辑)
pattern = re.escape(jargon_content)
# 对于中文,使用更宽松的匹配;对于英文/数字,使用单词边界
if re.search(r"[\u4e00-\u9fff]", jargon_content):
# 包含中文,使用更宽松的匹配
search_pattern = pattern
else:
# 纯英文/数字,使用单词边界
search_pattern = r"\b" + pattern + r"\b"
if re.search(search_pattern, msg_text, re.IGNORECASE):
# 找到匹配构建条目source_id 从 1 开始,因为 build_anonymous_messages 的编号从 1 开始)
source_id = str(i + 1)
matched_entries.append((jargon_content, source_id))
return matched_entries
async def _process_jargon_entries(self, jargon_entries: List[Tuple[str, str]], messages: List[Any]) -> None:
"""
处理从 expression learner 提取的黑话条目路由到 jargon_miner

View File

@ -45,7 +45,7 @@ class JargonExplainer:
def __init__(self, chat_id: str) -> None:
self.chat_id = chat_id
self.llm = LLMRequest(
model_set=model_config.model_task_config.utils,
model_set=model_config.model_task_config.tool_use,
request_type="jargon.explain",
)

View File

@ -51,32 +51,32 @@ def _is_single_char_jargon(content: str) -> bool:
)
def _init_prompt() -> None:
prompt_str = """
**聊天内容其中的{bot_name}的发言内容是你自己的发言[msg_id] 是消息ID**
{chat_str}
# def _init_prompt() -> None:
# prompt_str = """
# **聊天内容,其中的{bot_name}的发言内容是你自己的发言,[msg_id] 是消息ID**
# {chat_str}
请从上面这段聊天内容中提取"可能是黑话"的候选项黑话/俚语/网络缩写/口头禅
- 必须为对话中真实出现过的短词或短语
- 必须是你无法理解含义的词语没有明确含义的词语请不要选择有明确含义或者含义清晰的词语
- 排除人名@表情包/图片中的内容纯标点常规功能词如的啊等
- 每个词条长度建议 2-8 个字符不强制尽量短小
# 请从上面这段聊天内容中提取"可能是黑话"的候选项(黑话/俚语/网络缩写/口头禅)
# - 必须为对话中真实出现过的短词或短语
# - 必须是你无法理解含义的词语,没有明确含义的词语,请不要选择有明确含义,或者含义清晰的词语
# - 排除:人名、@、表情包/图片中的内容、纯标点、常规功能词(如的、了、呢、啊等
# - 每个词条长度建议 2-8 个字符(不强制),尽量短小
黑话必须为以下几种类型
- 由字母构成的汉语拼音首字母的简写词例如nbyydsxswl
- 英文词语的缩写用英文字母概括一个词汇或含义例如CPUGPUAPI
- 中文词语的缩写用几个汉字概括一个词汇或含义例如社死内卷
# 黑话必须为以下几种类型
# - 由字母构成的汉语拼音首字母的简写词例如nb、yyds、xswl
# - 英文词语的缩写用英文字母概括一个词汇或含义例如CPU、GPU、API
# - 中文词语的缩写,用几个汉字概括一个词汇或含义,例如:社死、内卷
JSON 数组输出元素为对象严格按以下结构
请你提取出可能的黑话最多30个黑话请尽量提取所有
[
{{"content": "词条", "msg_id": "m12"}}, // msg_id 必须与上方聊天中展示的ID完全一致
{{"content": "词条2", "msg_id": "m15"}}
]
# 以 JSON 数组输出,元素为对象(严格按以下结构)
# 请你提取出可能的黑话最多30个黑话请尽量提取所有
# [
# {{"content": "词条", "msg_id": "m12"}}, // msg_id 必须与上方聊天中展示的ID完全一致
# {{"content": "词条2", "msg_id": "m15"}}
# ]
现在请输出
"""
Prompt(prompt_str, "extract_jargon_prompt")
# 现在请输出
# """
# Prompt(prompt_str, "extract_jargon_prompt")
def _init_inference_prompts() -> None:
@ -142,7 +142,6 @@ def _init_inference_prompts() -> None:
Prompt(prompt3_str, "jargon_compare_inference_prompt")
_init_prompt()
_init_inference_prompts()
@ -229,34 +228,9 @@ class JargonMiner:
if len(self.cache) > self.cache_limit:
self.cache.popitem(last=False)
def _collect_cached_entries(self, messages: List[Any]) -> List[Dict[str, List[str]]]:
"""检查缓存中的黑话是否出现在当前消息窗口,生成对应上下文"""
if not self.cache or not messages:
return []
cached_entries: List[Dict[str, List[str]]] = []
processed_pairs = set()
for idx, msg in enumerate(messages):
msg_text = (
getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or ""
).strip()
if not msg_text or is_bot_message(msg):
continue
for content in self.cache.keys():
if not content:
continue
if (content, idx) in processed_pairs:
continue
if content in msg_text:
paragraph = build_context_paragraph(messages, idx)
if not paragraph:
continue
cached_entries.append({"content": content, "raw_content": [paragraph]})
processed_pairs.add((content, idx))
return cached_entries
def get_cached_jargons(self) -> List[str]:
"""获取缓存中的所有黑话列表"""
return list(self.cache.keys())
async def _infer_meaning_by_id(self, jargon_id: int) -> None:
"""通过ID加载对象并推断"""
@ -480,263 +454,6 @@ class JargonMiner:
traceback.print_exc()
async def run_once(
self,
messages: List[Any],
person_name_filter: Optional[Callable[[str], bool]] = None
) -> None:
"""
运行一次黑话提取
Args:
messages: 外部传入的消息列表必需
person_name_filter: 可选的过滤函数用于检查内容是否包含人物名称
"""
# 使用异步锁防止并发执行
async with self._extraction_lock:
try:
if not messages:
return
# 按时间排序,确保编号与上下文一致
messages = sorted(messages, key=lambda msg: msg.time or 0)
chat_str, message_id_list = build_readable_messages_with_id(
messages=messages,
replace_bot_name=True,
timestamp_mode="relative",
truncate=False,
show_actions=False,
show_pic=True,
pic_single=True,
)
if not chat_str.strip():
return
msg_id_to_index: Dict[str, int] = {}
for idx, (msg_id, _msg) in enumerate(message_id_list or []):
if not msg_id:
continue
msg_id_to_index[msg_id] = idx
if not msg_id_to_index:
logger.warning("未能生成消息ID映射跳过本次提取")
return
prompt: str = await global_prompt_manager.format_prompt(
"extract_jargon_prompt",
bot_name=global_config.bot.nickname,
chat_str=chat_str,
)
response, _ = await self.llm.generate_response_async(prompt, temperature=0.2)
if not response:
return
if global_config.debug.show_jargon_prompt:
logger.info(f"jargon提取提示词: {prompt}")
logger.info(f"jargon提取结果: {response}")
# 解析为JSON
entries: List[dict] = []
try:
resp = response.strip()
parsed = None
if resp.startswith("[") and resp.endswith("]"):
parsed = json.loads(resp)
else:
repaired = repair_json(resp)
if isinstance(repaired, str):
parsed = json.loads(repaired)
else:
parsed = repaired
if isinstance(parsed, dict):
parsed = [parsed]
if not isinstance(parsed, list):
return
for item in parsed:
if not isinstance(item, dict):
continue
content = str(item.get("content", "")).strip()
msg_id_value = item.get("msg_id")
if not content:
continue
if contains_bot_self_name(content):
logger.info(f"解析阶段跳过包含机器人昵称/别名的词条: {content}")
continue
# 检查是否包含人物名称
if person_name_filter and person_name_filter(content):
logger.info(f"解析阶段跳过包含人物名称的词条: {content}")
continue
msg_id_str = str(msg_id_value or "").strip()
if not msg_id_str:
logger.warning(f"解析jargon失败msg_id缺失content={content}")
continue
msg_index = msg_id_to_index.get(msg_id_str)
if msg_index is None:
logger.warning(f"解析jargon失败msg_id未找到content={content}, msg_id={msg_id_str}")
continue
target_msg = messages[msg_index]
if is_bot_message(target_msg):
logger.info(f"解析阶段跳过引用机器人自身消息的词条: content={content}, msg_id={msg_id_str}")
continue
context_paragraph = build_context_paragraph(messages, msg_index)
if not context_paragraph:
logger.warning(f"解析jargon失败上下文为空content={content}, msg_id={msg_id_str}")
continue
entries.append({"content": content, "raw_content": [context_paragraph]})
cached_entries = self._collect_cached_entries(messages)
if cached_entries:
entries.extend(cached_entries)
except Exception as e:
logger.error(f"解析jargon JSON失败: {e}; 原始: {response}")
return
if not entries:
return
# 去重并合并raw_content按 content 聚合)
merged_entries: OrderedDict[str, Dict[str, List[str]]] = OrderedDict()
for entry in entries:
content_key = entry["content"]
raw_list = entry.get("raw_content", []) or []
if content_key in merged_entries:
merged_entries[content_key]["raw_content"].extend(raw_list)
else:
merged_entries[content_key] = {
"content": content_key,
"raw_content": list(raw_list),
}
uniq_entries = []
for merged_entry in merged_entries.values():
raw_content_list = merged_entry["raw_content"]
if raw_content_list:
merged_entry["raw_content"] = list(dict.fromkeys(raw_content_list))
uniq_entries.append(merged_entry)
saved = 0
updated = 0
for entry in uniq_entries:
content = entry["content"]
raw_content_list = entry["raw_content"] # 已经是列表
try:
# 查询所有content匹配的记录
query = Jargon.select().where(Jargon.content == content)
# 查找匹配的记录
matched_obj = None
for obj in query:
if global_config.expression.all_global_jargon:
# 开启all_global所有content匹配的记录都可以
matched_obj = obj
break
else:
# 关闭all_global需要检查chat_id列表是否包含目标chat_id
chat_id_list = parse_chat_id_list(obj.chat_id)
if chat_id_list_contains(chat_id_list, self.chat_id):
matched_obj = obj
break
if matched_obj:
obj = matched_obj
try:
obj.count = (obj.count or 0) + 1
except Exception:
obj.count = 1
# 合并raw_content列表读取现有列表追加新值去重
existing_raw_content = []
if obj.raw_content:
try:
existing_raw_content = (
json.loads(obj.raw_content)
if isinstance(obj.raw_content, str)
else obj.raw_content
)
if not isinstance(existing_raw_content, list):
existing_raw_content = [existing_raw_content] if existing_raw_content else []
except (json.JSONDecodeError, TypeError):
existing_raw_content = [obj.raw_content] if obj.raw_content else []
# 合并并去重
merged_list = list(dict.fromkeys(existing_raw_content + raw_content_list))
obj.raw_content = json.dumps(merged_list, ensure_ascii=False)
# 更新chat_id列表增加当前chat_id的计数
chat_id_list = parse_chat_id_list(obj.chat_id)
updated_chat_id_list = update_chat_id_list(chat_id_list, self.chat_id, increment=1)
obj.chat_id = json.dumps(updated_chat_id_list, ensure_ascii=False)
# 开启all_global时确保记录标记为is_global=True
if global_config.expression.all_global_jargon:
obj.is_global = True
# 关闭all_global时保持原有is_global不变不修改
obj.save()
# 检查是否需要推断(达到阈值且超过上次判定值)
if _should_infer_meaning(obj):
# 异步触发推断,不阻塞主流程
# 重新加载对象以确保数据最新
jargon_id = obj.id
asyncio.create_task(self._infer_meaning_by_id(jargon_id))
updated += 1
else:
# 没找到匹配记录,创建新记录
if global_config.expression.all_global_jargon:
# 开启all_global新记录默认为is_global=True
is_global_new = True
else:
# 关闭all_global新记录is_global=False
is_global_new = False
# 使用新格式创建chat_id列表[[chat_id, count]]
chat_id_list = [[self.chat_id, 1]]
chat_id_json = json.dumps(chat_id_list, ensure_ascii=False)
Jargon.create(
content=content,
raw_content=json.dumps(raw_content_list, ensure_ascii=False),
chat_id=chat_id_json,
is_global=is_global_new,
count=1,
)
saved += 1
except Exception as e:
logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}")
continue
finally:
self._add_to_cache(content)
# 固定输出提取的jargon结果格式化为可读形式只要有提取结果就输出
if uniq_entries:
# 收集所有提取的jargon内容
jargon_list = [entry["content"] for entry in uniq_entries]
jargon_str = ",".join(jargon_list)
# 输出格式化的结果使用logger.info会自动应用jargon模块的颜色
logger.info(f"[{self.stream_name}]疑似黑话: {jargon_str}")
if saved or updated:
logger.info(f"jargon写入: 新增 {saved} 条,更新 {updated}chat_id={self.chat_id}")
except Exception as e:
logger.error(f"JargonMiner 运行失败: {e}")
# 即使失败也保持时间戳更新,避免频繁重试
async def process_extracted_entries(
self,
entries: List[Dict[str, List[str]]],

View File

@ -28,7 +28,7 @@ class ReflectTracker:
self.max_duration = 15 * 60 # 15 minutes
# LLM for judging response
self.judge_model = LLMRequest(model_set=model_config.model_task_config.utils, request_type="reflect.tracker")
self.judge_model = LLMRequest(model_set=model_config.model_task_config.tool_use, request_type="reflect.tracker")
self._init_prompts()

View File

@ -382,7 +382,7 @@ class EmojiManager:
self.vlm = LLMRequest(model_set=model_config.model_task_config.vlm, request_type="emoji.see")
self.llm_emotion_judge = LLMRequest(
model_set=model_config.model_task_config.utils, request_type="emoji"
) # 更高的温度更少的token后续可以根据情绪来调整温度
)
self.emoji_num = 0
self.emoji_num_max = global_config.emoji.max_reg_num

View File

@ -50,7 +50,7 @@ def get_dream_summary_model() -> LLMRequest:
global _dream_summary_model
if _dream_summary_model is None:
_dream_summary_model = LLMRequest(
model_set=model_config.model_task_config.utils,
model_set=model_config.model_task_config.replyer,
request_type="dream.summary",
)
return _dream_summary_model