From 8e7a2aecae13d527ad357d66abeaa8fe834223f1 Mon Sep 17 00:00:00 2001 From: SengokuCola <1026294844@qq.com> Date: Sat, 27 Dec 2025 19:43:46 +0800 Subject: [PATCH] =?UTF-8?q?fix=EF=BC=9A=E4=BF=AE=E5=A4=8D=E9=BB=91?= =?UTF-8?q?=E8=AF=9D=E6=8F=90=E5=8F=96=E7=9A=84=E5=AD=A6=E4=B9=A0=E7=BC=93?= =?UTF-8?q?=E5=AD=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/bw_learner/expression_learner.py | 75 +++++- src/bw_learner/jargon_explainer.py | 2 +- src/bw_learner/jargon_miner.py | 333 ++----------------------- src/bw_learner/reflect_tracker.py | 2 +- src/chat/emoji_system/emoji_manager.py | 2 +- src/dream/dream_generator.py | 2 +- 6 files changed, 103 insertions(+), 313 deletions(-) diff --git a/src/bw_learner/expression_learner.py b/src/bw_learner/expression_learner.py index 1e19ee46..b644263e 100644 --- a/src/bw_learner/expression_learner.py +++ b/src/bw_learner/expression_learner.py @@ -92,7 +92,7 @@ class ExpressionLearner: model_set=model_config.model_task_config.utils, request_type="expression.learner" ) self.summary_model: LLMRequest = LLMRequest( - model_set=model_config.model_task_config.utils, request_type="expression.summary" + model_set=model_config.model_task_config.tool_use, request_type="expression.summary" ) self.check_model: Optional[LLMRequest] = None # 检查用的 LLM 实例,延迟初始化 self.chat_id = chat_id @@ -142,6 +142,17 @@ class ExpressionLearner: jargon_entries: List[Tuple[str, str]] # (content, source_id) expressions, jargon_entries = parse_expression_response(response) + # 从缓存中检查 jargon 是否出现在 messages 中 + cached_jargon_entries = self._check_cached_jargons_in_messages(random_msg) + if cached_jargon_entries: + # 合并缓存中的 jargon 条目(去重:如果 content 已存在则跳过) + existing_contents = {content for content, _ in jargon_entries} + for content, source_id in cached_jargon_entries: + if content not in existing_contents: + jargon_entries.append((content, source_id)) + existing_contents.add(content) + logger.info(f"从缓存中检查到黑话: {content}") + # 检查表达方式数量,如果超过10个则放弃本次表达学习 if len(expressions) > 20: logger.info(f"表达方式提取数量超过10个(实际{len(expressions)}个),放弃本次表达学习") @@ -483,6 +494,68 @@ class ExpressionLearner: logger.error(f"立即检查表达方式失败 [ID: {expr_obj.id}]: {e}", exc_info=True) # 检查失败时,保持 checked=False,等待后续自动检查任务处理 + def _check_cached_jargons_in_messages(self, messages: List[Any]) -> List[Tuple[str, str]]: + """ + 检查缓存中的 jargon 是否出现在 messages 中 + + Args: + messages: 消息列表 + + Returns: + List[Tuple[str, str]]: 匹配到的黑话条目列表,每个元素是 (content, source_id) + """ + if not messages: + return [] + + # 获取 jargon_miner 实例 + jargon_miner = miner_manager.get_miner(self.chat_id) + + # 获取缓存中的所有 jargon + cached_jargons = jargon_miner.get_cached_jargons() + if not cached_jargons: + return [] + + matched_entries: List[Tuple[str, str]] = [] + + # 遍历 messages,检查缓存中的 jargon 是否出现 + for i, msg in enumerate(messages): + # 跳过机器人自己的消息 + if is_bot_message(msg): + continue + + # 获取消息文本 + msg_text = ( + getattr(msg, "processed_plain_text", None) or + "" + ).strip() + + if not msg_text: + continue + + # 检查每个缓存中的 jargon 是否出现在消息文本中 + for jargon in cached_jargons: + if not jargon or not jargon.strip(): + continue + + jargon_content = jargon.strip() + + # 使用正则匹配,考虑单词边界(类似 jargon_explainer 中的逻辑) + pattern = re.escape(jargon_content) + # 对于中文,使用更宽松的匹配;对于英文/数字,使用单词边界 + if re.search(r"[\u4e00-\u9fff]", jargon_content): + # 包含中文,使用更宽松的匹配 + search_pattern = pattern + else: + # 纯英文/数字,使用单词边界 + search_pattern = r"\b" + pattern + r"\b" + + if re.search(search_pattern, msg_text, re.IGNORECASE): + # 找到匹配,构建条目(source_id 从 1 开始,因为 build_anonymous_messages 的编号从 1 开始) + source_id = str(i + 1) + matched_entries.append((jargon_content, source_id)) + + return matched_entries + async def _process_jargon_entries(self, jargon_entries: List[Tuple[str, str]], messages: List[Any]) -> None: """ 处理从 expression learner 提取的黑话条目,路由到 jargon_miner diff --git a/src/bw_learner/jargon_explainer.py b/src/bw_learner/jargon_explainer.py index 31e33cdc..252d1e40 100644 --- a/src/bw_learner/jargon_explainer.py +++ b/src/bw_learner/jargon_explainer.py @@ -45,7 +45,7 @@ class JargonExplainer: def __init__(self, chat_id: str) -> None: self.chat_id = chat_id self.llm = LLMRequest( - model_set=model_config.model_task_config.utils, + model_set=model_config.model_task_config.tool_use, request_type="jargon.explain", ) diff --git a/src/bw_learner/jargon_miner.py b/src/bw_learner/jargon_miner.py index f1580fc4..947d311e 100644 --- a/src/bw_learner/jargon_miner.py +++ b/src/bw_learner/jargon_miner.py @@ -51,32 +51,32 @@ def _is_single_char_jargon(content: str) -> bool: ) -def _init_prompt() -> None: - prompt_str = """ -**聊天内容,其中的{bot_name}的发言内容是你自己的发言,[msg_id] 是消息ID** -{chat_str} +# def _init_prompt() -> None: +# prompt_str = """ +# **聊天内容,其中的{bot_name}的发言内容是你自己的发言,[msg_id] 是消息ID** +# {chat_str} -请从上面这段聊天内容中提取"可能是黑话"的候选项(黑话/俚语/网络缩写/口头禅)。 -- 必须为对话中真实出现过的短词或短语 -- 必须是你无法理解含义的词语,没有明确含义的词语,请不要选择有明确含义,或者含义清晰的词语 -- 排除:人名、@、表情包/图片中的内容、纯标点、常规功能词(如的、了、呢、啊等) -- 每个词条长度建议 2-8 个字符(不强制),尽量短小 +# 请从上面这段聊天内容中提取"可能是黑话"的候选项(黑话/俚语/网络缩写/口头禅)。 +# - 必须为对话中真实出现过的短词或短语 +# - 必须是你无法理解含义的词语,没有明确含义的词语,请不要选择有明确含义,或者含义清晰的词语 +# - 排除:人名、@、表情包/图片中的内容、纯标点、常规功能词(如的、了、呢、啊等) +# - 每个词条长度建议 2-8 个字符(不强制),尽量短小 -黑话必须为以下几种类型: -- 由字母构成的,汉语拼音首字母的简写词,例如:nb、yyds、xswl -- 英文词语的缩写,用英文字母概括一个词汇或含义,例如:CPU、GPU、API -- 中文词语的缩写,用几个汉字概括一个词汇或含义,例如:社死、内卷 +# 黑话必须为以下几种类型: +# - 由字母构成的,汉语拼音首字母的简写词,例如:nb、yyds、xswl +# - 英文词语的缩写,用英文字母概括一个词汇或含义,例如:CPU、GPU、API +# - 中文词语的缩写,用几个汉字概括一个词汇或含义,例如:社死、内卷 -以 JSON 数组输出,元素为对象(严格按以下结构): -请你提取出可能的黑话,最多30个黑话,请尽量提取所有 -[ - {{"content": "词条", "msg_id": "m12"}}, // msg_id 必须与上方聊天中展示的ID完全一致 - {{"content": "词条2", "msg_id": "m15"}} -] +# 以 JSON 数组输出,元素为对象(严格按以下结构): +# 请你提取出可能的黑话,最多30个黑话,请尽量提取所有 +# [ +# {{"content": "词条", "msg_id": "m12"}}, // msg_id 必须与上方聊天中展示的ID完全一致 +# {{"content": "词条2", "msg_id": "m15"}} +# ] -现在请输出: -""" - Prompt(prompt_str, "extract_jargon_prompt") +# 现在请输出: +# """ +# Prompt(prompt_str, "extract_jargon_prompt") def _init_inference_prompts() -> None: @@ -142,7 +142,6 @@ def _init_inference_prompts() -> None: Prompt(prompt3_str, "jargon_compare_inference_prompt") -_init_prompt() _init_inference_prompts() @@ -229,34 +228,9 @@ class JargonMiner: if len(self.cache) > self.cache_limit: self.cache.popitem(last=False) - def _collect_cached_entries(self, messages: List[Any]) -> List[Dict[str, List[str]]]: - """检查缓存中的黑话是否出现在当前消息窗口,生成对应上下文""" - if not self.cache or not messages: - return [] - - cached_entries: List[Dict[str, List[str]]] = [] - processed_pairs = set() - - for idx, msg in enumerate(messages): - msg_text = ( - getattr(msg, "display_message", None) or getattr(msg, "processed_plain_text", None) or "" - ).strip() - if not msg_text or is_bot_message(msg): - continue - - for content in self.cache.keys(): - if not content: - continue - if (content, idx) in processed_pairs: - continue - if content in msg_text: - paragraph = build_context_paragraph(messages, idx) - if not paragraph: - continue - cached_entries.append({"content": content, "raw_content": [paragraph]}) - processed_pairs.add((content, idx)) - - return cached_entries + def get_cached_jargons(self) -> List[str]: + """获取缓存中的所有黑话列表""" + return list(self.cache.keys()) async def _infer_meaning_by_id(self, jargon_id: int) -> None: """通过ID加载对象并推断""" @@ -480,263 +454,6 @@ class JargonMiner: traceback.print_exc() - async def run_once( - self, - messages: List[Any], - person_name_filter: Optional[Callable[[str], bool]] = None - ) -> None: - """ - 运行一次黑话提取 - - Args: - messages: 外部传入的消息列表(必需) - person_name_filter: 可选的过滤函数,用于检查内容是否包含人物名称 - """ - # 使用异步锁防止并发执行 - async with self._extraction_lock: - try: - if not messages: - return - - # 按时间排序,确保编号与上下文一致 - messages = sorted(messages, key=lambda msg: msg.time or 0) - - chat_str, message_id_list = build_readable_messages_with_id( - messages=messages, - replace_bot_name=True, - timestamp_mode="relative", - truncate=False, - show_actions=False, - show_pic=True, - pic_single=True, - ) - if not chat_str.strip(): - return - - msg_id_to_index: Dict[str, int] = {} - for idx, (msg_id, _msg) in enumerate(message_id_list or []): - if not msg_id: - continue - msg_id_to_index[msg_id] = idx - if not msg_id_to_index: - logger.warning("未能生成消息ID映射,跳过本次提取") - return - - prompt: str = await global_prompt_manager.format_prompt( - "extract_jargon_prompt", - bot_name=global_config.bot.nickname, - chat_str=chat_str, - ) - - response, _ = await self.llm.generate_response_async(prompt, temperature=0.2) - if not response: - return - - if global_config.debug.show_jargon_prompt: - logger.info(f"jargon提取提示词: {prompt}") - logger.info(f"jargon提取结果: {response}") - - # 解析为JSON - entries: List[dict] = [] - try: - resp = response.strip() - parsed = None - if resp.startswith("[") and resp.endswith("]"): - parsed = json.loads(resp) - else: - repaired = repair_json(resp) - if isinstance(repaired, str): - parsed = json.loads(repaired) - else: - parsed = repaired - - if isinstance(parsed, dict): - parsed = [parsed] - - if not isinstance(parsed, list): - return - - for item in parsed: - if not isinstance(item, dict): - continue - - content = str(item.get("content", "")).strip() - msg_id_value = item.get("msg_id") - - if not content: - continue - - if contains_bot_self_name(content): - logger.info(f"解析阶段跳过包含机器人昵称/别名的词条: {content}") - continue - - # 检查是否包含人物名称 - if person_name_filter and person_name_filter(content): - logger.info(f"解析阶段跳过包含人物名称的词条: {content}") - continue - - msg_id_str = str(msg_id_value or "").strip() - if not msg_id_str: - logger.warning(f"解析jargon失败:msg_id缺失,content={content}") - continue - - msg_index = msg_id_to_index.get(msg_id_str) - if msg_index is None: - logger.warning(f"解析jargon失败:msg_id未找到,content={content}, msg_id={msg_id_str}") - continue - - target_msg = messages[msg_index] - if is_bot_message(target_msg): - logger.info(f"解析阶段跳过引用机器人自身消息的词条: content={content}, msg_id={msg_id_str}") - continue - - context_paragraph = build_context_paragraph(messages, msg_index) - if not context_paragraph: - logger.warning(f"解析jargon失败:上下文为空,content={content}, msg_id={msg_id_str}") - continue - - entries.append({"content": content, "raw_content": [context_paragraph]}) - cached_entries = self._collect_cached_entries(messages) - if cached_entries: - entries.extend(cached_entries) - except Exception as e: - logger.error(f"解析jargon JSON失败: {e}; 原始: {response}") - return - - if not entries: - return - - # 去重并合并raw_content(按 content 聚合) - merged_entries: OrderedDict[str, Dict[str, List[str]]] = OrderedDict() - for entry in entries: - content_key = entry["content"] - raw_list = entry.get("raw_content", []) or [] - if content_key in merged_entries: - merged_entries[content_key]["raw_content"].extend(raw_list) - else: - merged_entries[content_key] = { - "content": content_key, - "raw_content": list(raw_list), - } - - uniq_entries = [] - for merged_entry in merged_entries.values(): - raw_content_list = merged_entry["raw_content"] - if raw_content_list: - merged_entry["raw_content"] = list(dict.fromkeys(raw_content_list)) - uniq_entries.append(merged_entry) - - saved = 0 - updated = 0 - for entry in uniq_entries: - content = entry["content"] - raw_content_list = entry["raw_content"] # 已经是列表 - - try: - # 查询所有content匹配的记录 - query = Jargon.select().where(Jargon.content == content) - - # 查找匹配的记录 - matched_obj = None - for obj in query: - if global_config.expression.all_global_jargon: - # 开启all_global:所有content匹配的记录都可以 - matched_obj = obj - break - else: - # 关闭all_global:需要检查chat_id列表是否包含目标chat_id - chat_id_list = parse_chat_id_list(obj.chat_id) - if chat_id_list_contains(chat_id_list, self.chat_id): - matched_obj = obj - break - - if matched_obj: - obj = matched_obj - try: - obj.count = (obj.count or 0) + 1 - except Exception: - obj.count = 1 - - # 合并raw_content列表:读取现有列表,追加新值,去重 - existing_raw_content = [] - if obj.raw_content: - try: - existing_raw_content = ( - json.loads(obj.raw_content) - if isinstance(obj.raw_content, str) - else obj.raw_content - ) - if not isinstance(existing_raw_content, list): - existing_raw_content = [existing_raw_content] if existing_raw_content else [] - except (json.JSONDecodeError, TypeError): - existing_raw_content = [obj.raw_content] if obj.raw_content else [] - - # 合并并去重 - merged_list = list(dict.fromkeys(existing_raw_content + raw_content_list)) - obj.raw_content = json.dumps(merged_list, ensure_ascii=False) - - # 更新chat_id列表:增加当前chat_id的计数 - chat_id_list = parse_chat_id_list(obj.chat_id) - updated_chat_id_list = update_chat_id_list(chat_id_list, self.chat_id, increment=1) - obj.chat_id = json.dumps(updated_chat_id_list, ensure_ascii=False) - - # 开启all_global时,确保记录标记为is_global=True - if global_config.expression.all_global_jargon: - obj.is_global = True - # 关闭all_global时,保持原有is_global不变(不修改) - - obj.save() - - # 检查是否需要推断(达到阈值且超过上次判定值) - if _should_infer_meaning(obj): - # 异步触发推断,不阻塞主流程 - # 重新加载对象以确保数据最新 - jargon_id = obj.id - asyncio.create_task(self._infer_meaning_by_id(jargon_id)) - - updated += 1 - else: - # 没找到匹配记录,创建新记录 - if global_config.expression.all_global_jargon: - # 开启all_global:新记录默认为is_global=True - is_global_new = True - else: - # 关闭all_global:新记录is_global=False - is_global_new = False - - # 使用新格式创建chat_id列表:[[chat_id, count]] - chat_id_list = [[self.chat_id, 1]] - chat_id_json = json.dumps(chat_id_list, ensure_ascii=False) - - Jargon.create( - content=content, - raw_content=json.dumps(raw_content_list, ensure_ascii=False), - chat_id=chat_id_json, - is_global=is_global_new, - count=1, - ) - saved += 1 - except Exception as e: - logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}") - continue - finally: - self._add_to_cache(content) - - # 固定输出提取的jargon结果,格式化为可读形式(只要有提取结果就输出) - if uniq_entries: - # 收集所有提取的jargon内容 - jargon_list = [entry["content"] for entry in uniq_entries] - jargon_str = ",".join(jargon_list) - - # 输出格式化的结果(使用logger.info会自动应用jargon模块的颜色) - logger.info(f"[{self.stream_name}]疑似黑话: {jargon_str}") - - if saved or updated: - logger.info(f"jargon写入: 新增 {saved} 条,更新 {updated} 条,chat_id={self.chat_id}") - except Exception as e: - logger.error(f"JargonMiner 运行失败: {e}") - # 即使失败也保持时间戳更新,避免频繁重试 - async def process_extracted_entries( self, entries: List[Dict[str, List[str]]], diff --git a/src/bw_learner/reflect_tracker.py b/src/bw_learner/reflect_tracker.py index c792679d..aa012534 100644 --- a/src/bw_learner/reflect_tracker.py +++ b/src/bw_learner/reflect_tracker.py @@ -28,7 +28,7 @@ class ReflectTracker: self.max_duration = 15 * 60 # 15 minutes # LLM for judging response - self.judge_model = LLMRequest(model_set=model_config.model_task_config.utils, request_type="reflect.tracker") + self.judge_model = LLMRequest(model_set=model_config.model_task_config.tool_use, request_type="reflect.tracker") self._init_prompts() diff --git a/src/chat/emoji_system/emoji_manager.py b/src/chat/emoji_system/emoji_manager.py index af12bb1a..e0c4d103 100644 --- a/src/chat/emoji_system/emoji_manager.py +++ b/src/chat/emoji_system/emoji_manager.py @@ -382,7 +382,7 @@ class EmojiManager: self.vlm = LLMRequest(model_set=model_config.model_task_config.vlm, request_type="emoji.see") self.llm_emotion_judge = LLMRequest( model_set=model_config.model_task_config.utils, request_type="emoji" - ) # 更高的温度,更少的token(后续可以根据情绪来调整温度) + ) self.emoji_num = 0 self.emoji_num_max = global_config.emoji.max_reg_num diff --git a/src/dream/dream_generator.py b/src/dream/dream_generator.py index 8c4b5bc3..3fe20f1c 100644 --- a/src/dream/dream_generator.py +++ b/src/dream/dream_generator.py @@ -50,7 +50,7 @@ def get_dream_summary_model() -> LLMRequest: global _dream_summary_model if _dream_summary_model is None: _dream_summary_model = LLMRequest( - model_set=model_config.model_task_config.utils, + model_set=model_config.model_task_config.replyer, request_type="dream.summary", ) return _dream_summary_model