From e52a81e90b2b41c7b6d920dbb115df93395735d9 Mon Sep 17 00:00:00 2001 From: SengokuCola <1026294844@qq.com> Date: Thu, 13 Nov 2025 17:45:32 +0800 Subject: [PATCH] =?UTF-8?q?better=EF=BC=9A=E4=BC=98=E5=8C=96jargon?= =?UTF-8?q?=E6=9F=A5=E8=AF=A2=EF=BC=8C=E5=B9=B6=E4=B8=94=E9=BB=98=E8=AE=A4?= =?UTF-8?q?=E5=85=A8=E5=B1=80=E5=AD=A6=E4=B9=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/config/config.py | 2 + src/config/official_configs.py | 11 ++ src/jargon/jargon_miner.py | 184 ++++++++++-------------------- template/bot_config_template.toml | 7 +- 4 files changed, 79 insertions(+), 125 deletions(-) diff --git a/src/config/config.py b/src/config/config.py index c379be66..d9ec2b52 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -33,6 +33,7 @@ from src.config.official_configs import ( MoodConfig, MemoryConfig, DebugConfig, + JargonConfig, ) from .api_ada_configs import ( @@ -354,6 +355,7 @@ class Config(ConfigBase): debug: DebugConfig mood: MoodConfig voice: VoiceConfig + jargon: JargonConfig @dataclass diff --git a/src/config/official_configs.py b/src/config/official_configs.py index e21d8f96..d7646121 100644 --- a/src/config/official_configs.py +++ b/src/config/official_configs.py @@ -720,6 +720,9 @@ class LPMMKnowledgeConfig(ConfigBase): enable: bool = True """是否启用LPMM知识库""" + + lpmm_mode: Literal["classic", "agent"] = "classic" + """LPMM知识库模式,可选:classic经典模式,agent 模式,结合最新的记忆一同使用""" rag_synonym_search_top_k: int = 10 """RAG同义词搜索的Top K数量""" @@ -753,3 +756,11 @@ class LPMMKnowledgeConfig(ConfigBase): embedding_dimension: int = 1024 """嵌入向量维度,应该与模型的输出维度一致""" + + +@dataclass +class JargonConfig(ConfigBase): + """Jargon配置类""" + + all_global: bool = False + """是否将所有新增的jargon项目默认为全局(is_global=True),chat_id记录第一次存储时的id""" \ No newline at end of file diff --git a/src/jargon/jargon_miner.py b/src/jargon/jargon_miner.py index 3d983521..e41aac32 100644 --- a/src/jargon/jargon_miner.py +++ b/src/jargon/jargon_miner.py @@ -29,20 +29,19 @@ def _init_prompt() -> None: - 必须为对话中真实出现过的短词或短语 - 必须是你无法理解含义的词语,没有明确含义的词语 - 请不要选择有明确含义,或者含义清晰的词语 -- 必须是这几种类别之一:英文或中文缩写、中文拼音短语 - 排除:人名、@、表情包/图片中的内容、纯标点、常规功能词(如的、了、呢、啊等) - 每个词条长度建议 2-8 个字符(不强制),尽量短小 - 合并重复项,去重 -分类规则,type必须根据规则填写: -- p(拼音缩写):由字母构成的,汉语拼音首字母的简写词,例如:nb、yyds、xswl -- e(英文缩写):英文词语的缩写,用英文字母概括一个词汇或含义,例如:CPU、GPU、API -- c(中文缩写):中文词语的缩写,用几个汉字概括一个词汇或含义,例如:社死、内卷 +黑话必须为以下几种类型: +- 由字母构成的,汉语拼音首字母的简写词,例如:nb、yyds、xswl +- 英文词语的缩写,用英文字母概括一个词汇或含义,例如:CPU、GPU、API +- 中文词语的缩写,用几个汉字概括一个词汇或含义,例如:社死、内卷 以 JSON 数组输出,元素为对象(严格按以下结构): [ - {{"content": "词条", "raw_content": "包含该词条的完整对话上下文原文", "type": "p"}}, - {{"content": "词条2", "raw_content": "包含该词条的完整对话上下文原文", "type": "c"}} + {{"content": "词条", "raw_content": "包含该词条的完整对话上下文原文"}}, + {{"content": "词条2", "raw_content": "包含该词条的完整对话上下文原文"}} ] 现在请输出: @@ -154,8 +153,8 @@ class JargonMiner: self.chat_id = chat_id self.last_learning_time: float = time.time() # 频率控制,可按需调整 - self.min_messages_for_learning: int = 20 - self.min_learning_interval: float = 30 + self.min_messages_for_learning: int = 15 + self.min_learning_interval: float = 20 self.llm = LLMRequest( model_set=model_config.model_task_config.utils, @@ -427,17 +426,10 @@ class JargonMiner: if raw_content_str: raw_content_list = [raw_content_str] - type_str = str(item.get("type", "")).strip().lower() - - # 验证type是否为有效值 - if type_str not in ["p", "c", "e"]: - type_str = "p" # 默认值 - if content and raw_content_list: entries.append({ "content": content, - "raw_content": raw_content_list, - "type": type_str + "raw_content": raw_content_list }) except Exception as e: logger.error(f"解析jargon JSON失败: {e}; 原始: {response}") @@ -458,21 +450,27 @@ class JargonMiner: saved = 0 updated = 0 - merged = 0 for entry in uniq_entries: content = entry["content"] raw_content_list = entry["raw_content"] # 已经是列表 - type_str = entry["type"] try: - # 步骤1: 检查同chat_id的记录,默认纳入global项目 - # 查询条件:chat_id匹配 OR (is_global为True且content匹配) - query = ( - Jargon.select() - .where( - ((Jargon.chat_id == self.chat_id) | Jargon.is_global) & - (Jargon.content == content) + # 根据all_global配置决定查询逻辑 + if global_config.jargon.all_global: + # 开启all_global:无视chat_id,查询所有content匹配的记录(所有记录都是全局的) + query = ( + Jargon.select() + .where(Jargon.content == content) ) - ) + else: + # 关闭all_global:只查询chat_id匹配的记录(不考虑is_global) + query = ( + Jargon.select() + .where( + (Jargon.chat_id == self.chat_id) & + (Jargon.content == content) + ) + ) + if query.exists(): obj = query.get() try: @@ -494,9 +492,11 @@ class JargonMiner: merged_list = list(dict.fromkeys(existing_raw_content + raw_content_list)) obj.raw_content = json.dumps(merged_list, ensure_ascii=False) - # 更新type(如果为空) - if type_str and not obj.type: - obj.type = type_str + # 开启all_global时,确保记录标记为is_global=True + if global_config.jargon.all_global: + obj.is_global = True + # 关闭all_global时,保持原有is_global不变(不修改) + obj.save() # 检查是否需要推断(达到阈值且超过上次判定值) @@ -508,93 +508,22 @@ class JargonMiner: updated += 1 else: - # 步骤2: 同chat_id没有找到,检查所有chat_id中是否有相同content的记录 - # 查询所有非global的记录(global的已经在步骤1检查过了) - all_content_query = ( - Jargon.select() - .where( - (Jargon.content == content) & - (~Jargon.is_global) - ) - ) - all_matching = list(all_content_query) - - # 如果找到3个或更多相同content的记录,合并它们 - if len(all_matching) >= 3: - # 找到3个或更多已有记录,合并它们(新条目也会被包含在合并中) - total_count = sum((obj.count or 0) for obj in all_matching) + 1 # +1 是因为当前新条目 - - # 合并所有raw_content列表 - all_raw_content = [] - for obj in all_matching: - if obj.raw_content: - try: - obj_raw = json.loads(obj.raw_content) if isinstance(obj.raw_content, str) else obj.raw_content - if not isinstance(obj_raw, list): - obj_raw = [obj_raw] if obj_raw else [] - all_raw_content.extend(obj_raw) - except (json.JSONDecodeError, TypeError): - if obj.raw_content: - all_raw_content.append(obj.raw_content) - - # 添加当前新条目的raw_content - all_raw_content.extend(raw_content_list) - # 去重 - merged_raw_content = list(dict.fromkeys(all_raw_content)) - - # 合并type:优先使用非空的值 - merged_type = type_str - for obj in all_matching: - if obj.type and not merged_type: - merged_type = obj.type - break - - # 合并其他字段:优先使用已有值 - merged_meaning = None - merged_is_jargon = None - merged_last_inference_count = None - merged_is_complete = False - - for obj in all_matching: - if obj.meaning and not merged_meaning: - merged_meaning = obj.meaning - if obj.is_jargon is not None and merged_is_jargon is None: - merged_is_jargon = obj.is_jargon - if obj.last_inference_count is not None and merged_last_inference_count is None: - merged_last_inference_count = obj.last_inference_count - if obj.is_complete: - merged_is_complete = True - - # 删除旧的记录 - for obj in all_matching: - obj.delete_instance() - - # 创建新的global记录 - Jargon.create( - content=content, - raw_content=json.dumps(merged_raw_content, ensure_ascii=False), - type=merged_type, - chat_id="global", - is_global=True, - count=total_count, - meaning=merged_meaning, - is_jargon=merged_is_jargon, - last_inference_count=merged_last_inference_count, - is_complete=merged_is_complete - ) - merged += 1 - logger.info(f"合并jargon为global: content={content}, 合并了{len(all_matching)}条已有记录+1条新记录(共{len(all_matching)+1}条),总count={total_count}") + # 没找到匹配记录,创建新记录 + if global_config.jargon.all_global: + # 开启all_global:新记录默认为is_global=True + is_global_new = True else: - # 找到少于3个已有记录,正常创建新记录 - Jargon.create( - content=content, - raw_content=json.dumps(raw_content_list, ensure_ascii=False), - type=type_str, - chat_id=self.chat_id, - is_global=False, - count=1 - ) - saved += 1 + # 关闭all_global:新记录is_global=False + is_global_new = False + + Jargon.create( + content=content, + raw_content=json.dumps(raw_content_list, ensure_ascii=False), + chat_id=self.chat_id, + is_global=is_global_new, + count=1 + ) + saved += 1 except Exception as e: logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}") continue @@ -611,8 +540,8 @@ class JargonMiner: # 更新为本次提取的结束时间,确保不会重复提取相同的消息窗口 self.last_learning_time = extraction_end_time - if saved or updated or merged: - logger.info(f"jargon写入: 新增 {saved} 条,更新 {updated} 条,合并为global {merged} 条,chat_id={self.chat_id}") + if saved or updated: + logger.info(f"jargon写入: 新增 {saved} 条,更新 {updated} 条,chat_id={self.chat_id}") except Exception as e: logger.error(f"JargonMiner 运行失败: {e}") @@ -647,7 +576,9 @@ def search_jargon( Args: keyword: 搜索关键词 - chat_id: 可选的聊天ID,如果提供则优先搜索该聊天或global的jargon + chat_id: 可选的聊天ID + - 如果开启了all_global:此参数被忽略,查询所有is_global=True的记录 + - 如果关闭了all_global:如果提供则优先搜索该聊天或global的jargon limit: 返回结果数量限制,默认10 case_sensitive: 是否大小写敏感,默认False(不敏感) fuzzy: 是否模糊搜索,默认True(使用LIKE匹配) @@ -686,11 +617,16 @@ def search_jargon( query = query.where(search_condition) - # 如果提供了chat_id,优先搜索该聊天或global的jargon - if chat_id: - query = query.where( - (Jargon.chat_id == chat_id) | Jargon.is_global - ) + # 根据all_global配置决定查询逻辑 + if global_config.jargon.all_global: + # 开启all_global:所有记录都是全局的,查询所有is_global=True的记录(无视chat_id) + query = query.where(Jargon.is_global) + else: + # 关闭all_global:如果提供了chat_id,优先搜索该聊天或global的jargon + if chat_id: + query = query.where( + (Jargon.chat_id == chat_id) | Jargon.is_global + ) # 只返回有meaning的记录 query = query.where( diff --git a/template/bot_config_template.toml b/template/bot_config_template.toml index 52ace70d..4ab75552 100644 --- a/template/bot_config_template.toml +++ b/template/bot_config_template.toml @@ -1,5 +1,5 @@ [inner] -version = "6.20.3" +version = "6.21.1" #----以下是给开发人员阅读的,如果你只是部署了麦麦,不需要阅读---- #如果你想要修改配置文件,请递增version的值 @@ -124,6 +124,8 @@ max_memory_number = 100 # 记忆最大数量 max_memory_size = 2048 # 记忆最大大小 memory_build_frequency = 1 # 记忆构建频率 +[jargon] +all_global = true # 是否开启全局黑话模式,注意,此功能关闭后,已经记录的全局黑话不会改变,需要手动删除 [tool] enable_tool = true # 是否启用工具 @@ -161,6 +163,8 @@ ban_msgs_regex = [ [lpmm_knowledge] # lpmm知识库配置 enable = false # 是否启用lpmm知识库 +lpmm_mode = "agent" +# 可选:classic经典模式,agent 模式,结合最新的记忆一同使用 rag_synonym_search_top_k = 10 # 同义词搜索TopK rag_synonym_threshold = 0.8 # 同义词阈值(相似度高于此阈值的词语会被认为是同义词) info_extraction_workers = 3 # 实体提取同时执行线程数,非Pro模型不要设置超过5 @@ -255,3 +259,4 @@ chat_prompts = [] #此系统暂时移除,无效配置 [relationship] enable_relationship = true # 是否启用关系系统 +