mirror of https://github.com/Mai-with-u/MaiBot.git
better:优化jargon查询,并且默认全局学习
parent
e78a070fbd
commit
e52a81e90b
|
|
@ -33,6 +33,7 @@ from src.config.official_configs import (
|
||||||
MoodConfig,
|
MoodConfig,
|
||||||
MemoryConfig,
|
MemoryConfig,
|
||||||
DebugConfig,
|
DebugConfig,
|
||||||
|
JargonConfig,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .api_ada_configs import (
|
from .api_ada_configs import (
|
||||||
|
|
@ -354,6 +355,7 @@ class Config(ConfigBase):
|
||||||
debug: DebugConfig
|
debug: DebugConfig
|
||||||
mood: MoodConfig
|
mood: MoodConfig
|
||||||
voice: VoiceConfig
|
voice: VoiceConfig
|
||||||
|
jargon: JargonConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
|
||||||
|
|
@ -720,6 +720,9 @@ class LPMMKnowledgeConfig(ConfigBase):
|
||||||
|
|
||||||
enable: bool = True
|
enable: bool = True
|
||||||
"""是否启用LPMM知识库"""
|
"""是否启用LPMM知识库"""
|
||||||
|
|
||||||
|
lpmm_mode: Literal["classic", "agent"] = "classic"
|
||||||
|
"""LPMM知识库模式,可选:classic经典模式,agent 模式,结合最新的记忆一同使用"""
|
||||||
|
|
||||||
rag_synonym_search_top_k: int = 10
|
rag_synonym_search_top_k: int = 10
|
||||||
"""RAG同义词搜索的Top K数量"""
|
"""RAG同义词搜索的Top K数量"""
|
||||||
|
|
@ -753,3 +756,11 @@ class LPMMKnowledgeConfig(ConfigBase):
|
||||||
|
|
||||||
embedding_dimension: int = 1024
|
embedding_dimension: int = 1024
|
||||||
"""嵌入向量维度,应该与模型的输出维度一致"""
|
"""嵌入向量维度,应该与模型的输出维度一致"""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class JargonConfig(ConfigBase):
|
||||||
|
"""Jargon配置类"""
|
||||||
|
|
||||||
|
all_global: bool = False
|
||||||
|
"""是否将所有新增的jargon项目默认为全局(is_global=True),chat_id记录第一次存储时的id"""
|
||||||
|
|
@ -29,20 +29,19 @@ def _init_prompt() -> None:
|
||||||
- 必须为对话中真实出现过的短词或短语
|
- 必须为对话中真实出现过的短词或短语
|
||||||
- 必须是你无法理解含义的词语,没有明确含义的词语
|
- 必须是你无法理解含义的词语,没有明确含义的词语
|
||||||
- 请不要选择有明确含义,或者含义清晰的词语
|
- 请不要选择有明确含义,或者含义清晰的词语
|
||||||
- 必须是这几种类别之一:英文或中文缩写、中文拼音短语
|
|
||||||
- 排除:人名、@、表情包/图片中的内容、纯标点、常规功能词(如的、了、呢、啊等)
|
- 排除:人名、@、表情包/图片中的内容、纯标点、常规功能词(如的、了、呢、啊等)
|
||||||
- 每个词条长度建议 2-8 个字符(不强制),尽量短小
|
- 每个词条长度建议 2-8 个字符(不强制),尽量短小
|
||||||
- 合并重复项,去重
|
- 合并重复项,去重
|
||||||
|
|
||||||
分类规则,type必须根据规则填写:
|
黑话必须为以下几种类型:
|
||||||
- p(拼音缩写):由字母构成的,汉语拼音首字母的简写词,例如:nb、yyds、xswl
|
- 由字母构成的,汉语拼音首字母的简写词,例如:nb、yyds、xswl
|
||||||
- e(英文缩写):英文词语的缩写,用英文字母概括一个词汇或含义,例如:CPU、GPU、API
|
- 英文词语的缩写,用英文字母概括一个词汇或含义,例如:CPU、GPU、API
|
||||||
- c(中文缩写):中文词语的缩写,用几个汉字概括一个词汇或含义,例如:社死、内卷
|
- 中文词语的缩写,用几个汉字概括一个词汇或含义,例如:社死、内卷
|
||||||
|
|
||||||
以 JSON 数组输出,元素为对象(严格按以下结构):
|
以 JSON 数组输出,元素为对象(严格按以下结构):
|
||||||
[
|
[
|
||||||
{{"content": "词条", "raw_content": "包含该词条的完整对话上下文原文", "type": "p"}},
|
{{"content": "词条", "raw_content": "包含该词条的完整对话上下文原文"}},
|
||||||
{{"content": "词条2", "raw_content": "包含该词条的完整对话上下文原文", "type": "c"}}
|
{{"content": "词条2", "raw_content": "包含该词条的完整对话上下文原文"}}
|
||||||
]
|
]
|
||||||
|
|
||||||
现在请输出:
|
现在请输出:
|
||||||
|
|
@ -154,8 +153,8 @@ class JargonMiner:
|
||||||
self.chat_id = chat_id
|
self.chat_id = chat_id
|
||||||
self.last_learning_time: float = time.time()
|
self.last_learning_time: float = time.time()
|
||||||
# 频率控制,可按需调整
|
# 频率控制,可按需调整
|
||||||
self.min_messages_for_learning: int = 20
|
self.min_messages_for_learning: int = 15
|
||||||
self.min_learning_interval: float = 30
|
self.min_learning_interval: float = 20
|
||||||
|
|
||||||
self.llm = LLMRequest(
|
self.llm = LLMRequest(
|
||||||
model_set=model_config.model_task_config.utils,
|
model_set=model_config.model_task_config.utils,
|
||||||
|
|
@ -427,17 +426,10 @@ class JargonMiner:
|
||||||
if raw_content_str:
|
if raw_content_str:
|
||||||
raw_content_list = [raw_content_str]
|
raw_content_list = [raw_content_str]
|
||||||
|
|
||||||
type_str = str(item.get("type", "")).strip().lower()
|
|
||||||
|
|
||||||
# 验证type是否为有效值
|
|
||||||
if type_str not in ["p", "c", "e"]:
|
|
||||||
type_str = "p" # 默认值
|
|
||||||
|
|
||||||
if content and raw_content_list:
|
if content and raw_content_list:
|
||||||
entries.append({
|
entries.append({
|
||||||
"content": content,
|
"content": content,
|
||||||
"raw_content": raw_content_list,
|
"raw_content": raw_content_list
|
||||||
"type": type_str
|
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"解析jargon JSON失败: {e}; 原始: {response}")
|
logger.error(f"解析jargon JSON失败: {e}; 原始: {response}")
|
||||||
|
|
@ -458,21 +450,27 @@ class JargonMiner:
|
||||||
|
|
||||||
saved = 0
|
saved = 0
|
||||||
updated = 0
|
updated = 0
|
||||||
merged = 0
|
|
||||||
for entry in uniq_entries:
|
for entry in uniq_entries:
|
||||||
content = entry["content"]
|
content = entry["content"]
|
||||||
raw_content_list = entry["raw_content"] # 已经是列表
|
raw_content_list = entry["raw_content"] # 已经是列表
|
||||||
type_str = entry["type"]
|
|
||||||
try:
|
try:
|
||||||
# 步骤1: 检查同chat_id的记录,默认纳入global项目
|
# 根据all_global配置决定查询逻辑
|
||||||
# 查询条件:chat_id匹配 OR (is_global为True且content匹配)
|
if global_config.jargon.all_global:
|
||||||
query = (
|
# 开启all_global:无视chat_id,查询所有content匹配的记录(所有记录都是全局的)
|
||||||
Jargon.select()
|
query = (
|
||||||
.where(
|
Jargon.select()
|
||||||
((Jargon.chat_id == self.chat_id) | Jargon.is_global) &
|
.where(Jargon.content == content)
|
||||||
(Jargon.content == content)
|
|
||||||
)
|
)
|
||||||
)
|
else:
|
||||||
|
# 关闭all_global:只查询chat_id匹配的记录(不考虑is_global)
|
||||||
|
query = (
|
||||||
|
Jargon.select()
|
||||||
|
.where(
|
||||||
|
(Jargon.chat_id == self.chat_id) &
|
||||||
|
(Jargon.content == content)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if query.exists():
|
if query.exists():
|
||||||
obj = query.get()
|
obj = query.get()
|
||||||
try:
|
try:
|
||||||
|
|
@ -494,9 +492,11 @@ class JargonMiner:
|
||||||
merged_list = list(dict.fromkeys(existing_raw_content + raw_content_list))
|
merged_list = list(dict.fromkeys(existing_raw_content + raw_content_list))
|
||||||
obj.raw_content = json.dumps(merged_list, ensure_ascii=False)
|
obj.raw_content = json.dumps(merged_list, ensure_ascii=False)
|
||||||
|
|
||||||
# 更新type(如果为空)
|
# 开启all_global时,确保记录标记为is_global=True
|
||||||
if type_str and not obj.type:
|
if global_config.jargon.all_global:
|
||||||
obj.type = type_str
|
obj.is_global = True
|
||||||
|
# 关闭all_global时,保持原有is_global不变(不修改)
|
||||||
|
|
||||||
obj.save()
|
obj.save()
|
||||||
|
|
||||||
# 检查是否需要推断(达到阈值且超过上次判定值)
|
# 检查是否需要推断(达到阈值且超过上次判定值)
|
||||||
|
|
@ -508,93 +508,22 @@ class JargonMiner:
|
||||||
|
|
||||||
updated += 1
|
updated += 1
|
||||||
else:
|
else:
|
||||||
# 步骤2: 同chat_id没有找到,检查所有chat_id中是否有相同content的记录
|
# 没找到匹配记录,创建新记录
|
||||||
# 查询所有非global的记录(global的已经在步骤1检查过了)
|
if global_config.jargon.all_global:
|
||||||
all_content_query = (
|
# 开启all_global:新记录默认为is_global=True
|
||||||
Jargon.select()
|
is_global_new = True
|
||||||
.where(
|
|
||||||
(Jargon.content == content) &
|
|
||||||
(~Jargon.is_global)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
all_matching = list(all_content_query)
|
|
||||||
|
|
||||||
# 如果找到3个或更多相同content的记录,合并它们
|
|
||||||
if len(all_matching) >= 3:
|
|
||||||
# 找到3个或更多已有记录,合并它们(新条目也会被包含在合并中)
|
|
||||||
total_count = sum((obj.count or 0) for obj in all_matching) + 1 # +1 是因为当前新条目
|
|
||||||
|
|
||||||
# 合并所有raw_content列表
|
|
||||||
all_raw_content = []
|
|
||||||
for obj in all_matching:
|
|
||||||
if obj.raw_content:
|
|
||||||
try:
|
|
||||||
obj_raw = json.loads(obj.raw_content) if isinstance(obj.raw_content, str) else obj.raw_content
|
|
||||||
if not isinstance(obj_raw, list):
|
|
||||||
obj_raw = [obj_raw] if obj_raw else []
|
|
||||||
all_raw_content.extend(obj_raw)
|
|
||||||
except (json.JSONDecodeError, TypeError):
|
|
||||||
if obj.raw_content:
|
|
||||||
all_raw_content.append(obj.raw_content)
|
|
||||||
|
|
||||||
# 添加当前新条目的raw_content
|
|
||||||
all_raw_content.extend(raw_content_list)
|
|
||||||
# 去重
|
|
||||||
merged_raw_content = list(dict.fromkeys(all_raw_content))
|
|
||||||
|
|
||||||
# 合并type:优先使用非空的值
|
|
||||||
merged_type = type_str
|
|
||||||
for obj in all_matching:
|
|
||||||
if obj.type and not merged_type:
|
|
||||||
merged_type = obj.type
|
|
||||||
break
|
|
||||||
|
|
||||||
# 合并其他字段:优先使用已有值
|
|
||||||
merged_meaning = None
|
|
||||||
merged_is_jargon = None
|
|
||||||
merged_last_inference_count = None
|
|
||||||
merged_is_complete = False
|
|
||||||
|
|
||||||
for obj in all_matching:
|
|
||||||
if obj.meaning and not merged_meaning:
|
|
||||||
merged_meaning = obj.meaning
|
|
||||||
if obj.is_jargon is not None and merged_is_jargon is None:
|
|
||||||
merged_is_jargon = obj.is_jargon
|
|
||||||
if obj.last_inference_count is not None and merged_last_inference_count is None:
|
|
||||||
merged_last_inference_count = obj.last_inference_count
|
|
||||||
if obj.is_complete:
|
|
||||||
merged_is_complete = True
|
|
||||||
|
|
||||||
# 删除旧的记录
|
|
||||||
for obj in all_matching:
|
|
||||||
obj.delete_instance()
|
|
||||||
|
|
||||||
# 创建新的global记录
|
|
||||||
Jargon.create(
|
|
||||||
content=content,
|
|
||||||
raw_content=json.dumps(merged_raw_content, ensure_ascii=False),
|
|
||||||
type=merged_type,
|
|
||||||
chat_id="global",
|
|
||||||
is_global=True,
|
|
||||||
count=total_count,
|
|
||||||
meaning=merged_meaning,
|
|
||||||
is_jargon=merged_is_jargon,
|
|
||||||
last_inference_count=merged_last_inference_count,
|
|
||||||
is_complete=merged_is_complete
|
|
||||||
)
|
|
||||||
merged += 1
|
|
||||||
logger.info(f"合并jargon为global: content={content}, 合并了{len(all_matching)}条已有记录+1条新记录(共{len(all_matching)+1}条),总count={total_count}")
|
|
||||||
else:
|
else:
|
||||||
# 找到少于3个已有记录,正常创建新记录
|
# 关闭all_global:新记录is_global=False
|
||||||
Jargon.create(
|
is_global_new = False
|
||||||
content=content,
|
|
||||||
raw_content=json.dumps(raw_content_list, ensure_ascii=False),
|
Jargon.create(
|
||||||
type=type_str,
|
content=content,
|
||||||
chat_id=self.chat_id,
|
raw_content=json.dumps(raw_content_list, ensure_ascii=False),
|
||||||
is_global=False,
|
chat_id=self.chat_id,
|
||||||
count=1
|
is_global=is_global_new,
|
||||||
)
|
count=1
|
||||||
saved += 1
|
)
|
||||||
|
saved += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}")
|
logger.error(f"保存jargon失败: chat_id={self.chat_id}, content={content}, err={e}")
|
||||||
continue
|
continue
|
||||||
|
|
@ -611,8 +540,8 @@ class JargonMiner:
|
||||||
# 更新为本次提取的结束时间,确保不会重复提取相同的消息窗口
|
# 更新为本次提取的结束时间,确保不会重复提取相同的消息窗口
|
||||||
self.last_learning_time = extraction_end_time
|
self.last_learning_time = extraction_end_time
|
||||||
|
|
||||||
if saved or updated or merged:
|
if saved or updated:
|
||||||
logger.info(f"jargon写入: 新增 {saved} 条,更新 {updated} 条,合并为global {merged} 条,chat_id={self.chat_id}")
|
logger.info(f"jargon写入: 新增 {saved} 条,更新 {updated} 条,chat_id={self.chat_id}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"JargonMiner 运行失败: {e}")
|
logger.error(f"JargonMiner 运行失败: {e}")
|
||||||
|
|
||||||
|
|
@ -647,7 +576,9 @@ def search_jargon(
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
keyword: 搜索关键词
|
keyword: 搜索关键词
|
||||||
chat_id: 可选的聊天ID,如果提供则优先搜索该聊天或global的jargon
|
chat_id: 可选的聊天ID
|
||||||
|
- 如果开启了all_global:此参数被忽略,查询所有is_global=True的记录
|
||||||
|
- 如果关闭了all_global:如果提供则优先搜索该聊天或global的jargon
|
||||||
limit: 返回结果数量限制,默认10
|
limit: 返回结果数量限制,默认10
|
||||||
case_sensitive: 是否大小写敏感,默认False(不敏感)
|
case_sensitive: 是否大小写敏感,默认False(不敏感)
|
||||||
fuzzy: 是否模糊搜索,默认True(使用LIKE匹配)
|
fuzzy: 是否模糊搜索,默认True(使用LIKE匹配)
|
||||||
|
|
@ -686,11 +617,16 @@ def search_jargon(
|
||||||
|
|
||||||
query = query.where(search_condition)
|
query = query.where(search_condition)
|
||||||
|
|
||||||
# 如果提供了chat_id,优先搜索该聊天或global的jargon
|
# 根据all_global配置决定查询逻辑
|
||||||
if chat_id:
|
if global_config.jargon.all_global:
|
||||||
query = query.where(
|
# 开启all_global:所有记录都是全局的,查询所有is_global=True的记录(无视chat_id)
|
||||||
(Jargon.chat_id == chat_id) | Jargon.is_global
|
query = query.where(Jargon.is_global)
|
||||||
)
|
else:
|
||||||
|
# 关闭all_global:如果提供了chat_id,优先搜索该聊天或global的jargon
|
||||||
|
if chat_id:
|
||||||
|
query = query.where(
|
||||||
|
(Jargon.chat_id == chat_id) | Jargon.is_global
|
||||||
|
)
|
||||||
|
|
||||||
# 只返回有meaning的记录
|
# 只返回有meaning的记录
|
||||||
query = query.where(
|
query = query.where(
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
[inner]
|
[inner]
|
||||||
version = "6.20.3"
|
version = "6.21.1"
|
||||||
|
|
||||||
#----以下是给开发人员阅读的,如果你只是部署了麦麦,不需要阅读----
|
#----以下是给开发人员阅读的,如果你只是部署了麦麦,不需要阅读----
|
||||||
#如果你想要修改配置文件,请递增version的值
|
#如果你想要修改配置文件,请递增version的值
|
||||||
|
|
@ -124,6 +124,8 @@ max_memory_number = 100 # 记忆最大数量
|
||||||
max_memory_size = 2048 # 记忆最大大小
|
max_memory_size = 2048 # 记忆最大大小
|
||||||
memory_build_frequency = 1 # 记忆构建频率
|
memory_build_frequency = 1 # 记忆构建频率
|
||||||
|
|
||||||
|
[jargon]
|
||||||
|
all_global = true # 是否开启全局黑话模式,注意,此功能关闭后,已经记录的全局黑话不会改变,需要手动删除
|
||||||
|
|
||||||
[tool]
|
[tool]
|
||||||
enable_tool = true # 是否启用工具
|
enable_tool = true # 是否启用工具
|
||||||
|
|
@ -161,6 +163,8 @@ ban_msgs_regex = [
|
||||||
|
|
||||||
[lpmm_knowledge] # lpmm知识库配置
|
[lpmm_knowledge] # lpmm知识库配置
|
||||||
enable = false # 是否启用lpmm知识库
|
enable = false # 是否启用lpmm知识库
|
||||||
|
lpmm_mode = "agent"
|
||||||
|
# 可选:classic经典模式,agent 模式,结合最新的记忆一同使用
|
||||||
rag_synonym_search_top_k = 10 # 同义词搜索TopK
|
rag_synonym_search_top_k = 10 # 同义词搜索TopK
|
||||||
rag_synonym_threshold = 0.8 # 同义词阈值(相似度高于此阈值的词语会被认为是同义词)
|
rag_synonym_threshold = 0.8 # 同义词阈值(相似度高于此阈值的词语会被认为是同义词)
|
||||||
info_extraction_workers = 3 # 实体提取同时执行线程数,非Pro模型不要设置超过5
|
info_extraction_workers = 3 # 实体提取同时执行线程数,非Pro模型不要设置超过5
|
||||||
|
|
@ -255,3 +259,4 @@ chat_prompts = []
|
||||||
#此系统暂时移除,无效配置
|
#此系统暂时移除,无效配置
|
||||||
[relationship]
|
[relationship]
|
||||||
enable_relationship = true # 是否启用关系系统
|
enable_relationship = true # 是否启用关系系统
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue