From a41f4694cf92545c2df722f9bfdda93f7ffbb0bb Mon Sep 17 00:00:00 2001 From: SengokuCola <1026294844@qq.com> Date: Tue, 18 Nov 2025 01:21:46 +0800 Subject: [PATCH] =?UTF-8?q?better=EF=BC=9A=E4=BC=98=E5=8C=96=E8=A1=A8?= =?UTF-8?q?=E8=BE=BE=E6=96=B9=E5=BC=8F=EF=BC=8C=E7=8E=B0=E5=9C=A8=E8=A1=A8?= =?UTF-8?q?=E8=BE=BE=E6=96=B9=E5=BC=8F=E4=BC=9A=E9=9A=8F=E6=97=B6=E9=97=B4?= =?UTF-8?q?=E6=9B=B4=E5=8A=A0=E7=B2=BE=E5=87=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- changelogs/changelog.md | 4 + src/common/database/database_model.py | 2 + src/express/expression_learner.py | 178 +++++++++++++++++++++++--- 3 files changed, 166 insertions(+), 18 deletions(-) diff --git a/changelogs/changelog.md b/changelogs/changelog.md index a1de221a..f8f30857 100644 --- a/changelogs/changelog.md +++ b/changelogs/changelog.md @@ -3,8 +3,12 @@ ## [0.11.3] - 2025-11-17 ### 功能更改和修复 - 优化记忆提取策略 +- 优化表达方式学习 - 修改readme +提示:清理旧的记忆数据和表达方式,表现更好 +方法:删除数据库中 expression jargon 和 thinking_back 的全部内容 + ## [0.11.2] - 2025-11-16 ### 🌟 主要功能更改 - "海马体Agent"记忆系统上线,最新最好的记忆系统,默认已接入lpmm diff --git a/src/common/database/database_model.py b/src/common/database/database_model.py index 3673e6d2..c97c0b72 100644 --- a/src/common/database/database_model.py +++ b/src/common/database/database_model.py @@ -311,6 +311,8 @@ class Expression(BaseModel): context = TextField(null=True) up_content = TextField(null=True) + content_list = TextField(null=True) + count = IntegerField(default=1) last_active_time = FloatField() chat_id = TextField(index=True) create_date = FloatField(null=True) # 创建日期,允许为空以兼容老数据 diff --git a/src/express/expression_learner.py b/src/express/expression_learner.py index 72dd831a..2a83e028 100644 --- a/src/express/expression_learner.py +++ b/src/express/expression_learner.py @@ -77,6 +77,9 @@ class ExpressionLearner: self.express_learn_model: LLMRequest = LLMRequest( model_set=model_config.model_task_config.utils, request_type="expression.learner" ) + self.summary_model: LLMRequest = LLMRequest( + model_set=model_config.model_task_config.utils_small, request_type="expression.summary" + ) self.embedding_model: LLMRequest = LLMRequest( model_set=model_config.model_task_config.embedding, request_type="expression.embedding" ) @@ -186,25 +189,13 @@ class ExpressionLearner: context, up_content, ) in learnt_expressions: - # 查找是否已存在相似表达方式 - query = Expression.select().where( - (Expression.chat_id == self.chat_id) & (Expression.situation == situation) & (Expression.style == style) + await self._upsert_expression_record( + situation=situation, + style=style, + context=context, + up_content=up_content, + current_time=current_time, ) - if query.exists(): - # 表达方式完全相同,只更新时间戳 - expr_obj = query.get() - expr_obj.last_active_time = current_time - expr_obj.save() - else: - Expression.create( - situation=situation, - style=style, - last_active_time=current_time, - chat_id=self.chat_id, - create_date=current_time, # 手动设置创建日期 - context=context, - up_content=up_content, - ) return learnt_expressions @@ -362,6 +353,10 @@ class ExpressionLearner: logger.error(f"学习表达方式失败,模型生成出错: {e}") return None expressions: List[Tuple[str, str]] = self.parse_expression_response(response) + expressions = self._filter_self_reference_styles(expressions) + if not expressions: + logger.info("过滤后没有可用的表达方式(style 与机器人名称重复)") + return None # logger.debug(f"学习{type_str}的response: {response}") # 对表达方式溯源 @@ -433,6 +428,153 @@ class ExpressionLearner: expressions.append((situation, style)) return expressions + def _filter_self_reference_styles(self, expressions: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + """ + 过滤掉style与机器人名称/昵称重复的表达 + """ + banned_names = set() + bot_nickname = (global_config.bot.nickname or "").strip() + if bot_nickname: + banned_names.add(bot_nickname) + + alias_names = global_config.bot.alias_names or [] + for alias in alias_names: + alias = alias.strip() + if alias: + banned_names.add(alias) + + banned_casefold = {name.casefold() for name in banned_names if name} + + filtered: List[Tuple[str, str]] = [] + removed_count = 0 + for situation, style in expressions: + normalized_style = (style or "").strip() + if normalized_style and normalized_style.casefold() not in banned_casefold: + filtered.append((situation, style)) + else: + removed_count += 1 + + if removed_count: + logger.debug(f"已过滤 {removed_count} 条style与机器人名称重复的表达方式") + + return filtered + + async def _upsert_expression_record( + self, + situation: str, + style: str, + context: str, + up_content: str, + current_time: float, + ) -> None: + expr_obj = ( + Expression.select() + .where((Expression.chat_id == self.chat_id) & (Expression.style == style)) + .first() + ) + + if expr_obj: + await self._update_existing_expression( + expr_obj=expr_obj, + situation=situation, + context=context, + up_content=up_content, + current_time=current_time, + ) + return + + await self._create_expression_record( + situation=situation, + style=style, + context=context, + up_content=up_content, + current_time=current_time, + ) + + async def _create_expression_record( + self, + situation: str, + style: str, + context: str, + up_content: str, + current_time: float, + ) -> None: + content_list = [situation] + formatted_situation = await self._compose_situation_text(content_list, 1, situation) + + Expression.create( + situation=formatted_situation, + style=style, + content_list=json.dumps(content_list, ensure_ascii=False), + count=1, + last_active_time=current_time, + chat_id=self.chat_id, + create_date=current_time, + context=context, + up_content=up_content, + ) + + async def _update_existing_expression( + self, + expr_obj: Expression, + situation: str, + context: str, + up_content: str, + current_time: float, + ) -> None: + content_list = self._parse_content_list(expr_obj.content_list) + content_list.append(situation) + + expr_obj.content_list = json.dumps(content_list, ensure_ascii=False) + expr_obj.count = (expr_obj.count or 0) + 1 + expr_obj.last_active_time = current_time + expr_obj.context = context + expr_obj.up_content = up_content + + new_situation = await self._compose_situation_text( + content_list=content_list, + count=expr_obj.count, + fallback=expr_obj.situation, + ) + expr_obj.situation = new_situation + + expr_obj.save() + + def _parse_content_list(self, stored_list: Optional[str]) -> List[str]: + if not stored_list: + return [] + try: + data = json.loads(stored_list) + except json.JSONDecodeError: + return [] + return [str(item) for item in data if isinstance(item, str)] if isinstance(data, list) else [] + + async def _compose_situation_text(self, content_list: List[str], count: int, fallback: str = "") -> str: + sanitized = [c.strip() for c in content_list if c.strip()] + summary = await self._summarize_situations(sanitized) + if summary: + return summary + return "/".join(sanitized) if sanitized else fallback + + async def _summarize_situations(self, situations: List[str]) -> Optional[str]: + if not situations: + return None + + prompt = ( + "请阅读以下多个聊天情境描述,并将它们概括成一句简短的话," + "长度不超过20个字,保留共同特点:\n" + f"{chr(10).join(f'- {s}' for s in situations[-10:])}\n只输出概括内容。" + ) + + try: + summary, _ = await self.summary_model.generate_response_async(prompt, temperature=0.2) + summary = summary.strip() + if summary: + return summary + except Exception as e: + logger.error(f"概括表达情境失败: {e}") + return None + def _build_bare_lines(self, messages: List) -> List[Tuple[int, str]]: """ 为每条消息构建精简文本列表,保留到原消息索引的映射