mirror of https://github.com/Mai-with-u/MaiBot.git
feat:黑化和表达不会将名字总结进去
parent
b73a748f52
commit
3db9fafe65
|
|
@ -3,7 +3,7 @@ import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import asyncio
|
import asyncio
|
||||||
from typing import List, Optional, Tuple, Any, Dict
|
from typing import List, Optional, Tuple, Any, Dict, Callable
|
||||||
from src.common.logger import get_logger
|
from src.common.logger import get_logger
|
||||||
from src.common.database.database_model import Expression
|
from src.common.database.database_model import Expression
|
||||||
from src.llm_models.utils_model import LLMRequest
|
from src.llm_models.utils_model import LLMRequest
|
||||||
|
|
@ -97,14 +97,14 @@ class ExpressionLearner:
|
||||||
async def learn_and_store(
|
async def learn_and_store(
|
||||||
self,
|
self,
|
||||||
messages: List[Any],
|
messages: List[Any],
|
||||||
|
person_name_filter: Optional[Callable[[str], bool]] = None,
|
||||||
) -> List[Tuple[str, str, str]]:
|
) -> List[Tuple[str, str, str]]:
|
||||||
"""
|
"""
|
||||||
学习并存储表达方式
|
学习并存储表达方式
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
messages: 外部传入的消息列表(必需)
|
messages: 外部传入的消息列表(必需)
|
||||||
num: 学习数量
|
person_name_filter: 可选的过滤函数,用于检查内容是否包含人物名称
|
||||||
timestamp_start: 学习开始的时间戳,如果为None则使用self.last_learning_time
|
|
||||||
"""
|
"""
|
||||||
if not messages:
|
if not messages:
|
||||||
return None
|
return None
|
||||||
|
|
@ -135,6 +135,17 @@ class ExpressionLearner:
|
||||||
expressions, jargon_entries = self.parse_expression_response(response)
|
expressions, jargon_entries = self.parse_expression_response(response)
|
||||||
expressions = self._filter_self_reference_styles(expressions)
|
expressions = self._filter_self_reference_styles(expressions)
|
||||||
|
|
||||||
|
# 过滤掉包含人物名称的表达方式
|
||||||
|
if person_name_filter:
|
||||||
|
filtered_expressions = []
|
||||||
|
for situation, style, source_id in expressions:
|
||||||
|
# 检查 situation 和 style 是否包含人物名称
|
||||||
|
if person_name_filter(situation) or person_name_filter(style):
|
||||||
|
logger.info(f"跳过包含人物名称的表达方式: situation={situation}, style={style}")
|
||||||
|
continue
|
||||||
|
filtered_expressions.append((situation, style, source_id))
|
||||||
|
expressions = filtered_expressions
|
||||||
|
|
||||||
# 检查表达方式数量,如果超过10个则放弃本次表达学习
|
# 检查表达方式数量,如果超过10个则放弃本次表达学习
|
||||||
if len(expressions) > 10:
|
if len(expressions) > 10:
|
||||||
logger.info(f"表达方式提取数量超过10个(实际{len(expressions)}个),放弃本次表达学习")
|
logger.info(f"表达方式提取数量超过10个(实际{len(expressions)}个),放弃本次表达学习")
|
||||||
|
|
@ -147,7 +158,7 @@ class ExpressionLearner:
|
||||||
|
|
||||||
# 处理黑话条目,路由到 jargon_miner(即使没有表达方式也要处理黑话)
|
# 处理黑话条目,路由到 jargon_miner(即使没有表达方式也要处理黑话)
|
||||||
if jargon_entries:
|
if jargon_entries:
|
||||||
await self._process_jargon_entries(jargon_entries, random_msg)
|
await self._process_jargon_entries(jargon_entries, random_msg, person_name_filter)
|
||||||
|
|
||||||
# 如果没有表达方式,直接返回
|
# 如果没有表达方式,直接返回
|
||||||
if not expressions:
|
if not expressions:
|
||||||
|
|
@ -500,13 +511,19 @@ class ExpressionLearner:
|
||||||
logger.error(f"概括表达情境失败: {e}")
|
logger.error(f"概括表达情境失败: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def _process_jargon_entries(self, jargon_entries: List[Tuple[str, str]], messages: List[Any]) -> None:
|
async def _process_jargon_entries(
|
||||||
|
self,
|
||||||
|
jargon_entries: List[Tuple[str, str]],
|
||||||
|
messages: List[Any],
|
||||||
|
person_name_filter: Optional[Callable[[str], bool]] = None
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
处理从 expression learner 提取的黑话条目,路由到 jargon_miner
|
处理从 expression learner 提取的黑话条目,路由到 jargon_miner
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
jargon_entries: 黑话条目列表,每个元素是 (content, source_id)
|
jargon_entries: 黑话条目列表,每个元素是 (content, source_id)
|
||||||
messages: 消息列表,用于构建上下文
|
messages: 消息列表,用于构建上下文
|
||||||
|
person_name_filter: 可选的过滤函数,用于检查内容是否包含人物名称
|
||||||
"""
|
"""
|
||||||
if not jargon_entries or not messages:
|
if not jargon_entries or not messages:
|
||||||
return
|
return
|
||||||
|
|
@ -527,6 +544,11 @@ class ExpressionLearner:
|
||||||
logger.info(f"跳过包含机器人昵称/别名的黑话: {content}")
|
logger.info(f"跳过包含机器人昵称/别名的黑话: {content}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# 检查是否包含人物名称
|
||||||
|
if person_name_filter and person_name_filter(content):
|
||||||
|
logger.info(f"跳过包含人物名称的黑话: {content}")
|
||||||
|
continue
|
||||||
|
|
||||||
# 解析 source_id
|
# 解析 source_id
|
||||||
source_id_str = (source_id or "").strip()
|
source_id_str = (source_id or "").strip()
|
||||||
if not source_id_str.isdigit():
|
if not source_id_str.isdigit():
|
||||||
|
|
@ -557,7 +579,7 @@ class ExpressionLearner:
|
||||||
return
|
return
|
||||||
|
|
||||||
# 调用 jargon_miner 处理这些条目
|
# 调用 jargon_miner 处理这些条目
|
||||||
await jargon_miner.process_extracted_entries(entries)
|
await jargon_miner.process_extracted_entries(entries, person_name_filter)
|
||||||
|
|
||||||
|
|
||||||
init_prompt()
|
init_prompt()
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ import json
|
||||||
import asyncio
|
import asyncio
|
||||||
import random
|
import random
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import List, Dict, Optional, Any
|
from typing import List, Dict, Optional, Any, Callable
|
||||||
from json_repair import repair_json
|
from json_repair import repair_json
|
||||||
from peewee import fn
|
from peewee import fn
|
||||||
|
|
||||||
|
|
@ -478,12 +478,17 @@ class JargonMiner:
|
||||||
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
async def run_once(self, messages: List[Any]) -> None:
|
async def run_once(
|
||||||
|
self,
|
||||||
|
messages: List[Any],
|
||||||
|
person_name_filter: Optional[Callable[[str], bool]] = None
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
运行一次黑话提取
|
运行一次黑话提取
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
messages: 外部传入的消息列表(必需)
|
messages: 外部传入的消息列表(必需)
|
||||||
|
person_name_filter: 可选的过滤函数,用于检查内容是否包含人物名称
|
||||||
"""
|
"""
|
||||||
# 使用异步锁防止并发执行
|
# 使用异步锁防止并发执行
|
||||||
async with self._extraction_lock:
|
async with self._extraction_lock:
|
||||||
|
|
@ -563,6 +568,11 @@ class JargonMiner:
|
||||||
logger.info(f"解析阶段跳过包含机器人昵称/别名的词条: {content}")
|
logger.info(f"解析阶段跳过包含机器人昵称/别名的词条: {content}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# 检查是否包含人物名称
|
||||||
|
if person_name_filter and person_name_filter(content):
|
||||||
|
logger.info(f"解析阶段跳过包含人物名称的词条: {content}")
|
||||||
|
continue
|
||||||
|
|
||||||
msg_id_str = str(msg_id_value or "").strip()
|
msg_id_str = str(msg_id_value or "").strip()
|
||||||
if not msg_id_str:
|
if not msg_id_str:
|
||||||
logger.warning(f"解析jargon失败:msg_id缺失,content={content}")
|
logger.warning(f"解析jargon失败:msg_id缺失,content={content}")
|
||||||
|
|
@ -723,12 +733,17 @@ class JargonMiner:
|
||||||
logger.error(f"JargonMiner 运行失败: {e}")
|
logger.error(f"JargonMiner 运行失败: {e}")
|
||||||
# 即使失败也保持时间戳更新,避免频繁重试
|
# 即使失败也保持时间戳更新,避免频繁重试
|
||||||
|
|
||||||
async def process_extracted_entries(self, entries: List[Dict[str, List[str]]]) -> None:
|
async def process_extracted_entries(
|
||||||
|
self,
|
||||||
|
entries: List[Dict[str, List[str]]],
|
||||||
|
person_name_filter: Optional[Callable[[str], bool]] = None
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
处理已提取的黑话条目(从 expression_learner 路由过来的)
|
处理已提取的黑话条目(从 expression_learner 路由过来的)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
entries: 黑话条目列表,每个元素格式为 {"content": "...", "raw_content": [...]}
|
entries: 黑话条目列表,每个元素格式为 {"content": "...", "raw_content": [...]}
|
||||||
|
person_name_filter: 可选的过滤函数,用于检查内容是否包含人物名称
|
||||||
"""
|
"""
|
||||||
if not entries:
|
if not entries:
|
||||||
return
|
return
|
||||||
|
|
@ -738,6 +753,14 @@ class JargonMiner:
|
||||||
merged_entries: OrderedDict[str, Dict[str, List[str]]] = OrderedDict()
|
merged_entries: OrderedDict[str, Dict[str, List[str]]] = OrderedDict()
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
content_key = entry["content"]
|
content_key = entry["content"]
|
||||||
|
|
||||||
|
# 检查是否包含人物名称
|
||||||
|
logger.info(f"process_extracted_entries 检查是否包含人物名称: {content_key}")
|
||||||
|
logger.info(f"person_name_filter: {person_name_filter}")
|
||||||
|
if person_name_filter and person_name_filter(content_key):
|
||||||
|
logger.info(f"process_extracted_entries 跳过包含人物名称的黑话: {content_key}")
|
||||||
|
continue
|
||||||
|
|
||||||
raw_list = entry.get("raw_content", []) or []
|
raw_list = entry.get("raw_content", []) or []
|
||||||
if content_key in merged_entries:
|
if content_key in merged_entries:
|
||||||
merged_entries[content_key]["raw_content"].extend(raw_list)
|
merged_entries[content_key]["raw_content"].extend(raw_list)
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,34 @@
|
||||||
import time
|
import time
|
||||||
import asyncio
|
import asyncio
|
||||||
from typing import List, Any
|
from typing import List, Any, Optional
|
||||||
|
from collections import OrderedDict
|
||||||
|
from dataclasses import dataclass
|
||||||
from src.common.logger import get_logger
|
from src.common.logger import get_logger
|
||||||
from src.config.config import global_config
|
from src.config.config import global_config
|
||||||
from src.chat.message_receive.chat_stream import get_chat_manager
|
from src.chat.message_receive.chat_stream import get_chat_manager
|
||||||
from src.chat.utils.chat_message_builder import get_raw_msg_by_timestamp_with_chat_inclusive
|
from src.chat.utils.chat_message_builder import get_raw_msg_by_timestamp_with_chat_inclusive
|
||||||
from src.bw_learner.expression_learner import expression_learner_manager
|
from src.bw_learner.expression_learner import expression_learner_manager
|
||||||
from src.bw_learner.jargon_miner import miner_manager
|
from src.bw_learner.jargon_miner import miner_manager
|
||||||
|
from src.person_info.person_info import Person
|
||||||
|
|
||||||
logger = get_logger("bw_learner")
|
logger = get_logger("bw_learner")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PersonInfo:
|
||||||
|
"""参与聊天的人物信息"""
|
||||||
|
user_id: str
|
||||||
|
user_platform: str
|
||||||
|
user_nickname: str
|
||||||
|
user_cardname: Optional[str]
|
||||||
|
person_name: str
|
||||||
|
last_seen_time: float # 最后发言时间
|
||||||
|
|
||||||
|
def get_unique_key(self) -> str:
|
||||||
|
"""获取唯一标识(用于去重)"""
|
||||||
|
return f"{self.user_platform}:{self.user_id}"
|
||||||
|
|
||||||
|
|
||||||
class MessageRecorder:
|
class MessageRecorder:
|
||||||
"""
|
"""
|
||||||
统一的消息记录器,负责管理时间窗口和消息提取,并将消息分发给 expression_learner 和 jargon_miner
|
统一的消息记录器,负责管理时间窗口和消息提取,并将消息分发给 expression_learner 和 jargon_miner
|
||||||
|
|
@ -27,6 +45,11 @@ class MessageRecorder:
|
||||||
# 提取锁,防止并发执行
|
# 提取锁,防止并发执行
|
||||||
self._extraction_lock = asyncio.Lock()
|
self._extraction_lock = asyncio.Lock()
|
||||||
|
|
||||||
|
# 维护参与该chat_id的人物列表(最多30个,使用OrderedDict保持插入顺序)
|
||||||
|
# key: f"{platform}:{user_id}", value: PersonInfo
|
||||||
|
self._person_list: OrderedDict[str, PersonInfo] = OrderedDict()
|
||||||
|
self._max_person_count = 30
|
||||||
|
|
||||||
# 获取 expression 和 jargon 的配置参数
|
# 获取 expression 和 jargon 的配置参数
|
||||||
self._init_parameters()
|
self._init_parameters()
|
||||||
|
|
||||||
|
|
@ -111,6 +134,11 @@ class MessageRecorder:
|
||||||
# 按时间排序,确保顺序一致
|
# 按时间排序,确保顺序一致
|
||||||
messages = sorted(messages, key=lambda msg: msg.time or 0)
|
messages = sorted(messages, key=lambda msg: msg.time or 0)
|
||||||
|
|
||||||
|
# 更新参与聊天的人物列表
|
||||||
|
self._update_person_list(messages)
|
||||||
|
|
||||||
|
logger.info(f"聊天流 {self.chat_name} 的人物列表: {self._person_list}")
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"聊天流 {self.chat_name} 提取到 {len(messages)} 条消息,"
|
f"聊天流 {self.chat_name} 提取到 {len(messages)} 条消息,"
|
||||||
f"时间窗口: {extraction_start_time:.2f} - {extraction_end_time:.2f}"
|
f"时间窗口: {extraction_start_time:.2f} - {extraction_end_time:.2f}"
|
||||||
|
|
@ -152,8 +180,11 @@ class MessageRecorder:
|
||||||
messages: 消息列表
|
messages: 消息列表
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# 传递消息给 ExpressionLearner(必需参数)
|
# 传递消息和过滤函数给 ExpressionLearner
|
||||||
learnt_style = await self.expression_learner.learn_and_store(messages=messages)
|
learnt_style = await self.expression_learner.learn_and_store(
|
||||||
|
messages=messages,
|
||||||
|
person_name_filter=self.contains_person_name
|
||||||
|
)
|
||||||
|
|
||||||
if learnt_style:
|
if learnt_style:
|
||||||
logger.info(f"聊天流 {self.chat_name} 表达学习完成")
|
logger.info(f"聊天流 {self.chat_name} 表达学习完成")
|
||||||
|
|
@ -179,14 +210,124 @@ class MessageRecorder:
|
||||||
messages: 消息列表
|
messages: 消息列表
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# 传递消息给 JargonMiner,避免它重复获取
|
# 传递消息和过滤函数给 JargonMiner
|
||||||
await self.jargon_miner.run_once(messages=messages)
|
await self.jargon_miner.run_once(
|
||||||
|
messages=messages,
|
||||||
|
person_name_filter=self.contains_person_name
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"为聊天流 {self.chat_name} 触发黑话提取失败: {e}")
|
logger.error(f"为聊天流 {self.chat_name} 触发黑话提取失败: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
def _update_person_list(self, messages: List[Any]) -> None:
|
||||||
|
"""
|
||||||
|
从消息中提取人物信息并更新人物列表
|
||||||
|
|
||||||
|
Args:
|
||||||
|
messages: 消息列表
|
||||||
|
"""
|
||||||
|
for msg in messages:
|
||||||
|
# 获取消息发送者信息
|
||||||
|
# 消息对象可能是 DatabaseMessages,它有 user_info 属性
|
||||||
|
if hasattr(msg, 'user_info'):
|
||||||
|
# DatabaseMessages 类型
|
||||||
|
user_info = msg.user_info
|
||||||
|
user_id = getattr(user_info, 'user_id', None) or ''
|
||||||
|
user_platform = getattr(user_info, 'platform', None) or ''
|
||||||
|
user_nickname = getattr(user_info, 'user_nickname', None) or ''
|
||||||
|
user_cardname = getattr(user_info, 'user_cardname', None)
|
||||||
|
else:
|
||||||
|
# 直接属性访问
|
||||||
|
user_id = getattr(msg, 'user_id', None) or ''
|
||||||
|
user_platform = getattr(msg, 'user_platform', None) or ''
|
||||||
|
user_nickname = getattr(msg, 'user_nickname', None) or ''
|
||||||
|
user_cardname = getattr(msg, 'user_cardname', None)
|
||||||
|
|
||||||
|
msg_time = getattr(msg, 'time', time.time())
|
||||||
|
|
||||||
|
# 检查必要信息
|
||||||
|
if not user_id or not user_platform:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 获取 person_name
|
||||||
|
try:
|
||||||
|
person = Person(platform=user_platform, user_id=str(user_id))
|
||||||
|
person_name = person.person_name or user_nickname or (user_cardname if user_cardname else "未知用户")
|
||||||
|
except Exception as e:
|
||||||
|
logger.info(f"获取person_name失败: {e}, 使用nickname")
|
||||||
|
person_name = user_nickname or (user_cardname if user_cardname else "未知用户")
|
||||||
|
|
||||||
|
# 生成唯一key
|
||||||
|
unique_key = f"{user_platform}:{user_id}"
|
||||||
|
|
||||||
|
# 如果已存在,更新最后发言时间
|
||||||
|
if unique_key in self._person_list:
|
||||||
|
self._person_list[unique_key].last_seen_time = msg_time
|
||||||
|
# 移动到末尾(表示最近活跃)
|
||||||
|
self._person_list.move_to_end(unique_key)
|
||||||
|
else:
|
||||||
|
# 如果超过最大数量,移除最早的(最前面的)
|
||||||
|
if len(self._person_list) >= self._max_person_count:
|
||||||
|
oldest_key = next(iter(self._person_list))
|
||||||
|
del self._person_list[oldest_key]
|
||||||
|
logger.info(f"人物列表已满,移除最早的人物: {oldest_key}")
|
||||||
|
|
||||||
|
# 添加新人物
|
||||||
|
person_info = PersonInfo(
|
||||||
|
user_id=str(user_id),
|
||||||
|
user_platform=user_platform,
|
||||||
|
user_nickname=user_nickname or "",
|
||||||
|
user_cardname=user_cardname,
|
||||||
|
person_name=person_name,
|
||||||
|
last_seen_time=msg_time
|
||||||
|
)
|
||||||
|
self._person_list[unique_key] = person_info
|
||||||
|
logger.info(f"添加新人物到列表: {unique_key}, person_name={person_name}")
|
||||||
|
|
||||||
|
def contains_person_name(self, content: str) -> bool:
|
||||||
|
"""
|
||||||
|
检查内容是否包含任何参与聊天的人物的名称或昵称
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: 要检查的内容
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: 如果包含任何人物名称或昵称,返回True
|
||||||
|
"""
|
||||||
|
if not content or not self._person_list:
|
||||||
|
return False
|
||||||
|
|
||||||
|
content_lower = content.strip().lower()
|
||||||
|
if not content_lower:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 检查所有人物
|
||||||
|
for person_info in self._person_list.values():
|
||||||
|
# 检查 person_name
|
||||||
|
if person_info.person_name:
|
||||||
|
person_name_lower = person_info.person_name.strip().lower()
|
||||||
|
if person_name_lower and person_name_lower in content_lower:
|
||||||
|
logger.debug(f"内容包含person_name: {person_info.person_name} in {content}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 检查 user_nickname
|
||||||
|
if person_info.user_nickname:
|
||||||
|
nickname_lower = person_info.user_nickname.strip().lower()
|
||||||
|
if nickname_lower and nickname_lower in content_lower:
|
||||||
|
logger.debug(f"内容包含nickname: {person_info.user_nickname} in {content}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 检查 user_cardname(群昵称)
|
||||||
|
if person_info.user_cardname:
|
||||||
|
cardname_lower = person_info.user_cardname.strip().lower()
|
||||||
|
if cardname_lower and cardname_lower in content_lower:
|
||||||
|
logger.debug(f"内容包含cardname: {person_info.user_cardname} in {content}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class MessageRecorderManager:
|
class MessageRecorderManager:
|
||||||
"""MessageRecorder 管理器"""
|
"""MessageRecorder 管理器"""
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ def init_replyer_prompt():
|
||||||
{reply_target_block}。
|
{reply_target_block}。
|
||||||
{planner_reasoning}
|
{planner_reasoning}
|
||||||
{identity}
|
{identity}
|
||||||
{chat_prompt}你正在群里聊天,现在请你读读之前的聊天记录,把握当前的话题,然后给出口语化回复,
|
{chat_prompt}你正在群里聊天,现在请你读读之前的聊天记录,把握当前的话题,然后给出日常且简短的回复,。
|
||||||
{keywords_reaction_prompt}
|
{keywords_reaction_prompt}
|
||||||
请注意把握聊天内容。
|
请注意把握聊天内容。
|
||||||
{reply_style}
|
{reply_style}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue