better:优化错别字生成和分段

pull/1354/head
SengokuCola 2025-11-10 01:13:02 +08:00
parent 70cffcc387
commit 10cd2474af
2 changed files with 77 additions and 41 deletions

View File

@ -221,13 +221,13 @@ def split_into_sentences_w_remove_punctuation(text: str) -> list[str]:
while i < len(text): while i < len(text):
char = text[i] char = text[i]
if char in separators: if char in separators:
# 检查分割条件:如果分隔符左右都是英文字母,则不分割 # 检查分割条件:如果空格左右都是英文字母,则不分割(仅对空格应用此规则)
can_split = True can_split = True
if 0 < i < len(text) - 1: if 0 < i < len(text) - 1:
prev_char = text[i - 1] prev_char = text[i - 1]
next_char = text[i + 1] next_char = text[i + 1]
# if is_english_letter(prev_char) and is_english_letter(next_char) and char == ' ': # 原计划只对空格应用此规则,现应用于所有分隔符 # 只对空格应用"不分割两个英文之间的空格"规则
if is_english_letter(prev_char) and is_english_letter(next_char): if char == ' ' and is_english_letter(prev_char) and is_english_letter(next_char):
can_split = False can_split = False
if can_split: if can_split:
@ -388,9 +388,16 @@ def process_llm_response(text: str, enable_splitter: bool = True, enable_chinese
for sentence in split_sentences: for sentence in split_sentences:
if global_config.chinese_typo.enable and enable_chinese_typo: if global_config.chinese_typo.enable and enable_chinese_typo:
typoed_text, typo_corrections = typo_generator.create_typo_sentence(sentence) typoed_text, typo_corrections = typo_generator.create_typo_sentence(sentence)
sentences.append(typoed_text)
if typo_corrections: if typo_corrections:
sentences.append(typo_corrections) # 50%概率新增正确字/词50%概率用正确分句替换错别字分句
if random.random() < 0.5:
sentences.append(typoed_text)
sentences.append(typo_corrections)
else:
# 用正确的分句替换错别字分句
sentences.append(sentence)
else:
sentences.append(typoed_text)
else: else:
sentences.append(sentence) sentences.append(sentence)

View File

@ -1,6 +1,7 @@
import time import time
import json import json
import re import re
import random
from typing import List, Dict, Any, Optional, Tuple from typing import List, Dict, Any, Optional, Tuple
from src.common.logger import get_logger from src.common.logger import get_logger
from src.config.config import global_config, model_config from src.config.config import global_config, model_config
@ -63,8 +64,7 @@ def init_memory_retrieval_prompt():
# 第二步ReAct Agent prompt工具描述会在运行时动态生成 # 第二步ReAct Agent prompt工具描述会在运行时动态生成
Prompt( Prompt(
""" """你需要通过思考(Think)、行动(Action)、观察(Observation)的循环来回答问题。
你是一个记忆检索助手需要通过思考(Think)行动(Action)观察(Observation)的循环来回答问题
当前问题{question} 当前问题{question}
已收集的信息 已收集的信息
@ -77,14 +77,13 @@ def init_memory_retrieval_prompt():
```json ```json
{{ {{
"thought": "你的思考过程,分析当前情况,决定下一步行动", "thought": "你的思考过程,分析当前情况,决定下一步行动",
"action": "要执行的动作,格式为:工具名(参数)",
"action_type": {action_types_list}, "action_type": {action_types_list},
"action_params": {{参数名: 参数值}} null "action_params": {{参数名: 参数值}} null
}} }}
``` ```
你可以选择以下动作 你可以选择以下动作
1. 如果已经收集到足够的信息可以回答问题请设置action_type为"final_answer"并在thought中说明答案 1. 如果已经收集到足够的信息可以回答问题请设置action_type为"final_answer"并在thought中说明答案除非明确找到答案否则不要设置为final_answer
2. 如果经过多次查询后确认无法找到相关信息或答案请设置action_type为"no_answer"并在thought中说明原因 2. 如果经过多次查询后确认无法找到相关信息或答案请设置action_type为"no_answer"并在thought中说明原因
请只输出JSON不要输出其他内容 请只输出JSON不要输出其他内容
@ -341,17 +340,18 @@ def _query_thinking_back(chat_id: str, question: str) -> Optional[Tuple[bool, st
question: 问题 question: 问题
Returns: Returns:
Optional[Tuple[bool, str]]: 如果找到答案返回(True, answer)否则返回None Optional[Tuple[bool, str]]: 如果找到记录返回(found_answer, answer)否则返回None
found_answer: 是否找到答案True表示found_answer=1False表示found_answer=0
answer: 答案内容
""" """
try: try:
# 查询相同chat_id和问题且found_answer为True的记录 # 查询相同chat_id和问题的所有记录包括found_answer为0和1的
# 按更新时间倒序,获取最新的答案 # 按更新时间倒序,获取最新的记录
records = ( records = (
ThinkingBack.select() ThinkingBack.select()
.where( .where(
(ThinkingBack.chat_id == chat_id) & (ThinkingBack.chat_id == chat_id) &
(ThinkingBack.question == question) & (ThinkingBack.question == question)
(ThinkingBack.found_answer == 1)
) )
.order_by(ThinkingBack.update_time.desc()) .order_by(ThinkingBack.update_time.desc())
.limit(1) .limit(1)
@ -359,8 +359,10 @@ def _query_thinking_back(chat_id: str, question: str) -> Optional[Tuple[bool, st
if records.exists(): if records.exists():
record = records.get() record = records.get()
logger.info(f"在thinking_back中找到现成答案问题: {question[:50]}...") found_answer = bool(record.found_answer)
return True, record.answer or "" answer = record.answer or ""
logger.info(f"在thinking_back中找到记录问题: {question[:50]}...found_answer: {found_answer}")
return found_answer, answer
return None return None
@ -503,34 +505,61 @@ async def build_memory_retrieval_prompt(
# 先检查thinking_back数据库中是否有现成答案 # 先检查thinking_back数据库中是否有现成答案
cached_result = _query_thinking_back(chat_id, question) cached_result = _query_thinking_back(chat_id, question)
should_requery = False
if cached_result: if cached_result:
found_answer, answer = cached_result cached_found_answer, cached_answer = cached_result
# 根据found_answer的值决定是否重新查询
if cached_found_answer: # found_answer == 1 (True)
# found_answer == 120%概率重新查询
if random.random() < 0.2:
should_requery = True
logger.info(f"found_answer=1触发20%概率重新查询,问题: {question[:50]}...")
else:
# 使用缓存答案
if cached_answer:
logger.info(f"从thinking_back缓存中获取答案found_answer=1问题: {question[:50]}...")
all_results.append(f"问题:{question}\n答案:{cached_answer}")
continue # 跳过ReAct Agent查询
else: # found_answer == 0 (False)
# found_answer == 040%概率重新查询
if random.random() < 0.4:
should_requery = True
logger.info(f"found_answer=0触发40%概率重新查询,问题: {question[:50]}...")
else:
# 使用缓存答案即使found_answer=0也可能有部分答案
if cached_answer:
logger.info(f"从thinking_back缓存中获取答案found_answer=0问题: {question[:50]}...")
all_results.append(f"问题:{question}\n答案:{cached_answer}")
continue # 跳过ReAct Agent查询
# 如果没有缓存答案或需要重新查询使用ReAct Agent查询
if not cached_result or should_requery:
if should_requery:
logger.info(f"概率触发重新查询使用ReAct Agent查询问题: {question[:50]}...")
else:
logger.info(f"未找到缓存答案使用ReAct Agent查询问题: {question[:50]}...")
found_answer, answer, thinking_steps = await _react_agent_solve_question(
question=question,
chat_id=chat_id,
max_iterations=5,
timeout=30.0
)
# 存储到数据库
_store_thinking_back(
chat_id=chat_id,
question=question,
context=message, # 只存储前500字符作为上下文
found_answer=found_answer,
answer=answer,
thinking_steps=thinking_steps
)
if found_answer and answer: if found_answer and answer:
logger.info(f"从thinking_back缓存中获取答案问题: {question[:50]}...")
all_results.append(f"问题:{question}\n答案:{answer}") all_results.append(f"问题:{question}\n答案:{answer}")
continue # 跳过ReAct Agent查询
# 如果没有缓存答案使用ReAct Agent查询
logger.info(f"未找到缓存答案使用ReAct Agent查询问题: {question[:50]}...")
found_answer, answer, thinking_steps = await _react_agent_solve_question(
question=question,
chat_id=chat_id,
max_iterations=5,
timeout=30.0
)
# 存储到数据库
_store_thinking_back(
chat_id=chat_id,
question=question,
context=message, # 只存储前500字符作为上下文
found_answer=found_answer,
answer=answer,
thinking_steps=thinking_steps
)
if found_answer and answer:
all_results.append(f"问题:{question}\n答案:{answer}")
end_time = time.time() end_time = time.time()