fix. 书名号回复

pull/937/head
Bakadax 2025-05-17 03:18:02 +08:00
parent e7206e4da6
commit 2080c02f54
1 changed files with 339 additions and 229 deletions

View File

@ -19,141 +19,174 @@ from ...config.config import global_config
logger = get_module_logger("chat_utils")
# 预编译正则表达式以提高性能
_L_REGEX = regex.compile(r"\p{L}") # 匹配任何Unicode字母
_HAN_CHAR_REGEX = regex.compile(r"\p{Han}") # 匹配汉字 (Unicode属性)
_Nd_REGEX = regex.compile(r"\p{Nd}") # 新增匹配Unicode数字 (Nd = Number, decimal digit)
# --- 全局常量和预编译正则表达式 ---
# \p{L} 匹配任何语言中的任何种类的字母字符。
_L_REGEX = regex.compile(r"\p{L}")
# \p{Han} 匹配汉字。
_HAN_CHAR_REGEX = regex.compile(r"\p{Han}")
# \p{Nd} 匹配十进制数字字符。
_Nd_REGEX = regex.compile(r"\p{Nd}")
# 书名号占位符的前缀,用于在处理文本时临时替换书名号。
BOOK_TITLE_PLACEHOLDER_PREFIX = "__BOOKTITLE_"
# 定义句子分隔符集合。
SEPARATORS = {"", "", ",", " ", ";", "\xa0", "\n", ".", "", "", ""}
# 已知的以点号结尾的英文缩写词,用于避免错误地将缩写词中的点号作为句子结束符。
KNOWN_ABBREVIATIONS_ENDING_WITH_DOT = {
"Mr.",
"Mrs.",
"Ms.",
"Dr.",
"Prof.",
"St.",
"Messrs.",
"Mmes.",
"Capt.",
"Gov.",
"Inc.",
"Ltd.",
"Corp.",
"Co.",
"PLC", # PLC通常不带点但有些可能
"vs.",
"etc.",
"i.e.",
"e.g.",
"viz.",
"al.",
"et al.",
"ca.",
"cf.",
"No.",
"Vol.",
"pp.",
"fig.",
"figs.",
"ed.",
"Ph.D.",
"M.D.",
"B.A.",
"M.A.",
"Jan.",
"Feb.",
"Mar.",
"Apr.",
"Jun.",
"Jul.",
"Aug.",
"Sep.",
"Oct.",
"Nov.",
"Dec.", # May. 通常不用点
"Mon.",
"Tue.",
"Wed.",
"Thu.",
"Fri.",
"Sat.",
"Sun.",
"U.S.",
"U.K.",
"E.U.",
"U.S.A.",
"U.S.S.R.",
"Ave.",
"Blvd.",
"Rd.",
"Ln.", # Street suffixes
"approx.",
"dept.",
"appt.",
"श्री.", # Hindi Shri.
"Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "St.", "Messrs.", "Mmes.", "Capt.", "Gov.",
"Inc.", "Ltd.", "Corp.", "Co.", "PLC", "vs.", "etc.", "i.e.", "e.g.", "viz.",
"al.", "et al.", "ca.", "cf.", "No.", "Vol.", "pp.", "fig.", "figs.", "ed.",
"Ph.D.", "M.D.", "B.A.", "M.A.", "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.",
"Aug.", "Sep.", "Oct.", "Nov.", "Dec.", "Mon.", "Tue.", "Wed.", "Thu.", "Fri.",
"Sat.", "Sun.", "U.S.", "U.K.", "E.U.", "U.S.A.", "U.S.S.R.", "Ave.", "Blvd.",
"Rd.", "Ln.", "approx.", "dept.", "appt.", "श्री.", # 印地语中的 Shri.
}
# --- 辅助函数 ---
def is_letter_not_han(char_str: str) -> bool:
"""
检查字符是否为字母非汉字
检查单个字符是否为字母非汉字
例如拉丁字母西里尔字母韩文等返回True
汉字数字标点空格等返回False
Args:
char_str:待检查的单个字符
Returns:
bool: 如果字符是字母且非汉字则为True否则为False
"""
if not isinstance(char_str, str) or len(char_str) != 1:
return False
return False # 输入必须是单个字符的字符串
is_letter = _L_REGEX.fullmatch(char_str) is not None
if not is_letter:
return False
# 使用 \p{Han} 属性进行汉字判断,更为准确
return False # 如果不是字母直接返回False
# 是字母,则进一步判断是否为汉字
is_han = _HAN_CHAR_REGEX.fullmatch(char_str) is not None
return not is_han
return not is_han # 是字母且不是汉字
def is_han_character(char_str: str) -> bool:
"""检查字符是否为汉字 (使用 \p{Han} Unicode 属性)"""
"""
检查单个字符是否为汉字 (使用 Unicode \p{Han} 属性)
Args:
char_str: 待检查的单个字符
Returns:
bool: 如果字符是汉字则为True否则为False
"""
if not isinstance(char_str, str) or len(char_str) != 1:
return False
return _HAN_CHAR_REGEX.fullmatch(char_str) is not None
def is_digit(char_str: str) -> bool:
"""检查字符是否为Unicode数字"""
"""
检查单个字符是否为Unicode数字 (十进制数字)
Args:
char_str: 待检查的单个字符
Returns:
bool: 如果字符是Unicode数字则为True否则为False
"""
if not isinstance(char_str, str) or len(char_str) != 1:
return False
return _Nd_REGEX.fullmatch(char_str) is not None
def is_relevant_word_char(char_str: str) -> bool: # 新增辅助函数
def is_relevant_word_char(char_str: str) -> bool:
"""
检查字符是否为相关词语字符非汉字字母 数字
用于判断在非中文语境下空格两侧是否应被视为一个词内部的部分
检查字符是否为相关词语字符即非汉字字母或数字
此函数用于判断在非中文语境下空格两侧的字符是否应被视为构成一个连续词语的部分
从而决定该空格是否作为分割点
例如拉丁字母西里尔字母数字等返回True
汉字标点纯空格等返回False
Args:
char_str: 待检查的单个字符
Returns:
bool: 如果字符是非汉字字母或数字则为True否则为False
"""
if not isinstance(char_str, str) or len(char_str) != 1:
return False
# 检查是否为Unicode字母
if _L_REGEX.fullmatch(char_str):
# 如果是字母,则检查是否非汉字
return not _HAN_CHAR_REGEX.fullmatch(char_str)
# 检查是否为Unicode数字
if _Nd_REGEX.fullmatch(char_str):
return True # 数字本身被视为相关词语字符
return False
def is_english_letter(char: str) -> bool:
"""检查字符是否为英文字母(忽略大小写)"""
"""
检查单个字符是否为英文字母忽略大小写
Args:
char: 待检查的单个字符
Returns:
bool: 如果字符是英文字母则为True否则为False
"""
return "a" <= char.lower() <= "z"
def protect_book_titles(text: str) -> tuple[str, dict[str, str]]:
"""
保护文本中的书名号内容将其替换为唯一的占位符
返回保护后的文本和占位符到原始内容的映射
Args:
text: 原始输入文本
Returns:
tuple[str, dict[str, str]]: 一个元组包含
- protected_text (str): 书名号被占位符替换后的文本
- book_title_mapping (dict): 占位符到原始书名号内容含书名号本身的映射
"""
book_title_mapping = {}
# 正则表达式匹配《内容》形式的书名号,使用非贪婪匹配 (.*?) 以正确处理。
book_title_pattern = re.compile(r"《(.*?)》")
def replace_func(match):
# 为每个匹配到的书名号生成一个唯一的占位符。
placeholder = f"{BOOK_TITLE_PLACEHOLDER_PREFIX}{len(book_title_mapping)}__"
# 存储占位符和原始书名号(包括《》)的映射关系。
book_title_mapping[placeholder] = match.group(0)
return placeholder
protected_text = book_title_pattern.sub(replace_func, text)
return protected_text, book_title_mapping
def recover_book_titles(sentences: list[str], book_title_mapping: dict[str, str]) -> list[str]:
"""
将句子列表中的书名号占位符恢复为原始的书名号内容
Args:
sentences: 包含可能书名号占位符的句子列表
book_title_mapping: 占位符到原始书名号内容的映射
Returns:
list[str]: 书名号占位符被恢复后的句子列表
"""
recovered_sentences = []
if not sentences: # 如果输入句子列表为空,直接返回空列表
return []
for sentence in sentences:
if not isinstance(sentence, str): # 添加类型检查,确保每个元素都是字符串
recovered_sentences.append(sentence) # 如果不是字符串,直接添加(或选择跳过/记录错误)
continue
# 遍历映射,将句子中的每个占位符替换回其原始书名号内容。
for placeholder, original_content in book_title_mapping.items():
sentence = sentence.replace(placeholder, original_content)
recovered_sentences.append(sentence)
return recovered_sentences
def db_message_to_str(message_dict: dict) -> str:
logger.debug(f"message_dict: {message_dict}")
time_str = time.strftime("%m-%d %H:%M:%S", time.localtime(message_dict["time"]))
@ -305,180 +338,279 @@ def get_recent_group_speaker(chat_stream_id: int, sender, limit: int = 12) -> li
def split_into_sentences_w_remove_punctuation(original_text: str) -> list[str]:
"""将文本分割成句子,并根据概率合并"""
# print(f"DEBUG: 输入文本 (repr): {repr(text)}")
"""
将输入文本分割成句子列表
此过程包括
1. 保护书名号
2. 文本预处理如处理换行符
3. 基于分隔符将文本切分为初步的段落(segments)
4. 根据段落内容和分隔符类型构建初步的句子列表(preliminary_final_sentences)
特别处理汉字间的空格作为分割点
5. 对初步句子列表进行可能的合并基于随机概率和文本长度
6. 对合并后的句子进行随机标点移除
7. 恢复书名号
8. 返回最终处理过的句子列表
Args:
original_text: 原始输入文本
Returns:
list[str]: 分割和处理后的句子列表
"""
# 步骤1: 保护书名号,将其替换为占位符,并获取映射关系。
text, local_book_title_mapping = protect_book_titles(original_text)
perform_book_title_recovery_here = True
# 预处理
text = regex.sub(r"\n\s*\n+", "\n", text) # 合并多个换行符
text = regex.sub(r"\n\s*([—。.,;\s\xa0])", r"\1", text)
text = regex.sub(r"([—。.,;\s\xa0])\s*\n", r"\1", text)
perform_book_title_recovery_here = True # 控制是否在本函数末尾执行恢复,主要用于调试
# 步骤2: 文本预处理
text = regex.sub(r"\n\s*\n+", "\n", text) # 合并多个连续的换行符(及其间的空格)为一个换行符。
text = regex.sub(r"\n\s*([—。.,;\s\xa0])", r"\1", text) # 移除分隔符前的换行符和空格。
text = regex.sub(r"([—。.,;\s\xa0])\s*\n", r"\1", text) # 移除分隔符后的换行符和空格。
def replace_han_newline(match):
"""辅助函数,用于将汉字之间的单个换行符替换为逗号。"""
char1 = match.group(1)
char2 = match.group(2)
if is_han_character(char1) and is_han_character(char2):
return char1 + "" + char2 # 汉字间的换行符替换为逗号
return match.group(0)
return match.group(0) # 其他情况保持不变
text = regex.sub(r"(.)\n(.)", replace_han_newline, text)
text = regex.sub(r"(.)\n(.)", replace_han_newline, text) # 应用上述替换规则
len_text = len(text)
if len_text < 3:
len_text = len(text) # 使用保护书名号后的文本长度进行后续判断
# 特殊情况处理:如果原始文本(保护后)本身就是一个书名号占位符,
# 后续逻辑可能会将其作为单个元素处理,这里先标记,确保它能被正确恢复。
# (此处的 'pass' 意味着具体处理逻辑在后续的 restructured section 中统一进行)
if local_book_title_mapping and text in local_book_title_mapping: # 注意:这里应该是 text in local_book_title_mapping.keys()
pass
# 对于非常短且不含书名号的文本的提前返回逻辑。
if len_text < 3 and not local_book_title_mapping:
stripped_text = text.strip()
if not stripped_text:
if not stripped_text: # 如果剥离空格后为空,返回空列表
return []
# 如果剥离后只有一个字符且该字符是分隔符,也视为空(或无效)输入
if len(stripped_text) == 1 and stripped_text in SEPARATORS:
return []
# 对于极短文本,不应用随机标点移除,直接返回其剥离空格后的内容
return [stripped_text]
# 步骤3: 基于分隔符将文本切分为初步的段落(segments)
# segments 列表中的每个元素是一个元组 (content, separator_char)
segments = []
current_segment = ""
current_segment = "" # 当前正在构建的段落内容
i = 0
while i < len(text):
char = text[i]
if char in SEPARATORS:
can_split_current_char = True
char = text[i] # 当前字符
if char in SEPARATORS: # 如果当前字符是分隔符
can_split_current_char = True # 默认情况下,当前分隔符可以用于分割
if char == ".":
can_split_this_dot = True
# 规则1: 小数点 (数字.数字)
# 特殊分隔符处理逻辑
if char == ".": # 处理点号 '.'
can_split_this_dot = True # 默认点号可以分割
# 规则1: 小数点 (数字.数字) - 不分割
if 0 < i < len_text - 1 and is_digit(text[i - 1]) and is_digit(text[i + 1]):
can_split_this_dot = False
# 规则2: 西文缩写/域名内部的点 (西文字母.西文字母)
# 规则2: 西文缩写/域名内部的点 (非汉字字母.非汉字字母) - 不分割
elif 0 < i < len_text - 1 and is_letter_not_han(text[i - 1]) and is_letter_not_han(text[i + 1]):
can_split_this_dot = False
# 规则3: 已知缩写词的末尾点 (例如 "e.g. ", "U.S.A. ")
# 规则3: 已知缩写词的末尾点 (例如 "e.g. ", "U.S.A. ") - 不分割
else:
potential_abbreviation_word = current_segment + char
potential_abbreviation_word = current_segment + char # 构造包含当前点号的潜在词语
# 检查是否是已知缩写词,并且其后是空格或文本末尾
is_followed_by_space = i + 1 < len_text and text[i + 1] == " "
is_at_end_of_text = i + 1 == len_text
if potential_abbreviation_word in KNOWN_ABBREVIATIONS_ENDING_WITH_DOT and (
is_followed_by_space or is_at_end_of_text
):
if potential_abbreviation_word in KNOWN_ABBREVIATIONS_ENDING_WITH_DOT and \
(is_followed_by_space or is_at_end_of_text):
can_split_this_dot = False
can_split_current_char = can_split_this_dot
elif char == " " or char == "\xa0": # 处理空格/NBSP
if 0 < i < len_text - 1:
elif char == " " or char == "\xa0": # 处理空格或NBSP (非断行空格)
# 规则:非中文单词内部的空格不分割 (例如 "hello world", "слово1 слово2")
if 0 < i < len_text - 1: # 确保空格前后都有字符
prev_char = text[i - 1]
next_char = text[i + 1]
# 非中文单词内部的空格不分割 (例如 "hello world", "слово1 слово2")
# 如果空格前后都是“相关词语字符”(非汉字字母或数字),则不分割
if is_relevant_word_char(prev_char) and is_relevant_word_char(next_char):
can_split_current_char = False
# 特殊分隔符处理逻辑结束
if can_split_current_char:
if current_segment: # 如果当前段落有内容,则添加 (内容, 分隔符)
if can_split_current_char: # 如果决定在此处分割
if current_segment: # 如果当前段落有内容则将其与分隔符一起存入segments
segments.append((current_segment, char))
# 如果当前段落为空,但分隔符不是简单的排版空格 (除非是换行符这种有意义的空行分隔)
# 这用于处理连续分隔符或以分隔符开头的情况
elif char not in [" ", "\xa0"] or char == "\n":
segments.append(("", char)) # 添加 ("", 分隔符)
current_segment = "" # 重置当前段落
else:
current_segment += char # 不分割,将当前分隔符加入到当前段落
else:
current_segment += char # 非分隔符,加入当前段落
segments.append(("", char)) # 添加空内容和该分隔符
current_segment = "" # 重置当前段落内容
else: # 如果不分割,则将当前分隔符加入到当前段落内容中
current_segment += char
else: # 如果当前字符不是分隔符,则加入当前段落内容
current_segment += char
i += 1
if current_segment: # 处理末尾剩余的段落
segments.append((current_segment, ""))
if current_segment: # 处理文本末尾剩余的段落内容(它没有后续分隔符)
segments.append((current_segment, "")) # 使用空字符串作为其分隔符标记
# 过滤掉仅由空格组成的segment但保留其后的有效分隔符
# 步骤3.1: 过滤segments列表
# 移除仅由空格组成的segment内容但保留其后的有效分隔符如换行符
filtered_segments = []
for content, sep in segments:
stripped_content = content.strip()
if stripped_content:
stripped_content = content.strip() # 移除内容两端的空白
if stripped_content: # 如果剥离后仍有内容,则保留
filtered_segments.append((stripped_content, sep))
# 如果内容为空,但分隔符本身有意义(不是普通空格,或者是换行符)
elif sep and (sep not in [" ", "\xa0"] or sep == "\n"):
filtered_segments.append(("", sep))
segments = filtered_segments
if not segments:
return [text.strip()] if text.strip() else []
filtered_segments.append(("", sep)) # 保留空内容和该有意义的分隔符
segments = filtered_segments # 更新segments为过滤后的列表
# 步骤4: 构建初步的句子列表 (preliminary_final_sentences)
# 此阶段基于segments中的内容和分隔符类型尝试组装成句子。
# 关键逻辑:识别强终止符,并特别处理汉字间的空格作为分割点。
preliminary_final_sentences = []
current_sentence_build = ""
for k, (content, sep) in enumerate(segments):
current_sentence_build += content # 先添加内容部分
current_sentence_build = "" # 当前正在构建的句子
num_segments = len(segments)
for k, (content, sep) in enumerate(segments): # 遍历每个 (内容, 分隔符) 对
current_sentence_build += content # 首先将段落内容加入当前句子构建
# 判断分隔符类型
is_strong_terminator = sep in {"", ".", "", "", "\n", ""}
is_space_separator = sep in [" ", "\xa0"]
# 判断分隔符类型
is_strong_terminator = sep in {"", ".", "", "", "\n", ""} # 是否为强句子终止符
is_space_separator = sep in [" ", "\xa0"] # 是否为空格类分隔符
if is_strong_terminator:
current_sentence_build += sep # 将强终止符加入
if current_sentence_build.strip():
preliminary_final_sentences.append(current_sentence_build.strip())
current_sentence_build = "" # 开始新的句子构建
elif is_space_separator:
# 如果是空格,并且当前构建的句子不以空格结尾,则添加空格并继续构建
if not current_sentence_build.endswith(sep):
append_sep_to_current = is_strong_terminator # 默认只有强终止符会附加到句子末尾
should_split_now = False # 标记是否应在当前分隔符处立即分割句子
if is_strong_terminator: # 如果是强终止符,则应立即分割
should_split_now = True
elif is_space_separator: # 如果分隔符是空格
# 检查是否为“汉字-空格-汉字”模式,若是,则也应分割
if current_sentence_build: # 确保当前构建的句子有内容
last_char_of_build_stripped = current_sentence_build.strip() # 获取去除尾部空格的句子内容
# 检查当前句子末尾字符是否为汉字
if last_char_of_build_stripped and is_han_character(last_char_of_build_stripped[-1]):
# 检查下一个segment (如果存在) 的内容的第一个字符是否是汉字
if k + 1 < num_segments:
next_content_tuple = segments[k+1]
if next_content_tuple: # 确保元组存在
next_content = next_content_tuple[0] # 获取下一个段落的内容
if next_content and is_han_character(next_content[0]):
should_split_now = True # 满足汉字-空格-汉字,应分割
append_sep_to_current = False # 此时,该空格作为分割符,不应附加到句子末尾
if not should_split_now: # 如果不是因汉字间空格而分割(即普通空格连接)
# 避免在句子开头或已存在尾部空格时重复添加空格
if current_sentence_build and not current_sentence_build.endswith(" ") and not current_sentence_build.endswith("\xa0"):
current_sentence_build += " " # 将此空格作为连接符加入(统一用普通空格)
append_sep_to_current = False # 该空格已作为连接符处理,不作为独立分隔符附加
if should_split_now: # 如果决定在当前位置分割句子
if append_sep_to_current and sep: # 如果需要附加分隔符(通常是强终止符)
current_sentence_build += sep
elif sep: # 其他分隔符 (如 ',', ';')
current_sentence_build += sep # 加入并继续构建,这些通常不独立成句
# 如果这些弱分隔符后紧跟的就是文本末尾,则它们可能结束一个句子
if k == len(segments) - 1 and current_sentence_build.strip():
stripped_sentence = current_sentence_build.strip() # 清理句子两端空格
if stripped_sentence: # 确保句子不为空
preliminary_final_sentences.append(stripped_sentence)
current_sentence_build = "" # 重置句子构建器
elif sep and not is_space_separator: # 如果是其他弱分隔符 (如 ',', ';')
current_sentence_build += sep # 将其加入当前句子
# 如果这是最后一个segment且当前构建的句子有内容则也视为一个完整句子
if k == num_segments - 1 and current_sentence_build.strip():
preliminary_final_sentences.append(current_sentence_build.strip())
current_sentence_build = ""
# 如果 sep 是空字符串 (通常是最后一个 segment 的情况),则 current_sentence_build 已有内容,
# 等待循环结束后的统一处理。
if current_sentence_build.strip(): # 处理最后一个构建中的句子
if current_sentence_build.strip(): # 处理循环结束后剩余的正在构建的句子
preliminary_final_sentences.append(current_sentence_build.strip())
preliminary_final_sentences = [s for s in preliminary_final_sentences if s.strip()] # 清理空字符串
# print(f"DEBUG: 初步分割(优化组装后)的句子: {preliminary_final_sentences}")
# 再次清理,确保列表中的句子都是非空字符串
preliminary_final_sentences = [s for s in preliminary_final_sentences if s.strip()]
# --- RESTRUCTURED SECTION FOR MERGING, PUNCTUATION REMOVAL, AND BOOK TITLE RECOVERY ---
# 此部分统一处理句子的合并、随机标点移除和最终的书名号恢复。
intermediate_sentences_placeholders = [] # 存储待恢复书名号的中间句子列表
if not preliminary_final_sentences:
return []
# 情况1: 初步句子列表为空
# 这可能发生在原始文本非常短、仅包含分隔符,或者仅包含一个书名号(已被替换为占位符)
# 如果原文是单个书名号占位符,则应保留它以供恢复。
# 注意: text 是经过 protect_book_titles 处理后的文本。
# local_book_title_mapping.keys() 包含所有占位符。
if local_book_title_mapping and text in local_book_title_mapping.keys():
intermediate_sentences_placeholders = [text] # 此时 text 就是那个占位符
# else (其他导致 preliminary_final_sentences 为空的情况)intermediate_sentences_placeholders 保持为空列表
if len_text < 12:
split_strength = 0.2
elif len_text < 32:
split_strength = 0.5
else:
split_strength = 0.7
merge_probability = 1.0 - split_strength
if merge_probability == 1.0 and len(preliminary_final_sentences) > 1:
merged_text = " ".join(preliminary_final_sentences).strip()
if merged_text.endswith(",") or merged_text.endswith(""):
merged_text = merged_text[:-1].strip()
return [merged_text] if merged_text else []
elif len(preliminary_final_sentences) == 1:
s = preliminary_final_sentences[0].strip()
if s.endswith(",") or s.endswith(""):
s = s[:-1].strip()
return [s] if s else []
final_sentences_merged = []
temp_sentence = ""
if preliminary_final_sentences:
temp_sentence = preliminary_final_sentences[0]
for i_merge in range(1, len(preliminary_final_sentences)):
should_merge_based_on_punctuation = True
if temp_sentence and temp_sentence[-1] in {"", ".", "", ""}:
should_merge_based_on_punctuation = False
if random.random() < merge_probability and temp_sentence and should_merge_based_on_punctuation:
temp_sentence += " " + preliminary_final_sentences[i_merge]
else:
if temp_sentence:
final_sentences_merged.append(temp_sentence)
temp_sentence = preliminary_final_sentences[i_merge]
if temp_sentence:
final_sentences_merged.append(temp_sentence)
processed_sentences_after_merge = []
for sentence in final_sentences_merged:
s = sentence.strip()
if s.endswith(",") or s.endswith(""):
s = s[:-1].strip()
# 情况2: 初步句子列表只有一个句子
s = preliminary_final_sentences[0].strip() # 获取该句子并去除两端空格
if s:
s = random_remove_punctuation(s)
processed_sentences_after_merge.append(s)
s = random_remove_punctuation(s) # 对该句子进行随机标点移除
intermediate_sentences_placeholders = [s] if s else [] # 如果处理后仍有内容,则加入列表
else: # 情况3: 初步句子列表有多个句子,需要进行可能的随机合并
final_sentences_merged = [] # 存储合并后的句子
# 根据原始文本长度(未保护书名号前)决定合并强度
original_len_for_strength = len(original_text)
split_strength = 0.5 # 默认分割强度
if original_len_for_strength < 12:
split_strength = 0.5
elif original_len_for_strength < 32:
split_strength = 0.7
else:
split_strength = 0.9
actual_merge_probability = 1.0 - split_strength # 合并概率与分割强度互补
temp_sentence = "" # 临时存储正在合并的句子
if preliminary_final_sentences: # 确保有句子可以合并
temp_sentence = preliminary_final_sentences[0] # 从第一个句子开始
for i_merge in range(1, len(preliminary_final_sentences)): # 遍历后续句子
current_sentence_to_merge = preliminary_final_sentences[i_merge]
should_merge_based_on_punctuation = True # 默认可以合并
# 如果前一个句子以强终止符结尾,则不应与后一个句子合并
if temp_sentence and \
(temp_sentence.endswith("") or temp_sentence.endswith(".") or \
temp_sentence.endswith("!") or temp_sentence.endswith("?") or \
temp_sentence.endswith("")):
should_merge_based_on_punctuation = False
# 根据合并概率和标点规则决定是否合并
if random.random() < actual_merge_probability and temp_sentence and should_merge_based_on_punctuation:
# 合并时,如果需要,在两句子间添加空格
if not temp_sentence.endswith(" ") and not current_sentence_to_merge.startswith(" "):
temp_sentence += " "
temp_sentence += current_sentence_to_merge
else: # 不合并,则将已构建的 temp_sentence 加入列表,并开始新的 temp_sentence
if temp_sentence:
final_sentences_merged.append(temp_sentence)
temp_sentence = current_sentence_to_merge
if temp_sentence: # 将最后一个构建的(或未合并的)句子加入列表
final_sentences_merged.append(temp_sentence)
# 对合并后的每个句子进行清理和随机标点移除
processed_temp = []
for sentence_val in final_sentences_merged:
s_loop = sentence_val.strip()
# 移除句末可能存在的逗号
if s_loop.endswith(",") or s_loop.endswith(""):
s_loop = s_loop[:-1].strip()
if s_loop: # 确保句子不为空
s_loop = random_remove_punctuation(s_loop) # 随机标点移除
if s_loop: # 再次确保句子不为空
processed_temp.append(s_loop)
intermediate_sentences_placeholders = processed_temp
# 统一的书名号恢复步骤
final_sentences_recovered = []
if perform_book_title_recovery_here and local_book_title_mapping:
processed_sentences_after_merge = recover_book_titles(processed_sentences_after_merge, local_book_title_mapping)
return processed_sentences_after_merge
# 如果有书名号映射且需要恢复,则调用恢复函数
final_sentences_recovered = recover_book_titles(intermediate_sentences_placeholders, local_book_title_mapping)
else: # 否则,直接使用中间结果
final_sentences_recovered = intermediate_sentences_placeholders
# 返回最终结果,并再次过滤空字符串
return [s for s in final_sentences_recovered if s.strip()]
def random_remove_punctuation(text: str) -> str:
@ -497,13 +629,13 @@ def random_remove_punctuation(text: str) -> str:
if char == "" and i == text_len - 1: # 结尾的句号
if random.random() > 0.1: # 90%概率删除结尾句号
continue
elif char == "":
rand = random.random()
if rand < 0.25: # 25%概率删除逗号
continue
elif rand < 0.2: # 20%概率把逗号变成空格
result += " "
continue
# elif char == "":
# rand = random.random()
# if rand < 0.25: # 25%概率删除逗号
# continue
# elif rand < 0.2: # 20%概率把逗号变成空格
# result += " "
# continue
result += char
return result
@ -921,26 +1053,4 @@ def parse_text_timestamps(text: str, mode: str = "normal") -> str:
pattern_instance = re.escape(match.group(0))
result_text = re.sub(pattern_instance, readable_time, result_text, count=1)
return result_text
def protect_book_titles(text):
book_title_mapping = {}
book_title_pattern = re.compile(r"《(.*?)》") # 非贪婪匹配
def replace_func(match):
# 生成唯一占位符
placeholder = f"{BOOK_TITLE_PLACEHOLDER_PREFIX}{len(book_title_mapping)}__"
# 存储映射关系
book_title_mapping[placeholder] = match.group(0) # 存储包含书名号的完整匹配
return placeholder
protected_text = book_title_pattern.sub(replace_func, text)
return protected_text, book_title_mapping
def recover_book_titles(sentences, book_title_mapping):
recovered_sentences = []
for sentence in sentences:
for placeholder, original_content in book_title_mapping.items():
sentence = sentence.replace(placeholder, original_content)
recovered_sentences.append(sentence)
return recovered_sentences
return result_text