better. 分割函数考虑省略号存在

pull/937/head
Bakadax 2025-05-19 10:26:16 +08:00
parent 09ec1502cd
commit 9ccdc11bc6
1 changed files with 184 additions and 172 deletions

View File

@ -29,6 +29,8 @@ _Nd_REGEX = regex.compile(r"\p{Nd}")
# 书名号占位符的前缀,用于在处理文本时临时替换书名号。 # 书名号占位符的前缀,用于在处理文本时临时替换书名号。
BOOK_TITLE_PLACEHOLDER_PREFIX = "__BOOKTITLE_" BOOK_TITLE_PLACEHOLDER_PREFIX = "__BOOKTITLE_"
# 省略号占位符的前缀
ELLIPSIS_PLACEHOLDER_PREFIX = "__ELLIPSIS_"
# 定义句子分隔符集合。 # 定义句子分隔符集合。
SEPARATORS = {"", "", ",", " ", ";", "\xa0", "\n", ".", "", "", ""} SEPARATORS = {"", "", ",", " ", ";", "\xa0", "\n", ".", "", "", ""}
# 已知的以点号结尾的英文缩写词,用于避免错误地将缩写词中的点号作为句子结束符。 # 已知的以点号结尾的英文缩写词,用于避免错误地将缩写词中的点号作为句子结束符。
@ -57,13 +59,12 @@ def is_letter_not_han(char_str: str) -> bool:
bool: 如果字符是字母且非汉字则为True否则为False bool: 如果字符是字母且非汉字则为True否则为False
""" """
if not isinstance(char_str, str) or len(char_str) != 1: if not isinstance(char_str, str) or len(char_str) != 1:
return False # 输入必须是单个字符的字符串 return False
is_letter = _L_REGEX.fullmatch(char_str) is not None is_letter = _L_REGEX.fullmatch(char_str) is not None
if not is_letter: if not is_letter:
return False # 如果不是字母直接返回False return False
# 是字母,则进一步判断是否为汉字
is_han = _HAN_CHAR_REGEX.fullmatch(char_str) is not None is_han = _HAN_CHAR_REGEX.fullmatch(char_str) is not None
return not is_han # 是字母且不是汉字 return not is_han
def is_han_character(char_str: str) -> bool: def is_han_character(char_str: str) -> bool:
@ -112,13 +113,10 @@ def is_relevant_word_char(char_str: str) -> bool:
""" """
if not isinstance(char_str, str) or len(char_str) != 1: if not isinstance(char_str, str) or len(char_str) != 1:
return False return False
# 检查是否为Unicode字母
if _L_REGEX.fullmatch(char_str): if _L_REGEX.fullmatch(char_str):
# 如果是字母,则检查是否非汉字
return not _HAN_CHAR_REGEX.fullmatch(char_str) return not _HAN_CHAR_REGEX.fullmatch(char_str)
# 检查是否为Unicode数字
if _Nd_REGEX.fullmatch(char_str): if _Nd_REGEX.fullmatch(char_str):
return True # 数字本身被视为相关词语字符 return True
return False return False
@ -149,13 +147,10 @@ def protect_book_titles(text: str) -> tuple[str, dict[str, str]]:
- book_title_mapping (dict): 占位符到原始书名号内容含书名号本身的映射 - book_title_mapping (dict): 占位符到原始书名号内容含书名号本身的映射
""" """
book_title_mapping = {} book_title_mapping = {}
# 正则表达式匹配《内容》形式的书名号,使用非贪婪匹配 (.*?) 以正确处理。
book_title_pattern = re.compile(r"《(.*?)》") book_title_pattern = re.compile(r"《(.*?)》")
def replace_func(match): def replace_func(match):
# 为每个匹配到的书名号生成一个唯一的占位符。
placeholder = f"{BOOK_TITLE_PLACEHOLDER_PREFIX}{len(book_title_mapping)}__" placeholder = f"{BOOK_TITLE_PLACEHOLDER_PREFIX}{len(book_title_mapping)}__"
# 存储占位符和原始书名号(包括《》)的映射关系。
book_title_mapping[placeholder] = match.group(0) book_title_mapping[placeholder] = match.group(0)
return placeholder return placeholder
@ -174,18 +169,70 @@ def recover_book_titles(sentences: list[str], book_title_mapping: dict[str, str]
list[str]: 书名号占位符被恢复后的句子列表 list[str]: 书名号占位符被恢复后的句子列表
""" """
recovered_sentences = [] recovered_sentences = []
if not sentences: # 如果输入句子列表为空,直接返回空列表 if not sentences:
return [] return []
for sentence in sentences: for sentence in sentences:
if not isinstance(sentence, str): # 添加类型检查,确保每个元素都是字符串 if not isinstance(sentence, str):
recovered_sentences.append(sentence) # 如果不是字符串,直接添加(或选择跳过/记录错误) recovered_sentences.append(sentence)
continue continue
# 遍历映射,将句子中的每个占位符替换回其原始书名号内容。
for placeholder, original_content in book_title_mapping.items(): for placeholder, original_content in book_title_mapping.items():
sentence = sentence.replace(placeholder, original_content) sentence = sentence.replace(placeholder, original_content)
recovered_sentences.append(sentence) recovered_sentences.append(sentence)
return recovered_sentences return recovered_sentences
def protect_ellipsis(text: str) -> tuple[str, dict[str, str]]:
"""
保护文本中的省略号将其替换为唯一的占位符
匹配连续三个或更多点号以及Unicode省略号字符
返回保护后的文本和占位符到原始内容的映射
Args:
text: 原始输入文本
Returns:
tuple[str, dict[str, str]]: 一个元组包含
- protected_text (str): 省略号被占位符替换后的文本
- ellipsis_mapping (dict): 占位符到原始省略号字符串的映射
"""
ellipsis_mapping = {}
# 正则表达式匹配:
# \.{3,} 匹配三个或更多连续的点号 (例如 "...", "....")
# | 或
# \u2026 匹配Unicode省略号字符 '…'
ellipsis_pattern = re.compile(r"(\.{3,}|\u2026)")
def replace_func(match):
# 为每个匹配到的省略号生成一个唯一的占位符。
placeholder = f"{ELLIPSIS_PLACEHOLDER_PREFIX}{len(ellipsis_mapping)}__"
# 存储占位符和原始省略号字符串的映射关系。
ellipsis_mapping[placeholder] = match.group(0)
return placeholder
protected_text = ellipsis_pattern.sub(replace_func, text)
return protected_text, ellipsis_mapping
def recover_ellipsis(sentences: list[str], ellipsis_mapping: dict[str, str]) -> list[str]:
"""
将句子列表中的省略号占位符恢复为原始的省略号字符串
Args:
sentences: 包含可能省略号占位符的句子列表
ellipsis_mapping: 占位符到原始省略号字符串的映射
Returns:
list[str]: 省略号占位符被恢复后的句子列表
"""
recovered_sentences = []
if not sentences:
return []
for sentence in sentences:
if not isinstance(sentence, str):
recovered_sentences.append(sentence)
continue
for placeholder, original_content in ellipsis_mapping.items():
sentence = sentence.replace(placeholder, original_content)
recovered_sentences.append(sentence)
return recovered_sentences
def db_message_to_str(message_dict: dict) -> str: def db_message_to_str(message_dict: dict) -> str:
logger.debug(f"message_dict: {message_dict}") logger.debug(f"message_dict: {message_dict}")
@ -341,14 +388,14 @@ def split_into_sentences_w_remove_punctuation(original_text: str) -> list[str]:
""" """
将输入文本分割成句子列表 将输入文本分割成句子列表
此过程包括 此过程包括
1. 保护书名号 1. 保护书名号和省略号
2. 文本预处理如处理换行符 2. 文本预处理如处理换行符
3. 基于分隔符将文本切分为初步的段落(segments) 3. 基于分隔符将文本切分为初步的段落(segments)
4. 根据段落内容和分隔符类型构建初步的句子列表(preliminary_final_sentences) 4. 根据段落内容和分隔符类型构建初步的句子列表(preliminary_final_sentences)
特别处理汉字间的空格作为分割点 特别处理汉字间的空格作为分割点
5. 对初步句子列表进行可能的合并基于随机概率和文本长度 5. 对初步句子列表进行可能的合并基于随机概率和文本长度
6. 对合并后的句子进行随机标点移除 6. 对合并后的句子进行随机标点移除
7. 恢复书名号 7. 恢复书名号和省略号
8. 返回最终处理过的句子列表 8. 返回最终处理过的句子列表
Args: Args:
@ -357,259 +404,224 @@ def split_into_sentences_w_remove_punctuation(original_text: str) -> list[str]:
Returns: Returns:
list[str]: 分割和处理后的句子列表 list[str]: 分割和处理后的句子列表
""" """
# 步骤1: 保护书名号,将其替换为占位符,并获取映射关系。 # 步骤1: 保护特殊序列
text, local_book_title_mapping = protect_book_titles(original_text) text, local_book_title_mapping = protect_book_titles(original_text)
perform_book_title_recovery_here = True # 控制是否在本函数末尾执行恢复,主要用于调试 text, local_ellipsis_mapping = protect_ellipsis(text) # 新增:保护省略号
perform_book_title_recovery_here = True
# 步骤2: 文本预处理 # 步骤2: 文本预处理
text = regex.sub(r"\n\s*\n+", "\n", text) # 合并多个连续的换行符(及其间的空格)为一个换行符。 text = regex.sub(r"\n\s*\n+", "\n", text)
text = regex.sub(r"\n\s*([—。.,;\s\xa0])", r"\1", text) # 移除分隔符前的换行符和空格。 text = regex.sub(r"\n\s*([—。.,;\s\xa0])", r"\1", text)
text = regex.sub(r"([—。.,;\s\xa0])\s*\n", r"\1", text) # 移除分隔符后的换行符和空格。 text = regex.sub(r"([—。.,;\s\xa0])\s*\n", r"\1", text)
def replace_han_newline(match): def replace_han_newline(match):
"""辅助函数,用于将汉字之间的单个换行符替换为逗号。"""
char1 = match.group(1) char1 = match.group(1)
char2 = match.group(2) char2 = match.group(2)
if is_han_character(char1) and is_han_character(char2): if is_han_character(char1) and is_han_character(char2):
return char1 + "" + char2 # 汉字间的换行符替换为逗号 return char1 + "" + char2
return match.group(0) # 其他情况保持不变 return match.group(0)
text = regex.sub(r"(.)\n(.)", replace_han_newline, text) # 应用上述替换规则 text = regex.sub(r"(.)\n(.)", replace_han_newline, text)
len_text = len(text) # 使用保护书名号后的文本长度进行后续判断 len_text = len(text)
# 特殊情况处理:如果原始文本(保护后)本身就是一个书名号占位符, # 检查文本是否仅由占位符组成
# 后续逻辑可能会将其作为单个元素处理,这里先标记,确保它能被正确恢复。 is_only_placeholder = False
# (此处的 'pass' 意味着具体处理逻辑在后续的 restructured section 中统一进行) if local_book_title_mapping and text in local_book_title_mapping: # text is a book title placeholder
if local_book_title_mapping and text in local_book_title_mapping: # 注意:这里应该是 text in local_book_title_mapping.keys() is_only_placeholder = True
pass if not is_only_placeholder and local_ellipsis_mapping and text in local_ellipsis_mapping: # text is an ellipsis placeholder
is_only_placeholder = True
# 对于非常短且不含书名号的文本的提前返回逻辑。 if len_text < 3 and not local_book_title_mapping and not local_ellipsis_mapping:
if len_text < 3 and not local_book_title_mapping:
stripped_text = text.strip() stripped_text = text.strip()
if not stripped_text: # 如果剥离空格后为空,返回空列表 if not stripped_text:
return [] return []
# 如果剥离后只有一个字符且该字符是分隔符,也视为空(或无效)输入
if len(stripped_text) == 1 and stripped_text in SEPARATORS: if len(stripped_text) == 1 and stripped_text in SEPARATORS:
return [] return []
# 对于极短文本,不应用随机标点移除,直接返回其剥离空格后的内容
return [stripped_text] return [stripped_text]
# 步骤3: 基于分隔符将文本切分为初步的段落(segments) # 步骤3: 基于分隔符将文本切分为初步的段落(segments)
# segments 列表中的每个元素是一个元组 (content, separator_char)
segments = [] segments = []
current_segment = "" # 当前正在构建的段落内容 current_segment = ""
i = 0 i = 0
while i < len(text): while i < len(text):
char = text[i] # 当前字符 char = text[i]
if char in SEPARATORS: # 如果当前字符是分隔符 if char in SEPARATORS:
can_split_current_char = True # 默认情况下,当前分隔符可以用于分割 can_split_current_char = True
# 特殊分隔符处理逻辑 if char == ".":
if char == ".": # 处理点号 '.' can_split_this_dot = True
can_split_this_dot = True # 默认点号可以分割
# 规则1: 小数点 (数字.数字) - 不分割
if 0 < i < len_text - 1 and is_digit(text[i - 1]) and is_digit(text[i + 1]): if 0 < i < len_text - 1 and is_digit(text[i - 1]) and is_digit(text[i + 1]):
can_split_this_dot = False can_split_this_dot = False
# 规则2: 西文缩写/域名内部的点 (非汉字字母.非汉字字母) - 不分割
elif 0 < i < len_text - 1 and is_letter_not_han(text[i - 1]) and is_letter_not_han(text[i + 1]): elif 0 < i < len_text - 1 and is_letter_not_han(text[i - 1]) and is_letter_not_han(text[i + 1]):
can_split_this_dot = False can_split_this_dot = False
# 规则3: 已知缩写词的末尾点 (例如 "e.g. ", "U.S.A. ") - 不分割
else: else:
potential_abbreviation_word = current_segment + char # 构造包含当前点号的潜在词语 potential_abbreviation_word = current_segment + char
# 检查是否是已知缩写词,并且其后是空格或文本末尾
is_followed_by_space = i + 1 < len_text and text[i + 1] == " " is_followed_by_space = i + 1 < len_text and text[i + 1] == " "
is_at_end_of_text = i + 1 == len_text is_at_end_of_text = i + 1 == len_text
if potential_abbreviation_word in KNOWN_ABBREVIATIONS_ENDING_WITH_DOT and \ if potential_abbreviation_word in KNOWN_ABBREVIATIONS_ENDING_WITH_DOT and \
(is_followed_by_space or is_at_end_of_text): (is_followed_by_space or is_at_end_of_text):
can_split_this_dot = False can_split_this_dot = False
can_split_current_char = can_split_this_dot can_split_current_char = can_split_this_dot
elif char == " " or char == "\xa0": # 处理空格或NBSP (非断行空格) elif char == " " or char == "\xa0":
# 规则:非中文单词内部的空格不分割 (例如 "hello world", "слово1 слово2") if 0 < i < len_text - 1:
if 0 < i < len_text - 1: # 确保空格前后都有字符
prev_char = text[i - 1] prev_char = text[i - 1]
next_char = text[i + 1] next_char = text[i + 1]
# 如果空格前后都是“相关词语字符”(非汉字字母或数字),则不分割
if is_relevant_word_char(prev_char) and is_relevant_word_char(next_char): if is_relevant_word_char(prev_char) and is_relevant_word_char(next_char):
can_split_current_char = False can_split_current_char = False
# 特殊分隔符处理逻辑结束
if can_split_current_char: # 如果决定在此处分割 if can_split_current_char:
if current_segment: # 如果当前段落有内容则将其与分隔符一起存入segments if current_segment:
segments.append((current_segment, char)) segments.append((current_segment, char))
# 如果当前段落为空,但分隔符不是简单的排版空格 (除非是换行符这种有意义的空行分隔)
# 这用于处理连续分隔符或以分隔符开头的情况
elif char not in [" ", "\xa0"] or char == "\n": elif char not in [" ", "\xa0"] or char == "\n":
segments.append(("", char)) # 添加空内容和该分隔符 segments.append(("", char))
current_segment = "" # 重置当前段落内容 current_segment = ""
else: # 如果不分割,则将当前分隔符加入到当前段落内容中 else:
current_segment += char current_segment += char
else: # 如果当前字符不是分隔符,则加入当前段落内容 else:
current_segment += char current_segment += char
i += 1 i += 1
if current_segment: # 处理文本末尾剩余的段落内容(它没有后续分隔符) if current_segment:
segments.append((current_segment, "")) # 使用空字符串作为其分隔符标记 segments.append((current_segment, ""))
# 步骤3.1: 过滤segments列表 # 步骤3.1: 过滤segments列表
# 移除仅由空格组成的segment内容但保留其后的有效分隔符如换行符
filtered_segments = [] filtered_segments = []
for content, sep in segments: for content, sep in segments:
stripped_content = content.strip() # 移除内容两端的空白 stripped_content = content.strip()
if stripped_content: # 如果剥离后仍有内容,则保留 if stripped_content:
filtered_segments.append((stripped_content, sep)) filtered_segments.append((stripped_content, sep))
# 如果内容为空,但分隔符本身有意义(不是普通空格,或者是换行符)
elif sep and (sep not in [" ", "\xa0"] or sep == "\n"): elif sep and (sep not in [" ", "\xa0"] or sep == "\n"):
filtered_segments.append(("", sep)) # 保留空内容和该有意义的分隔符 filtered_segments.append(("", sep))
segments = filtered_segments # 更新segments为过滤后的列表 segments = filtered_segments
# 步骤4: 构建初步的句子列表 (preliminary_final_sentences) # 步骤4: 构建初步的句子列表 (preliminary_final_sentences)
# 此阶段基于segments中的内容和分隔符类型尝试组装成句子。
# 关键逻辑:识别强终止符,并特别处理汉字间的空格作为分割点。
preliminary_final_sentences = [] preliminary_final_sentences = []
current_sentence_build = "" # 当前正在构建的句子 current_sentence_build = ""
num_segments = len(segments) num_segments = len(segments)
for k, (content, sep) in enumerate(segments): # 遍历每个 (内容, 分隔符) 对 for k, (content, sep) in enumerate(segments):
current_sentence_build += content # 首先将段落内容加入当前句子构建 current_sentence_build += content
# 判断分隔符的类型 is_strong_terminator = sep in {"", ".", "", "", "\n", ""}
is_strong_terminator = sep in {"", ".", "", "", "\n", ""} # 是否为强句子终止符 is_space_separator = sep in [" ", "\xa0"]
is_space_separator = sep in [" ", "\xa0"] # 是否为空格类分隔符
append_sep_to_current = is_strong_terminator # 默认只有强终止符会附加到句子末尾 append_sep_to_current = is_strong_terminator
should_split_now = False # 标记是否应在当前分隔符处立即分割句子 should_split_now = False
if is_strong_terminator: # 如果是强终止符,则应立即分割 if is_strong_terminator:
should_split_now = True should_split_now = True
elif is_space_separator: # 如果分隔符是空格 elif is_space_separator:
# 检查是否为“汉字-空格-汉字”模式,若是,则也应分割 if current_sentence_build:
if current_sentence_build: # 确保当前构建的句子有内容 last_char_of_build_stripped = current_sentence_build.strip()
last_char_of_build_stripped = current_sentence_build.strip() # 获取去除尾部空格的句子内容
# 检查当前句子末尾字符是否为汉字
if last_char_of_build_stripped and is_han_character(last_char_of_build_stripped[-1]): if last_char_of_build_stripped and is_han_character(last_char_of_build_stripped[-1]):
# 检查下一个segment (如果存在) 的内容的第一个字符是否是汉字
if k + 1 < num_segments: if k + 1 < num_segments:
next_content_tuple = segments[k+1] next_content_tuple = segments[k+1]
if next_content_tuple: # 确保元组存在 if next_content_tuple:
next_content = next_content_tuple[0] # 获取下一个段落的内容 next_content = next_content_tuple[0]
if next_content and is_han_character(next_content[0]): if next_content and is_han_character(next_content[0]):
should_split_now = True # 满足汉字-空格-汉字,应分割 should_split_now = True
append_sep_to_current = False # 此时,该空格作为分割符,不应附加到句子末尾 append_sep_to_current = False
if not should_split_now: # 如果不是因汉字间空格而分割(即普通空格连接) if not should_split_now:
# 避免在句子开头或已存在尾部空格时重复添加空格
if current_sentence_build and not current_sentence_build.endswith(" ") and not current_sentence_build.endswith("\xa0"): if current_sentence_build and not current_sentence_build.endswith(" ") and not current_sentence_build.endswith("\xa0"):
current_sentence_build += " " # 将此空格作为连接符加入(统一用普通空格) current_sentence_build += " "
append_sep_to_current = False # 该空格已作为连接符处理,不作为独立分隔符附加 append_sep_to_current = False
if should_split_now: # 如果决定在当前位置分割句子 if should_split_now:
if append_sep_to_current and sep: # 如果需要附加分隔符(通常是强终止符) if append_sep_to_current and sep:
current_sentence_build += sep current_sentence_build += sep
stripped_sentence = current_sentence_build.strip() # 清理句子两端空格 stripped_sentence = current_sentence_build.strip()
if stripped_sentence: # 确保句子不为空 if stripped_sentence:
preliminary_final_sentences.append(stripped_sentence) preliminary_final_sentences.append(stripped_sentence)
current_sentence_build = "" # 重置句子构建器 current_sentence_build = ""
elif sep and not is_space_separator: # 如果是其他弱分隔符 (如 ',', ';') elif sep and not is_space_separator:
current_sentence_build += sep # 将其加入当前句子 current_sentence_build += sep
# 如果这是最后一个segment且当前构建的句子有内容则也视为一个完整句子
if k == num_segments - 1 and current_sentence_build.strip(): if k == num_segments - 1 and current_sentence_build.strip():
preliminary_final_sentences.append(current_sentence_build.strip()) preliminary_final_sentences.append(current_sentence_build.strip())
current_sentence_build = "" current_sentence_build = ""
# 如果 sep 是空字符串 (通常是最后一个 segment 的情况),则 current_sentence_build 已有内容,
# 等待循环结束后的统一处理。
if current_sentence_build.strip(): # 处理循环结束后剩余的正在构建的句子 if current_sentence_build.strip():
preliminary_final_sentences.append(current_sentence_build.strip()) preliminary_final_sentences.append(current_sentence_build.strip())
# 再次清理,确保列表中的句子都是非空字符串
preliminary_final_sentences = [s for s in preliminary_final_sentences if s.strip()] preliminary_final_sentences = [s for s in preliminary_final_sentences if s.strip()]
# --- RESTRUCTURED SECTION FOR MERGING, PUNCTUATION REMOVAL, AND BOOK TITLE RECOVERY --- intermediate_sentences_placeholders = []
# 此部分统一处理句子的合并、随机标点移除和最终的书名号恢复。
intermediate_sentences_placeholders = [] # 存储待恢复书名号的中间句子列表
if not preliminary_final_sentences: if not preliminary_final_sentences:
# 情况1: 初步句子列表为空 # 如果初步句子列表为空,检查原始文本(保护后)是否是单个占位符
# 这可能发生在原始文本非常短、仅包含分隔符,或者仅包含一个书名号(已被替换为占位符) if is_only_placeholder: # is_only_placeholder was set earlier
# 如果原文是单个书名号占位符,则应保留它以供恢复。 intermediate_sentences_placeholders = [text] # text is the placeholder itself
# 注意: text 是经过 protect_book_titles 处理后的文本。
# local_book_title_mapping.keys() 包含所有占位符。
if local_book_title_mapping and text in local_book_title_mapping.keys():
intermediate_sentences_placeholders = [text] # 此时 text 就是那个占位符
# else (其他导致 preliminary_final_sentences 为空的情况)intermediate_sentences_placeholders 保持为空列表
elif len(preliminary_final_sentences) == 1: elif len(preliminary_final_sentences) == 1:
# 情况2: 初步句子列表只有一个句子 s = preliminary_final_sentences[0].strip()
s = preliminary_final_sentences[0].strip() # 获取该句子并去除两端空格
if s: if s:
s = random_remove_punctuation(s) # 对该句子进行随机标点移除 s = random_remove_punctuation(s)
intermediate_sentences_placeholders = [s] if s else [] # 如果处理后仍有内容,则加入列表 intermediate_sentences_placeholders = [s] if s else []
else: # 情况3: 初步句子列表有多个句子,需要进行可能的随机合并 else:
final_sentences_merged = [] # 存储合并后的句子 final_sentences_merged = []
# 根据原始文本长度(未保护书名号前)决定合并强度
original_len_for_strength = len(original_text) original_len_for_strength = len(original_text)
split_strength = 0.5 # 默认分割强度 split_strength = 0.5
if original_len_for_strength < 12: if original_len_for_strength < 12:
split_strength = 0.5 split_strength = 0.5
elif original_len_for_strength < 32: elif original_len_for_strength < 32:
split_strength = 0.7 split_strength = 0.7
else: else:
split_strength = 0.9 split_strength = 0.9
actual_merge_probability = 1.0 - split_strength # 合并概率与分割强度互补 actual_merge_probability = 1.0 - split_strength
temp_sentence = "" # 临时存储正在合并的句子 temp_sentence = ""
if preliminary_final_sentences: # 确保有句子可以合并 if preliminary_final_sentences:
temp_sentence = preliminary_final_sentences[0] # 从第一个句子开始 temp_sentence = preliminary_final_sentences[0]
for i_merge in range(1, len(preliminary_final_sentences)): # 遍历后续句子 for i_merge in range(1, len(preliminary_final_sentences)):
current_sentence_to_merge = preliminary_final_sentences[i_merge] current_sentence_to_merge = preliminary_final_sentences[i_merge]
should_merge_based_on_punctuation = True # 默认可以合并 should_merge_based_on_punctuation = True
# 如果前一个句子以强终止符结尾,则不应与后一个句子合并
if temp_sentence and \ if temp_sentence and \
(temp_sentence.endswith("") or temp_sentence.endswith(".") or \ (temp_sentence.endswith("") or temp_sentence.endswith(".") or \
temp_sentence.endswith("!") or temp_sentence.endswith("?") or \ temp_sentence.endswith("!") or temp_sentence.endswith("?") or \
temp_sentence.endswith("")): temp_sentence.endswith("")):
should_merge_based_on_punctuation = False should_merge_based_on_punctuation = False
# 根据合并概率和标点规则决定是否合并
if random.random() < actual_merge_probability and temp_sentence and should_merge_based_on_punctuation: if random.random() < actual_merge_probability and temp_sentence and should_merge_based_on_punctuation:
# 合并时,如果需要,在两句子间添加空格
if not temp_sentence.endswith(" ") and not current_sentence_to_merge.startswith(" "): if not temp_sentence.endswith(" ") and not current_sentence_to_merge.startswith(" "):
temp_sentence += " " temp_sentence += " "
temp_sentence += current_sentence_to_merge temp_sentence += current_sentence_to_merge
else: # 不合并,则将已构建的 temp_sentence 加入列表,并开始新的 temp_sentence else:
if temp_sentence: if temp_sentence:
final_sentences_merged.append(temp_sentence) final_sentences_merged.append(temp_sentence)
temp_sentence = current_sentence_to_merge temp_sentence = current_sentence_to_merge
if temp_sentence: # 将最后一个构建的(或未合并的)句子加入列表 if temp_sentence:
final_sentences_merged.append(temp_sentence) final_sentences_merged.append(temp_sentence)
# 对合并后的每个句子进行清理和随机标点移除
processed_temp = [] processed_temp = []
for sentence_val in final_sentences_merged: for sentence_val in final_sentences_merged:
s_loop = sentence_val.strip() s_loop = sentence_val.strip()
# 移除句末可能存在的逗号
if s_loop.endswith(",") or s_loop.endswith(""): if s_loop.endswith(",") or s_loop.endswith(""):
s_loop = s_loop[:-1].strip() s_loop = s_loop[:-1].strip()
if s_loop: # 确保句子不为空 if s_loop:
s_loop = random_remove_punctuation(s_loop) # 随机标点移除 s_loop = random_remove_punctuation(s_loop)
if s_loop: # 再次确保句子不为空 if s_loop:
processed_temp.append(s_loop) processed_temp.append(s_loop)
intermediate_sentences_placeholders = processed_temp intermediate_sentences_placeholders = processed_temp
# 统一的书名号恢复步骤 # 1. 恢复书名号
final_sentences_recovered = [] sentences_after_book_title_recovery = []
if perform_book_title_recovery_here and local_book_title_mapping: if perform_book_title_recovery_here and local_book_title_mapping:
# 如果有书名号映射且需要恢复,则调用恢复函数 sentences_after_book_title_recovery = recover_book_titles(intermediate_sentences_placeholders, local_book_title_mapping)
final_sentences_recovered = recover_book_titles(intermediate_sentences_placeholders, local_book_title_mapping) else:
else: # 否则,直接使用中间结果 sentences_after_book_title_recovery = intermediate_sentences_placeholders
final_sentences_recovered = intermediate_sentences_placeholders
# 2. 恢复省略号 (在书名号恢复之后)
final_sentences_recovered = []
if local_ellipsis_mapping: # 检查是否有省略号需要恢复
final_sentences_recovered = recover_ellipsis(sentences_after_book_title_recovery, local_ellipsis_mapping)
else:
final_sentences_recovered = sentences_after_book_title_recovery
# 返回最终结果,并再次过滤空字符串
return [s for s in final_sentences_recovered if s.strip()] return [s for s in final_sentences_recovered if s.strip()]