fix. 书名号回复

2025-05-17 03:18:02 +08:00 · 2025-05-17 03:18:02 +08:00 · 2080c02f54
parent e7206e4da6
commit 2080c02f54
1 changed files with 339 additions and 229 deletions
--- a/src/chat/utils/utils.py
+++ b/src/chat/utils/utils.py
@ -19,141 +19,174 @@ from ...config.config import global_config

 logger = get_module_logger("chat_utils")

-# 预编译正则表达式以提高性能
-_L_REGEX = regex.compile(r"\p{L}")  # 匹配任何Unicode字母
-_HAN_CHAR_REGEX = regex.compile(r"\p{Han}")  # 匹配汉字 (Unicode属性)
-_Nd_REGEX = regex.compile(r"\p{Nd}")  # 新增：匹配Unicode数字 (Nd = Number, decimal digit)
+# --- 全局常量和预编译正则表达式 ---
+# \p{L} 匹配任何语言中的任何种类的字母字符。
+_L_REGEX = regex.compile(r"\p{L}")
+# \p{Han} 匹配汉字。
+_HAN_CHAR_REGEX = regex.compile(r"\p{Han}")
+# \p{Nd} 匹配十进制数字字符。
+_Nd_REGEX = regex.compile(r"\p{Nd}")

+# 书名号占位符的前缀，用于在处理文本时临时替换书名号。
 BOOK_TITLE_PLACEHOLDER_PREFIX = "__BOOKTITLE_"
+# 定义句子分隔符集合。
 SEPARATORS = {"。", "，", ",", " ", ";", "\xa0", "\n", ".", "—", "！", "？"}
+# 已知的以点号结尾的英文缩写词，用于避免错误地将缩写词中的点号作为句子结束符。
 KNOWN_ABBREVIATIONS_ENDING_WITH_DOT = {
-    "Mr.",
-    "Mrs.",
-    "Ms.",
-    "Dr.",
-    "Prof.",
-    "St.",
-    "Messrs.",
-    "Mmes.",
-    "Capt.",
-    "Gov.",
-    "Inc.",
-    "Ltd.",
-    "Corp.",
-    "Co.",
-    "PLC",  # PLC通常不带点，但有些可能
-    "vs.",
-    "etc.",
-    "i.e.",
-    "e.g.",
-    "viz.",
-    "al.",
-    "et al.",
-    "ca.",
-    "cf.",
-    "No.",
-    "Vol.",
-    "pp.",
-    "fig.",
-    "figs.",
-    "ed.",
-    "Ph.D.",
-    "M.D.",
-    "B.A.",
-    "M.A.",
-    "Jan.",
-    "Feb.",
-    "Mar.",
-    "Apr.",
-    "Jun.",
-    "Jul.",
-    "Aug.",
-    "Sep.",
-    "Oct.",
-    "Nov.",
-    "Dec.",  # May. 通常不用点
-    "Mon.",
-    "Tue.",
-    "Wed.",
-    "Thu.",
-    "Fri.",
-    "Sat.",
-    "Sun.",
-    "U.S.",
-    "U.K.",
-    "E.U.",
-    "U.S.A.",
-    "U.S.S.R.",
-    "Ave.",
-    "Blvd.",
-    "Rd.",
-    "Ln.",  # Street suffixes
-    "approx.",
-    "dept.",
-    "appt.",
-    "श्री.",  # Hindi Shri.
+    "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "St.", "Messrs.", "Mmes.", "Capt.", "Gov.",
+    "Inc.", "Ltd.", "Corp.", "Co.", "PLC", "vs.", "etc.", "i.e.", "e.g.", "viz.",
+    "al.", "et al.", "ca.", "cf.", "No.", "Vol.", "pp.", "fig.", "figs.", "ed.",
+    "Ph.D.", "M.D.", "B.A.", "M.A.", "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.",
+    "Aug.", "Sep.", "Oct.", "Nov.", "Dec.", "Mon.", "Tue.", "Wed.", "Thu.", "Fri.",
+    "Sat.", "Sun.", "U.S.", "U.K.", "E.U.", "U.S.A.", "U.S.S.R.", "Ave.", "Blvd.",
+    "Rd.", "Ln.", "approx.", "dept.", "appt.", "श्री.", # 印地语中的 Shri.
 }

+# --- 辅助函数 ---

 def is_letter_not_han(char_str: str) -> bool:
    """
-    检查字符是否为“字母”且“非汉字”。
+    检查单个字符是否为“字母”且“非汉字”。
    例如拉丁字母、西里尔字母、韩文等返回True。
    汉字、数字、标点、空格等返回False。
+
+    Args:
+        char_str:待检查的单个字符。
+
+    Returns:
+        bool: 如果字符是字母且非汉字则为True，否则为False。
    """
    if not isinstance(char_str, str) or len(char_str) != 1:
-        return False
-
+        return False # 输入必须是单个字符的字符串
    is_letter = _L_REGEX.fullmatch(char_str) is not None
    if not is_letter:
-        return False
-
-    # 使用 \p{Han} 属性进行汉字判断，更为准确
+        return False # 如果不是字母，直接返回False
+    # 是字母，则进一步判断是否为汉字
    is_han = _HAN_CHAR_REGEX.fullmatch(char_str) is not None
-    return not is_han
+    return not is_han # 是字母且不是汉字


 def is_han_character(char_str: str) -> bool:
-    """检查字符是否为汉字 (使用 \p{Han} Unicode 属性)"""
+    """
+    检查单个字符是否为汉字 (使用 Unicode \p{Han} 属性)。
+
+    Args:
+        char_str: 待检查的单个字符。
+
+    Returns:
+        bool: 如果字符是汉字则为True，否则为False。
+    """
    if not isinstance(char_str, str) or len(char_str) != 1:
        return False
    return _HAN_CHAR_REGEX.fullmatch(char_str) is not None


 def is_digit(char_str: str) -> bool:
-    """检查字符是否为Unicode数字"""
+    """
+    检查单个字符是否为Unicode数字 (十进制数字)。
+
+    Args:
+        char_str: 待检查的单个字符。
+
+    Returns:
+        bool: 如果字符是Unicode数字则为True，否则为False。
+    """
    if not isinstance(char_str, str) or len(char_str) != 1:
        return False
    return _Nd_REGEX.fullmatch(char_str) is not None


-def is_relevant_word_char(char_str: str) -> bool:  # 新增辅助函数
+def is_relevant_word_char(char_str: str) -> bool:
    """
-    检查字符是否为“相关词语字符”（非汉字字母 或 数字）。
-    用于判断在非中文语境下，空格两侧是否应被视为一个词内部的部分。
+    检查字符是否为“相关词语字符”（即非汉字字母或数字）。
+    此函数用于判断在非中文语境下，空格两侧的字符是否应被视为构成一个连续词语的部分，
+    从而决定该空格是否作为分割点。
    例如拉丁字母、西里尔字母、数字等返回True。
    汉字、标点、纯空格等返回False。
+
+    Args:
+        char_str: 待检查的单个字符。
+
+    Returns:
+        bool: 如果字符是非汉字字母或数字则为True，否则为False。
    """
    if not isinstance(char_str, str) or len(char_str) != 1:
        return False
-
    # 检查是否为Unicode字母
    if _L_REGEX.fullmatch(char_str):
        # 如果是字母，则检查是否非汉字
        return not _HAN_CHAR_REGEX.fullmatch(char_str)
-
    # 检查是否为Unicode数字
    if _Nd_REGEX.fullmatch(char_str):
        return True  # 数字本身被视为相关词语字符
-
    return False


 def is_english_letter(char: str) -> bool:
-    """检查字符是否为英文字母（忽略大小写）"""
+    """
+    检查单个字符是否为英文字母（忽略大小写）。
+
+    Args:
+        char: 待检查的单个字符。
+
+    Returns:
+        bool: 如果字符是英文字母则为True，否则为False。
+    """
    return "a" <= char.lower() <= "z"


+def protect_book_titles(text: str) -> tuple[str, dict[str, str]]:
+    """
+    保护文本中的书名号内容，将其替换为唯一的占位符。
+    返回保护后的文本和占位符到原始内容的映射。
+
+    Args:
+        text: 原始输入文本。
+
+    Returns:
+        tuple[str, dict[str, str]]: 一个元组，包含：
+            - protected_text (str): 书名号被占位符替换后的文本。
+            - book_title_mapping (dict): 占位符到原始书名号内容（含书名号本身）的映射。
+    """
+    book_title_mapping = {}
+    # 正则表达式匹配《内容》形式的书名号，使用非贪婪匹配 (.*?) 以正确处理。
+    book_title_pattern = re.compile(r"《(.*?)》")
+
+    def replace_func(match):
+        # 为每个匹配到的书名号生成一个唯一的占位符。
+        placeholder = f"{BOOK_TITLE_PLACEHOLDER_PREFIX}{len(book_title_mapping)}__"
+        # 存储占位符和原始书名号（包括《》）的映射关系。
+        book_title_mapping[placeholder] = match.group(0)
+        return placeholder
+
+    protected_text = book_title_pattern.sub(replace_func, text)
+    return protected_text, book_title_mapping
+
+def recover_book_titles(sentences: list[str], book_title_mapping: dict[str, str]) -> list[str]:
+    """
+    将句子列表中的书名号占位符恢复为原始的书名号内容。
+
+    Args:
+        sentences: 包含可能书名号占位符的句子列表。
+        book_title_mapping: 占位符到原始书名号内容的映射。
+
+    Returns:
+        list[str]: 书名号占位符被恢复后的句子列表。
+    """
+    recovered_sentences = []
+    if not sentences: # 如果输入句子列表为空，直接返回空列表
+        return []
+    for sentence in sentences:
+        if not isinstance(sentence, str): # 添加类型检查，确保每个元素都是字符串
+            recovered_sentences.append(sentence) # 如果不是字符串，直接添加（或选择跳过/记录错误）
+            continue
+        # 遍历映射，将句子中的每个占位符替换回其原始书名号内容。
+        for placeholder, original_content in book_title_mapping.items():
+            sentence = sentence.replace(placeholder, original_content)
+        recovered_sentences.append(sentence)
+    return recovered_sentences
+
+
 def db_message_to_str(message_dict: dict) -> str:
    logger.debug(f"message_dict: {message_dict}")
    time_str = time.strftime("%m-%d %H:%M:%S", time.localtime(message_dict["time"]))
@ -305,180 +338,279 @@ def get_recent_group_speaker(chat_stream_id: int, sender, limit: int = 12) -> li


 def split_into_sentences_w_remove_punctuation(original_text: str) -> list[str]:
-    """将文本分割成句子，并根据概率合并"""
-    # print(f"DEBUG: 输入文本 (repr): {repr(text)}")
+    """
+    将输入文本分割成句子列表。
+    此过程包括：
+    1. 保护书名号。
+    2. 文本预处理（如处理换行符）。
+    3. 基于分隔符将文本切分为初步的段落(segments)。
+    4. 根据段落内容和分隔符类型，构建初步的句子列表(preliminary_final_sentences)，
+       特别处理汉字间的空格作为分割点。
+    5. 对初步句子列表进行可能的合并（基于随机概率和文本长度）。
+    6. 对合并后的句子进行随机标点移除。
+    7. 恢复书名号。
+    8. 返回最终处理过的句子列表。
+
+    Args:
+        original_text: 原始输入文本。
+
+    Returns:
+        list[str]: 分割和处理后的句子列表。
+    """
+    # 步骤1: 保护书名号，将其替换为占位符，并获取映射关系。
    text, local_book_title_mapping = protect_book_titles(original_text)
-    perform_book_title_recovery_here = True
-    # 预处理
-    text = regex.sub(r"\n\s*\n+", "\n", text)  # 合并多个换行符
-    text = regex.sub(r"\n\s*([—。.,，;\s\xa0！？])", r"\1", text)
-    text = regex.sub(r"([—。.,，;\s\xa0！？])\s*\n", r"\1", text)
+    perform_book_title_recovery_here = True # 控制是否在本函数末尾执行恢复，主要用于调试
+
+    # 步骤2: 文本预处理
+    text = regex.sub(r"\n\s*\n+", "\n", text)  # 合并多个连续的换行符（及其间的空格）为一个换行符。
+    text = regex.sub(r"\n\s*([—。.,，;\s\xa0！？])", r"\1", text) # 移除分隔符前的换行符和空格。
+    text = regex.sub(r"([—。.,，;\s\xa0！？])\s*\n", r"\1", text) # 移除分隔符后的换行符和空格。

    def replace_han_newline(match):
+        """辅助函数，用于将汉字之间的单个换行符替换为逗号。"""
        char1 = match.group(1)
        char2 = match.group(2)
        if is_han_character(char1) and is_han_character(char2):
            return char1 + "，" + char2  # 汉字间的换行符替换为逗号
-        return match.group(0)
+        return match.group(0) # 其他情况保持不变

-    text = regex.sub(r"(.)\n(.)", replace_han_newline, text)
+    text = regex.sub(r"(.)\n(.)", replace_han_newline, text) # 应用上述替换规则

-    len_text = len(text)
-    if len_text < 3:
+    len_text = len(text) # 使用保护书名号后的文本长度进行后续判断
+
+    # 特殊情况处理：如果原始文本（保护后）本身就是一个书名号占位符，
+    # 后续逻辑可能会将其作为单个元素处理，这里先标记，确保它能被正确恢复。
+    # (此处的 'pass' 意味着具体处理逻辑在后续的 restructured section 中统一进行)
+    if local_book_title_mapping and text in local_book_title_mapping: # 注意：这里应该是 text in local_book_title_mapping.keys()
+        pass
+
+
+    # 对于非常短且不含书名号的文本的提前返回逻辑。
+    if len_text < 3 and not local_book_title_mapping:
        stripped_text = text.strip()
-        if not stripped_text:
+        if not stripped_text: # 如果剥离空格后为空，返回空列表
            return []
+        # 如果剥离后只有一个字符且该字符是分隔符，也视为空（或无效）输入
        if len(stripped_text) == 1 and stripped_text in SEPARATORS:
            return []
+        # 对于极短文本，不应用随机标点移除，直接返回其剥离空格后的内容
        return [stripped_text]

+
+    # 步骤3: 基于分隔符将文本切分为初步的段落(segments)
+    # segments 列表中的每个元素是一个元组 (content, separator_char)
    segments = []
-    current_segment = ""
+    current_segment = "" # 当前正在构建的段落内容
    i = 0
    while i < len(text):
-        char = text[i]
-        if char in SEPARATORS:
-            can_split_current_char = True
+        char = text[i] # 当前字符
+        if char in SEPARATORS: # 如果当前字符是分隔符
+            can_split_current_char = True # 默认情况下，当前分隔符可以用于分割

-            if char == ".":
-                can_split_this_dot = True
-                # 规则1: 小数点 (数字.数字)
+            # 特殊分隔符处理逻辑
+            if char == ".": # 处理点号 '.'
+                can_split_this_dot = True # 默认点号可以分割
+                # 规则1: 小数点 (数字.数字) - 不分割
                if 0 < i < len_text - 1 and is_digit(text[i - 1]) and is_digit(text[i + 1]):
                    can_split_this_dot = False
-                # 规则2: 西文缩写/域名内部的点 (西文字母.西文字母)
+                # 规则2: 西文缩写/域名内部的点 (非汉字字母.非汉字字母) - 不分割
                elif 0 < i < len_text - 1 and is_letter_not_han(text[i - 1]) and is_letter_not_han(text[i + 1]):
                    can_split_this_dot = False
-                # 规则3: 已知缩写词的末尾点 (例如 "e.g. ", "U.S.A. ")
+                # 规则3: 已知缩写词的末尾点 (例如 "e.g. ", "U.S.A. ") - 不分割
                else:
-                    potential_abbreviation_word = current_segment + char
+                    potential_abbreviation_word = current_segment + char # 构造包含当前点号的潜在词语
+                    # 检查是否是已知缩写词，并且其后是空格或文本末尾
                    is_followed_by_space = i + 1 < len_text and text[i + 1] == " "
                    is_at_end_of_text = i + 1 == len_text
-
-                    if potential_abbreviation_word in KNOWN_ABBREVIATIONS_ENDING_WITH_DOT and (
-                        is_followed_by_space or is_at_end_of_text
-                    ):
+                    if potential_abbreviation_word in KNOWN_ABBREVIATIONS_ENDING_WITH_DOT and \
+                        (is_followed_by_space or is_at_end_of_text):
                        can_split_this_dot = False
                can_split_current_char = can_split_this_dot
-            elif char == " " or char == "\xa0":  # 处理空格/NBSP
-                if 0 < i < len_text - 1:
+            elif char == " " or char == "\xa0":  # 处理空格或NBSP (非断行空格)
+                # 规则：非中文单词内部的空格不分割 (例如 "hello world", "слово1 слово2")
+                if 0 < i < len_text - 1: # 确保空格前后都有字符
                    prev_char = text[i - 1]
                    next_char = text[i + 1]
-                    # 非中文单词内部的空格不分割 (例如 "hello world", "слово1 слово2")
+                    # 如果空格前后都是“相关词语字符”（非汉字字母或数字），则不分割
                    if is_relevant_word_char(prev_char) and is_relevant_word_char(next_char):
                        can_split_current_char = False
+            # 特殊分隔符处理逻辑结束

-            if can_split_current_char:
-                if current_segment:  # 如果当前段落有内容，则添加 (内容, 分隔符)
+            if can_split_current_char: # 如果决定在此处分割
+                if current_segment: # 如果当前段落有内容，则将其与分隔符一起存入segments
                    segments.append((current_segment, char))
                # 如果当前段落为空，但分隔符不是简单的排版空格 (除非是换行符这种有意义的空行分隔)
+                # 这用于处理连续分隔符或以分隔符开头的情况
                elif char not in [" ", "\xa0"] or char == "\n":
-                    segments.append(("", char))  # 添加 ("", 分隔符)
-                current_segment = ""  # 重置当前段落
-            else:
-                current_segment += char  # 不分割，将当前分隔符加入到当前段落
-        else:
-            current_segment += char  # 非分隔符，加入当前段落
+                    segments.append(("", char)) # 添加空内容和该分隔符
+                current_segment = ""  # 重置当前段落内容
+            else: # 如果不分割，则将当前分隔符加入到当前段落内容中
+                current_segment += char
+        else: # 如果当前字符不是分隔符，则加入当前段落内容
+            current_segment += char
        i += 1

-    if current_segment:  # 处理末尾剩余的段落
-        segments.append((current_segment, ""))
+    if current_segment:  # 处理文本末尾剩余的段落内容（它没有后续分隔符）
+        segments.append((current_segment, "")) # 使用空字符串作为其分隔符标记

-    # 过滤掉仅由空格组成的segment，但保留其后的有效分隔符
+    # 步骤3.1: 过滤segments列表
+    # 移除仅由空格组成的segment内容，但保留其后的有效分隔符（如换行符）。
    filtered_segments = []
    for content, sep in segments:
-        stripped_content = content.strip()
-        if stripped_content:
+        stripped_content = content.strip() # 移除内容两端的空白
+        if stripped_content: # 如果剥离后仍有内容，则保留
            filtered_segments.append((stripped_content, sep))
+        # 如果内容为空，但分隔符本身有意义（不是普通空格，或者是换行符）
        elif sep and (sep not in [" ", "\xa0"] or sep == "\n"):
-            filtered_segments.append(("", sep))
-    segments = filtered_segments
-
-    if not segments:
-        return [text.strip()] if text.strip() else []
+            filtered_segments.append(("", sep)) # 保留空内容和该有意义的分隔符
+    segments = filtered_segments # 更新segments为过滤后的列表

+    # 步骤4: 构建初步的句子列表 (preliminary_final_sentences)
+    # 此阶段基于segments中的内容和分隔符类型，尝试组装成句子。
+    # 关键逻辑：识别强终止符，并特别处理汉字间的空格作为分割点。
    preliminary_final_sentences = []
-    current_sentence_build = ""
-    for k, (content, sep) in enumerate(segments):
-        current_sentence_build += content  # 先添加内容部分
+    current_sentence_build = "" # 当前正在构建的句子
+    num_segments = len(segments)
+    for k, (content, sep) in enumerate(segments): # 遍历每个 (内容, 分隔符) 对
+        current_sentence_build += content  # 首先将段落内容加入当前句子构建

-        # 判断分隔符类型
-        is_strong_terminator = sep in {"。", ".", "！", "？", "\n", "—"}
-        is_space_separator = sep in [" ", "\xa0"]
+        # 判断分隔符的类型
+        is_strong_terminator = sep in {"。", ".", "！", "？", "\n", "—"} # 是否为强句子终止符
+        is_space_separator = sep in [" ", "\xa0"] # 是否为空格类分隔符

-        if is_strong_terminator:
-            current_sentence_build += sep  # 将强终止符加入
-            if current_sentence_build.strip():
-                preliminary_final_sentences.append(current_sentence_build.strip())
-            current_sentence_build = ""  # 开始新的句子构建
-        elif is_space_separator:
-            # 如果是空格，并且当前构建的句子不以空格结尾，则添加空格并继续构建
-            if not current_sentence_build.endswith(sep):
+        append_sep_to_current = is_strong_terminator # 默认只有强终止符会附加到句子末尾
+        should_split_now = False # 标记是否应在当前分隔符处立即分割句子
+
+        if is_strong_terminator: # 如果是强终止符，则应立即分割
+            should_split_now = True
+        elif is_space_separator:  # 如果分隔符是空格
+            # 检查是否为“汉字-空格-汉字”模式，若是，则也应分割
+            if current_sentence_build: # 确保当前构建的句子有内容
+                last_char_of_build_stripped = current_sentence_build.strip() # 获取去除尾部空格的句子内容
+                # 检查当前句子末尾字符是否为汉字
+                if last_char_of_build_stripped and is_han_character(last_char_of_build_stripped[-1]):
+                    # 检查下一个segment (如果存在) 的内容的第一个字符是否是汉字
+                    if k + 1 < num_segments:
+                        next_content_tuple = segments[k+1]
+                        if next_content_tuple: # 确保元组存在
+                            next_content = next_content_tuple[0] # 获取下一个段落的内容
+                            if next_content and is_han_character(next_content[0]):
+                                should_split_now = True # 满足汉字-空格-汉字，应分割
+                                append_sep_to_current = False  # 此时，该空格作为分割符，不应附加到句子末尾
+
+            if not should_split_now: # 如果不是因汉字间空格而分割（即普通空格连接）
+                # 避免在句子开头或已存在尾部空格时重复添加空格
+                if current_sentence_build and not current_sentence_build.endswith(" ") and not current_sentence_build.endswith("\xa0"):
+                    current_sentence_build += " "  # 将此空格作为连接符加入（统一用普通空格）
+                append_sep_to_current = False # 该空格已作为连接符处理，不作为独立分隔符附加
+
+        if should_split_now: # 如果决定在当前位置分割句子
+            if append_sep_to_current and sep: # 如果需要附加分隔符（通常是强终止符）
                current_sentence_build += sep
-        elif sep:  # 其他分隔符 (如 ',', ';')
-            current_sentence_build += sep  # 加入并继续构建，这些通常不独立成句
-            # 如果这些弱分隔符后紧跟的就是文本末尾，则它们可能结束一个句子
-            if k == len(segments) - 1 and current_sentence_build.strip():
+
+            stripped_sentence = current_sentence_build.strip() # 清理句子两端空格
+            if stripped_sentence: # 确保句子不为空
+                preliminary_final_sentences.append(stripped_sentence)
+            current_sentence_build = "" # 重置句子构建器
+        elif sep and not is_space_separator:  # 如果是其他弱分隔符 (如 ',', ';')
+            current_sentence_build += sep # 将其加入当前句子
+            # 如果这是最后一个segment，且当前构建的句子有内容，则也视为一个完整句子
+            if k == num_segments - 1 and current_sentence_build.strip():
                preliminary_final_sentences.append(current_sentence_build.strip())
                current_sentence_build = ""
+        # 如果 sep 是空字符串 (通常是最后一个 segment 的情况)，则 current_sentence_build 已有内容，
+        # 等待循环结束后的统一处理。

-    if current_sentence_build.strip():  # 处理最后一个构建中的句子
+    if current_sentence_build.strip(): # 处理循环结束后剩余的正在构建的句子
        preliminary_final_sentences.append(current_sentence_build.strip())

-    preliminary_final_sentences = [s for s in preliminary_final_sentences if s.strip()]  # 清理空字符串
-    # print(f"DEBUG: 初步分割（优化组装后）的句子: {preliminary_final_sentences}")
+    # 再次清理，确保列表中的句子都是非空字符串
+    preliminary_final_sentences = [s for s in preliminary_final_sentences if s.strip()]
+
+    # --- RESTRUCTURED SECTION FOR MERGING, PUNCTUATION REMOVAL, AND BOOK TITLE RECOVERY ---
+    # 此部分统一处理句子的合并、随机标点移除和最终的书名号恢复。
+    intermediate_sentences_placeholders = [] # 存储待恢复书名号的中间句子列表

    if not preliminary_final_sentences:
-        return []
+        # 情况1: 初步句子列表为空
+        # 这可能发生在原始文本非常短、仅包含分隔符，或者仅包含一个书名号（已被替换为占位符）
+        # 如果原文是单个书名号占位符，则应保留它以供恢复。
+        # 注意: text 是经过 protect_book_titles 处理后的文本。
+        # local_book_title_mapping.keys() 包含所有占位符。
+        if local_book_title_mapping and text in local_book_title_mapping.keys():
+            intermediate_sentences_placeholders = [text] # 此时 text 就是那个占位符
+        # else (其他导致 preliminary_final_sentences 为空的情况)，intermediate_sentences_placeholders 保持为空列表

-    if len_text < 12:
-        split_strength = 0.2
-    elif len_text < 32:
-        split_strength = 0.5
-    else:
-        split_strength = 0.7
-    merge_probability = 1.0 - split_strength
-
-    if merge_probability == 1.0 and len(preliminary_final_sentences) > 1:
-        merged_text = " ".join(preliminary_final_sentences).strip()
-        if merged_text.endswith(",") or merged_text.endswith("，"):
-            merged_text = merged_text[:-1].strip()
-        return [merged_text] if merged_text else []
    elif len(preliminary_final_sentences) == 1:
-        s = preliminary_final_sentences[0].strip()
-        if s.endswith(",") or s.endswith("，"):
-            s = s[:-1].strip()
-        return [s] if s else []
-
-    final_sentences_merged = []
-    temp_sentence = ""
-    if preliminary_final_sentences:
-        temp_sentence = preliminary_final_sentences[0]
-        for i_merge in range(1, len(preliminary_final_sentences)):
-            should_merge_based_on_punctuation = True
-            if temp_sentence and temp_sentence[-1] in {"。", ".", "！", "？"}:
-                should_merge_based_on_punctuation = False
-
-            if random.random() < merge_probability and temp_sentence and should_merge_based_on_punctuation:
-                temp_sentence += " " + preliminary_final_sentences[i_merge]
-            else:
-                if temp_sentence:
-                    final_sentences_merged.append(temp_sentence)
-                temp_sentence = preliminary_final_sentences[i_merge]
-        if temp_sentence:
-            final_sentences_merged.append(temp_sentence)
-
-    processed_sentences_after_merge = []
-    for sentence in final_sentences_merged:
-        s = sentence.strip()
-        if s.endswith(",") or s.endswith("，"):
-            s = s[:-1].strip()
+        # 情况2: 初步句子列表只有一个句子
+        s = preliminary_final_sentences[0].strip() # 获取该句子并去除两端空格
        if s:
-            s = random_remove_punctuation(s)
-            processed_sentences_after_merge.append(s)
+            s = random_remove_punctuation(s) # 对该句子进行随机标点移除
+        intermediate_sentences_placeholders = [s] if s else [] # 如果处理后仍有内容，则加入列表

+    else: # 情况3: 初步句子列表有多个句子，需要进行可能的随机合并
+        final_sentences_merged = [] # 存储合并后的句子
+
+        # 根据原始文本长度（未保护书名号前）决定合并强度
+        original_len_for_strength = len(original_text)
+        split_strength = 0.5 # 默认分割强度
+        if original_len_for_strength < 12:
+            split_strength = 0.5
+        elif original_len_for_strength < 32:
+            split_strength = 0.7
+        else:
+            split_strength = 0.9
+        actual_merge_probability = 1.0 - split_strength # 合并概率与分割强度互补
+
+        temp_sentence = "" # 临时存储正在合并的句子
+        if preliminary_final_sentences: # 确保有句子可以合并
+            temp_sentence = preliminary_final_sentences[0] # 从第一个句子开始
+            for i_merge in range(1, len(preliminary_final_sentences)): # 遍历后续句子
+                current_sentence_to_merge = preliminary_final_sentences[i_merge]
+                should_merge_based_on_punctuation = True # 默认可以合并
+                # 如果前一个句子以强终止符结尾，则不应与后一个句子合并
+                if temp_sentence and \
+                    (temp_sentence.endswith("。") or temp_sentence.endswith(".") or \
+                    temp_sentence.endswith("!") or temp_sentence.endswith("?") or \
+                    temp_sentence.endswith("—")):
+                    should_merge_based_on_punctuation = False
+
+                # 根据合并概率和标点规则决定是否合并
+                if random.random() < actual_merge_probability and temp_sentence and should_merge_based_on_punctuation:
+                    # 合并时，如果需要，在两句子间添加空格
+                    if not temp_sentence.endswith(" ") and not current_sentence_to_merge.startswith(" "):
+                        temp_sentence += " " 
+                    temp_sentence += current_sentence_to_merge
+                else: # 不合并，则将已构建的 temp_sentence 加入列表，并开始新的 temp_sentence
+                    if temp_sentence:
+                        final_sentences_merged.append(temp_sentence)
+                    temp_sentence = current_sentence_to_merge
+            if temp_sentence: # 将最后一个构建的（或未合并的）句子加入列表
+                final_sentences_merged.append(temp_sentence)
+
+        # 对合并后的每个句子进行清理和随机标点移除
+        processed_temp = []
+        for sentence_val in final_sentences_merged:
+            s_loop = sentence_val.strip()
+            # 移除句末可能存在的逗号
+            if s_loop.endswith(",") or s_loop.endswith("，"):
+                s_loop = s_loop[:-1].strip()
+            if s_loop: # 确保句子不为空
+                s_loop = random_remove_punctuation(s_loop) # 随机标点移除
+            if s_loop: # 再次确保句子不为空
+                processed_temp.append(s_loop)
+        intermediate_sentences_placeholders = processed_temp
+
+    # 统一的书名号恢复步骤
+    final_sentences_recovered = []
    if perform_book_title_recovery_here and local_book_title_mapping:
-        processed_sentences_after_merge = recover_book_titles(processed_sentences_after_merge, local_book_title_mapping)
-    return processed_sentences_after_merge
+        # 如果有书名号映射且需要恢复，则调用恢复函数
+        final_sentences_recovered = recover_book_titles(intermediate_sentences_placeholders, local_book_title_mapping)
+    else: # 否则，直接使用中间结果
+        final_sentences_recovered = intermediate_sentences_placeholders
+
+    # 返回最终结果，并再次过滤空字符串
+    return [s for s in final_sentences_recovered if s.strip()]


 def random_remove_punctuation(text: str) -> str:
@ -497,13 +629,13 @@ def random_remove_punctuation(text: str) -> str:
        if char == "。" and i == text_len - 1:  # 结尾的句号
            if random.random() > 0.1:  # 90%概率删除结尾句号
                continue
-        elif char == "，":
-            rand = random.random()
-            if rand < 0.25:  # 25%概率删除逗号
-                continue
-            elif rand < 0.2:  # 20%概率把逗号变成空格
-                result += " "
-                continue
+        # elif char == "，":
+        #     rand = random.random()
+        #     if rand < 0.25:  # 25%概率删除逗号
+        #         continue
+        #     elif rand < 0.2:  # 20%概率把逗号变成空格
+        #         result += " "
+        #         continue
        result += char
    return result

@ -921,26 +1053,4 @@ def parse_text_timestamps(text: str, mode: str = "normal") -> str:
            pattern_instance = re.escape(match.group(0))
            result_text = re.sub(pattern_instance, readable_time, result_text, count=1)

-        return result_text
-
-def protect_book_titles(text):
-    book_title_mapping = {}
-    book_title_pattern = re.compile(r"《(.*?)》") # 非贪婪匹配
-
-    def replace_func(match):
-        # 生成唯一占位符
-        placeholder = f"{BOOK_TITLE_PLACEHOLDER_PREFIX}{len(book_title_mapping)}__"
-        # 存储映射关系
-        book_title_mapping[placeholder] = match.group(0) # 存储包含书名号的完整匹配
-        return placeholder
-
-    protected_text = book_title_pattern.sub(replace_func, text)
-    return protected_text, book_title_mapping
-
-def recover_book_titles(sentences, book_title_mapping):
-    recovered_sentences = []
-    for sentence in sentences:
-        for placeholder, original_content in book_title_mapping.items():
-            sentence = sentence.replace(placeholder, original_content)
-        recovered_sentences.append(sentence)
-    return recovered_sentences
+        return result_text