From b612de9764f267373b5c0de50f75f82027953001 Mon Sep 17 00:00:00 2001
From: Bakadax <bakadax@qq.com>
Date: Thu, 15 May 2025 14:43:17 +0800
Subject: [PATCH] =?UTF-8?q?=E5=88=86=E5=88=86=E5=88=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/chat/utils/utils.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/chat/utils/utils.py b/src/chat/utils/utils.py
index cfe45e56..50d69d40 100644
--- a/src/chat/utils/utils.py
+++ b/src/chat/utils/utils.py
@@ -255,7 +255,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> list[str]:
         else:
             return [text]
 
-    separators = {"。", "，", ",", " ", ";", "\xa0", "\n", ".", "—"} # 保持原有分隔符集合
+    separators = {"。", "，", ",", " ", ";", "\xa0", "\n", ".", "—", "！", "？"} # 保持原有分隔符集合
     # logger.debug(f"DEBUG: 使用的分隔符集合: {separators}")
     segments = []
     current_segment = ""
@@ -365,6 +365,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> list[str]:
         if s.endswith(',') or s.endswith('，'):
             s = s[:-1].strip()
         if s:
+            s = random_remove_punctuation(s)
             processed_sentences_after_merge.append(s)
 
     return processed_sentences_after_merge
@@ -386,13 +387,13 @@ def random_remove_punctuation(text: str) -> str:
         if char == "。" and i == text_len - 1:  # 结尾的句号
             if random.random() > 0.1:  # 90%概率删除结尾句号
                 continue
-        elif char == "，":
-            rand = random.random()
-            if rand < 0.25:  # 5%概率删除逗号
-                continue
-            elif rand < 0.25:  # 20%概率把逗号变成空格
-                result += " "
-                continue
+        # elif char == "，":
+        #     rand = random.random()
+        #     if rand < 0.25:  # 25%概率删除逗号
+        #         continue
+        #     elif rand < 0.2:  # 20%概率把逗号变成空格
+        #         result += " "
+        #         continue
         result += char
     return result