From 7c665d2d04ef30ad3f292d29d5fa924fd90d0542 Mon Sep 17 00:00:00 2001
From: Bakadax <bakadax@qq.com>
Date: Sun, 30 Mar 2025 19:57:44 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E5=A5=BD=E7=9A=84=E7=A9=BA=E6=A0=BC?=
 =?UTF-8?q?=E5=88=86=E5=89=B2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/plugins/chat/utils.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/plugins/chat/utils.py b/src/plugins/chat/utils.py
index 163b5530..f2eee1a3 100644
--- a/src/plugins/chat/utils.py
+++ b/src/plugins/chat/utils.py
@@ -201,15 +201,22 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
     else:
         split_strength = 0.7
 
+    # 处理文本，分行区分西文和中文字符
+    new_text = []
+    for i, char in enumerate(text):
+        if char == ' ' and should_split(text, i):
+            new_text.append('|seg|')
+        else:
+            new_text.append(char)
+    text = ''.join(new_text)
+
     # 检查是否为西文字符段落
     if not is_western_paragraph(text):
         # 当语言为中文时，统一将英文逗号转换为中文逗号
         text = text.replace(",", "，")
-        text = text.replace("\n", " ")
-    else:
+        text = text.replace("\n", "|seg|")
         # 用"|seg|"作为分割符分开
-        text = re.sub(r"([.!?]) +", r"\1\|seg\|", text)
-        text = text.replace("\n", "\|seg\|")
+    text = text.replace("\n", "|seg|")
     text, mapping = protect_kaomoji(text)
     # print(f"处理前的文本: {text}")
 
@@ -240,7 +247,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
                 else:
                     current_sentence += "，" + part
             # 处理空格分割
-            space_parts = current_sentence.split(" ")
+            space_parts = current_sentence.split("|seg|")
             current_sentence = space_parts[0]
             for part in space_parts[1:]:
                 if random.random() < split_strength:
@@ -250,7 +257,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
                     current_sentence += " " + part
         else:
             # 处理分割符
-            space_parts = current_sentence.split("\|seg\|")
+            space_parts = current_sentence.split("|seg|")
             current_sentence = space_parts[0]
             for part in space_parts[1:]:
                 new_sentences.append(current_sentence.strip())
@@ -484,4 +491,9 @@ def is_western_char(char):
 def is_western_paragraph(paragraph):
     """检测是否为西文字符段落"""
     return all(is_western_char(char) for char in paragraph if char.isalnum())
-  
\ No newline at end of file
+
+def should_split(text, index):
+    """检测空格两边的字符是否为西文字符"""
+    if index == 0 or index == len(text) - 1:
+        return False
+    return not (is_western_char(text[index - 1]) or is_western_char(text[index + 1]))
\ No newline at end of file