From 7c665d2d04ef30ad3f292d29d5fa924fd90d0542 Mon Sep 17 00:00:00 2001 From: Bakadax Date: Sun, 30 Mar 2025 19:57:44 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E5=A5=BD=E7=9A=84=E7=A9=BA=E6=A0=BC?= =?UTF-8?q?=E5=88=86=E5=89=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/plugins/chat/utils.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/plugins/chat/utils.py b/src/plugins/chat/utils.py index 163b5530..f2eee1a3 100644 --- a/src/plugins/chat/utils.py +++ b/src/plugins/chat/utils.py @@ -201,15 +201,22 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: else: split_strength = 0.7 + # 处理文本,分行区分西文和中文字符 + new_text = [] + for i, char in enumerate(text): + if char == ' ' and should_split(text, i): + new_text.append('|seg|') + else: + new_text.append(char) + text = ''.join(new_text) + # 检查是否为西文字符段落 if not is_western_paragraph(text): # 当语言为中文时,统一将英文逗号转换为中文逗号 text = text.replace(",", ",") - text = text.replace("\n", " ") - else: + text = text.replace("\n", "|seg|") # 用"|seg|"作为分割符分开 - text = re.sub(r"([.!?]) +", r"\1\|seg\|", text) - text = text.replace("\n", "\|seg\|") + text = text.replace("\n", "|seg|") text, mapping = protect_kaomoji(text) # print(f"处理前的文本: {text}") @@ -240,7 +247,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: else: current_sentence += "," + part # 处理空格分割 - space_parts = current_sentence.split(" ") + space_parts = current_sentence.split("|seg|") current_sentence = space_parts[0] for part in space_parts[1:]: if random.random() < split_strength: @@ -250,7 +257,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]: current_sentence += " " + part else: # 处理分割符 - space_parts = current_sentence.split("\|seg\|") + space_parts = current_sentence.split("|seg|") current_sentence = space_parts[0] for part in space_parts[1:]: new_sentences.append(current_sentence.strip()) @@ -484,4 +491,9 @@ def is_western_char(char): def is_western_paragraph(paragraph): """检测是否为西文字符段落""" return all(is_western_char(char) for char in paragraph if char.isalnum()) - \ No newline at end of file + +def should_split(text, index): + """检测空格两边的字符是否为西文字符""" + if index == 0 or index == len(text) - 1: + return False + return not (is_western_char(text[index - 1]) or is_western_char(text[index + 1])) \ No newline at end of file