更好的空格分割

pull/619/head
Bakadax 2025-03-30 19:57:44 +08:00
parent 256bfcf5c2
commit 7c665d2d04
1 changed files with 19 additions and 7 deletions

View File

@ -201,15 +201,22 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
else:
split_strength = 0.7
# 处理文本,分行区分西文和中文字符
new_text = []
for i, char in enumerate(text):
if char == ' ' and should_split(text, i):
new_text.append('|seg|')
else:
new_text.append(char)
text = ''.join(new_text)
# 检查是否为西文字符段落
if not is_western_paragraph(text):
# 当语言为中文时,统一将英文逗号转换为中文逗号
text = text.replace(",", "")
text = text.replace("\n", " ")
else:
text = text.replace("\n", "|seg|")
# 用"|seg|"作为分割符分开
text = re.sub(r"([.!?]) +", r"\1\|seg\|", text)
text = text.replace("\n", "\|seg\|")
text = text.replace("\n", "|seg|")
text, mapping = protect_kaomoji(text)
# print(f"处理前的文本: {text}")
@ -240,7 +247,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
else:
current_sentence += "" + part
# 处理空格分割
space_parts = current_sentence.split(" ")
space_parts = current_sentence.split("|seg|")
current_sentence = space_parts[0]
for part in space_parts[1:]:
if random.random() < split_strength:
@ -250,7 +257,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
current_sentence += " " + part
else:
# 处理分割符
space_parts = current_sentence.split("\|seg\|")
space_parts = current_sentence.split("|seg|")
current_sentence = space_parts[0]
for part in space_parts[1:]:
new_sentences.append(current_sentence.strip())
@ -484,4 +491,9 @@ def is_western_char(char):
def is_western_paragraph(paragraph):
"""检测是否为西文字符段落"""
return all(is_western_char(char) for char in paragraph if char.isalnum())
def should_split(text, index):
"""检测空格两边的字符是否为西文字符"""
if index == 0 or index == len(text) - 1:
return False
return not (is_western_char(text[index - 1]) or is_western_char(text[index + 1]))