更好的空格分割

pull/619/head
Bakadax 2025-03-30 19:57:44 +08:00
parent 256bfcf5c2
commit 7c665d2d04
1 changed files with 19 additions and 7 deletions

View File

@ -201,15 +201,22 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
else: else:
split_strength = 0.7 split_strength = 0.7
# 处理文本,分行区分西文和中文字符
new_text = []
for i, char in enumerate(text):
if char == ' ' and should_split(text, i):
new_text.append('|seg|')
else:
new_text.append(char)
text = ''.join(new_text)
# 检查是否为西文字符段落 # 检查是否为西文字符段落
if not is_western_paragraph(text): if not is_western_paragraph(text):
# 当语言为中文时,统一将英文逗号转换为中文逗号 # 当语言为中文时,统一将英文逗号转换为中文逗号
text = text.replace(",", "") text = text.replace(",", "")
text = text.replace("\n", " ") text = text.replace("\n", "|seg|")
else:
# 用"|seg|"作为分割符分开 # 用"|seg|"作为分割符分开
text = re.sub(r"([.!?]) +", r"\1\|seg\|", text) text = text.replace("\n", "|seg|")
text = text.replace("\n", "\|seg\|")
text, mapping = protect_kaomoji(text) text, mapping = protect_kaomoji(text)
# print(f"处理前的文本: {text}") # print(f"处理前的文本: {text}")
@ -240,7 +247,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
else: else:
current_sentence += "" + part current_sentence += "" + part
# 处理空格分割 # 处理空格分割
space_parts = current_sentence.split(" ") space_parts = current_sentence.split("|seg|")
current_sentence = space_parts[0] current_sentence = space_parts[0]
for part in space_parts[1:]: for part in space_parts[1:]:
if random.random() < split_strength: if random.random() < split_strength:
@ -250,7 +257,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
current_sentence += " " + part current_sentence += " " + part
else: else:
# 处理分割符 # 处理分割符
space_parts = current_sentence.split("\|seg\|") space_parts = current_sentence.split("|seg|")
current_sentence = space_parts[0] current_sentence = space_parts[0]
for part in space_parts[1:]: for part in space_parts[1:]:
new_sentences.append(current_sentence.strip()) new_sentences.append(current_sentence.strip())
@ -484,4 +491,9 @@ def is_western_char(char):
def is_western_paragraph(paragraph): def is_western_paragraph(paragraph):
"""检测是否为西文字符段落""" """检测是否为西文字符段落"""
return all(is_western_char(char) for char in paragraph if char.isalnum()) return all(is_western_char(char) for char in paragraph if char.isalnum())
def should_split(text, index):
"""检测空格两边的字符是否为西文字符"""
if index == 0 or index == len(text) - 1:
return False
return not (is_western_char(text[index - 1]) or is_western_char(text[index + 1]))