mirror of https://github.com/Mai-with-u/MaiBot.git
更好的空格分割
parent
256bfcf5c2
commit
7c665d2d04
|
|
@ -201,15 +201,22 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
|
||||||
else:
|
else:
|
||||||
split_strength = 0.7
|
split_strength = 0.7
|
||||||
|
|
||||||
|
# 处理文本,分行区分西文和中文字符
|
||||||
|
new_text = []
|
||||||
|
for i, char in enumerate(text):
|
||||||
|
if char == ' ' and should_split(text, i):
|
||||||
|
new_text.append('|seg|')
|
||||||
|
else:
|
||||||
|
new_text.append(char)
|
||||||
|
text = ''.join(new_text)
|
||||||
|
|
||||||
# 检查是否为西文字符段落
|
# 检查是否为西文字符段落
|
||||||
if not is_western_paragraph(text):
|
if not is_western_paragraph(text):
|
||||||
# 当语言为中文时,统一将英文逗号转换为中文逗号
|
# 当语言为中文时,统一将英文逗号转换为中文逗号
|
||||||
text = text.replace(",", ",")
|
text = text.replace(",", ",")
|
||||||
text = text.replace("\n", " ")
|
text = text.replace("\n", "|seg|")
|
||||||
else:
|
|
||||||
# 用"|seg|"作为分割符分开
|
# 用"|seg|"作为分割符分开
|
||||||
text = re.sub(r"([.!?]) +", r"\1\|seg\|", text)
|
text = text.replace("\n", "|seg|")
|
||||||
text = text.replace("\n", "\|seg\|")
|
|
||||||
text, mapping = protect_kaomoji(text)
|
text, mapping = protect_kaomoji(text)
|
||||||
# print(f"处理前的文本: {text}")
|
# print(f"处理前的文本: {text}")
|
||||||
|
|
||||||
|
|
@ -240,7 +247,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
|
||||||
else:
|
else:
|
||||||
current_sentence += "," + part
|
current_sentence += "," + part
|
||||||
# 处理空格分割
|
# 处理空格分割
|
||||||
space_parts = current_sentence.split(" ")
|
space_parts = current_sentence.split("|seg|")
|
||||||
current_sentence = space_parts[0]
|
current_sentence = space_parts[0]
|
||||||
for part in space_parts[1:]:
|
for part in space_parts[1:]:
|
||||||
if random.random() < split_strength:
|
if random.random() < split_strength:
|
||||||
|
|
@ -250,7 +257,7 @@ def split_into_sentences_w_remove_punctuation(text: str) -> List[str]:
|
||||||
current_sentence += " " + part
|
current_sentence += " " + part
|
||||||
else:
|
else:
|
||||||
# 处理分割符
|
# 处理分割符
|
||||||
space_parts = current_sentence.split("\|seg\|")
|
space_parts = current_sentence.split("|seg|")
|
||||||
current_sentence = space_parts[0]
|
current_sentence = space_parts[0]
|
||||||
for part in space_parts[1:]:
|
for part in space_parts[1:]:
|
||||||
new_sentences.append(current_sentence.strip())
|
new_sentences.append(current_sentence.strip())
|
||||||
|
|
@ -484,4 +491,9 @@ def is_western_char(char):
|
||||||
def is_western_paragraph(paragraph):
|
def is_western_paragraph(paragraph):
|
||||||
"""检测是否为西文字符段落"""
|
"""检测是否为西文字符段落"""
|
||||||
return all(is_western_char(char) for char in paragraph if char.isalnum())
|
return all(is_western_char(char) for char in paragraph if char.isalnum())
|
||||||
|
|
||||||
|
def should_split(text, index):
|
||||||
|
"""检测空格两边的字符是否为西文字符"""
|
||||||
|
if index == 0 or index == len(text) - 1:
|
||||||
|
return False
|
||||||
|
return not (is_western_char(text[index - 1]) or is_western_char(text[index + 1]))
|
||||||
Loading…
Reference in New Issue