mirror of https://github.com/Mai-with-u/MaiBot.git
分割书名号保护
parent
c7584f743f
commit
742d45b419
|
|
@ -23,6 +23,8 @@ logger = get_module_logger("chat_utils")
|
||||||
_L_REGEX = regex.compile(r"\p{L}") # 匹配任何Unicode字母
|
_L_REGEX = regex.compile(r"\p{L}") # 匹配任何Unicode字母
|
||||||
_HAN_CHAR_REGEX = regex.compile(r"\p{Han}") # 匹配汉字 (Unicode属性)
|
_HAN_CHAR_REGEX = regex.compile(r"\p{Han}") # 匹配汉字 (Unicode属性)
|
||||||
_Nd_REGEX = regex.compile(r"\p{Nd}") # 新增:匹配Unicode数字 (Nd = Number, decimal digit)
|
_Nd_REGEX = regex.compile(r"\p{Nd}") # 新增:匹配Unicode数字 (Nd = Number, decimal digit)
|
||||||
|
|
||||||
|
BOOK_TITLE_PLACEHOLDER_PREFIX = "__BOOKTITLE_"
|
||||||
SEPARATORS = {"。", ",", ",", " ", ";", "\xa0", "\n", ".", "—", "!", "?"}
|
SEPARATORS = {"。", ",", ",", " ", ";", "\xa0", "\n", ".", "—", "!", "?"}
|
||||||
KNOWN_ABBREVIATIONS_ENDING_WITH_DOT = {
|
KNOWN_ABBREVIATIONS_ENDING_WITH_DOT = {
|
||||||
"Mr.",
|
"Mr.",
|
||||||
|
|
@ -301,10 +303,11 @@ def get_recent_group_speaker(chat_stream_id: int, sender, limit: int = 12) -> li
|
||||||
return who_chat_in_group
|
return who_chat_in_group
|
||||||
|
|
||||||
|
|
||||||
def split_into_sentences_w_remove_punctuation(text: str) -> list[str]:
|
def split_into_sentences_w_remove_punctuation(original_text: str) -> list[str]:
|
||||||
"""将文本分割成句子,并根据概率合并"""
|
"""将文本分割成句子,并根据概率合并"""
|
||||||
# print(f"DEBUG: 输入文本 (repr): {repr(text)}")
|
# print(f"DEBUG: 输入文本 (repr): {repr(text)}")
|
||||||
|
text, local_book_title_mapping = protect_book_titles(original_text)
|
||||||
|
perform_book_title_recovery_here = True
|
||||||
# 预处理
|
# 预处理
|
||||||
text = regex.sub(r"\n\s*\n+", "\n", text) # 合并多个换行符
|
text = regex.sub(r"\n\s*\n+", "\n", text) # 合并多个换行符
|
||||||
text = regex.sub(r"\n\s*([—。.,,;\s\xa0!?])", r"\1", text)
|
text = regex.sub(r"\n\s*([—。.,,;\s\xa0!?])", r"\1", text)
|
||||||
|
|
@ -472,6 +475,9 @@ def split_into_sentences_w_remove_punctuation(text: str) -> list[str]:
|
||||||
s = random_remove_punctuation(s)
|
s = random_remove_punctuation(s)
|
||||||
processed_sentences_after_merge.append(s)
|
processed_sentences_after_merge.append(s)
|
||||||
|
|
||||||
|
if perform_book_title_recovery_here and local_book_title_mapping:
|
||||||
|
# 假设 processed_sentences_after_merge 是最终的句子列表
|
||||||
|
processed_sentences_after_merge = recover_book_titles(processed_sentences_after_merge, local_book_title_mapping)
|
||||||
return processed_sentences_after_merge
|
return processed_sentences_after_merge
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -917,3 +923,25 @@ def parse_text_timestamps(text: str, mode: str = "normal") -> str:
|
||||||
result_text = re.sub(pattern_instance, readable_time, result_text, count=1)
|
result_text = re.sub(pattern_instance, readable_time, result_text, count=1)
|
||||||
|
|
||||||
return result_text
|
return result_text
|
||||||
|
|
||||||
|
def protect_book_titles(text):
|
||||||
|
book_title_mapping = {}
|
||||||
|
book_title_pattern = re.compile(r"《(.*?)》") # 非贪婪匹配
|
||||||
|
|
||||||
|
def replace_func(match):
|
||||||
|
# 生成唯一占位符
|
||||||
|
placeholder = f"{BOOK_TITLE_PLACEHOLDER_PREFIX}{len(book_title_mapping)}__"
|
||||||
|
# 存储映射关系
|
||||||
|
book_title_mapping[placeholder] = match.group(0) # 存储包含书名号的完整匹配
|
||||||
|
return placeholder
|
||||||
|
|
||||||
|
protected_text = book_title_pattern.sub(replace_func, text)
|
||||||
|
return protected_text, book_title_mapping
|
||||||
|
|
||||||
|
def recover_book_titles(sentences, book_title_mapping):
|
||||||
|
recovered_sentences = []
|
||||||
|
for sentence in sentences:
|
||||||
|
for placeholder, original_content in book_title_mapping.items():
|
||||||
|
sentence = sentence.replace(placeholder, original_content)
|
||||||
|
recovered_sentences.append(sentence)
|
||||||
|
return recovered_sentences
|
||||||
Loading…
Reference in New Issue