mirror of https://github.com/Mai-with-u/MaiBot.git
35 lines
973 B
Python
35 lines
973 B
Python
import re
|
|
from typing import List, Optional, Set
|
|
|
|
try:
|
|
import jieba
|
|
|
|
_HAS_JIEBA = True
|
|
except Exception:
|
|
_HAS_JIEBA = False
|
|
|
|
_WORD_RE = re.compile(r"[A-Za-z0-9_]+")
|
|
# 匹配纯符号的正则表达式
|
|
_SYMBOL_RE = re.compile(r"^[^\w\u4e00-\u9fff]+$")
|
|
|
|
|
|
def simple_en_tokenize(text: str) -> List[str]:
|
|
return _WORD_RE.findall(text.lower())
|
|
|
|
|
|
class Tokenizer:
|
|
def __init__(self, stopwords: Optional[Set[str]] = None, use_jieba: bool = True):
|
|
self.stopwords = stopwords or set()
|
|
self.use_jieba = use_jieba and _HAS_JIEBA
|
|
|
|
def tokenize(self, text: str) -> List[str]:
|
|
text = (text or "").strip()
|
|
if not text:
|
|
return []
|
|
if self.use_jieba:
|
|
toks = [t.strip().lower() for t in jieba.cut(text) if t.strip()]
|
|
else:
|
|
toks = simple_en_tokenize(text)
|
|
# 过滤掉纯符号和停用词
|
|
return [t for t in toks if t not in self.stopwords and not _SYMBOL_RE.match(t)]
|