MaiBot/src/express/expressor_model/tokenizer.py

import re
from typing import List, Optional, Set

try:
    import jieba

    _HAS_JIEBA = True
except Exception:
    _HAS_JIEBA = False

_WORD_RE = re.compile(r"[A-Za-z0-9_]+")
# 匹配纯符号的正则表达式
_SYMBOL_RE = re.compile(r"^[^\w\u4e00-\u9fff]+$")


def simple_en_tokenize(text: str) -> List[str]:
    return _WORD_RE.findall(text.lower())


class Tokenizer:
    def __init__(self, stopwords: Optional[Set[str]] = None, use_jieba: bool = True):
        self.stopwords = stopwords or set()
        self.use_jieba = use_jieba and _HAS_JIEBA

    def tokenize(self, text: str) -> List[str]:
        text = (text or "").strip()
        if not text:
            return []
        if self.use_jieba:
            toks = [t.strip().lower() for t in jieba.cut(text) if t.strip()]
        else:
            toks = simple_en_tokenize(text)
        # 过滤掉纯符号和停用词
        return [t for t in toks if t not in self.stopwords and not _SYMBOL_RE.match(t)]