feat(regulatory): 增加法规核查基础服务

This commit is contained in:
2026-06-07 00:36:18 +08:00
parent 44d31d2a14
commit ec89e62661
11 changed files with 327 additions and 0 deletions

View File

@@ -0,0 +1,31 @@
from __future__ import annotations
import hashlib
from dataclasses import dataclass
from pathlib import Path
from .rag_index import extract_text_from_path
@dataclass(frozen=True)
class ExtractedText:
path: Path
text: str
status: str
content_hash: str = ""
error_message: str = ""
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
def extract_text(path: str | Path) -> ExtractedText:
file_path = Path(path)
if file_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
return ExtractedText(path=file_path, text="", status="unsupported")
try:
text = extract_text_from_path(file_path)
except Exception as exc:
return ExtractedText(path=file_path, text="", status="failed", error_message=str(exc))
content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() if text else ""
return ExtractedText(path=file_path, text=text, status="success", content_hash=content_hash)