feat(regulatory): 增加法规核查基础服务
This commit is contained in:
31
review_agent/regulatory_review/services/text_extract.py
Normal file
31
review_agent/regulatory_review/services/text_extract.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from .rag_index import extract_text_from_path
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExtractedText:
|
||||
path: Path
|
||||
text: str
|
||||
status: str
|
||||
content_hash: str = ""
|
||||
error_message: str = ""
|
||||
|
||||
|
||||
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
|
||||
|
||||
|
||||
def extract_text(path: str | Path) -> ExtractedText:
|
||||
file_path = Path(path)
|
||||
if file_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
|
||||
return ExtractedText(path=file_path, text="", status="unsupported")
|
||||
try:
|
||||
text = extract_text_from_path(file_path)
|
||||
except Exception as exc:
|
||||
return ExtractedText(path=file_path, text="", status="failed", error_message=str(exc))
|
||||
content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() if text else ""
|
||||
return ExtractedText(path=file_path, text=text, status="success", content_hash=content_hash)
|
||||
Reference in New Issue
Block a user