32 lines
970 B
Python
32 lines
970 B
Python
from __future__ import annotations
|
|
|
|
import hashlib
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
from .rag_index import extract_text_from_path
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ExtractedText:
|
|
path: Path
|
|
text: str
|
|
status: str
|
|
content_hash: str = ""
|
|
error_message: str = ""
|
|
|
|
|
|
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
|
|
|
|
|
|
def extract_text(path: str | Path) -> ExtractedText:
|
|
file_path = Path(path)
|
|
if file_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
|
|
return ExtractedText(path=file_path, text="", status="unsupported")
|
|
try:
|
|
text = extract_text_from_path(file_path)
|
|
except Exception as exc:
|
|
return ExtractedText(path=file_path, text="", status="failed", error_message=str(exc))
|
|
content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() if text else ""
|
|
return ExtractedText(path=file_path, text=text, status="success", content_hash=content_hash)
|