from __future__ import annotations import hashlib from dataclasses import dataclass from pathlib import Path from .rag_index import extract_text_from_path @dataclass(frozen=True) class ExtractedText: path: Path text: str status: str content_hash: str = "" error_message: str = "" SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"} def extract_text(path: str | Path) -> ExtractedText: file_path = Path(path) if file_path.suffix.lower() not in SUPPORTED_EXTENSIONS: return ExtractedText(path=file_path, text="", status="unsupported") try: text = extract_text_from_path(file_path) except Exception as exc: return ExtractedText(path=file_path, text="", status="failed", error_message=str(exc)) content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() if text else "" return ExtractedText(path=file_path, text=text, status="success", content_hash=content_hash)