feat(documents): 支持文档上传与本地RAG入库
This commit is contained in:
89
apps/documents/services.py
Normal file
89
apps/documents/services.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from pathlib import Path
|
||||
from zipfile import BadZipFile, ZipFile
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from agent_core.rag.ingest import ingest_document
|
||||
|
||||
from .models import UploadedDocument
|
||||
|
||||
|
||||
def create_uploaded_document(scenario_id: str, uploaded_file) -> UploadedDocument:
|
||||
extension = Path(uploaded_file.name).suffix.lower().lstrip(".")
|
||||
return UploadedDocument.objects.create(
|
||||
scenario_id=scenario_id,
|
||||
original_name=uploaded_file.name,
|
||||
file=uploaded_file,
|
||||
file_type=extension,
|
||||
size=uploaded_file.size,
|
||||
status=UploadedDocument.STATUS_UPLOADED,
|
||||
)
|
||||
|
||||
|
||||
def extract_text(document: UploadedDocument) -> str:
|
||||
path = Path(document.file.path)
|
||||
extension = f".{document.file_type.lower().lstrip('.')}"
|
||||
if extension == ".pdf":
|
||||
return _extract_pdf_text(path)
|
||||
if extension == ".docx":
|
||||
return _extract_docx_text(path)
|
||||
return _read_text_file(path)
|
||||
|
||||
|
||||
def _read_text_file(path: Path) -> str:
|
||||
try:
|
||||
return path.read_text(encoding="utf-8")
|
||||
except UnicodeDecodeError:
|
||||
return path.read_text()
|
||||
|
||||
|
||||
def _extract_pdf_text(path: Path) -> str:
|
||||
try:
|
||||
import pypdf
|
||||
|
||||
reader = pypdf.PdfReader(str(path))
|
||||
return "\n".join(page.extract_text() or "" for page in reader.pages)
|
||||
except Exception:
|
||||
return _read_binary_text_fallback(path)
|
||||
|
||||
|
||||
def _extract_docx_text(path: Path) -> str:
|
||||
try:
|
||||
with ZipFile(path) as archive:
|
||||
document_xml = archive.read("word/document.xml")
|
||||
root = ET.fromstring(document_xml)
|
||||
namespace = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
||||
texts = [node.text for node in root.findall(".//w:t", namespace) if node.text]
|
||||
return "\n".join(texts)
|
||||
except (BadZipFile, KeyError, ET.ParseError):
|
||||
return _read_binary_text_fallback(path)
|
||||
|
||||
|
||||
def _read_binary_text_fallback(path: Path) -> str:
|
||||
data = path.read_bytes()
|
||||
text = data.decode("utf-8", errors="ignore")
|
||||
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]+", " ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def index_document(document: UploadedDocument) -> UploadedDocument:
|
||||
try:
|
||||
text = extract_text(document)
|
||||
result = ingest_document(
|
||||
document_id=document.id,
|
||||
scenario_id=document.scenario_id,
|
||||
source_file=document.original_name,
|
||||
text=text,
|
||||
collection=document.scenario_id,
|
||||
)
|
||||
if result.success:
|
||||
document.status = UploadedDocument.STATUS_INDEXED
|
||||
document.error_message = ""
|
||||
else:
|
||||
document.status = UploadedDocument.STATUS_FAILED
|
||||
document.error_message = result.error
|
||||
except Exception as exc:
|
||||
document.status = UploadedDocument.STATUS_FAILED
|
||||
document.error_message = str(exc)
|
||||
document.save(update_fields=["status", "error_message", "updated_at"])
|
||||
return document
|
||||
Reference in New Issue
Block a user