feat(documents): 支持文档上传与本地RAG入库

This commit is contained in:
2026-05-30 00:10:05 +08:00
parent 7a6c110103
commit 4a831ee2c5
13 changed files with 403 additions and 0 deletions

View File

@@ -0,0 +1,89 @@
from pathlib import Path
from zipfile import BadZipFile, ZipFile
import re
import xml.etree.ElementTree as ET
from agent_core.rag.ingest import ingest_document
from .models import UploadedDocument
def create_uploaded_document(scenario_id: str, uploaded_file) -> UploadedDocument:
extension = Path(uploaded_file.name).suffix.lower().lstrip(".")
return UploadedDocument.objects.create(
scenario_id=scenario_id,
original_name=uploaded_file.name,
file=uploaded_file,
file_type=extension,
size=uploaded_file.size,
status=UploadedDocument.STATUS_UPLOADED,
)
def extract_text(document: UploadedDocument) -> str:
path = Path(document.file.path)
extension = f".{document.file_type.lower().lstrip('.')}"
if extension == ".pdf":
return _extract_pdf_text(path)
if extension == ".docx":
return _extract_docx_text(path)
return _read_text_file(path)
def _read_text_file(path: Path) -> str:
try:
return path.read_text(encoding="utf-8")
except UnicodeDecodeError:
return path.read_text()
def _extract_pdf_text(path: Path) -> str:
try:
import pypdf
reader = pypdf.PdfReader(str(path))
return "\n".join(page.extract_text() or "" for page in reader.pages)
except Exception:
return _read_binary_text_fallback(path)
def _extract_docx_text(path: Path) -> str:
try:
with ZipFile(path) as archive:
document_xml = archive.read("word/document.xml")
root = ET.fromstring(document_xml)
namespace = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
texts = [node.text for node in root.findall(".//w:t", namespace) if node.text]
return "\n".join(texts)
except (BadZipFile, KeyError, ET.ParseError):
return _read_binary_text_fallback(path)
def _read_binary_text_fallback(path: Path) -> str:
data = path.read_bytes()
text = data.decode("utf-8", errors="ignore")
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]+", " ", text)
return text.strip()
def index_document(document: UploadedDocument) -> UploadedDocument:
try:
text = extract_text(document)
result = ingest_document(
document_id=document.id,
scenario_id=document.scenario_id,
source_file=document.original_name,
text=text,
collection=document.scenario_id,
)
if result.success:
document.status = UploadedDocument.STATUS_INDEXED
document.error_message = ""
else:
document.status = UploadedDocument.STATUS_FAILED
document.error_message = result.error
except Exception as exc:
document.status = UploadedDocument.STATUS_FAILED
document.error_message = str(exc)
document.save(update_fields=["status", "error_message", "updated_at"])
return document