Files
DEMO-AGENT/apps/documents/services.py

128 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from pathlib import Path
import re
import xml.etree.ElementTree as ET
from zipfile import BadZipFile, ZipFile
from agent_core.rag.ingest import ingest_document
from .models import UploadedDocument
def create_uploaded_document(scenario_id: str, uploaded_file) -> UploadedDocument:
"""
保存上传文件的元数据记录。
Documents 模块只记录文件与场景关系、原始名称、类型和大小,
真正的入库动作由用户后续主动触发,避免上传阶段就耦合 RAG 流程。
"""
extension = _detect_extension(uploaded_file.name)
return UploadedDocument.objects.create(
scenario_id=scenario_id,
original_name=uploaded_file.name,
file=uploaded_file,
file_type=extension,
size=uploaded_file.size,
status=UploadedDocument.STATUS_UPLOADED,
)
def extract_text(document: UploadedDocument) -> str:
"""
根据文档类型选择合适的文本抽取策略。
V1 的目标是“可演示且稳定”,因此:
- `.txt` / `.md` 直接按文本读取
- `.pdf` 优先走 pypdf失败时回退为二进制容错读取
- `.docx` 优先解析 Word XML失败时回退为二进制容错读取
"""
path = Path(document.file.path)
extension = f".{document.file_type.lower().lstrip('.')}"
if extension == ".pdf":
return _extract_pdf_text(path)
if extension == ".docx":
return _extract_docx_text(path)
return _read_text_file(path)
def index_document(document: UploadedDocument) -> UploadedDocument:
"""
触发单个文档入库,并把成功/失败状态回写到 UploadedDocument。
这里故意不抛业务异常给 View
View 层只需要知道“最终状态是什么”,而错误信息统一落到模型字段中,
便于页面重试和演示。
"""
try:
text = extract_text(document)
ingest_result = ingest_document(
document_id=document.id,
scenario_id=document.scenario_id,
source_file=document.original_name,
text=text,
collection=document.scenario_id,
)
_apply_ingest_result(document, ingest_result.success, ingest_result.error)
except Exception as exc:
_apply_ingest_result(document, success=False, error=str(exc))
document.save(update_fields=["status", "error_message", "updated_at"])
return document
def _apply_ingest_result(document: UploadedDocument, success: bool, error: str = "") -> None:
"""把入库结果映射为 UploadedDocument 的稳定状态字段。"""
if success:
document.status = UploadedDocument.STATUS_INDEXED
document.error_message = ""
return
document.status = UploadedDocument.STATUS_FAILED
document.error_message = error
def _detect_extension(file_name: str) -> str:
"""统一将扩展名转成小写且去掉前导点,便于模型字段存储。"""
return Path(file_name).suffix.lower().lstrip(".")
def _read_text_file(path: Path) -> str:
"""优先按 UTF-8 读取;失败时回退到系统默认编码。"""
try:
return path.read_text(encoding="utf-8")
except UnicodeDecodeError:
return path.read_text()
def _extract_pdf_text(path: Path) -> str:
"""优先使用 pypdf 抽取 PDF 文本,失败时回退到容错方案。"""
try:
import pypdf
reader = pypdf.PdfReader(str(path))
return "\n".join(page.extract_text() or "" for page in reader.pages)
except Exception:
return _read_binary_text_fallback(path)
def _extract_docx_text(path: Path) -> str:
"""提取 Word XML 中的可见文字内容,不追求保留样式。"""
try:
with ZipFile(path) as archive:
document_xml = archive.read("word/document.xml")
root = ET.fromstring(document_xml)
namespace = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
texts = [node.text for node in root.findall(".//w:t", namespace) if node.text]
return "\n".join(texts)
except (BadZipFile, KeyError, ET.ParseError):
return _read_binary_text_fallback(path)
def _read_binary_text_fallback(path: Path) -> str:
"""
当结构化抽取失败时,退回到“尽可能保留纯文本”的保底方案。
该方案不保证版式,但足以支撑 V1 入库和演示。
"""
data = path.read_bytes()
text = data.decode("utf-8", errors="ignore")
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]+", " ", text)
return text.strip()