refactor(rag): 梳理文档入库与检索服务结构
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
from pathlib import Path
|
||||
from zipfile import BadZipFile, ZipFile
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from zipfile import BadZipFile, ZipFile
|
||||
|
||||
from agent_core.rag.ingest import ingest_document
|
||||
|
||||
@@ -9,7 +9,13 @@ from .models import UploadedDocument
|
||||
|
||||
|
||||
def create_uploaded_document(scenario_id: str, uploaded_file) -> UploadedDocument:
|
||||
extension = Path(uploaded_file.name).suffix.lower().lstrip(".")
|
||||
"""
|
||||
保存上传文件的元数据记录。
|
||||
|
||||
Documents 模块只记录文件与场景关系、原始名称、类型和大小,
|
||||
真正的入库动作由用户后续主动触发,避免上传阶段就耦合 RAG 流程。
|
||||
"""
|
||||
extension = _detect_extension(uploaded_file.name)
|
||||
return UploadedDocument.objects.create(
|
||||
scenario_id=scenario_id,
|
||||
original_name=uploaded_file.name,
|
||||
@@ -21,6 +27,14 @@ def create_uploaded_document(scenario_id: str, uploaded_file) -> UploadedDocumen
|
||||
|
||||
|
||||
def extract_text(document: UploadedDocument) -> str:
|
||||
"""
|
||||
根据文档类型选择合适的文本抽取策略。
|
||||
|
||||
V1 的目标是“可演示且稳定”,因此:
|
||||
- `.txt` / `.md` 直接按文本读取
|
||||
- `.pdf` 优先走 pypdf,失败时回退为二进制容错读取
|
||||
- `.docx` 优先解析 Word XML,失败时回退为二进制容错读取
|
||||
"""
|
||||
path = Path(document.file.path)
|
||||
extension = f".{document.file_type.lower().lstrip('.')}"
|
||||
if extension == ".pdf":
|
||||
@@ -30,7 +44,47 @@ def extract_text(document: UploadedDocument) -> str:
|
||||
return _read_text_file(path)
|
||||
|
||||
|
||||
def index_document(document: UploadedDocument) -> UploadedDocument:
|
||||
"""
|
||||
触发单个文档入库,并把成功/失败状态回写到 UploadedDocument。
|
||||
|
||||
这里故意不抛业务异常给 View:
|
||||
View 层只需要知道“最终状态是什么”,而错误信息统一落到模型字段中,
|
||||
便于页面重试和演示。
|
||||
"""
|
||||
try:
|
||||
text = extract_text(document)
|
||||
ingest_result = ingest_document(
|
||||
document_id=document.id,
|
||||
scenario_id=document.scenario_id,
|
||||
source_file=document.original_name,
|
||||
text=text,
|
||||
collection=document.scenario_id,
|
||||
)
|
||||
_apply_ingest_result(document, ingest_result.success, ingest_result.error)
|
||||
except Exception as exc:
|
||||
_apply_ingest_result(document, success=False, error=str(exc))
|
||||
document.save(update_fields=["status", "error_message", "updated_at"])
|
||||
return document
|
||||
|
||||
|
||||
def _apply_ingest_result(document: UploadedDocument, success: bool, error: str = "") -> None:
|
||||
"""把入库结果映射为 UploadedDocument 的稳定状态字段。"""
|
||||
if success:
|
||||
document.status = UploadedDocument.STATUS_INDEXED
|
||||
document.error_message = ""
|
||||
return
|
||||
document.status = UploadedDocument.STATUS_FAILED
|
||||
document.error_message = error
|
||||
|
||||
|
||||
def _detect_extension(file_name: str) -> str:
|
||||
"""统一将扩展名转成小写且去掉前导点,便于模型字段存储。"""
|
||||
return Path(file_name).suffix.lower().lstrip(".")
|
||||
|
||||
|
||||
def _read_text_file(path: Path) -> str:
|
||||
"""优先按 UTF-8 读取;失败时回退到系统默认编码。"""
|
||||
try:
|
||||
return path.read_text(encoding="utf-8")
|
||||
except UnicodeDecodeError:
|
||||
@@ -38,6 +92,7 @@ def _read_text_file(path: Path) -> str:
|
||||
|
||||
|
||||
def _extract_pdf_text(path: Path) -> str:
|
||||
"""优先使用 pypdf 抽取 PDF 文本,失败时回退到容错方案。"""
|
||||
try:
|
||||
import pypdf
|
||||
|
||||
@@ -48,6 +103,7 @@ def _extract_pdf_text(path: Path) -> str:
|
||||
|
||||
|
||||
def _extract_docx_text(path: Path) -> str:
|
||||
"""提取 Word XML 中的可见文字内容,不追求保留样式。"""
|
||||
try:
|
||||
with ZipFile(path) as archive:
|
||||
document_xml = archive.read("word/document.xml")
|
||||
@@ -60,30 +116,12 @@ def _extract_docx_text(path: Path) -> str:
|
||||
|
||||
|
||||
def _read_binary_text_fallback(path: Path) -> str:
|
||||
"""
|
||||
当结构化抽取失败时,退回到“尽可能保留纯文本”的保底方案。
|
||||
|
||||
该方案不保证版式,但足以支撑 V1 入库和演示。
|
||||
"""
|
||||
data = path.read_bytes()
|
||||
text = data.decode("utf-8", errors="ignore")
|
||||
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]+", " ", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def index_document(document: UploadedDocument) -> UploadedDocument:
|
||||
try:
|
||||
text = extract_text(document)
|
||||
result = ingest_document(
|
||||
document_id=document.id,
|
||||
scenario_id=document.scenario_id,
|
||||
source_file=document.original_name,
|
||||
text=text,
|
||||
collection=document.scenario_id,
|
||||
)
|
||||
if result.success:
|
||||
document.status = UploadedDocument.STATUS_INDEXED
|
||||
document.error_message = ""
|
||||
else:
|
||||
document.status = UploadedDocument.STATUS_FAILED
|
||||
document.error_message = result.error
|
||||
except Exception as exc:
|
||||
document.status = UploadedDocument.STATUS_FAILED
|
||||
document.error_message = str(exc)
|
||||
document.save(update_fields=["status", "error_message", "updated_at"])
|
||||
return document
|
||||
|
||||
Reference in New Issue
Block a user