feat(documents): 支持文档上传与本地RAG入库

This commit is contained in:
2026-05-30 00:10:05 +08:00
parent 7a6c110103
commit 4a831ee2c5
13 changed files with 403 additions and 0 deletions

View File

@@ -0,0 +1 @@

10
apps/documents/admin.py Normal file
View File

@@ -0,0 +1,10 @@
from django.contrib import admin
from .models import UploadedDocument
@admin.register(UploadedDocument)
class UploadedDocumentAdmin(admin.ModelAdmin):
list_display = ("id", "original_name", "scenario_id", "file_type", "status", "created_at")
list_filter = ("status", "scenario_id", "file_type")
search_fields = ("original_name", "scenario_id")

6
apps/documents/apps.py Normal file
View File

@@ -0,0 +1,6 @@
from django.apps import AppConfig
class DocumentsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "apps.documents"

27
apps/documents/forms.py Normal file
View File

@@ -0,0 +1,27 @@
from pathlib import Path
from django import forms
from apps.scenarios.services import ScenarioNotFound, get_scenario
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx"}
class DocumentUploadForm(forms.Form):
scenario_id = forms.CharField(label="场景")
file = forms.FileField(label="文件")
def clean_scenario_id(self):
scenario_id = self.cleaned_data["scenario_id"]
try:
get_scenario(scenario_id)
except ScenarioNotFound as exc:
raise forms.ValidationError("场景不存在") from exc
return scenario_id
def clean_file(self):
uploaded_file = self.cleaned_data["file"]
extension = Path(uploaded_file.name).suffix.lower()
if extension not in SUPPORTED_EXTENSIONS:
raise forms.ValidationError("仅支持 .txt、.md、.pdf 和 .docx 文件")
return uploaded_file

View File

@@ -0,0 +1,42 @@
# Generated by Django 5.2.14 on 2026-05-29 13:41
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = []
operations = [
migrations.CreateModel(
name="UploadedDocument",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("scenario_id", models.CharField(db_index=True, max_length=100)),
("original_name", models.CharField(max_length=255)),
("file", models.FileField(upload_to="documents/%Y%m%d/")),
("file_type", models.CharField(max_length=20)),
("size", models.PositiveIntegerField(default=0)),
(
"status",
models.CharField(db_index=True, default="uploaded", max_length=20),
),
("error_message", models.TextField(blank=True)),
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
],
options={
"ordering": ["-created_at"],
},
),
]

View File

23
apps/documents/models.py Normal file
View File

@@ -0,0 +1,23 @@
from django.db import models
class UploadedDocument(models.Model):
STATUS_UPLOADED = "uploaded"
STATUS_INDEXED = "indexed"
STATUS_FAILED = "failed"
scenario_id = models.CharField(max_length=100, db_index=True)
original_name = models.CharField(max_length=255)
file = models.FileField(upload_to="documents/%Y%m%d/")
file_type = models.CharField(max_length=20)
size = models.PositiveIntegerField(default=0)
status = models.CharField(max_length=20, default=STATUS_UPLOADED, db_index=True)
error_message = models.TextField(blank=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ["-created_at"]
def __str__(self) -> str:
return self.original_name

View File

@@ -0,0 +1,89 @@
from pathlib import Path
from zipfile import BadZipFile, ZipFile
import re
import xml.etree.ElementTree as ET
from agent_core.rag.ingest import ingest_document
from .models import UploadedDocument
def create_uploaded_document(scenario_id: str, uploaded_file) -> UploadedDocument:
extension = Path(uploaded_file.name).suffix.lower().lstrip(".")
return UploadedDocument.objects.create(
scenario_id=scenario_id,
original_name=uploaded_file.name,
file=uploaded_file,
file_type=extension,
size=uploaded_file.size,
status=UploadedDocument.STATUS_UPLOADED,
)
def extract_text(document: UploadedDocument) -> str:
path = Path(document.file.path)
extension = f".{document.file_type.lower().lstrip('.')}"
if extension == ".pdf":
return _extract_pdf_text(path)
if extension == ".docx":
return _extract_docx_text(path)
return _read_text_file(path)
def _read_text_file(path: Path) -> str:
try:
return path.read_text(encoding="utf-8")
except UnicodeDecodeError:
return path.read_text()
def _extract_pdf_text(path: Path) -> str:
try:
import pypdf
reader = pypdf.PdfReader(str(path))
return "\n".join(page.extract_text() or "" for page in reader.pages)
except Exception:
return _read_binary_text_fallback(path)
def _extract_docx_text(path: Path) -> str:
try:
with ZipFile(path) as archive:
document_xml = archive.read("word/document.xml")
root = ET.fromstring(document_xml)
namespace = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
texts = [node.text for node in root.findall(".//w:t", namespace) if node.text]
return "\n".join(texts)
except (BadZipFile, KeyError, ET.ParseError):
return _read_binary_text_fallback(path)
def _read_binary_text_fallback(path: Path) -> str:
data = path.read_bytes()
text = data.decode("utf-8", errors="ignore")
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]+", " ", text)
return text.strip()
def index_document(document: UploadedDocument) -> UploadedDocument:
try:
text = extract_text(document)
result = ingest_document(
document_id=document.id,
scenario_id=document.scenario_id,
source_file=document.original_name,
text=text,
collection=document.scenario_id,
)
if result.success:
document.status = UploadedDocument.STATUS_INDEXED
document.error_message = ""
else:
document.status = UploadedDocument.STATUS_FAILED
document.error_message = result.error
except Exception as exc:
document.status = UploadedDocument.STATUS_FAILED
document.error_message = str(exc)
document.save(update_fields=["status", "error_message", "updated_at"])
return document

12
apps/documents/urls.py Normal file
View File

@@ -0,0 +1,12 @@
from django.urls import path
from . import views
app_name = "documents"
urlpatterns = [
path("", views.document_list, name="list"),
path("upload/", views.upload, name="upload"),
path("<int:document_id>/index/", views.index, name="index"),
]

35
apps/documents/views.py Normal file
View File

@@ -0,0 +1,35 @@
from django.shortcuts import get_object_or_404, redirect, render
from django.views.decorators.http import require_POST
from apps.scenarios.services import list_scenarios
from .forms import DocumentUploadForm
from .models import UploadedDocument
from .services import create_uploaded_document, index_document
def document_list(request):
documents = UploadedDocument.objects.all()
return render(request, "documents/document_list.html", {"documents": documents})
def upload(request):
if request.method == "POST":
form = DocumentUploadForm(request.POST, request.FILES)
if form.is_valid():
create_uploaded_document(form.cleaned_data["scenario_id"], form.cleaned_data["file"])
return redirect("documents:list")
else:
form = DocumentUploadForm()
return render(
request,
"documents/upload.html",
{"form": form, "scenarios": list_scenarios()},
)
@require_POST
def index(request, document_id: int):
document = get_object_or_404(UploadedDocument, pk=document_id)
index_document(document)
return redirect("documents:list")