From 4a831ee2c5bccbca34fce8aadc3224be975fc289 Mon Sep 17 00:00:00 2001 From: bruce Date: Sat, 30 May 2026 00:10:05 +0800 Subject: [PATCH] =?UTF-8?q?feat(documents):=20=E6=94=AF=E6=8C=81=E6=96=87?= =?UTF-8?q?=E6=A1=A3=E4=B8=8A=E4=BC=A0=E4=B8=8E=E6=9C=AC=E5=9C=B0RAG?= =?UTF-8?q?=E5=85=A5=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/documents/__init__.py | 1 + apps/documents/admin.py | 10 +++ apps/documents/apps.py | 6 ++ apps/documents/forms.py | 27 +++++++ apps/documents/migrations/0001_initial.py | 42 +++++++++++ apps/documents/migrations/__init__.py | 0 apps/documents/models.py | 23 ++++++ apps/documents/services.py | 89 +++++++++++++++++++++++ apps/documents/urls.py | 12 +++ apps/documents/views.py | 35 +++++++++ templates/documents/document_list.html | 50 +++++++++++++ templates/documents/upload.html | 26 +++++++ tests/test_documents.py | 82 +++++++++++++++++++++ 13 files changed, 403 insertions(+) create mode 100644 apps/documents/__init__.py create mode 100644 apps/documents/admin.py create mode 100644 apps/documents/apps.py create mode 100644 apps/documents/forms.py create mode 100644 apps/documents/migrations/0001_initial.py create mode 100644 apps/documents/migrations/__init__.py create mode 100644 apps/documents/models.py create mode 100644 apps/documents/services.py create mode 100644 apps/documents/urls.py create mode 100644 apps/documents/views.py create mode 100644 templates/documents/document_list.html create mode 100644 templates/documents/upload.html create mode 100644 tests/test_documents.py diff --git a/apps/documents/__init__.py b/apps/documents/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/apps/documents/__init__.py @@ -0,0 +1 @@ + diff --git a/apps/documents/admin.py b/apps/documents/admin.py new file mode 100644 index 0000000..d669f40 --- /dev/null +++ b/apps/documents/admin.py @@ -0,0 +1,10 @@ +from django.contrib import admin + +from .models import UploadedDocument + + +@admin.register(UploadedDocument) +class UploadedDocumentAdmin(admin.ModelAdmin): + list_display = ("id", "original_name", "scenario_id", "file_type", "status", "created_at") + list_filter = ("status", "scenario_id", "file_type") + search_fields = ("original_name", "scenario_id") diff --git a/apps/documents/apps.py b/apps/documents/apps.py new file mode 100644 index 0000000..243fc5d --- /dev/null +++ b/apps/documents/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class DocumentsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "apps.documents" diff --git a/apps/documents/forms.py b/apps/documents/forms.py new file mode 100644 index 0000000..a0e90c6 --- /dev/null +++ b/apps/documents/forms.py @@ -0,0 +1,27 @@ +from pathlib import Path + +from django import forms + +from apps.scenarios.services import ScenarioNotFound, get_scenario + +SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx"} + + +class DocumentUploadForm(forms.Form): + scenario_id = forms.CharField(label="场景") + file = forms.FileField(label="文件") + + def clean_scenario_id(self): + scenario_id = self.cleaned_data["scenario_id"] + try: + get_scenario(scenario_id) + except ScenarioNotFound as exc: + raise forms.ValidationError("场景不存在") from exc + return scenario_id + + def clean_file(self): + uploaded_file = self.cleaned_data["file"] + extension = Path(uploaded_file.name).suffix.lower() + if extension not in SUPPORTED_EXTENSIONS: + raise forms.ValidationError("仅支持 .txt、.md、.pdf 和 .docx 文件") + return uploaded_file diff --git a/apps/documents/migrations/0001_initial.py b/apps/documents/migrations/0001_initial.py new file mode 100644 index 0000000..c738384 --- /dev/null +++ b/apps/documents/migrations/0001_initial.py @@ -0,0 +1,42 @@ +# Generated by Django 5.2.14 on 2026-05-29 13:41 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="UploadedDocument", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("scenario_id", models.CharField(db_index=True, max_length=100)), + ("original_name", models.CharField(max_length=255)), + ("file", models.FileField(upload_to="documents/%Y%m%d/")), + ("file_type", models.CharField(max_length=20)), + ("size", models.PositiveIntegerField(default=0)), + ( + "status", + models.CharField(db_index=True, default="uploaded", max_length=20), + ), + ("error_message", models.TextField(blank=True)), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ], + options={ + "ordering": ["-created_at"], + }, + ), + ] diff --git a/apps/documents/migrations/__init__.py b/apps/documents/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/documents/models.py b/apps/documents/models.py new file mode 100644 index 0000000..fd2500e --- /dev/null +++ b/apps/documents/models.py @@ -0,0 +1,23 @@ +from django.db import models + + +class UploadedDocument(models.Model): + STATUS_UPLOADED = "uploaded" + STATUS_INDEXED = "indexed" + STATUS_FAILED = "failed" + + scenario_id = models.CharField(max_length=100, db_index=True) + original_name = models.CharField(max_length=255) + file = models.FileField(upload_to="documents/%Y%m%d/") + file_type = models.CharField(max_length=20) + size = models.PositiveIntegerField(default=0) + status = models.CharField(max_length=20, default=STATUS_UPLOADED, db_index=True) + error_message = models.TextField(blank=True) + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + class Meta: + ordering = ["-created_at"] + + def __str__(self) -> str: + return self.original_name diff --git a/apps/documents/services.py b/apps/documents/services.py new file mode 100644 index 0000000..68976d8 --- /dev/null +++ b/apps/documents/services.py @@ -0,0 +1,89 @@ +from pathlib import Path +from zipfile import BadZipFile, ZipFile +import re +import xml.etree.ElementTree as ET + +from agent_core.rag.ingest import ingest_document + +from .models import UploadedDocument + + +def create_uploaded_document(scenario_id: str, uploaded_file) -> UploadedDocument: + extension = Path(uploaded_file.name).suffix.lower().lstrip(".") + return UploadedDocument.objects.create( + scenario_id=scenario_id, + original_name=uploaded_file.name, + file=uploaded_file, + file_type=extension, + size=uploaded_file.size, + status=UploadedDocument.STATUS_UPLOADED, + ) + + +def extract_text(document: UploadedDocument) -> str: + path = Path(document.file.path) + extension = f".{document.file_type.lower().lstrip('.')}" + if extension == ".pdf": + return _extract_pdf_text(path) + if extension == ".docx": + return _extract_docx_text(path) + return _read_text_file(path) + + +def _read_text_file(path: Path) -> str: + try: + return path.read_text(encoding="utf-8") + except UnicodeDecodeError: + return path.read_text() + + +def _extract_pdf_text(path: Path) -> str: + try: + import pypdf + + reader = pypdf.PdfReader(str(path)) + return "\n".join(page.extract_text() or "" for page in reader.pages) + except Exception: + return _read_binary_text_fallback(path) + + +def _extract_docx_text(path: Path) -> str: + try: + with ZipFile(path) as archive: + document_xml = archive.read("word/document.xml") + root = ET.fromstring(document_xml) + namespace = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} + texts = [node.text for node in root.findall(".//w:t", namespace) if node.text] + return "\n".join(texts) + except (BadZipFile, KeyError, ET.ParseError): + return _read_binary_text_fallback(path) + + +def _read_binary_text_fallback(path: Path) -> str: + data = path.read_bytes() + text = data.decode("utf-8", errors="ignore") + text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]+", " ", text) + return text.strip() + + +def index_document(document: UploadedDocument) -> UploadedDocument: + try: + text = extract_text(document) + result = ingest_document( + document_id=document.id, + scenario_id=document.scenario_id, + source_file=document.original_name, + text=text, + collection=document.scenario_id, + ) + if result.success: + document.status = UploadedDocument.STATUS_INDEXED + document.error_message = "" + else: + document.status = UploadedDocument.STATUS_FAILED + document.error_message = result.error + except Exception as exc: + document.status = UploadedDocument.STATUS_FAILED + document.error_message = str(exc) + document.save(update_fields=["status", "error_message", "updated_at"]) + return document diff --git a/apps/documents/urls.py b/apps/documents/urls.py new file mode 100644 index 0000000..bb69ace --- /dev/null +++ b/apps/documents/urls.py @@ -0,0 +1,12 @@ +from django.urls import path + +from . import views + + +app_name = "documents" + +urlpatterns = [ + path("", views.document_list, name="list"), + path("upload/", views.upload, name="upload"), + path("/index/", views.index, name="index"), +] diff --git a/apps/documents/views.py b/apps/documents/views.py new file mode 100644 index 0000000..56fda0a --- /dev/null +++ b/apps/documents/views.py @@ -0,0 +1,35 @@ +from django.shortcuts import get_object_or_404, redirect, render +from django.views.decorators.http import require_POST + +from apps.scenarios.services import list_scenarios + +from .forms import DocumentUploadForm +from .models import UploadedDocument +from .services import create_uploaded_document, index_document + + +def document_list(request): + documents = UploadedDocument.objects.all() + return render(request, "documents/document_list.html", {"documents": documents}) + + +def upload(request): + if request.method == "POST": + form = DocumentUploadForm(request.POST, request.FILES) + if form.is_valid(): + create_uploaded_document(form.cleaned_data["scenario_id"], form.cleaned_data["file"]) + return redirect("documents:list") + else: + form = DocumentUploadForm() + return render( + request, + "documents/upload.html", + {"form": form, "scenarios": list_scenarios()}, + ) + + +@require_POST +def index(request, document_id: int): + document = get_object_or_404(UploadedDocument, pk=document_id) + index_document(document) + return redirect("documents:list") diff --git a/templates/documents/document_list.html b/templates/documents/document_list.html new file mode 100644 index 0000000..9903645 --- /dev/null +++ b/templates/documents/document_list.html @@ -0,0 +1,50 @@ + + + + + 文件列表 + + +

文件列表

+ + + + + + + + + + + + + + {% for document in documents %} + + + + + + + + + {% empty %} + + {% endfor %} + +
文件名场景类型大小状态操作
{{ document.original_name }}{{ document.scenario_id }}{{ document.file_type }}{{ document.size }}{{ document.status }} + {% if document.status != "indexed" %} +
+ {% csrf_token %} + +
+ {% endif %} + {% if document.error_message %} +
{{ document.error_message }}
+ {% endif %} +
暂无文件。
+ + diff --git a/templates/documents/upload.html b/templates/documents/upload.html new file mode 100644 index 0000000..6285233 --- /dev/null +++ b/templates/documents/upload.html @@ -0,0 +1,26 @@ + + + + + 上传文件 + + +

上传文件

+ +
+ {% csrf_token %} +

+ + +

+

{{ form.file.label_tag }} {{ form.file }}

+ {{ form.errors }} +

支持 .txt、.md、.pdf 和 .docx 文件。

+ +
+ + diff --git a/tests/test_documents.py b/tests/test_documents.py new file mode 100644 index 0000000..7429540 --- /dev/null +++ b/tests/test_documents.py @@ -0,0 +1,82 @@ +from django.core.files.uploadedfile import SimpleUploadedFile +from django.urls import reverse + +from apps.documents.models import UploadedDocument +from apps.documents.services import extract_text + + +def test_upload_txt_document_creates_uploaded_record(client, db): + file = SimpleUploadedFile("rules.txt", "hello".encode("utf-8"), content_type="text/plain") + + response = client.post( + reverse("documents:upload"), + {"scenario_id": "knowledge_qa", "file": file}, + ) + + assert response.status_code == 302 + document = UploadedDocument.objects.get() + assert document.status == "uploaded" + assert document.file_type == "txt" + assert document.scenario_id == "knowledge_qa" + + +def test_upload_accepts_pdf_and_docx_documents(client, db): + for filename, payload in [ + ("policy.pdf", b"%PDF-1.4\nplain policy text"), + ("contract.docx", b"fake-docx-body"), + ]: + file = SimpleUploadedFile(filename, payload) + + response = client.post( + reverse("documents:upload"), + {"scenario_id": "knowledge_qa", "file": file}, + ) + + assert response.status_code == 302 + + assert set(UploadedDocument.objects.values_list("file_type", flat=True)) == {"pdf", "docx"} + + +def test_index_document_updates_status_to_indexed(client, db): + document = UploadedDocument.objects.create( + scenario_id="knowledge_qa", + original_name="rules.md", + file="knowledge_qa/rules.md", + file_type="md", + size=5, + status="uploaded", + ) + document.file.save("rules.md", SimpleUploadedFile("rules.md", b"# rule").file) + + response = client.post(reverse("documents:index", args=[document.id])) + + assert response.status_code == 302 + document.refresh_from_db() + assert document.status == "indexed" + assert document.error_message == "" + + +def test_extract_text_supports_pdf_and_docx_plain_text_fallback(db): + pdf_document = UploadedDocument.objects.create( + scenario_id="knowledge_qa", + original_name="policy.pdf", + file_type="pdf", + size=10, + status="uploaded", + ) + pdf_document.file.save("policy.pdf", SimpleUploadedFile("policy.pdf", b"%PDF-1.4\nSafety policy")) + + docx_document = UploadedDocument.objects.create( + scenario_id="knowledge_qa", + original_name="contract.docx", + file_type="docx", + size=10, + status="uploaded", + ) + docx_document.file.save( + "contract.docx", + SimpleUploadedFile("contract.docx", b"Contract clause review"), + ) + + assert "Safety policy" in extract_text(pdf_document) + assert "Contract clause review" in extract_text(docx_document)