feat(documents): 支持文档上传与本地RAG入库
This commit is contained in:
1
apps/documents/__init__.py
Normal file
1
apps/documents/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
10
apps/documents/admin.py
Normal file
10
apps/documents/admin.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
from django.contrib import admin
|
||||||
|
|
||||||
|
from .models import UploadedDocument
|
||||||
|
|
||||||
|
|
||||||
|
@admin.register(UploadedDocument)
|
||||||
|
class UploadedDocumentAdmin(admin.ModelAdmin):
|
||||||
|
list_display = ("id", "original_name", "scenario_id", "file_type", "status", "created_at")
|
||||||
|
list_filter = ("status", "scenario_id", "file_type")
|
||||||
|
search_fields = ("original_name", "scenario_id")
|
||||||
6
apps/documents/apps.py
Normal file
6
apps/documents/apps.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentsConfig(AppConfig):
|
||||||
|
default_auto_field = "django.db.models.BigAutoField"
|
||||||
|
name = "apps.documents"
|
||||||
27
apps/documents/forms.py
Normal file
27
apps/documents/forms.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from django import forms
|
||||||
|
|
||||||
|
from apps.scenarios.services import ScenarioNotFound, get_scenario
|
||||||
|
|
||||||
|
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx"}
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentUploadForm(forms.Form):
|
||||||
|
scenario_id = forms.CharField(label="场景")
|
||||||
|
file = forms.FileField(label="文件")
|
||||||
|
|
||||||
|
def clean_scenario_id(self):
|
||||||
|
scenario_id = self.cleaned_data["scenario_id"]
|
||||||
|
try:
|
||||||
|
get_scenario(scenario_id)
|
||||||
|
except ScenarioNotFound as exc:
|
||||||
|
raise forms.ValidationError("场景不存在") from exc
|
||||||
|
return scenario_id
|
||||||
|
|
||||||
|
def clean_file(self):
|
||||||
|
uploaded_file = self.cleaned_data["file"]
|
||||||
|
extension = Path(uploaded_file.name).suffix.lower()
|
||||||
|
if extension not in SUPPORTED_EXTENSIONS:
|
||||||
|
raise forms.ValidationError("仅支持 .txt、.md、.pdf 和 .docx 文件")
|
||||||
|
return uploaded_file
|
||||||
42
apps/documents/migrations/0001_initial.py
Normal file
42
apps/documents/migrations/0001_initial.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
# Generated by Django 5.2.14 on 2026-05-29 13:41
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = []
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name="UploadedDocument",
|
||||||
|
fields=[
|
||||||
|
(
|
||||||
|
"id",
|
||||||
|
models.BigAutoField(
|
||||||
|
auto_created=True,
|
||||||
|
primary_key=True,
|
||||||
|
serialize=False,
|
||||||
|
verbose_name="ID",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
("scenario_id", models.CharField(db_index=True, max_length=100)),
|
||||||
|
("original_name", models.CharField(max_length=255)),
|
||||||
|
("file", models.FileField(upload_to="documents/%Y%m%d/")),
|
||||||
|
("file_type", models.CharField(max_length=20)),
|
||||||
|
("size", models.PositiveIntegerField(default=0)),
|
||||||
|
(
|
||||||
|
"status",
|
||||||
|
models.CharField(db_index=True, default="uploaded", max_length=20),
|
||||||
|
),
|
||||||
|
("error_message", models.TextField(blank=True)),
|
||||||
|
("created_at", models.DateTimeField(auto_now_add=True)),
|
||||||
|
("updated_at", models.DateTimeField(auto_now=True)),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
"ordering": ["-created_at"],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
0
apps/documents/migrations/__init__.py
Normal file
0
apps/documents/migrations/__init__.py
Normal file
23
apps/documents/models.py
Normal file
23
apps/documents/models.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
from django.db import models
|
||||||
|
|
||||||
|
|
||||||
|
class UploadedDocument(models.Model):
|
||||||
|
STATUS_UPLOADED = "uploaded"
|
||||||
|
STATUS_INDEXED = "indexed"
|
||||||
|
STATUS_FAILED = "failed"
|
||||||
|
|
||||||
|
scenario_id = models.CharField(max_length=100, db_index=True)
|
||||||
|
original_name = models.CharField(max_length=255)
|
||||||
|
file = models.FileField(upload_to="documents/%Y%m%d/")
|
||||||
|
file_type = models.CharField(max_length=20)
|
||||||
|
size = models.PositiveIntegerField(default=0)
|
||||||
|
status = models.CharField(max_length=20, default=STATUS_UPLOADED, db_index=True)
|
||||||
|
error_message = models.TextField(blank=True)
|
||||||
|
created_at = models.DateTimeField(auto_now_add=True)
|
||||||
|
updated_at = models.DateTimeField(auto_now=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
ordering = ["-created_at"]
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return self.original_name
|
||||||
89
apps/documents/services.py
Normal file
89
apps/documents/services.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from zipfile import BadZipFile, ZipFile
|
||||||
|
import re
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
from agent_core.rag.ingest import ingest_document
|
||||||
|
|
||||||
|
from .models import UploadedDocument
|
||||||
|
|
||||||
|
|
||||||
|
def create_uploaded_document(scenario_id: str, uploaded_file) -> UploadedDocument:
|
||||||
|
extension = Path(uploaded_file.name).suffix.lower().lstrip(".")
|
||||||
|
return UploadedDocument.objects.create(
|
||||||
|
scenario_id=scenario_id,
|
||||||
|
original_name=uploaded_file.name,
|
||||||
|
file=uploaded_file,
|
||||||
|
file_type=extension,
|
||||||
|
size=uploaded_file.size,
|
||||||
|
status=UploadedDocument.STATUS_UPLOADED,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text(document: UploadedDocument) -> str:
|
||||||
|
path = Path(document.file.path)
|
||||||
|
extension = f".{document.file_type.lower().lstrip('.')}"
|
||||||
|
if extension == ".pdf":
|
||||||
|
return _extract_pdf_text(path)
|
||||||
|
if extension == ".docx":
|
||||||
|
return _extract_docx_text(path)
|
||||||
|
return _read_text_file(path)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_text_file(path: Path) -> str:
|
||||||
|
try:
|
||||||
|
return path.read_text(encoding="utf-8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
return path.read_text()
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_pdf_text(path: Path) -> str:
|
||||||
|
try:
|
||||||
|
import pypdf
|
||||||
|
|
||||||
|
reader = pypdf.PdfReader(str(path))
|
||||||
|
return "\n".join(page.extract_text() or "" for page in reader.pages)
|
||||||
|
except Exception:
|
||||||
|
return _read_binary_text_fallback(path)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_docx_text(path: Path) -> str:
|
||||||
|
try:
|
||||||
|
with ZipFile(path) as archive:
|
||||||
|
document_xml = archive.read("word/document.xml")
|
||||||
|
root = ET.fromstring(document_xml)
|
||||||
|
namespace = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
||||||
|
texts = [node.text for node in root.findall(".//w:t", namespace) if node.text]
|
||||||
|
return "\n".join(texts)
|
||||||
|
except (BadZipFile, KeyError, ET.ParseError):
|
||||||
|
return _read_binary_text_fallback(path)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_binary_text_fallback(path: Path) -> str:
|
||||||
|
data = path.read_bytes()
|
||||||
|
text = data.decode("utf-8", errors="ignore")
|
||||||
|
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]+", " ", text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def index_document(document: UploadedDocument) -> UploadedDocument:
|
||||||
|
try:
|
||||||
|
text = extract_text(document)
|
||||||
|
result = ingest_document(
|
||||||
|
document_id=document.id,
|
||||||
|
scenario_id=document.scenario_id,
|
||||||
|
source_file=document.original_name,
|
||||||
|
text=text,
|
||||||
|
collection=document.scenario_id,
|
||||||
|
)
|
||||||
|
if result.success:
|
||||||
|
document.status = UploadedDocument.STATUS_INDEXED
|
||||||
|
document.error_message = ""
|
||||||
|
else:
|
||||||
|
document.status = UploadedDocument.STATUS_FAILED
|
||||||
|
document.error_message = result.error
|
||||||
|
except Exception as exc:
|
||||||
|
document.status = UploadedDocument.STATUS_FAILED
|
||||||
|
document.error_message = str(exc)
|
||||||
|
document.save(update_fields=["status", "error_message", "updated_at"])
|
||||||
|
return document
|
||||||
12
apps/documents/urls.py
Normal file
12
apps/documents/urls.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from django.urls import path
|
||||||
|
|
||||||
|
from . import views
|
||||||
|
|
||||||
|
|
||||||
|
app_name = "documents"
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path("", views.document_list, name="list"),
|
||||||
|
path("upload/", views.upload, name="upload"),
|
||||||
|
path("<int:document_id>/index/", views.index, name="index"),
|
||||||
|
]
|
||||||
35
apps/documents/views.py
Normal file
35
apps/documents/views.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from django.shortcuts import get_object_or_404, redirect, render
|
||||||
|
from django.views.decorators.http import require_POST
|
||||||
|
|
||||||
|
from apps.scenarios.services import list_scenarios
|
||||||
|
|
||||||
|
from .forms import DocumentUploadForm
|
||||||
|
from .models import UploadedDocument
|
||||||
|
from .services import create_uploaded_document, index_document
|
||||||
|
|
||||||
|
|
||||||
|
def document_list(request):
|
||||||
|
documents = UploadedDocument.objects.all()
|
||||||
|
return render(request, "documents/document_list.html", {"documents": documents})
|
||||||
|
|
||||||
|
|
||||||
|
def upload(request):
|
||||||
|
if request.method == "POST":
|
||||||
|
form = DocumentUploadForm(request.POST, request.FILES)
|
||||||
|
if form.is_valid():
|
||||||
|
create_uploaded_document(form.cleaned_data["scenario_id"], form.cleaned_data["file"])
|
||||||
|
return redirect("documents:list")
|
||||||
|
else:
|
||||||
|
form = DocumentUploadForm()
|
||||||
|
return render(
|
||||||
|
request,
|
||||||
|
"documents/upload.html",
|
||||||
|
{"form": form, "scenarios": list_scenarios()},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@require_POST
|
||||||
|
def index(request, document_id: int):
|
||||||
|
document = get_object_or_404(UploadedDocument, pk=document_id)
|
||||||
|
index_document(document)
|
||||||
|
return redirect("documents:list")
|
||||||
50
templates/documents/document_list.html
Normal file
50
templates/documents/document_list.html
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="zh-CN">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>文件列表</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>文件列表</h1>
|
||||||
|
<nav>
|
||||||
|
<a href="/">返回首页</a>
|
||||||
|
<a href="{% url 'documents:upload' %}">上传文件</a>
|
||||||
|
</nav>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>文件名</th>
|
||||||
|
<th>场景</th>
|
||||||
|
<th>类型</th>
|
||||||
|
<th>大小</th>
|
||||||
|
<th>状态</th>
|
||||||
|
<th>操作</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for document in documents %}
|
||||||
|
<tr>
|
||||||
|
<td>{{ document.original_name }}</td>
|
||||||
|
<td>{{ document.scenario_id }}</td>
|
||||||
|
<td>{{ document.file_type }}</td>
|
||||||
|
<td>{{ document.size }}</td>
|
||||||
|
<td>{{ document.status }}</td>
|
||||||
|
<td>
|
||||||
|
{% if document.status != "indexed" %}
|
||||||
|
<form action="{% url 'documents:index' document.id %}" method="post">
|
||||||
|
{% csrf_token %}
|
||||||
|
<button type="submit">入库</button>
|
||||||
|
</form>
|
||||||
|
{% endif %}
|
||||||
|
{% if document.error_message %}
|
||||||
|
<pre>{{ document.error_message }}</pre>
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% empty %}
|
||||||
|
<tr><td colspan="6">暂无文件。</td></tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
26
templates/documents/upload.html
Normal file
26
templates/documents/upload.html
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="zh-CN">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>上传文件</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>上传文件</h1>
|
||||||
|
<nav><a href="{% url 'documents:list' %}">返回文件列表</a></nav>
|
||||||
|
<form method="post" enctype="multipart/form-data">
|
||||||
|
{% csrf_token %}
|
||||||
|
<p>
|
||||||
|
<label for="id_scenario_id">场景</label>
|
||||||
|
<select name="scenario_id" id="id_scenario_id">
|
||||||
|
{% for scenario in scenarios %}
|
||||||
|
<option value="{{ scenario.id }}">{{ scenario.name }}</option>
|
||||||
|
{% endfor %}
|
||||||
|
</select>
|
||||||
|
</p>
|
||||||
|
<p>{{ form.file.label_tag }} {{ form.file }}</p>
|
||||||
|
{{ form.errors }}
|
||||||
|
<p>支持 .txt、.md、.pdf 和 .docx 文件。</p>
|
||||||
|
<button type="submit">上传</button>
|
||||||
|
</form>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
82
tests/test_documents.py
Normal file
82
tests/test_documents.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
from django.core.files.uploadedfile import SimpleUploadedFile
|
||||||
|
from django.urls import reverse
|
||||||
|
|
||||||
|
from apps.documents.models import UploadedDocument
|
||||||
|
from apps.documents.services import extract_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_upload_txt_document_creates_uploaded_record(client, db):
|
||||||
|
file = SimpleUploadedFile("rules.txt", "hello".encode("utf-8"), content_type="text/plain")
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
reverse("documents:upload"),
|
||||||
|
{"scenario_id": "knowledge_qa", "file": file},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 302
|
||||||
|
document = UploadedDocument.objects.get()
|
||||||
|
assert document.status == "uploaded"
|
||||||
|
assert document.file_type == "txt"
|
||||||
|
assert document.scenario_id == "knowledge_qa"
|
||||||
|
|
||||||
|
|
||||||
|
def test_upload_accepts_pdf_and_docx_documents(client, db):
|
||||||
|
for filename, payload in [
|
||||||
|
("policy.pdf", b"%PDF-1.4\nplain policy text"),
|
||||||
|
("contract.docx", b"fake-docx-body"),
|
||||||
|
]:
|
||||||
|
file = SimpleUploadedFile(filename, payload)
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
reverse("documents:upload"),
|
||||||
|
{"scenario_id": "knowledge_qa", "file": file},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 302
|
||||||
|
|
||||||
|
assert set(UploadedDocument.objects.values_list("file_type", flat=True)) == {"pdf", "docx"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_index_document_updates_status_to_indexed(client, db):
|
||||||
|
document = UploadedDocument.objects.create(
|
||||||
|
scenario_id="knowledge_qa",
|
||||||
|
original_name="rules.md",
|
||||||
|
file="knowledge_qa/rules.md",
|
||||||
|
file_type="md",
|
||||||
|
size=5,
|
||||||
|
status="uploaded",
|
||||||
|
)
|
||||||
|
document.file.save("rules.md", SimpleUploadedFile("rules.md", b"# rule").file)
|
||||||
|
|
||||||
|
response = client.post(reverse("documents:index", args=[document.id]))
|
||||||
|
|
||||||
|
assert response.status_code == 302
|
||||||
|
document.refresh_from_db()
|
||||||
|
assert document.status == "indexed"
|
||||||
|
assert document.error_message == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_text_supports_pdf_and_docx_plain_text_fallback(db):
|
||||||
|
pdf_document = UploadedDocument.objects.create(
|
||||||
|
scenario_id="knowledge_qa",
|
||||||
|
original_name="policy.pdf",
|
||||||
|
file_type="pdf",
|
||||||
|
size=10,
|
||||||
|
status="uploaded",
|
||||||
|
)
|
||||||
|
pdf_document.file.save("policy.pdf", SimpleUploadedFile("policy.pdf", b"%PDF-1.4\nSafety policy"))
|
||||||
|
|
||||||
|
docx_document = UploadedDocument.objects.create(
|
||||||
|
scenario_id="knowledge_qa",
|
||||||
|
original_name="contract.docx",
|
||||||
|
file_type="docx",
|
||||||
|
size=10,
|
||||||
|
status="uploaded",
|
||||||
|
)
|
||||||
|
docx_document.file.save(
|
||||||
|
"contract.docx",
|
||||||
|
SimpleUploadedFile("contract.docx", b"Contract clause review"),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "Safety policy" in extract_text(pdf_document)
|
||||||
|
assert "Contract clause review" in extract_text(docx_document)
|
||||||
Reference in New Issue
Block a user