From aa0a24fe5ac3c07b8744d0dcb9ab294a1a33e4f8 Mon Sep 17 00:00:00 2001 From: bruce Date: Thu, 4 Jun 2026 01:07:15 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=E8=B5=84=E6=96=99?= =?UTF-8?q?=E5=8C=85=E5=A4=9A=E6=96=87=E4=BB=B6=E4=B8=8Ezip=E5=AF=BC?= =?UTF-8?q?=E5=85=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/documents/forms.py | 44 +++++++++++++++++-- apps/documents/services.py | 74 +++++++++++++++++++++++++++---- apps/documents/views.py | 10 +++-- templates/documents/upload.html | 13 +++--- tests/test_documents.py | 77 +++++++++++++++++++++++++++++++++ 5 files changed, 198 insertions(+), 20 deletions(-) diff --git a/apps/documents/forms.py b/apps/documents/forms.py index 92f7f33..0f351b7 100644 --- a/apps/documents/forms.py +++ b/apps/documents/forms.py @@ -5,14 +5,31 @@ from django import forms from apps.scenarios.services import ScenarioNotFound, get_scenario from apps.scenarios.services import list_scenarios -SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx"} +SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".zip"} + + +class MultipleFileInput(forms.ClearableFileInput): + allow_multiple_selected = True + + +class MultipleFileField(forms.FileField): + widget = MultipleFileInput + + def clean(self, data, initial=None): + single_file_clean = super().clean + if not data: + return [] + if isinstance(data, (list, tuple)): + return [single_file_clean(item, initial) for item in data] + return [single_file_clean(data, initial)] class DocumentUploadForm(forms.Form): # 使用 ChoiceField 让表单自己维护场景选项, # 这样模板、校验和后续扩展都能围绕一个入口完成。 scenario_id = forms.ChoiceField(label="场景", choices=()) - file = forms.FileField(label="文件") + files = MultipleFileField(label="文件或资料包", required=False) + file = forms.FileField(label="兼容单文件上传", required=False) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -31,7 +48,28 @@ class DocumentUploadForm(forms.Form): def clean_file(self): uploaded_file = self.cleaned_data["file"] + if not uploaded_file: + return uploaded_file extension = Path(uploaded_file.name).suffix.lower() if extension not in SUPPORTED_EXTENSIONS: - raise forms.ValidationError("仅支持 .txt、.md、.pdf 和 .docx 文件") + raise forms.ValidationError("仅支持 .txt、.md、.pdf、.docx 和 .zip 文件") return uploaded_file + + def clean_files(self): + uploaded_files = self.cleaned_data.get("files") or [] + for uploaded_file in uploaded_files: + extension = Path(uploaded_file.name).suffix.lower() + if extension not in SUPPORTED_EXTENSIONS: + raise forms.ValidationError("仅支持 .txt、.md、.pdf、.docx 和 .zip 文件") + return uploaded_files + + def clean(self): + cleaned_data = super().clean() + files = list(cleaned_data.get("files") or []) + file = cleaned_data.get("file") + if file: + files.append(file) + if not files: + raise forms.ValidationError("请至少上传一个文件或一个 zip 资料包。") + cleaned_data["uploaded_files"] = files + return cleaned_data diff --git a/apps/documents/services.py b/apps/documents/services.py index 7ff6dca..b69543a 100644 --- a/apps/documents/services.py +++ b/apps/documents/services.py @@ -1,15 +1,23 @@ from pathlib import Path +from io import BytesIO import re import xml.etree.ElementTree as ET from zipfile import BadZipFile, ZipFile from agent_core.rag.ingest import ingest_document from apps.chat.services import create_conversation_for_batch +from django.core.files.uploadedfile import SimpleUploadedFile from .models import SubmissionBatch, UploadedDocument -def create_uploaded_document(scenario_id: str, uploaded_file, batch: SubmissionBatch | None = None) -> UploadedDocument: +def create_uploaded_document( + scenario_id: str, + uploaded_file, + batch: SubmissionBatch | None = None, + *, + relative_path: str | None = None, +) -> UploadedDocument: """ 保存上传文件的元数据记录。 @@ -20,11 +28,11 @@ def create_uploaded_document(scenario_id: str, uploaded_file, batch: SubmissionB return UploadedDocument.objects.create( batch=batch, scenario_id=scenario_id, - original_name=uploaded_file.name, + original_name=Path(relative_path or uploaded_file.name).name, file=uploaded_file, file_type=extension, size=uploaded_file.size, - relative_path=uploaded_file.name, + relative_path=relative_path or uploaded_file.name, status=UploadedDocument.STATUS_UPLOADED, ) @@ -49,14 +57,22 @@ def import_submission_batch(scenario_id: str, uploaded_files: list) -> dict: chapter_summary = {} total_pages = 0 - for uploaded_file in uploaded_files: - document = create_uploaded_document(scenario_id, uploaded_file, batch=batch) + expanded_files = _expand_uploaded_files(uploaded_files) + for uploaded_item in expanded_files: + uploaded_file = uploaded_item["uploaded_file"] + relative_path = uploaded_item["relative_path"] + document = create_uploaded_document( + scenario_id, + uploaded_file, + batch=batch, + relative_path=relative_path, + ) text = extract_text(document) page_count = _estimate_page_count(text) document.page_count = page_count document.page_count_confidence = "estimated" - document.document_role = _detect_document_role(document.original_name) - document.chapter_code = _detect_chapter_code(document.original_name, text) + document.document_role = _detect_document_role(document.relative_path) + document.chapter_code = _detect_chapter_code(document.relative_path, text) document.chapter_match_status = "matched" if document.chapter_code else "unknown" document.needs_manual_review = not bool(document.chapter_code) document.save( @@ -74,7 +90,7 @@ def import_submission_batch(scenario_id: str, uploaded_files: list) -> dict: total_pages += page_count chapter_key = document.chapter_code or "UNCLASSIFIED" chapter_summary[chapter_key] = chapter_summary.get(chapter_key, 0) + 1 - candidates.extend(_extract_product_candidates(document.original_name, text)) + candidates.extend(_extract_product_candidates(document.relative_path, text)) product_name, warnings = _select_product_name(candidates) conversation = create_conversation_for_batch(batch.batch_id, product_name) @@ -197,6 +213,48 @@ def _estimate_page_count(text: str) -> int: return max(1, line_count) +def _expand_uploaded_files(uploaded_files: list) -> list[dict]: + expanded_files = [] + for uploaded_file in uploaded_files: + extension = Path(uploaded_file.name).suffix.lower() + if extension == ".zip": + expanded_files.extend(_extract_zip_entries(uploaded_file)) + continue + expanded_files.append( + { + "relative_path": uploaded_file.name, + "uploaded_file": uploaded_file, + } + ) + return expanded_files + + +def _extract_zip_entries(uploaded_file) -> list[dict]: + archive_bytes = uploaded_file.read() + uploaded_file.seek(0) + entries = [] + with ZipFile(BytesIO(archive_bytes)) as archive: + for info in archive.infolist(): + if info.is_dir(): + continue + relative_path = info.filename.replace("\\", "/") + extension = Path(relative_path).suffix.lower() + if extension not in {".txt", ".md", ".pdf", ".docx"}: + continue + file_data = archive.read(info.filename) + extracted_file = SimpleUploadedFile( + Path(relative_path).name, + file_data, + ) + entries.append( + { + "relative_path": relative_path, + "uploaded_file": extracted_file, + } + ) + return entries + + def _detect_document_role(file_name: str) -> str: normalized = file_name.lower() if "申请表" in file_name: diff --git a/apps/documents/views.py b/apps/documents/views.py index 83bbbc4..afa817d 100644 --- a/apps/documents/views.py +++ b/apps/documents/views.py @@ -1,4 +1,5 @@ from django.contrib import messages +from django.db.models import Q from django.shortcuts import get_object_or_404, redirect, render from django.views.decorators.http import require_POST @@ -14,7 +15,9 @@ def document_list(request): keyword = (request.GET.get("keyword") or "").strip() batches = SubmissionBatch.objects.all() if keyword: - batches = batches.filter(product_name__icontains=keyword) + batches = batches.filter( + Q(product_name__icontains=keyword) | Q(batch_id__icontains=keyword) + ) documents = UploadedDocument.objects.all() status_counts = { "pending": batches.filter(import_status=SubmissionBatch.STATUS_PENDING).count(), @@ -57,7 +60,7 @@ def upload(request): if form.is_valid(): result = import_submission_batch( form.cleaned_data["scenario_id"], - [form.cleaned_data["file"]], + form.cleaned_data["uploaded_files"], ) messages.success( request, @@ -73,8 +76,9 @@ def upload(request): "form": form, "scenarios": list_scenarios(), "upload_checks": [ - "文件格式支持 PDF、DOCX、MD、TXT", + "文件格式支持 PDF、DOCX、MD、TXT 与 ZIP 资料包", "业务资料与法规依据资料需分开归属", + "支持一次上传多份文件并归并到同一个资料包", "目录类文件会优先参与完整性校验", "上传完成后建议立即进入解析与入库流程", ], diff --git a/templates/documents/upload.html b/templates/documents/upload.html index acc87a8..586f9c5 100644 --- a/templates/documents/upload.html +++ b/templates/documents/upload.html @@ -14,7 +14,7 @@

资料导入向导

-

当前支持 `.txt`、`.md`、`.pdf` 和 `.docx`。上传成功后即可回到文件中心执行解析与入库。

+

当前支持多文件上传,以及 `.txt`、`.md`、`.pdf`、`.docx` 与 `.zip` 资料包。上传成功后会直接形成一个资料包并绑定会话。

@@ -27,18 +27,19 @@ {% endif %}
- {{ form.file.label_tag }} - {{ form.file }} - {% if form.file.errors %} -

{{ form.file.errors|join:" " }}

+ {{ form.files.label_tag }} + {{ form.files }} + {% if form.files.errors %} +

{{ form.files.errors|join:" " }}

{% endif %} +

可一次选择多份文件,或上传一个 zip 资料包。

{% if form.errors %}
{{ form.errors }}
{% endif %}
diff --git a/tests/test_documents.py b/tests/test_documents.py index 6f13b6b..d49022a 100644 --- a/tests/test_documents.py +++ b/tests/test_documents.py @@ -1,5 +1,7 @@ from django.core.files.uploadedfile import SimpleUploadedFile from django.urls import reverse +from io import BytesIO +from zipfile import ZipFile from apps.documents.forms import DocumentUploadForm from apps.documents.models import SubmissionBatch, UploadedDocument @@ -197,6 +199,34 @@ def test_document_list_supports_product_name_search(client, db): assert "呼吸道合胞病毒核酸检测试剂盒" not in content +def test_document_list_supports_batch_id_search(client, db): + SubmissionBatch.objects.create( + batch_id="SUB-20260604-001", + product_name="产品A", + workflow_type="registration", + conversation_id="conv-001", + file_count=2, + page_count=12, + import_status="completed", + ) + SubmissionBatch.objects.create( + batch_id="SUB-20260604-002", + product_name="产品B", + workflow_type="registration", + conversation_id="conv-002", + file_count=3, + page_count=20, + import_status="completed", + ) + + response = client.get(reverse("documents:list"), {"keyword": "SUB-20260604-002"}) + + content = response.content.decode("utf-8") + assert response.status_code == 200 + assert "SUB-20260604-002" in content + assert "SUB-20260604-001" not in content + + def test_import_submission_batch_marks_manual_review_when_product_names_conflict(db): files = [ SimpleUploadedFile( @@ -217,3 +247,50 @@ def test_import_submission_batch_marks_manual_review_when_product_names_conflict assert batch.import_status == "review_required" assert result["registration_overview_report"]["warnings"] assert "产品名称来源冲突" in result["registration_overview_report"]["warnings"][0] + + +def test_upload_multiple_files_creates_single_submission_batch_and_multiple_documents(client, db): + application = SimpleUploadedFile( + "注册申请表.txt", + "产品名称:新型冠状病毒 2019-nCoV 核酸检测试剂盒".encode("utf-8"), + content_type="text/plain", + ) + manual = SimpleUploadedFile( + "目标产品说明书.txt", + "产品名称:新型冠状病毒 2019-nCoV 核酸检测试剂盒".encode("utf-8"), + content_type="text/plain", + ) + + response = client.post( + reverse("documents:upload"), + {"scenario_id": "document_review", "files": [application, manual]}, + ) + + assert response.status_code == 302 + batch = SubmissionBatch.objects.get() + assert batch.file_count == 2 + assert UploadedDocument.objects.filter(batch=batch).count() == 2 + assert Conversation.objects.get().title == "新型冠状病毒 2019-nCoV 核酸检测试剂盒" + + +def test_import_submission_batch_supports_zip_package_and_preserves_relative_paths(db): + archive = BytesIO() + with ZipFile(archive, "w") as zip_file: + zip_file.writestr("CH1/注册申请表.txt", "产品名称:产品A") + zip_file.writestr("CH1/目标产品说明书.txt", "产品名称:产品A") + archive.seek(0) + package = SimpleUploadedFile( + "registration-package.zip", + archive.read(), + content_type="application/zip", + ) + + result = import_submission_batch("document_review", [package]) + + batch = SubmissionBatch.objects.get(batch_id=result["batch_id"]) + documents = list(UploadedDocument.objects.filter(batch=batch).order_by("relative_path")) + assert batch.file_count == 2 + assert [document.relative_path for document in documents] == [ + "CH1/注册申请表.txt", + "CH1/目标产品说明书.txt", + ]