feat: 支持资料包多文件与zip导入

This commit is contained in:
2026-06-04 01:07:15 +08:00
parent 2b40ddc487
commit aa0a24fe5a
5 changed files with 198 additions and 20 deletions

View File

@@ -5,14 +5,31 @@ from django import forms
from apps.scenarios.services import ScenarioNotFound, get_scenario from apps.scenarios.services import ScenarioNotFound, get_scenario
from apps.scenarios.services import list_scenarios from apps.scenarios.services import list_scenarios
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx"} SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".zip"}
class MultipleFileInput(forms.ClearableFileInput):
allow_multiple_selected = True
class MultipleFileField(forms.FileField):
widget = MultipleFileInput
def clean(self, data, initial=None):
single_file_clean = super().clean
if not data:
return []
if isinstance(data, (list, tuple)):
return [single_file_clean(item, initial) for item in data]
return [single_file_clean(data, initial)]
class DocumentUploadForm(forms.Form): class DocumentUploadForm(forms.Form):
# 使用 ChoiceField 让表单自己维护场景选项, # 使用 ChoiceField 让表单自己维护场景选项,
# 这样模板、校验和后续扩展都能围绕一个入口完成。 # 这样模板、校验和后续扩展都能围绕一个入口完成。
scenario_id = forms.ChoiceField(label="场景", choices=()) scenario_id = forms.ChoiceField(label="场景", choices=())
file = forms.FileField(label="文件") files = MultipleFileField(label="文件或资料包", required=False)
file = forms.FileField(label="兼容单文件上传", required=False)
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@@ -31,7 +48,28 @@ class DocumentUploadForm(forms.Form):
def clean_file(self): def clean_file(self):
uploaded_file = self.cleaned_data["file"] uploaded_file = self.cleaned_data["file"]
if not uploaded_file:
return uploaded_file
extension = Path(uploaded_file.name).suffix.lower() extension = Path(uploaded_file.name).suffix.lower()
if extension not in SUPPORTED_EXTENSIONS: if extension not in SUPPORTED_EXTENSIONS:
raise forms.ValidationError("仅支持 .txt、.md、.pdf.docx 文件") raise forms.ValidationError("仅支持 .txt、.md、.pdf.docx 和 .zip 文件")
return uploaded_file return uploaded_file
def clean_files(self):
uploaded_files = self.cleaned_data.get("files") or []
for uploaded_file in uploaded_files:
extension = Path(uploaded_file.name).suffix.lower()
if extension not in SUPPORTED_EXTENSIONS:
raise forms.ValidationError("仅支持 .txt、.md、.pdf、.docx 和 .zip 文件")
return uploaded_files
def clean(self):
cleaned_data = super().clean()
files = list(cleaned_data.get("files") or [])
file = cleaned_data.get("file")
if file:
files.append(file)
if not files:
raise forms.ValidationError("请至少上传一个文件或一个 zip 资料包。")
cleaned_data["uploaded_files"] = files
return cleaned_data

View File

@@ -1,15 +1,23 @@
from pathlib import Path from pathlib import Path
from io import BytesIO
import re import re
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from zipfile import BadZipFile, ZipFile from zipfile import BadZipFile, ZipFile
from agent_core.rag.ingest import ingest_document from agent_core.rag.ingest import ingest_document
from apps.chat.services import create_conversation_for_batch from apps.chat.services import create_conversation_for_batch
from django.core.files.uploadedfile import SimpleUploadedFile
from .models import SubmissionBatch, UploadedDocument from .models import SubmissionBatch, UploadedDocument
def create_uploaded_document(scenario_id: str, uploaded_file, batch: SubmissionBatch | None = None) -> UploadedDocument: def create_uploaded_document(
scenario_id: str,
uploaded_file,
batch: SubmissionBatch | None = None,
*,
relative_path: str | None = None,
) -> UploadedDocument:
""" """
保存上传文件的元数据记录。 保存上传文件的元数据记录。
@@ -20,11 +28,11 @@ def create_uploaded_document(scenario_id: str, uploaded_file, batch: SubmissionB
return UploadedDocument.objects.create( return UploadedDocument.objects.create(
batch=batch, batch=batch,
scenario_id=scenario_id, scenario_id=scenario_id,
original_name=uploaded_file.name, original_name=Path(relative_path or uploaded_file.name).name,
file=uploaded_file, file=uploaded_file,
file_type=extension, file_type=extension,
size=uploaded_file.size, size=uploaded_file.size,
relative_path=uploaded_file.name, relative_path=relative_path or uploaded_file.name,
status=UploadedDocument.STATUS_UPLOADED, status=UploadedDocument.STATUS_UPLOADED,
) )
@@ -49,14 +57,22 @@ def import_submission_batch(scenario_id: str, uploaded_files: list) -> dict:
chapter_summary = {} chapter_summary = {}
total_pages = 0 total_pages = 0
for uploaded_file in uploaded_files: expanded_files = _expand_uploaded_files(uploaded_files)
document = create_uploaded_document(scenario_id, uploaded_file, batch=batch) for uploaded_item in expanded_files:
uploaded_file = uploaded_item["uploaded_file"]
relative_path = uploaded_item["relative_path"]
document = create_uploaded_document(
scenario_id,
uploaded_file,
batch=batch,
relative_path=relative_path,
)
text = extract_text(document) text = extract_text(document)
page_count = _estimate_page_count(text) page_count = _estimate_page_count(text)
document.page_count = page_count document.page_count = page_count
document.page_count_confidence = "estimated" document.page_count_confidence = "estimated"
document.document_role = _detect_document_role(document.original_name) document.document_role = _detect_document_role(document.relative_path)
document.chapter_code = _detect_chapter_code(document.original_name, text) document.chapter_code = _detect_chapter_code(document.relative_path, text)
document.chapter_match_status = "matched" if document.chapter_code else "unknown" document.chapter_match_status = "matched" if document.chapter_code else "unknown"
document.needs_manual_review = not bool(document.chapter_code) document.needs_manual_review = not bool(document.chapter_code)
document.save( document.save(
@@ -74,7 +90,7 @@ def import_submission_batch(scenario_id: str, uploaded_files: list) -> dict:
total_pages += page_count total_pages += page_count
chapter_key = document.chapter_code or "UNCLASSIFIED" chapter_key = document.chapter_code or "UNCLASSIFIED"
chapter_summary[chapter_key] = chapter_summary.get(chapter_key, 0) + 1 chapter_summary[chapter_key] = chapter_summary.get(chapter_key, 0) + 1
candidates.extend(_extract_product_candidates(document.original_name, text)) candidates.extend(_extract_product_candidates(document.relative_path, text))
product_name, warnings = _select_product_name(candidates) product_name, warnings = _select_product_name(candidates)
conversation = create_conversation_for_batch(batch.batch_id, product_name) conversation = create_conversation_for_batch(batch.batch_id, product_name)
@@ -197,6 +213,48 @@ def _estimate_page_count(text: str) -> int:
return max(1, line_count) return max(1, line_count)
def _expand_uploaded_files(uploaded_files: list) -> list[dict]:
expanded_files = []
for uploaded_file in uploaded_files:
extension = Path(uploaded_file.name).suffix.lower()
if extension == ".zip":
expanded_files.extend(_extract_zip_entries(uploaded_file))
continue
expanded_files.append(
{
"relative_path": uploaded_file.name,
"uploaded_file": uploaded_file,
}
)
return expanded_files
def _extract_zip_entries(uploaded_file) -> list[dict]:
archive_bytes = uploaded_file.read()
uploaded_file.seek(0)
entries = []
with ZipFile(BytesIO(archive_bytes)) as archive:
for info in archive.infolist():
if info.is_dir():
continue
relative_path = info.filename.replace("\\", "/")
extension = Path(relative_path).suffix.lower()
if extension not in {".txt", ".md", ".pdf", ".docx"}:
continue
file_data = archive.read(info.filename)
extracted_file = SimpleUploadedFile(
Path(relative_path).name,
file_data,
)
entries.append(
{
"relative_path": relative_path,
"uploaded_file": extracted_file,
}
)
return entries
def _detect_document_role(file_name: str) -> str: def _detect_document_role(file_name: str) -> str:
normalized = file_name.lower() normalized = file_name.lower()
if "申请表" in file_name: if "申请表" in file_name:

View File

@@ -1,4 +1,5 @@
from django.contrib import messages from django.contrib import messages
from django.db.models import Q
from django.shortcuts import get_object_or_404, redirect, render from django.shortcuts import get_object_or_404, redirect, render
from django.views.decorators.http import require_POST from django.views.decorators.http import require_POST
@@ -14,7 +15,9 @@ def document_list(request):
keyword = (request.GET.get("keyword") or "").strip() keyword = (request.GET.get("keyword") or "").strip()
batches = SubmissionBatch.objects.all() batches = SubmissionBatch.objects.all()
if keyword: if keyword:
batches = batches.filter(product_name__icontains=keyword) batches = batches.filter(
Q(product_name__icontains=keyword) | Q(batch_id__icontains=keyword)
)
documents = UploadedDocument.objects.all() documents = UploadedDocument.objects.all()
status_counts = { status_counts = {
"pending": batches.filter(import_status=SubmissionBatch.STATUS_PENDING).count(), "pending": batches.filter(import_status=SubmissionBatch.STATUS_PENDING).count(),
@@ -57,7 +60,7 @@ def upload(request):
if form.is_valid(): if form.is_valid():
result = import_submission_batch( result = import_submission_batch(
form.cleaned_data["scenario_id"], form.cleaned_data["scenario_id"],
[form.cleaned_data["file"]], form.cleaned_data["uploaded_files"],
) )
messages.success( messages.success(
request, request,
@@ -73,8 +76,9 @@ def upload(request):
"form": form, "form": form,
"scenarios": list_scenarios(), "scenarios": list_scenarios(),
"upload_checks": [ "upload_checks": [
"文件格式支持 PDF、DOCX、MD、TXT", "文件格式支持 PDF、DOCX、MD、TXT 与 ZIP 资料包",
"业务资料与法规依据资料需分开归属", "业务资料与法规依据资料需分开归属",
"支持一次上传多份文件并归并到同一个资料包",
"目录类文件会优先参与完整性校验", "目录类文件会优先参与完整性校验",
"上传完成后建议立即进入解析与入库流程", "上传完成后建议立即进入解析与入库流程",
], ],

View File

@@ -14,7 +14,7 @@
<div class="section-heading"> <div class="section-heading">
<div> <div>
<h2 class="section-title">资料导入向导</h2> <h2 class="section-title">资料导入向导</h2>
<p class="section-copy">当前支持 `.txt`、`.md`、`.pdf``.docx`。上传成功后即可回到文件中心执行解析与入库</p> <p class="section-copy">当前支持多文件上传,以及 `.txt`、`.md`、`.pdf``.docx` 与 `.zip` 资料包。上传成功后会直接形成一个资料包并绑定会话</p>
</div> </div>
</div> </div>
<form method="post" enctype="multipart/form-data" class="stack"> <form method="post" enctype="multipart/form-data" class="stack">
@@ -27,18 +27,19 @@
{% endif %} {% endif %}
</div> </div>
<div> <div>
{{ form.file.label_tag }} {{ form.files.label_tag }}
{{ form.file }} {{ form.files }}
{% if form.file.errors %} {% if form.files.errors %}
<p class="notice notice-error">{{ form.file.errors|join:" " }}</p> <p class="notice notice-error">{{ form.files.errors|join:" " }}</p>
{% endif %} {% endif %}
<p class="help-text">可一次选择多份文件,或上传一个 zip 资料包。</p>
</div> </div>
{% if form.errors %} {% if form.errors %}
<div class="notice notice-error">{{ form.errors }}</div> <div class="notice notice-error">{{ form.errors }}</div>
{% endif %} {% endif %}
<div class="button-row"> <div class="button-row">
<button type="submit">确认导入</button> <button type="submit">确认导入</button>
<a class="button" href="{% url 'documents:list' %}">返回文件中心</a> <a class="button" href="{% url 'documents:list' %}">返回资料包</a>
</div> </div>
</form> </form>
</article> </article>

View File

@@ -1,5 +1,7 @@
from django.core.files.uploadedfile import SimpleUploadedFile from django.core.files.uploadedfile import SimpleUploadedFile
from django.urls import reverse from django.urls import reverse
from io import BytesIO
from zipfile import ZipFile
from apps.documents.forms import DocumentUploadForm from apps.documents.forms import DocumentUploadForm
from apps.documents.models import SubmissionBatch, UploadedDocument from apps.documents.models import SubmissionBatch, UploadedDocument
@@ -197,6 +199,34 @@ def test_document_list_supports_product_name_search(client, db):
assert "呼吸道合胞病毒核酸检测试剂盒" not in content assert "呼吸道合胞病毒核酸检测试剂盒" not in content
def test_document_list_supports_batch_id_search(client, db):
SubmissionBatch.objects.create(
batch_id="SUB-20260604-001",
product_name="产品A",
workflow_type="registration",
conversation_id="conv-001",
file_count=2,
page_count=12,
import_status="completed",
)
SubmissionBatch.objects.create(
batch_id="SUB-20260604-002",
product_name="产品B",
workflow_type="registration",
conversation_id="conv-002",
file_count=3,
page_count=20,
import_status="completed",
)
response = client.get(reverse("documents:list"), {"keyword": "SUB-20260604-002"})
content = response.content.decode("utf-8")
assert response.status_code == 200
assert "SUB-20260604-002" in content
assert "SUB-20260604-001" not in content
def test_import_submission_batch_marks_manual_review_when_product_names_conflict(db): def test_import_submission_batch_marks_manual_review_when_product_names_conflict(db):
files = [ files = [
SimpleUploadedFile( SimpleUploadedFile(
@@ -217,3 +247,50 @@ def test_import_submission_batch_marks_manual_review_when_product_names_conflict
assert batch.import_status == "review_required" assert batch.import_status == "review_required"
assert result["registration_overview_report"]["warnings"] assert result["registration_overview_report"]["warnings"]
assert "产品名称来源冲突" in result["registration_overview_report"]["warnings"][0] assert "产品名称来源冲突" in result["registration_overview_report"]["warnings"][0]
def test_upload_multiple_files_creates_single_submission_batch_and_multiple_documents(client, db):
application = SimpleUploadedFile(
"注册申请表.txt",
"产品名称:新型冠状病毒 2019-nCoV 核酸检测试剂盒".encode("utf-8"),
content_type="text/plain",
)
manual = SimpleUploadedFile(
"目标产品说明书.txt",
"产品名称:新型冠状病毒 2019-nCoV 核酸检测试剂盒".encode("utf-8"),
content_type="text/plain",
)
response = client.post(
reverse("documents:upload"),
{"scenario_id": "document_review", "files": [application, manual]},
)
assert response.status_code == 302
batch = SubmissionBatch.objects.get()
assert batch.file_count == 2
assert UploadedDocument.objects.filter(batch=batch).count() == 2
assert Conversation.objects.get().title == "新型冠状病毒 2019-nCoV 核酸检测试剂盒"
def test_import_submission_batch_supports_zip_package_and_preserves_relative_paths(db):
archive = BytesIO()
with ZipFile(archive, "w") as zip_file:
zip_file.writestr("CH1/注册申请表.txt", "产品名称产品A")
zip_file.writestr("CH1/目标产品说明书.txt", "产品名称产品A")
archive.seek(0)
package = SimpleUploadedFile(
"registration-package.zip",
archive.read(),
content_type="application/zip",
)
result = import_submission_batch("document_review", [package])
batch = SubmissionBatch.objects.get(batch_id=result["batch_id"])
documents = list(UploadedDocument.objects.filter(batch=batch).order_by("relative_path"))
assert batch.file_count == 2
assert [document.relative_path for document in documents] == [
"CH1/注册申请表.txt",
"CH1/目标产品说明书.txt",
]