feat: 支持资料包多文件与zip导入
This commit is contained in:
@@ -5,14 +5,31 @@ from django import forms
|
||||
from apps.scenarios.services import ScenarioNotFound, get_scenario
|
||||
from apps.scenarios.services import list_scenarios
|
||||
|
||||
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx"}
|
||||
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".zip"}
|
||||
|
||||
|
||||
class MultipleFileInput(forms.ClearableFileInput):
|
||||
allow_multiple_selected = True
|
||||
|
||||
|
||||
class MultipleFileField(forms.FileField):
|
||||
widget = MultipleFileInput
|
||||
|
||||
def clean(self, data, initial=None):
|
||||
single_file_clean = super().clean
|
||||
if not data:
|
||||
return []
|
||||
if isinstance(data, (list, tuple)):
|
||||
return [single_file_clean(item, initial) for item in data]
|
||||
return [single_file_clean(data, initial)]
|
||||
|
||||
|
||||
class DocumentUploadForm(forms.Form):
|
||||
# 使用 ChoiceField 让表单自己维护场景选项,
|
||||
# 这样模板、校验和后续扩展都能围绕一个入口完成。
|
||||
scenario_id = forms.ChoiceField(label="场景", choices=())
|
||||
file = forms.FileField(label="文件")
|
||||
files = MultipleFileField(label="文件或资料包", required=False)
|
||||
file = forms.FileField(label="兼容单文件上传", required=False)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@@ -31,7 +48,28 @@ class DocumentUploadForm(forms.Form):
|
||||
|
||||
def clean_file(self):
|
||||
uploaded_file = self.cleaned_data["file"]
|
||||
if not uploaded_file:
|
||||
return uploaded_file
|
||||
extension = Path(uploaded_file.name).suffix.lower()
|
||||
if extension not in SUPPORTED_EXTENSIONS:
|
||||
raise forms.ValidationError("仅支持 .txt、.md、.pdf 和 .docx 文件")
|
||||
raise forms.ValidationError("仅支持 .txt、.md、.pdf、.docx 和 .zip 文件")
|
||||
return uploaded_file
|
||||
|
||||
def clean_files(self):
|
||||
uploaded_files = self.cleaned_data.get("files") or []
|
||||
for uploaded_file in uploaded_files:
|
||||
extension = Path(uploaded_file.name).suffix.lower()
|
||||
if extension not in SUPPORTED_EXTENSIONS:
|
||||
raise forms.ValidationError("仅支持 .txt、.md、.pdf、.docx 和 .zip 文件")
|
||||
return uploaded_files
|
||||
|
||||
def clean(self):
|
||||
cleaned_data = super().clean()
|
||||
files = list(cleaned_data.get("files") or [])
|
||||
file = cleaned_data.get("file")
|
||||
if file:
|
||||
files.append(file)
|
||||
if not files:
|
||||
raise forms.ValidationError("请至少上传一个文件或一个 zip 资料包。")
|
||||
cleaned_data["uploaded_files"] = files
|
||||
return cleaned_data
|
||||
|
||||
@@ -1,15 +1,23 @@
|
||||
from pathlib import Path
|
||||
from io import BytesIO
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from zipfile import BadZipFile, ZipFile
|
||||
|
||||
from agent_core.rag.ingest import ingest_document
|
||||
from apps.chat.services import create_conversation_for_batch
|
||||
from django.core.files.uploadedfile import SimpleUploadedFile
|
||||
|
||||
from .models import SubmissionBatch, UploadedDocument
|
||||
|
||||
|
||||
def create_uploaded_document(scenario_id: str, uploaded_file, batch: SubmissionBatch | None = None) -> UploadedDocument:
|
||||
def create_uploaded_document(
|
||||
scenario_id: str,
|
||||
uploaded_file,
|
||||
batch: SubmissionBatch | None = None,
|
||||
*,
|
||||
relative_path: str | None = None,
|
||||
) -> UploadedDocument:
|
||||
"""
|
||||
保存上传文件的元数据记录。
|
||||
|
||||
@@ -20,11 +28,11 @@ def create_uploaded_document(scenario_id: str, uploaded_file, batch: SubmissionB
|
||||
return UploadedDocument.objects.create(
|
||||
batch=batch,
|
||||
scenario_id=scenario_id,
|
||||
original_name=uploaded_file.name,
|
||||
original_name=Path(relative_path or uploaded_file.name).name,
|
||||
file=uploaded_file,
|
||||
file_type=extension,
|
||||
size=uploaded_file.size,
|
||||
relative_path=uploaded_file.name,
|
||||
relative_path=relative_path or uploaded_file.name,
|
||||
status=UploadedDocument.STATUS_UPLOADED,
|
||||
)
|
||||
|
||||
@@ -49,14 +57,22 @@ def import_submission_batch(scenario_id: str, uploaded_files: list) -> dict:
|
||||
chapter_summary = {}
|
||||
total_pages = 0
|
||||
|
||||
for uploaded_file in uploaded_files:
|
||||
document = create_uploaded_document(scenario_id, uploaded_file, batch=batch)
|
||||
expanded_files = _expand_uploaded_files(uploaded_files)
|
||||
for uploaded_item in expanded_files:
|
||||
uploaded_file = uploaded_item["uploaded_file"]
|
||||
relative_path = uploaded_item["relative_path"]
|
||||
document = create_uploaded_document(
|
||||
scenario_id,
|
||||
uploaded_file,
|
||||
batch=batch,
|
||||
relative_path=relative_path,
|
||||
)
|
||||
text = extract_text(document)
|
||||
page_count = _estimate_page_count(text)
|
||||
document.page_count = page_count
|
||||
document.page_count_confidence = "estimated"
|
||||
document.document_role = _detect_document_role(document.original_name)
|
||||
document.chapter_code = _detect_chapter_code(document.original_name, text)
|
||||
document.document_role = _detect_document_role(document.relative_path)
|
||||
document.chapter_code = _detect_chapter_code(document.relative_path, text)
|
||||
document.chapter_match_status = "matched" if document.chapter_code else "unknown"
|
||||
document.needs_manual_review = not bool(document.chapter_code)
|
||||
document.save(
|
||||
@@ -74,7 +90,7 @@ def import_submission_batch(scenario_id: str, uploaded_files: list) -> dict:
|
||||
total_pages += page_count
|
||||
chapter_key = document.chapter_code or "UNCLASSIFIED"
|
||||
chapter_summary[chapter_key] = chapter_summary.get(chapter_key, 0) + 1
|
||||
candidates.extend(_extract_product_candidates(document.original_name, text))
|
||||
candidates.extend(_extract_product_candidates(document.relative_path, text))
|
||||
|
||||
product_name, warnings = _select_product_name(candidates)
|
||||
conversation = create_conversation_for_batch(batch.batch_id, product_name)
|
||||
@@ -197,6 +213,48 @@ def _estimate_page_count(text: str) -> int:
|
||||
return max(1, line_count)
|
||||
|
||||
|
||||
def _expand_uploaded_files(uploaded_files: list) -> list[dict]:
|
||||
expanded_files = []
|
||||
for uploaded_file in uploaded_files:
|
||||
extension = Path(uploaded_file.name).suffix.lower()
|
||||
if extension == ".zip":
|
||||
expanded_files.extend(_extract_zip_entries(uploaded_file))
|
||||
continue
|
||||
expanded_files.append(
|
||||
{
|
||||
"relative_path": uploaded_file.name,
|
||||
"uploaded_file": uploaded_file,
|
||||
}
|
||||
)
|
||||
return expanded_files
|
||||
|
||||
|
||||
def _extract_zip_entries(uploaded_file) -> list[dict]:
|
||||
archive_bytes = uploaded_file.read()
|
||||
uploaded_file.seek(0)
|
||||
entries = []
|
||||
with ZipFile(BytesIO(archive_bytes)) as archive:
|
||||
for info in archive.infolist():
|
||||
if info.is_dir():
|
||||
continue
|
||||
relative_path = info.filename.replace("\\", "/")
|
||||
extension = Path(relative_path).suffix.lower()
|
||||
if extension not in {".txt", ".md", ".pdf", ".docx"}:
|
||||
continue
|
||||
file_data = archive.read(info.filename)
|
||||
extracted_file = SimpleUploadedFile(
|
||||
Path(relative_path).name,
|
||||
file_data,
|
||||
)
|
||||
entries.append(
|
||||
{
|
||||
"relative_path": relative_path,
|
||||
"uploaded_file": extracted_file,
|
||||
}
|
||||
)
|
||||
return entries
|
||||
|
||||
|
||||
def _detect_document_role(file_name: str) -> str:
|
||||
normalized = file_name.lower()
|
||||
if "申请表" in file_name:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from django.contrib import messages
|
||||
from django.db.models import Q
|
||||
from django.shortcuts import get_object_or_404, redirect, render
|
||||
from django.views.decorators.http import require_POST
|
||||
|
||||
@@ -14,7 +15,9 @@ def document_list(request):
|
||||
keyword = (request.GET.get("keyword") or "").strip()
|
||||
batches = SubmissionBatch.objects.all()
|
||||
if keyword:
|
||||
batches = batches.filter(product_name__icontains=keyword)
|
||||
batches = batches.filter(
|
||||
Q(product_name__icontains=keyword) | Q(batch_id__icontains=keyword)
|
||||
)
|
||||
documents = UploadedDocument.objects.all()
|
||||
status_counts = {
|
||||
"pending": batches.filter(import_status=SubmissionBatch.STATUS_PENDING).count(),
|
||||
@@ -57,7 +60,7 @@ def upload(request):
|
||||
if form.is_valid():
|
||||
result = import_submission_batch(
|
||||
form.cleaned_data["scenario_id"],
|
||||
[form.cleaned_data["file"]],
|
||||
form.cleaned_data["uploaded_files"],
|
||||
)
|
||||
messages.success(
|
||||
request,
|
||||
@@ -73,8 +76,9 @@ def upload(request):
|
||||
"form": form,
|
||||
"scenarios": list_scenarios(),
|
||||
"upload_checks": [
|
||||
"文件格式支持 PDF、DOCX、MD、TXT",
|
||||
"文件格式支持 PDF、DOCX、MD、TXT 与 ZIP 资料包",
|
||||
"业务资料与法规依据资料需分开归属",
|
||||
"支持一次上传多份文件并归并到同一个资料包",
|
||||
"目录类文件会优先参与完整性校验",
|
||||
"上传完成后建议立即进入解析与入库流程",
|
||||
],
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
<div class="section-heading">
|
||||
<div>
|
||||
<h2 class="section-title">资料导入向导</h2>
|
||||
<p class="section-copy">当前支持 `.txt`、`.md`、`.pdf` 和 `.docx`。上传成功后即可回到文件中心执行解析与入库。</p>
|
||||
<p class="section-copy">当前支持多文件上传,以及 `.txt`、`.md`、`.pdf`、`.docx` 与 `.zip` 资料包。上传成功后会直接形成一个资料包并绑定会话。</p>
|
||||
</div>
|
||||
</div>
|
||||
<form method="post" enctype="multipart/form-data" class="stack">
|
||||
@@ -27,18 +27,19 @@
|
||||
{% endif %}
|
||||
</div>
|
||||
<div>
|
||||
{{ form.file.label_tag }}
|
||||
{{ form.file }}
|
||||
{% if form.file.errors %}
|
||||
<p class="notice notice-error">{{ form.file.errors|join:" " }}</p>
|
||||
{{ form.files.label_tag }}
|
||||
{{ form.files }}
|
||||
{% if form.files.errors %}
|
||||
<p class="notice notice-error">{{ form.files.errors|join:" " }}</p>
|
||||
{% endif %}
|
||||
<p class="help-text">可一次选择多份文件,或上传一个 zip 资料包。</p>
|
||||
</div>
|
||||
{% if form.errors %}
|
||||
<div class="notice notice-error">{{ form.errors }}</div>
|
||||
{% endif %}
|
||||
<div class="button-row">
|
||||
<button type="submit">确认导入</button>
|
||||
<a class="button" href="{% url 'documents:list' %}">返回文件中心</a>
|
||||
<a class="button" href="{% url 'documents:list' %}">返回资料包</a>
|
||||
</div>
|
||||
</form>
|
||||
</article>
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
from django.core.files.uploadedfile import SimpleUploadedFile
|
||||
from django.urls import reverse
|
||||
from io import BytesIO
|
||||
from zipfile import ZipFile
|
||||
|
||||
from apps.documents.forms import DocumentUploadForm
|
||||
from apps.documents.models import SubmissionBatch, UploadedDocument
|
||||
@@ -197,6 +199,34 @@ def test_document_list_supports_product_name_search(client, db):
|
||||
assert "呼吸道合胞病毒核酸检测试剂盒" not in content
|
||||
|
||||
|
||||
def test_document_list_supports_batch_id_search(client, db):
|
||||
SubmissionBatch.objects.create(
|
||||
batch_id="SUB-20260604-001",
|
||||
product_name="产品A",
|
||||
workflow_type="registration",
|
||||
conversation_id="conv-001",
|
||||
file_count=2,
|
||||
page_count=12,
|
||||
import_status="completed",
|
||||
)
|
||||
SubmissionBatch.objects.create(
|
||||
batch_id="SUB-20260604-002",
|
||||
product_name="产品B",
|
||||
workflow_type="registration",
|
||||
conversation_id="conv-002",
|
||||
file_count=3,
|
||||
page_count=20,
|
||||
import_status="completed",
|
||||
)
|
||||
|
||||
response = client.get(reverse("documents:list"), {"keyword": "SUB-20260604-002"})
|
||||
|
||||
content = response.content.decode("utf-8")
|
||||
assert response.status_code == 200
|
||||
assert "SUB-20260604-002" in content
|
||||
assert "SUB-20260604-001" not in content
|
||||
|
||||
|
||||
def test_import_submission_batch_marks_manual_review_when_product_names_conflict(db):
|
||||
files = [
|
||||
SimpleUploadedFile(
|
||||
@@ -217,3 +247,50 @@ def test_import_submission_batch_marks_manual_review_when_product_names_conflict
|
||||
assert batch.import_status == "review_required"
|
||||
assert result["registration_overview_report"]["warnings"]
|
||||
assert "产品名称来源冲突" in result["registration_overview_report"]["warnings"][0]
|
||||
|
||||
|
||||
def test_upload_multiple_files_creates_single_submission_batch_and_multiple_documents(client, db):
|
||||
application = SimpleUploadedFile(
|
||||
"注册申请表.txt",
|
||||
"产品名称:新型冠状病毒 2019-nCoV 核酸检测试剂盒".encode("utf-8"),
|
||||
content_type="text/plain",
|
||||
)
|
||||
manual = SimpleUploadedFile(
|
||||
"目标产品说明书.txt",
|
||||
"产品名称:新型冠状病毒 2019-nCoV 核酸检测试剂盒".encode("utf-8"),
|
||||
content_type="text/plain",
|
||||
)
|
||||
|
||||
response = client.post(
|
||||
reverse("documents:upload"),
|
||||
{"scenario_id": "document_review", "files": [application, manual]},
|
||||
)
|
||||
|
||||
assert response.status_code == 302
|
||||
batch = SubmissionBatch.objects.get()
|
||||
assert batch.file_count == 2
|
||||
assert UploadedDocument.objects.filter(batch=batch).count() == 2
|
||||
assert Conversation.objects.get().title == "新型冠状病毒 2019-nCoV 核酸检测试剂盒"
|
||||
|
||||
|
||||
def test_import_submission_batch_supports_zip_package_and_preserves_relative_paths(db):
|
||||
archive = BytesIO()
|
||||
with ZipFile(archive, "w") as zip_file:
|
||||
zip_file.writestr("CH1/注册申请表.txt", "产品名称:产品A")
|
||||
zip_file.writestr("CH1/目标产品说明书.txt", "产品名称:产品A")
|
||||
archive.seek(0)
|
||||
package = SimpleUploadedFile(
|
||||
"registration-package.zip",
|
||||
archive.read(),
|
||||
content_type="application/zip",
|
||||
)
|
||||
|
||||
result = import_submission_batch("document_review", [package])
|
||||
|
||||
batch = SubmissionBatch.objects.get(batch_id=result["batch_id"])
|
||||
documents = list(UploadedDocument.objects.filter(batch=batch).order_by("relative_path"))
|
||||
assert batch.file_count == 2
|
||||
assert [document.relative_path for document in documents] == [
|
||||
"CH1/注册申请表.txt",
|
||||
"CH1/目标产品说明书.txt",
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user