feat: 支持资料包多文件与zip导入

2026-06-04 01:07:15 +08:00
parent 2b40ddc487
commit aa0a24fe5a
5 changed files with 198 additions and 20 deletions
--- a/apps/documents/services.py
+++ b/apps/documents/services.py
@@ -1,15 +1,23 @@
 from pathlib import Path
+from io import BytesIO
 import re
 import xml.etree.ElementTree as ET
 from zipfile import BadZipFile, ZipFile

 from agent_core.rag.ingest import ingest_document
 from apps.chat.services import create_conversation_for_batch
+from django.core.files.uploadedfile import SimpleUploadedFile

 from .models import SubmissionBatch, UploadedDocument


-def create_uploaded_document(scenario_id: str, uploaded_file, batch: SubmissionBatch | None = None) -> UploadedDocument:
+def create_uploaded_document(
+    scenario_id: str,
+    uploaded_file,
+    batch: SubmissionBatch | None = None,
+    *,
+    relative_path: str | None = None,
+) -> UploadedDocument:
    """
    保存上传文件的元数据记录。

@@ -20,11 +28,11 @@ def create_uploaded_document(scenario_id: str, uploaded_file, batch: SubmissionB
    return UploadedDocument.objects.create(
        batch=batch,
        scenario_id=scenario_id,
-        original_name=uploaded_file.name,
+        original_name=Path(relative_path or uploaded_file.name).name,
        file=uploaded_file,
        file_type=extension,
        size=uploaded_file.size,
-        relative_path=uploaded_file.name,
+        relative_path=relative_path or uploaded_file.name,
        status=UploadedDocument.STATUS_UPLOADED,
    )

@@ -49,14 +57,22 @@ def import_submission_batch(scenario_id: str, uploaded_files: list) -> dict:
    chapter_summary = {}
    total_pages = 0

-    for uploaded_file in uploaded_files:
-        document = create_uploaded_document(scenario_id, uploaded_file, batch=batch)
+    expanded_files = _expand_uploaded_files(uploaded_files)
+    for uploaded_item in expanded_files:
+        uploaded_file = uploaded_item["uploaded_file"]
+        relative_path = uploaded_item["relative_path"]
+        document = create_uploaded_document(
+            scenario_id,
+            uploaded_file,
+            batch=batch,
+            relative_path=relative_path,
+        )
        text = extract_text(document)
        page_count = _estimate_page_count(text)
        document.page_count = page_count
        document.page_count_confidence = "estimated"
-        document.document_role = _detect_document_role(document.original_name)
-        document.chapter_code = _detect_chapter_code(document.original_name, text)
+        document.document_role = _detect_document_role(document.relative_path)
+        document.chapter_code = _detect_chapter_code(document.relative_path, text)
        document.chapter_match_status = "matched" if document.chapter_code else "unknown"
        document.needs_manual_review = not bool(document.chapter_code)
        document.save(
@@ -74,7 +90,7 @@ def import_submission_batch(scenario_id: str, uploaded_files: list) -> dict:
        total_pages += page_count
        chapter_key = document.chapter_code or "UNCLASSIFIED"
        chapter_summary[chapter_key] = chapter_summary.get(chapter_key, 0) + 1
-        candidates.extend(_extract_product_candidates(document.original_name, text))
+        candidates.extend(_extract_product_candidates(document.relative_path, text))

    product_name, warnings = _select_product_name(candidates)
    conversation = create_conversation_for_batch(batch.batch_id, product_name)
@@ -197,6 +213,48 @@ def _estimate_page_count(text: str) -> int:
    return max(1, line_count)


+def _expand_uploaded_files(uploaded_files: list) -> list[dict]:
+    expanded_files = []
+    for uploaded_file in uploaded_files:
+        extension = Path(uploaded_file.name).suffix.lower()
+        if extension == ".zip":
+            expanded_files.extend(_extract_zip_entries(uploaded_file))
+            continue
+        expanded_files.append(
+            {
+                "relative_path": uploaded_file.name,
+                "uploaded_file": uploaded_file,
+            }
+        )
+    return expanded_files
+
+
+def _extract_zip_entries(uploaded_file) -> list[dict]:
+    archive_bytes = uploaded_file.read()
+    uploaded_file.seek(0)
+    entries = []
+    with ZipFile(BytesIO(archive_bytes)) as archive:
+        for info in archive.infolist():
+            if info.is_dir():
+                continue
+            relative_path = info.filename.replace("\\", "/")
+            extension = Path(relative_path).suffix.lower()
+            if extension not in {".txt", ".md", ".pdf", ".docx"}:
+                continue
+            file_data = archive.read(info.filename)
+            extracted_file = SimpleUploadedFile(
+                Path(relative_path).name,
+                file_data,
+            )
+            entries.append(
+                {
+                    "relative_path": relative_path,
+                    "uploaded_file": extracted_file,
+                }
+            )
+    return entries
+
+
 def _detect_document_role(file_name: str) -> str:
    normalized = file_name.lower()
    if "申请表" in file_name: