feat: 支持资料包多文件与zip导入

This commit is contained in:
2026-06-04 01:07:15 +08:00
parent 2b40ddc487
commit aa0a24fe5a
5 changed files with 198 additions and 20 deletions

View File

@@ -1,15 +1,23 @@
from pathlib import Path
from io import BytesIO
import re
import xml.etree.ElementTree as ET
from zipfile import BadZipFile, ZipFile
from agent_core.rag.ingest import ingest_document
from apps.chat.services import create_conversation_for_batch
from django.core.files.uploadedfile import SimpleUploadedFile
from .models import SubmissionBatch, UploadedDocument
def create_uploaded_document(scenario_id: str, uploaded_file, batch: SubmissionBatch | None = None) -> UploadedDocument:
def create_uploaded_document(
scenario_id: str,
uploaded_file,
batch: SubmissionBatch | None = None,
*,
relative_path: str | None = None,
) -> UploadedDocument:
"""
保存上传文件的元数据记录。
@@ -20,11 +28,11 @@ def create_uploaded_document(scenario_id: str, uploaded_file, batch: SubmissionB
return UploadedDocument.objects.create(
batch=batch,
scenario_id=scenario_id,
original_name=uploaded_file.name,
original_name=Path(relative_path or uploaded_file.name).name,
file=uploaded_file,
file_type=extension,
size=uploaded_file.size,
relative_path=uploaded_file.name,
relative_path=relative_path or uploaded_file.name,
status=UploadedDocument.STATUS_UPLOADED,
)
@@ -49,14 +57,22 @@ def import_submission_batch(scenario_id: str, uploaded_files: list) -> dict:
chapter_summary = {}
total_pages = 0
for uploaded_file in uploaded_files:
document = create_uploaded_document(scenario_id, uploaded_file, batch=batch)
expanded_files = _expand_uploaded_files(uploaded_files)
for uploaded_item in expanded_files:
uploaded_file = uploaded_item["uploaded_file"]
relative_path = uploaded_item["relative_path"]
document = create_uploaded_document(
scenario_id,
uploaded_file,
batch=batch,
relative_path=relative_path,
)
text = extract_text(document)
page_count = _estimate_page_count(text)
document.page_count = page_count
document.page_count_confidence = "estimated"
document.document_role = _detect_document_role(document.original_name)
document.chapter_code = _detect_chapter_code(document.original_name, text)
document.document_role = _detect_document_role(document.relative_path)
document.chapter_code = _detect_chapter_code(document.relative_path, text)
document.chapter_match_status = "matched" if document.chapter_code else "unknown"
document.needs_manual_review = not bool(document.chapter_code)
document.save(
@@ -74,7 +90,7 @@ def import_submission_batch(scenario_id: str, uploaded_files: list) -> dict:
total_pages += page_count
chapter_key = document.chapter_code or "UNCLASSIFIED"
chapter_summary[chapter_key] = chapter_summary.get(chapter_key, 0) + 1
candidates.extend(_extract_product_candidates(document.original_name, text))
candidates.extend(_extract_product_candidates(document.relative_path, text))
product_name, warnings = _select_product_name(candidates)
conversation = create_conversation_for_batch(batch.batch_id, product_name)
@@ -197,6 +213,48 @@ def _estimate_page_count(text: str) -> int:
return max(1, line_count)
def _expand_uploaded_files(uploaded_files: list) -> list[dict]:
expanded_files = []
for uploaded_file in uploaded_files:
extension = Path(uploaded_file.name).suffix.lower()
if extension == ".zip":
expanded_files.extend(_extract_zip_entries(uploaded_file))
continue
expanded_files.append(
{
"relative_path": uploaded_file.name,
"uploaded_file": uploaded_file,
}
)
return expanded_files
def _extract_zip_entries(uploaded_file) -> list[dict]:
archive_bytes = uploaded_file.read()
uploaded_file.seek(0)
entries = []
with ZipFile(BytesIO(archive_bytes)) as archive:
for info in archive.infolist():
if info.is_dir():
continue
relative_path = info.filename.replace("\\", "/")
extension = Path(relative_path).suffix.lower()
if extension not in {".txt", ".md", ".pdf", ".docx"}:
continue
file_data = archive.read(info.filename)
extracted_file = SimpleUploadedFile(
Path(relative_path).name,
file_data,
)
entries.append(
{
"relative_path": relative_path,
"uploaded_file": extracted_file,
}
)
return entries
def _detect_document_role(file_name: str) -> str:
normalized = file_name.lower()
if "申请表" in file_name: