feat: 支持资料包多文件与zip导入
This commit is contained in:
@@ -1,15 +1,23 @@
|
||||
from pathlib import Path
|
||||
from io import BytesIO
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from zipfile import BadZipFile, ZipFile
|
||||
|
||||
from agent_core.rag.ingest import ingest_document
|
||||
from apps.chat.services import create_conversation_for_batch
|
||||
from django.core.files.uploadedfile import SimpleUploadedFile
|
||||
|
||||
from .models import SubmissionBatch, UploadedDocument
|
||||
|
||||
|
||||
def create_uploaded_document(scenario_id: str, uploaded_file, batch: SubmissionBatch | None = None) -> UploadedDocument:
|
||||
def create_uploaded_document(
|
||||
scenario_id: str,
|
||||
uploaded_file,
|
||||
batch: SubmissionBatch | None = None,
|
||||
*,
|
||||
relative_path: str | None = None,
|
||||
) -> UploadedDocument:
|
||||
"""
|
||||
保存上传文件的元数据记录。
|
||||
|
||||
@@ -20,11 +28,11 @@ def create_uploaded_document(scenario_id: str, uploaded_file, batch: SubmissionB
|
||||
return UploadedDocument.objects.create(
|
||||
batch=batch,
|
||||
scenario_id=scenario_id,
|
||||
original_name=uploaded_file.name,
|
||||
original_name=Path(relative_path or uploaded_file.name).name,
|
||||
file=uploaded_file,
|
||||
file_type=extension,
|
||||
size=uploaded_file.size,
|
||||
relative_path=uploaded_file.name,
|
||||
relative_path=relative_path or uploaded_file.name,
|
||||
status=UploadedDocument.STATUS_UPLOADED,
|
||||
)
|
||||
|
||||
@@ -49,14 +57,22 @@ def import_submission_batch(scenario_id: str, uploaded_files: list) -> dict:
|
||||
chapter_summary = {}
|
||||
total_pages = 0
|
||||
|
||||
for uploaded_file in uploaded_files:
|
||||
document = create_uploaded_document(scenario_id, uploaded_file, batch=batch)
|
||||
expanded_files = _expand_uploaded_files(uploaded_files)
|
||||
for uploaded_item in expanded_files:
|
||||
uploaded_file = uploaded_item["uploaded_file"]
|
||||
relative_path = uploaded_item["relative_path"]
|
||||
document = create_uploaded_document(
|
||||
scenario_id,
|
||||
uploaded_file,
|
||||
batch=batch,
|
||||
relative_path=relative_path,
|
||||
)
|
||||
text = extract_text(document)
|
||||
page_count = _estimate_page_count(text)
|
||||
document.page_count = page_count
|
||||
document.page_count_confidence = "estimated"
|
||||
document.document_role = _detect_document_role(document.original_name)
|
||||
document.chapter_code = _detect_chapter_code(document.original_name, text)
|
||||
document.document_role = _detect_document_role(document.relative_path)
|
||||
document.chapter_code = _detect_chapter_code(document.relative_path, text)
|
||||
document.chapter_match_status = "matched" if document.chapter_code else "unknown"
|
||||
document.needs_manual_review = not bool(document.chapter_code)
|
||||
document.save(
|
||||
@@ -74,7 +90,7 @@ def import_submission_batch(scenario_id: str, uploaded_files: list) -> dict:
|
||||
total_pages += page_count
|
||||
chapter_key = document.chapter_code or "UNCLASSIFIED"
|
||||
chapter_summary[chapter_key] = chapter_summary.get(chapter_key, 0) + 1
|
||||
candidates.extend(_extract_product_candidates(document.original_name, text))
|
||||
candidates.extend(_extract_product_candidates(document.relative_path, text))
|
||||
|
||||
product_name, warnings = _select_product_name(candidates)
|
||||
conversation = create_conversation_for_batch(batch.batch_id, product_name)
|
||||
@@ -197,6 +213,48 @@ def _estimate_page_count(text: str) -> int:
|
||||
return max(1, line_count)
|
||||
|
||||
|
||||
def _expand_uploaded_files(uploaded_files: list) -> list[dict]:
|
||||
expanded_files = []
|
||||
for uploaded_file in uploaded_files:
|
||||
extension = Path(uploaded_file.name).suffix.lower()
|
||||
if extension == ".zip":
|
||||
expanded_files.extend(_extract_zip_entries(uploaded_file))
|
||||
continue
|
||||
expanded_files.append(
|
||||
{
|
||||
"relative_path": uploaded_file.name,
|
||||
"uploaded_file": uploaded_file,
|
||||
}
|
||||
)
|
||||
return expanded_files
|
||||
|
||||
|
||||
def _extract_zip_entries(uploaded_file) -> list[dict]:
|
||||
archive_bytes = uploaded_file.read()
|
||||
uploaded_file.seek(0)
|
||||
entries = []
|
||||
with ZipFile(BytesIO(archive_bytes)) as archive:
|
||||
for info in archive.infolist():
|
||||
if info.is_dir():
|
||||
continue
|
||||
relative_path = info.filename.replace("\\", "/")
|
||||
extension = Path(relative_path).suffix.lower()
|
||||
if extension not in {".txt", ".md", ".pdf", ".docx"}:
|
||||
continue
|
||||
file_data = archive.read(info.filename)
|
||||
extracted_file = SimpleUploadedFile(
|
||||
Path(relative_path).name,
|
||||
file_data,
|
||||
)
|
||||
entries.append(
|
||||
{
|
||||
"relative_path": relative_path,
|
||||
"uploaded_file": extracted_file,
|
||||
}
|
||||
)
|
||||
return entries
|
||||
|
||||
|
||||
def _detect_document_role(file_name: str) -> str:
|
||||
normalized = file_name.lower()
|
||||
if "申请表" in file_name:
|
||||
|
||||
Reference in New Issue
Block a user