From 9bca08001f2727f328993ef7537bd40d21db18e6 Mon Sep 17 00:00:00 2001 From: bruce Date: Thu, 4 Jun 2026 03:35:34 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E8=A1=A5=E9=BD=90DOCX=E7=B2=BE?= =?UTF-8?q?=E7=A1=AE=E9=A1=B5=E6=95=B0=E8=AF=86=E5=88=AB=E4=B8=8E=E5=BE=85?= =?UTF-8?q?=E5=A4=8D=E6=A0=B8=E7=AD=96=E7=95=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/documents/services.py | 64 +++++++++++++++++++++++++++++++++-- tests/test_documents.py | 68 +++++++++++++++++++++++++++++++++++++- 2 files changed, 128 insertions(+), 4 deletions(-) diff --git a/apps/documents/services.py b/apps/documents/services.py index 5795d7e..a769c48 100644 --- a/apps/documents/services.py +++ b/apps/documents/services.py @@ -316,6 +316,28 @@ def _estimate_page_count(text: str) -> int: return max(1, line_count) +def _resolve_page_count(document: UploadedDocument, text: str) -> tuple[int, str]: + """ + 按文件类型返回页数与可信度。 + + - PDF:优先统计真实页数 + - DOCX:优先读取 Word 页数元数据 + - 其他类型:退回估算 + """ + extension = document.file_type.lower() + if extension == "pdf": + page_count = _extract_pdf_page_count(Path(document.file.path)) + if page_count > 0: + return page_count, "exact" + return _estimate_page_count(text), "estimated" + if extension == "docx": + page_count = _extract_docx_page_count(Path(document.file.path)) + if page_count > 0: + return page_count, "exact" + return _estimate_page_count(text), "estimated" + return _estimate_page_count(text), "estimated" + + def _expand_uploaded_files(uploaded_files: list) -> list[dict]: expanded_files = [] warnings = [] @@ -368,13 +390,18 @@ def _ingest_files_into_batch( relative_path=relative_path, ) text = extract_text(document) - page_count = _estimate_page_count(text) + page_count, page_count_confidence = _resolve_page_count(document, text) document.page_count = page_count - document.page_count_confidence = "estimated" + document.page_count_confidence = page_count_confidence document.document_role = _detect_document_role(document.relative_path) document.chapter_code = _detect_chapter_code(document.relative_path, text) document.chapter_match_status = "matched" if document.chapter_code else "unknown" - document.needs_manual_review = not bool(document.chapter_code) + document.needs_manual_review = ( + not bool(document.chapter_code) + or (document.file_type.lower() == "docx" and page_count_confidence != "exact") + ) + if document.file_type.lower() == "docx" and page_count_confidence != "exact": + warnings.append(f"DOCX 页数无法精确统计:{document.relative_path}") document.save( update_fields=[ "page_count", @@ -622,6 +649,17 @@ def _extract_pdf_text(path: Path) -> str: return _read_binary_text_fallback(path) +def _extract_pdf_page_count(path: Path) -> int: + """优先使用 pypdf 统计 PDF 真实页数。""" + try: + import pypdf + + reader = pypdf.PdfReader(str(path)) + return len(reader.pages) + except Exception: + return 0 + + def _extract_docx_text(path: Path) -> str: """提取 Word XML 中的可见文字内容,不追求保留样式。""" try: @@ -635,6 +673,26 @@ def _extract_docx_text(path: Path) -> str: return _read_binary_text_fallback(path) +def _extract_docx_page_count(path: Path) -> int: + """ + 从 Word 扩展属性中提取真实页数。 + + 常见 docx 会在 `docProps/app.xml` 中写入 ``。 + 若缺失该元数据,则由上层回退为估算并进入待复核。 + """ + try: + with ZipFile(path) as archive: + app_xml = archive.read("docProps/app.xml") + root = ET.fromstring(app_xml) + namespace = {"ep": "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"} + pages_node = root.find(".//ep:Pages", namespace) + if pages_node is None or not (pages_node.text or "").strip(): + return 0 + return int((pages_node.text or "").strip()) + except (BadZipFile, KeyError, ET.ParseError, ValueError): + return 0 + + def _read_binary_text_fallback(path: Path) -> str: """ 当结构化抽取失败时,退回到“尽可能保留纯文本”的保底方案。 diff --git a/tests/test_documents.py b/tests/test_documents.py index ba5a27d..83c8be2 100644 --- a/tests/test_documents.py +++ b/tests/test_documents.py @@ -4,6 +4,7 @@ from io import BytesIO from pathlib import Path import sys import types +import zipfile from zipfile import ZipFile from apps.documents.forms import DocumentUploadForm @@ -418,7 +419,72 @@ def test_import_submission_batch_records_warnings_for_unsupported_zip_entries(db assert batch.file_count == 1 assert batch.exception_count == 1 assert "跳过不支持的文件" in warnings[0] - assert "CH1/忽略图片.png" in warnings[0] + + +def test_import_submission_batch_uses_exact_docx_page_count_from_metadata(db): + archive = BytesIO() + with zipfile.ZipFile(archive, "w") as docx_file: + docx_file.writestr( + "word/document.xml", + """ + + + 产品名称:新型冠状病毒 2019-nCoV 核酸检测试剂盒 + + """, + ) + docx_file.writestr( + "docProps/app.xml", + """ + + 7 + """, + ) + archive.seek(0) + file = SimpleUploadedFile( + "CH1-目标产品说明书.docx", + archive.read(), + content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + + result = import_submission_batch("document_review", [file]) + + batch = SubmissionBatch.objects.get(batch_id=result["batch_id"]) + document = UploadedDocument.objects.get(batch=batch) + assert batch.page_count == 7 + assert document.page_count == 7 + assert document.page_count_confidence == "exact" + assert batch.import_status == "completed" + + +def test_import_submission_batch_marks_review_when_docx_page_count_cannot_be_precisely_detected(db): + archive = BytesIO() + with zipfile.ZipFile(archive, "w") as docx_file: + docx_file.writestr( + "word/document.xml", + """ + + + 产品名称:新型冠状病毒 2019-nCoV 核酸检测试剂盒 + + """, + ) + archive.seek(0) + file = SimpleUploadedFile( + "CH1-目标产品说明书.docx", + archive.read(), + content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + + result = import_submission_batch("document_review", [file]) + + batch = SubmissionBatch.objects.get(batch_id=result["batch_id"]) + document = UploadedDocument.objects.get(batch=batch) + warnings = result["registration_overview_report"]["warnings"] + assert document.page_count_confidence == "estimated" + assert document.needs_manual_review is True + assert batch.import_status == "review_required" + assert any("DOCX 页数无法精确统计" in warning for warning in warnings) def test_import_submission_batch_marks_failed_when_zip_has_no_supported_files(db):