feat: 补齐DOCX精确页数识别与待复核策略

2026-06-04 03:35:34 +08:00
parent e9cf964a3f
commit 9bca08001f
2 changed files with 128 additions and 4 deletions
--- a/apps/documents/services.py
+++ b/apps/documents/services.py
@@ -316,6 +316,28 @@ def _estimate_page_count(text: str) -> int:
    return max(1, line_count)


+def _resolve_page_count(document: UploadedDocument, text: str) -> tuple[int, str]:
+    """
+    按文件类型返回页数与可信度。
+
+    - PDF：优先统计真实页数
+    - DOCX：优先读取 Word 页数元数据
+    - 其他类型：退回估算
+    """
+    extension = document.file_type.lower()
+    if extension == "pdf":
+        page_count = _extract_pdf_page_count(Path(document.file.path))
+        if page_count > 0:
+            return page_count, "exact"
+        return _estimate_page_count(text), "estimated"
+    if extension == "docx":
+        page_count = _extract_docx_page_count(Path(document.file.path))
+        if page_count > 0:
+            return page_count, "exact"
+        return _estimate_page_count(text), "estimated"
+    return _estimate_page_count(text), "estimated"
+
+
 def _expand_uploaded_files(uploaded_files: list) -> list[dict]:
    expanded_files = []
    warnings = []
@@ -368,13 +390,18 @@ def _ingest_files_into_batch(
            relative_path=relative_path,
        )
        text = extract_text(document)
-        page_count = _estimate_page_count(text)
+        page_count, page_count_confidence = _resolve_page_count(document, text)
        document.page_count = page_count
-        document.page_count_confidence = "estimated"
+        document.page_count_confidence = page_count_confidence
        document.document_role = _detect_document_role(document.relative_path)
        document.chapter_code = _detect_chapter_code(document.relative_path, text)
        document.chapter_match_status = "matched" if document.chapter_code else "unknown"
-        document.needs_manual_review = not bool(document.chapter_code)
+        document.needs_manual_review = (
+            not bool(document.chapter_code)
+            or (document.file_type.lower() == "docx" and page_count_confidence != "exact")
+        )
+        if document.file_type.lower() == "docx" and page_count_confidence != "exact":
+            warnings.append(f"DOCX 页数无法精确统计：{document.relative_path}")
        document.save(
            update_fields=[
                "page_count",
@@ -622,6 +649,17 @@ def _extract_pdf_text(path: Path) -> str:
        return _read_binary_text_fallback(path)


+def _extract_pdf_page_count(path: Path) -> int:
+    """优先使用 pypdf 统计 PDF 真实页数。"""
+    try:
+        import pypdf
+
+        reader = pypdf.PdfReader(str(path))
+        return len(reader.pages)
+    except Exception:
+        return 0
+
+
 def _extract_docx_text(path: Path) -> str:
    """提取 Word XML 中的可见文字内容，不追求保留样式。"""
    try:
@@ -635,6 +673,26 @@ def _extract_docx_text(path: Path) -> str:
        return _read_binary_text_fallback(path)


+def _extract_docx_page_count(path: Path) -> int:
+    """
+    从 Word 扩展属性中提取真实页数。
+
+    常见 docx 会在 `docProps/app.xml` 中写入 `<Pages>`。
+    若缺失该元数据，则由上层回退为估算并进入待复核。
+    """
+    try:
+        with ZipFile(path) as archive:
+            app_xml = archive.read("docProps/app.xml")
+        root = ET.fromstring(app_xml)
+        namespace = {"ep": "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"}
+        pages_node = root.find(".//ep:Pages", namespace)
+        if pages_node is None or not (pages_node.text or "").strip():
+            return 0
+        return int((pages_node.text or "").strip())
+    except (BadZipFile, KeyError, ET.ParseError, ValueError):
+        return 0
+
+
 def _read_binary_text_fallback(path: Path) -> str:
    """
    当结构化抽取失败时，退回到“尽可能保留纯文本”的保底方案。
--- a/tests/test_documents.py
+++ b/tests/test_documents.py
@@ -4,6 +4,7 @@ from io import BytesIO
 from pathlib import Path
 import sys
 import types
+import zipfile
 from zipfile import ZipFile

 from apps.documents.forms import DocumentUploadForm
@@ -418,7 +419,72 @@ def test_import_submission_batch_records_warnings_for_unsupported_zip_entries(db
    assert batch.file_count == 1
    assert batch.exception_count == 1
    assert "跳过不支持的文件" in warnings[0]
-    assert "CH1/忽略图片.png" in warnings[0]
+
+
+def test_import_submission_batch_uses_exact_docx_page_count_from_metadata(db):
+    archive = BytesIO()
+    with zipfile.ZipFile(archive, "w") as docx_file:
+        docx_file.writestr(
+            "word/document.xml",
+            """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+            <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+              <w:body>
+                <w:p><w:r><w:t>产品名称：新型冠状病毒 2019-nCoV 核酸检测试剂盒</w:t></w:r></w:p>
+              </w:body>
+            </w:document>""",
+        )
+        docx_file.writestr(
+            "docProps/app.xml",
+            """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+            <Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">
+              <Pages>7</Pages>
+            </Properties>""",
+        )
+    archive.seek(0)
+    file = SimpleUploadedFile(
+        "CH1-目标产品说明书.docx",
+        archive.read(),
+        content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    )
+
+    result = import_submission_batch("document_review", [file])
+
+    batch = SubmissionBatch.objects.get(batch_id=result["batch_id"])
+    document = UploadedDocument.objects.get(batch=batch)
+    assert batch.page_count == 7
+    assert document.page_count == 7
+    assert document.page_count_confidence == "exact"
+    assert batch.import_status == "completed"
+
+
+def test_import_submission_batch_marks_review_when_docx_page_count_cannot_be_precisely_detected(db):
+    archive = BytesIO()
+    with zipfile.ZipFile(archive, "w") as docx_file:
+        docx_file.writestr(
+            "word/document.xml",
+            """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+            <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+              <w:body>
+                <w:p><w:r><w:t>产品名称：新型冠状病毒 2019-nCoV 核酸检测试剂盒</w:t></w:r></w:p>
+              </w:body>
+            </w:document>""",
+        )
+    archive.seek(0)
+    file = SimpleUploadedFile(
+        "CH1-目标产品说明书.docx",
+        archive.read(),
+        content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    )
+
+    result = import_submission_batch("document_review", [file])
+
+    batch = SubmissionBatch.objects.get(batch_id=result["batch_id"])
+    document = UploadedDocument.objects.get(batch=batch)
+    warnings = result["registration_overview_report"]["warnings"]
+    assert document.page_count_confidence == "estimated"
+    assert document.needs_manual_review is True
+    assert batch.import_status == "review_required"
+    assert any("DOCX 页数无法精确统计" in warning for warning in warnings)


 def test_import_submission_batch_marks_failed_when_zip_has_no_supported_files(db):