feat: 补齐DOCX精确页数识别与待复核策略

This commit is contained in:
2026-06-04 03:35:34 +08:00
parent e9cf964a3f
commit 9bca08001f
2 changed files with 128 additions and 4 deletions

View File

@@ -316,6 +316,28 @@ def _estimate_page_count(text: str) -> int:
return max(1, line_count) return max(1, line_count)
def _resolve_page_count(document: UploadedDocument, text: str) -> tuple[int, str]:
"""
按文件类型返回页数与可信度。
- PDF优先统计真实页数
- DOCX优先读取 Word 页数元数据
- 其他类型:退回估算
"""
extension = document.file_type.lower()
if extension == "pdf":
page_count = _extract_pdf_page_count(Path(document.file.path))
if page_count > 0:
return page_count, "exact"
return _estimate_page_count(text), "estimated"
if extension == "docx":
page_count = _extract_docx_page_count(Path(document.file.path))
if page_count > 0:
return page_count, "exact"
return _estimate_page_count(text), "estimated"
return _estimate_page_count(text), "estimated"
def _expand_uploaded_files(uploaded_files: list) -> list[dict]: def _expand_uploaded_files(uploaded_files: list) -> list[dict]:
expanded_files = [] expanded_files = []
warnings = [] warnings = []
@@ -368,13 +390,18 @@ def _ingest_files_into_batch(
relative_path=relative_path, relative_path=relative_path,
) )
text = extract_text(document) text = extract_text(document)
page_count = _estimate_page_count(text) page_count, page_count_confidence = _resolve_page_count(document, text)
document.page_count = page_count document.page_count = page_count
document.page_count_confidence = "estimated" document.page_count_confidence = page_count_confidence
document.document_role = _detect_document_role(document.relative_path) document.document_role = _detect_document_role(document.relative_path)
document.chapter_code = _detect_chapter_code(document.relative_path, text) document.chapter_code = _detect_chapter_code(document.relative_path, text)
document.chapter_match_status = "matched" if document.chapter_code else "unknown" document.chapter_match_status = "matched" if document.chapter_code else "unknown"
document.needs_manual_review = not bool(document.chapter_code) document.needs_manual_review = (
not bool(document.chapter_code)
or (document.file_type.lower() == "docx" and page_count_confidence != "exact")
)
if document.file_type.lower() == "docx" and page_count_confidence != "exact":
warnings.append(f"DOCX 页数无法精确统计:{document.relative_path}")
document.save( document.save(
update_fields=[ update_fields=[
"page_count", "page_count",
@@ -622,6 +649,17 @@ def _extract_pdf_text(path: Path) -> str:
return _read_binary_text_fallback(path) return _read_binary_text_fallback(path)
def _extract_pdf_page_count(path: Path) -> int:
"""优先使用 pypdf 统计 PDF 真实页数。"""
try:
import pypdf
reader = pypdf.PdfReader(str(path))
return len(reader.pages)
except Exception:
return 0
def _extract_docx_text(path: Path) -> str: def _extract_docx_text(path: Path) -> str:
"""提取 Word XML 中的可见文字内容,不追求保留样式。""" """提取 Word XML 中的可见文字内容,不追求保留样式。"""
try: try:
@@ -635,6 +673,26 @@ def _extract_docx_text(path: Path) -> str:
return _read_binary_text_fallback(path) return _read_binary_text_fallback(path)
def _extract_docx_page_count(path: Path) -> int:
"""
从 Word 扩展属性中提取真实页数。
常见 docx 会在 `docProps/app.xml` 中写入 `<Pages>`。
若缺失该元数据,则由上层回退为估算并进入待复核。
"""
try:
with ZipFile(path) as archive:
app_xml = archive.read("docProps/app.xml")
root = ET.fromstring(app_xml)
namespace = {"ep": "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"}
pages_node = root.find(".//ep:Pages", namespace)
if pages_node is None or not (pages_node.text or "").strip():
return 0
return int((pages_node.text or "").strip())
except (BadZipFile, KeyError, ET.ParseError, ValueError):
return 0
def _read_binary_text_fallback(path: Path) -> str: def _read_binary_text_fallback(path: Path) -> str:
""" """
当结构化抽取失败时,退回到“尽可能保留纯文本”的保底方案。 当结构化抽取失败时,退回到“尽可能保留纯文本”的保底方案。

View File

@@ -4,6 +4,7 @@ from io import BytesIO
from pathlib import Path from pathlib import Path
import sys import sys
import types import types
import zipfile
from zipfile import ZipFile from zipfile import ZipFile
from apps.documents.forms import DocumentUploadForm from apps.documents.forms import DocumentUploadForm
@@ -418,7 +419,72 @@ def test_import_submission_batch_records_warnings_for_unsupported_zip_entries(db
assert batch.file_count == 1 assert batch.file_count == 1
assert batch.exception_count == 1 assert batch.exception_count == 1
assert "跳过不支持的文件" in warnings[0] assert "跳过不支持的文件" in warnings[0]
assert "CH1/忽略图片.png" in warnings[0]
def test_import_submission_batch_uses_exact_docx_page_count_from_metadata(db):
archive = BytesIO()
with zipfile.ZipFile(archive, "w") as docx_file:
docx_file.writestr(
"word/document.xml",
"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>产品名称:新型冠状病毒 2019-nCoV 核酸检测试剂盒</w:t></w:r></w:p>
</w:body>
</w:document>""",
)
docx_file.writestr(
"docProps/app.xml",
"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">
<Pages>7</Pages>
</Properties>""",
)
archive.seek(0)
file = SimpleUploadedFile(
"CH1-目标产品说明书.docx",
archive.read(),
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
result = import_submission_batch("document_review", [file])
batch = SubmissionBatch.objects.get(batch_id=result["batch_id"])
document = UploadedDocument.objects.get(batch=batch)
assert batch.page_count == 7
assert document.page_count == 7
assert document.page_count_confidence == "exact"
assert batch.import_status == "completed"
def test_import_submission_batch_marks_review_when_docx_page_count_cannot_be_precisely_detected(db):
archive = BytesIO()
with zipfile.ZipFile(archive, "w") as docx_file:
docx_file.writestr(
"word/document.xml",
"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>产品名称:新型冠状病毒 2019-nCoV 核酸检测试剂盒</w:t></w:r></w:p>
</w:body>
</w:document>""",
)
archive.seek(0)
file = SimpleUploadedFile(
"CH1-目标产品说明书.docx",
archive.read(),
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
result = import_submission_batch("document_review", [file])
batch = SubmissionBatch.objects.get(batch_id=result["batch_id"])
document = UploadedDocument.objects.get(batch=batch)
warnings = result["registration_overview_report"]["warnings"]
assert document.page_count_confidence == "estimated"
assert document.needs_manual_review is True
assert batch.import_status == "review_required"
assert any("DOCX 页数无法精确统计" in warning for warning in warnings)
def test_import_submission_batch_marks_failed_when_zip_has_no_supported_files(db): def test_import_submission_batch_marks_failed_when_zip_has_no_supported_files(db):