feat: 补齐DOCX精确页数识别与待复核策略
This commit is contained in:
@@ -316,6 +316,28 @@ def _estimate_page_count(text: str) -> int:
|
|||||||
return max(1, line_count)
|
return max(1, line_count)
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_page_count(document: UploadedDocument, text: str) -> tuple[int, str]:
|
||||||
|
"""
|
||||||
|
按文件类型返回页数与可信度。
|
||||||
|
|
||||||
|
- PDF:优先统计真实页数
|
||||||
|
- DOCX:优先读取 Word 页数元数据
|
||||||
|
- 其他类型:退回估算
|
||||||
|
"""
|
||||||
|
extension = document.file_type.lower()
|
||||||
|
if extension == "pdf":
|
||||||
|
page_count = _extract_pdf_page_count(Path(document.file.path))
|
||||||
|
if page_count > 0:
|
||||||
|
return page_count, "exact"
|
||||||
|
return _estimate_page_count(text), "estimated"
|
||||||
|
if extension == "docx":
|
||||||
|
page_count = _extract_docx_page_count(Path(document.file.path))
|
||||||
|
if page_count > 0:
|
||||||
|
return page_count, "exact"
|
||||||
|
return _estimate_page_count(text), "estimated"
|
||||||
|
return _estimate_page_count(text), "estimated"
|
||||||
|
|
||||||
|
|
||||||
def _expand_uploaded_files(uploaded_files: list) -> list[dict]:
|
def _expand_uploaded_files(uploaded_files: list) -> list[dict]:
|
||||||
expanded_files = []
|
expanded_files = []
|
||||||
warnings = []
|
warnings = []
|
||||||
@@ -368,13 +390,18 @@ def _ingest_files_into_batch(
|
|||||||
relative_path=relative_path,
|
relative_path=relative_path,
|
||||||
)
|
)
|
||||||
text = extract_text(document)
|
text = extract_text(document)
|
||||||
page_count = _estimate_page_count(text)
|
page_count, page_count_confidence = _resolve_page_count(document, text)
|
||||||
document.page_count = page_count
|
document.page_count = page_count
|
||||||
document.page_count_confidence = "estimated"
|
document.page_count_confidence = page_count_confidence
|
||||||
document.document_role = _detect_document_role(document.relative_path)
|
document.document_role = _detect_document_role(document.relative_path)
|
||||||
document.chapter_code = _detect_chapter_code(document.relative_path, text)
|
document.chapter_code = _detect_chapter_code(document.relative_path, text)
|
||||||
document.chapter_match_status = "matched" if document.chapter_code else "unknown"
|
document.chapter_match_status = "matched" if document.chapter_code else "unknown"
|
||||||
document.needs_manual_review = not bool(document.chapter_code)
|
document.needs_manual_review = (
|
||||||
|
not bool(document.chapter_code)
|
||||||
|
or (document.file_type.lower() == "docx" and page_count_confidence != "exact")
|
||||||
|
)
|
||||||
|
if document.file_type.lower() == "docx" and page_count_confidence != "exact":
|
||||||
|
warnings.append(f"DOCX 页数无法精确统计:{document.relative_path}")
|
||||||
document.save(
|
document.save(
|
||||||
update_fields=[
|
update_fields=[
|
||||||
"page_count",
|
"page_count",
|
||||||
@@ -622,6 +649,17 @@ def _extract_pdf_text(path: Path) -> str:
|
|||||||
return _read_binary_text_fallback(path)
|
return _read_binary_text_fallback(path)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_pdf_page_count(path: Path) -> int:
|
||||||
|
"""优先使用 pypdf 统计 PDF 真实页数。"""
|
||||||
|
try:
|
||||||
|
import pypdf
|
||||||
|
|
||||||
|
reader = pypdf.PdfReader(str(path))
|
||||||
|
return len(reader.pages)
|
||||||
|
except Exception:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def _extract_docx_text(path: Path) -> str:
|
def _extract_docx_text(path: Path) -> str:
|
||||||
"""提取 Word XML 中的可见文字内容,不追求保留样式。"""
|
"""提取 Word XML 中的可见文字内容,不追求保留样式。"""
|
||||||
try:
|
try:
|
||||||
@@ -635,6 +673,26 @@ def _extract_docx_text(path: Path) -> str:
|
|||||||
return _read_binary_text_fallback(path)
|
return _read_binary_text_fallback(path)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_docx_page_count(path: Path) -> int:
|
||||||
|
"""
|
||||||
|
从 Word 扩展属性中提取真实页数。
|
||||||
|
|
||||||
|
常见 docx 会在 `docProps/app.xml` 中写入 `<Pages>`。
|
||||||
|
若缺失该元数据,则由上层回退为估算并进入待复核。
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with ZipFile(path) as archive:
|
||||||
|
app_xml = archive.read("docProps/app.xml")
|
||||||
|
root = ET.fromstring(app_xml)
|
||||||
|
namespace = {"ep": "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"}
|
||||||
|
pages_node = root.find(".//ep:Pages", namespace)
|
||||||
|
if pages_node is None or not (pages_node.text or "").strip():
|
||||||
|
return 0
|
||||||
|
return int((pages_node.text or "").strip())
|
||||||
|
except (BadZipFile, KeyError, ET.ParseError, ValueError):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def _read_binary_text_fallback(path: Path) -> str:
|
def _read_binary_text_fallback(path: Path) -> str:
|
||||||
"""
|
"""
|
||||||
当结构化抽取失败时,退回到“尽可能保留纯文本”的保底方案。
|
当结构化抽取失败时,退回到“尽可能保留纯文本”的保底方案。
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from io import BytesIO
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import types
|
import types
|
||||||
|
import zipfile
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
from apps.documents.forms import DocumentUploadForm
|
from apps.documents.forms import DocumentUploadForm
|
||||||
@@ -418,7 +419,72 @@ def test_import_submission_batch_records_warnings_for_unsupported_zip_entries(db
|
|||||||
assert batch.file_count == 1
|
assert batch.file_count == 1
|
||||||
assert batch.exception_count == 1
|
assert batch.exception_count == 1
|
||||||
assert "跳过不支持的文件" in warnings[0]
|
assert "跳过不支持的文件" in warnings[0]
|
||||||
assert "CH1/忽略图片.png" in warnings[0]
|
|
||||||
|
|
||||||
|
def test_import_submission_batch_uses_exact_docx_page_count_from_metadata(db):
|
||||||
|
archive = BytesIO()
|
||||||
|
with zipfile.ZipFile(archive, "w") as docx_file:
|
||||||
|
docx_file.writestr(
|
||||||
|
"word/document.xml",
|
||||||
|
"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||||
|
<w:body>
|
||||||
|
<w:p><w:r><w:t>产品名称:新型冠状病毒 2019-nCoV 核酸检测试剂盒</w:t></w:r></w:p>
|
||||||
|
</w:body>
|
||||||
|
</w:document>""",
|
||||||
|
)
|
||||||
|
docx_file.writestr(
|
||||||
|
"docProps/app.xml",
|
||||||
|
"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">
|
||||||
|
<Pages>7</Pages>
|
||||||
|
</Properties>""",
|
||||||
|
)
|
||||||
|
archive.seek(0)
|
||||||
|
file = SimpleUploadedFile(
|
||||||
|
"CH1-目标产品说明书.docx",
|
||||||
|
archive.read(),
|
||||||
|
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = import_submission_batch("document_review", [file])
|
||||||
|
|
||||||
|
batch = SubmissionBatch.objects.get(batch_id=result["batch_id"])
|
||||||
|
document = UploadedDocument.objects.get(batch=batch)
|
||||||
|
assert batch.page_count == 7
|
||||||
|
assert document.page_count == 7
|
||||||
|
assert document.page_count_confidence == "exact"
|
||||||
|
assert batch.import_status == "completed"
|
||||||
|
|
||||||
|
|
||||||
|
def test_import_submission_batch_marks_review_when_docx_page_count_cannot_be_precisely_detected(db):
|
||||||
|
archive = BytesIO()
|
||||||
|
with zipfile.ZipFile(archive, "w") as docx_file:
|
||||||
|
docx_file.writestr(
|
||||||
|
"word/document.xml",
|
||||||
|
"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||||
|
<w:body>
|
||||||
|
<w:p><w:r><w:t>产品名称:新型冠状病毒 2019-nCoV 核酸检测试剂盒</w:t></w:r></w:p>
|
||||||
|
</w:body>
|
||||||
|
</w:document>""",
|
||||||
|
)
|
||||||
|
archive.seek(0)
|
||||||
|
file = SimpleUploadedFile(
|
||||||
|
"CH1-目标产品说明书.docx",
|
||||||
|
archive.read(),
|
||||||
|
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = import_submission_batch("document_review", [file])
|
||||||
|
|
||||||
|
batch = SubmissionBatch.objects.get(batch_id=result["batch_id"])
|
||||||
|
document = UploadedDocument.objects.get(batch=batch)
|
||||||
|
warnings = result["registration_overview_report"]["warnings"]
|
||||||
|
assert document.page_count_confidence == "estimated"
|
||||||
|
assert document.needs_manual_review is True
|
||||||
|
assert batch.import_status == "review_required"
|
||||||
|
assert any("DOCX 页数无法精确统计" in warning for warning in warnings)
|
||||||
|
|
||||||
|
|
||||||
def test_import_submission_batch_marks_failed_when_zip_has_no_supported_files(db):
|
def test_import_submission_batch_marks_failed_when_zip_has_no_supported_files(db):
|
||||||
|
|||||||
Reference in New Issue
Block a user