feat: 补齐DOCX精确页数识别与待复核策略

This commit is contained in:
2026-06-04 03:35:34 +08:00
parent e9cf964a3f
commit 9bca08001f
2 changed files with 128 additions and 4 deletions

View File

@@ -316,6 +316,28 @@ def _estimate_page_count(text: str) -> int:
return max(1, line_count)
def _resolve_page_count(document: UploadedDocument, text: str) -> tuple[int, str]:
"""
按文件类型返回页数与可信度。
- PDF优先统计真实页数
- DOCX优先读取 Word 页数元数据
- 其他类型:退回估算
"""
extension = document.file_type.lower()
if extension == "pdf":
page_count = _extract_pdf_page_count(Path(document.file.path))
if page_count > 0:
return page_count, "exact"
return _estimate_page_count(text), "estimated"
if extension == "docx":
page_count = _extract_docx_page_count(Path(document.file.path))
if page_count > 0:
return page_count, "exact"
return _estimate_page_count(text), "estimated"
return _estimate_page_count(text), "estimated"
def _expand_uploaded_files(uploaded_files: list) -> list[dict]:
expanded_files = []
warnings = []
@@ -368,13 +390,18 @@ def _ingest_files_into_batch(
relative_path=relative_path,
)
text = extract_text(document)
page_count = _estimate_page_count(text)
page_count, page_count_confidence = _resolve_page_count(document, text)
document.page_count = page_count
document.page_count_confidence = "estimated"
document.page_count_confidence = page_count_confidence
document.document_role = _detect_document_role(document.relative_path)
document.chapter_code = _detect_chapter_code(document.relative_path, text)
document.chapter_match_status = "matched" if document.chapter_code else "unknown"
document.needs_manual_review = not bool(document.chapter_code)
document.needs_manual_review = (
not bool(document.chapter_code)
or (document.file_type.lower() == "docx" and page_count_confidence != "exact")
)
if document.file_type.lower() == "docx" and page_count_confidence != "exact":
warnings.append(f"DOCX 页数无法精确统计:{document.relative_path}")
document.save(
update_fields=[
"page_count",
@@ -622,6 +649,17 @@ def _extract_pdf_text(path: Path) -> str:
return _read_binary_text_fallback(path)
def _extract_pdf_page_count(path: Path) -> int:
"""优先使用 pypdf 统计 PDF 真实页数。"""
try:
import pypdf
reader = pypdf.PdfReader(str(path))
return len(reader.pages)
except Exception:
return 0
def _extract_docx_text(path: Path) -> str:
"""提取 Word XML 中的可见文字内容,不追求保留样式。"""
try:
@@ -635,6 +673,26 @@ def _extract_docx_text(path: Path) -> str:
return _read_binary_text_fallback(path)
def _extract_docx_page_count(path: Path) -> int:
"""
从 Word 扩展属性中提取真实页数。
常见 docx 会在 `docProps/app.xml` 中写入 `<Pages>`。
若缺失该元数据,则由上层回退为估算并进入待复核。
"""
try:
with ZipFile(path) as archive:
app_xml = archive.read("docProps/app.xml")
root = ET.fromstring(app_xml)
namespace = {"ep": "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"}
pages_node = root.find(".//ep:Pages", namespace)
if pages_node is None or not (pages_node.text or "").strip():
return 0
return int((pages_node.text or "").strip())
except (BadZipFile, KeyError, ET.ParseError, ValueError):
return 0
def _read_binary_text_fallback(path: Path) -> str:
"""
当结构化抽取失败时,退回到“尽可能保留纯文本”的保底方案。