fix(file-summary): 补强 Office 页数统计

This commit is contained in:
2026-06-06 17:57:08 +08:00
parent 54c37edf19
commit 460d418921
2 changed files with 309 additions and 21 deletions

View File

@@ -1,4 +1,6 @@
import pytest
import shutil
from zipfile import ZipFile
from docx import Document
from openpyxl import Workbook
from pptx import Presentation
@@ -31,6 +33,89 @@ def test_count_document_pages_for_office_formats(tmp_path):
assert count_document_pages(pptx_path).page_count == 1
def test_count_docx_pages_from_extended_properties(tmp_path):
docx_path = tmp_path / "with-pages.docx"
Document().save(docx_path)
app_xml = (
'<?xml version="1.0" encoding="UTF-8"?>'
'<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">'
"<Pages>7</Pages>"
"</Properties>"
)
rewritten = tmp_path / "rewritten.docx"
with ZipFile(docx_path) as source, ZipFile(rewritten, "w") as target:
for entry in source.infolist():
if entry.filename != "docProps/app.xml":
target.writestr(entry, source.read(entry.filename))
target.writestr("docProps/app.xml", app_xml)
shutil.move(rewritten, docx_path)
result = count_document_pages(docx_path)
assert result.status == "success"
assert result.page_count == 7
def test_count_docx_pages_uses_word_com_fallback(monkeypatch, tmp_path):
docx_path = tmp_path / "without-pages.docx"
Document().save(docx_path)
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_docx_pages_from_extended_properties",
lambda path: None,
)
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_word_pages_with_com",
lambda path: 22,
)
result = count_document_pages(docx_path)
assert result.status == "success"
assert result.page_count == 22
def test_count_doc_pages_uses_word_com_fallback(monkeypatch, tmp_path):
doc_path = tmp_path / "legacy.doc"
doc_path.write_bytes(b"legacy-doc-placeholder")
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_word_pages_with_com",
lambda path: 5,
)
result = count_document_pages(doc_path)
assert result.status == "success"
assert result.page_count == 5
def test_count_ppt_pages_uses_powerpoint_com_fallback(monkeypatch, tmp_path):
ppt_path = tmp_path / "legacy.ppt"
ppt_path.write_bytes(b"legacy-ppt-placeholder")
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_powerpoint_slides_with_com",
lambda path: 9,
)
result = count_document_pages(ppt_path)
assert result.status == "success"
assert result.page_count == 9
def test_count_excel_pages_uses_excel_com_fallback(monkeypatch, tmp_path):
xls_path = tmp_path / "legacy.xls"
xls_path.write_bytes(b"legacy-xls-placeholder")
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_excel_sheets_with_com",
lambda path: 3,
)
result = count_document_pages(xls_path)
assert result.status == "success"
assert result.page_count == 3
def test_document_page_count_skill_marks_unsupported_and_success(tmp_path, django_user_model):
xlsx_path = tmp_path / "a.xlsx"
workbook = Workbook()