Files
DEMO-AGENT/tests/test_file_summary_page_count.py

181 lines
5.9 KiB
Python

import pytest
import shutil
from zipfile import ZipFile
from docx import Document
from openpyxl import Workbook
from pptx import Presentation
from review_agent.file_summary.services.page_count import count_document_pages
from review_agent.file_summary.skills.document_page_count import DocumentPageCountSkill
from review_agent.file_summary.skills.base import WorkflowContext
from review_agent.models import Conversation, FileSummaryBatch, FileSummaryItem
pytestmark = pytest.mark.django_db
def test_count_document_pages_for_office_formats(tmp_path):
docx_path = tmp_path / "a.docx"
Document().save(docx_path)
xlsx_path = tmp_path / "a.xlsx"
workbook = Workbook()
workbook.create_sheet("第二页")
workbook.save(xlsx_path)
pptx_path = tmp_path / "a.pptx"
presentation = Presentation()
presentation.slides.add_slide(presentation.slide_layouts[6])
presentation.save(pptx_path)
assert count_document_pages(docx_path).status in {"success", "uncertain"}
assert count_document_pages(xlsx_path).page_count == 2
assert count_document_pages(pptx_path).page_count == 1
def test_count_docx_pages_from_extended_properties(tmp_path):
docx_path = tmp_path / "with-pages.docx"
Document().save(docx_path)
app_xml = (
'<?xml version="1.0" encoding="UTF-8"?>'
'<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">'
"<Pages>7</Pages>"
"</Properties>"
)
rewritten = tmp_path / "rewritten.docx"
with ZipFile(docx_path) as source, ZipFile(rewritten, "w") as target:
for entry in source.infolist():
if entry.filename != "docProps/app.xml":
target.writestr(entry, source.read(entry.filename))
target.writestr("docProps/app.xml", app_xml)
shutil.move(rewritten, docx_path)
result = count_document_pages(docx_path)
assert result.status == "success"
assert result.page_count == 7
def test_count_docx_pages_uses_word_com_fallback(monkeypatch, tmp_path):
docx_path = tmp_path / "without-pages.docx"
Document().save(docx_path)
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_docx_pages_from_extended_properties",
lambda path: None,
)
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_word_pages_with_com",
lambda path: 22,
)
result = count_document_pages(docx_path)
assert result.status == "success"
assert result.page_count == 22
def test_count_doc_pages_uses_word_com_fallback(monkeypatch, tmp_path):
doc_path = tmp_path / "legacy.doc"
doc_path.write_bytes(b"legacy-doc-placeholder")
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._can_try_com_fallback",
lambda path, ext: True,
)
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_word_pages_with_com",
lambda path: 5,
)
result = count_document_pages(doc_path)
assert result.status == "success"
assert result.page_count == 5
def test_count_ppt_pages_uses_powerpoint_com_fallback(monkeypatch, tmp_path):
ppt_path = tmp_path / "legacy.ppt"
ppt_path.write_bytes(b"legacy-ppt-placeholder")
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._can_try_com_fallback",
lambda path, ext: True,
)
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_powerpoint_slides_with_com",
lambda path: 9,
)
result = count_document_pages(ppt_path)
assert result.status == "success"
assert result.page_count == 9
def test_count_excel_pages_uses_excel_com_fallback(monkeypatch, tmp_path):
xls_path = tmp_path / "legacy.xls"
xls_path.write_bytes(b"legacy-xls-placeholder")
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._can_try_com_fallback",
lambda path, ext: True,
)
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_excel_sheets_with_com",
lambda path: 3,
)
result = count_document_pages(xls_path)
assert result.status == "success"
assert result.page_count == 3
def test_invalid_xlsx_does_not_start_excel_com(monkeypatch, tmp_path):
xlsx_path = tmp_path / "broken.xlsx"
xlsx_path.write_bytes(b"not a real workbook")
def fail_if_called(path):
raise AssertionError("Excel COM should not start for invalid xlsx signatures")
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_excel_sheets_with_com",
fail_if_called,
)
result = count_document_pages(xlsx_path)
assert result.status == "uncertain"
def test_document_page_count_skill_marks_unsupported_and_success(tmp_path, django_user_model):
xlsx_path = tmp_path / "a.xlsx"
workbook = Workbook()
workbook.save(xlsx_path)
txt_path = tmp_path / "a.txt"
txt_path.write_text("x", encoding="utf-8")
user = django_user_model.objects.create_user(username="owner", password="pass")
conversation = Conversation.objects.create(user=user, title="会话")
batch = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-P")
xlsx_item = FileSummaryItem.objects.create(
batch=batch,
file_index=1,
file_name="a.xlsx",
file_type="xlsx",
relative_path="a.xlsx",
storage_path=str(xlsx_path),
)
txt_item = FileSummaryItem.objects.create(
batch=batch,
file_index=2,
file_name="a.txt",
file_type="txt",
relative_path="a.txt",
storage_path=str(txt_path),
)
result = DocumentPageCountSkill().run(WorkflowContext(batch=batch))
xlsx_item.refresh_from_db()
txt_item.refresh_from_db()
assert result.success is True
assert xlsx_item.statistics_status == FileSummaryItem.StatisticsStatus.SUCCESS
assert txt_item.statistics_status == FileSummaryItem.StatisticsStatus.UNSUPPORTED