import pytest import shutil from zipfile import ZipFile from docx import Document from openpyxl import Workbook from pptx import Presentation from review_agent.file_summary.services.page_count import count_document_pages from review_agent.file_summary.skills.document_page_count import DocumentPageCountSkill from review_agent.file_summary.skills.base import WorkflowContext from review_agent.models import Conversation, FileSummaryBatch, FileSummaryItem pytestmark = pytest.mark.django_db def test_count_document_pages_for_office_formats(tmp_path): docx_path = tmp_path / "a.docx" Document().save(docx_path) xlsx_path = tmp_path / "a.xlsx" workbook = Workbook() workbook.create_sheet("第二页") workbook.save(xlsx_path) pptx_path = tmp_path / "a.pptx" presentation = Presentation() presentation.slides.add_slide(presentation.slide_layouts[6]) presentation.save(pptx_path) assert count_document_pages(docx_path).status in {"success", "uncertain"} assert count_document_pages(xlsx_path).page_count == 2 assert count_document_pages(pptx_path).page_count == 1 def test_count_docx_pages_from_extended_properties(tmp_path): docx_path = tmp_path / "with-pages.docx" Document().save(docx_path) app_xml = ( '' '' "7" "" ) rewritten = tmp_path / "rewritten.docx" with ZipFile(docx_path) as source, ZipFile(rewritten, "w") as target: for entry in source.infolist(): if entry.filename != "docProps/app.xml": target.writestr(entry, source.read(entry.filename)) target.writestr("docProps/app.xml", app_xml) shutil.move(rewritten, docx_path) result = count_document_pages(docx_path) assert result.status == "success" assert result.page_count == 7 def test_count_docx_pages_uses_word_com_fallback(monkeypatch, tmp_path): docx_path = tmp_path / "without-pages.docx" Document().save(docx_path) monkeypatch.setattr( "review_agent.file_summary.services.page_count._count_docx_pages_from_extended_properties", lambda path: None, ) monkeypatch.setattr( "review_agent.file_summary.services.page_count._count_word_pages_with_com", lambda path: 22, ) result = count_document_pages(docx_path) assert result.status == "success" assert result.page_count == 22 def test_count_doc_pages_uses_word_com_fallback(monkeypatch, tmp_path): doc_path = tmp_path / "legacy.doc" doc_path.write_bytes(b"legacy-doc-placeholder") monkeypatch.setattr( "review_agent.file_summary.services.page_count._can_try_com_fallback", lambda path, ext: True, ) monkeypatch.setattr( "review_agent.file_summary.services.page_count._count_word_pages_with_com", lambda path: 5, ) result = count_document_pages(doc_path) assert result.status == "success" assert result.page_count == 5 def test_count_ppt_pages_uses_powerpoint_com_fallback(monkeypatch, tmp_path): ppt_path = tmp_path / "legacy.ppt" ppt_path.write_bytes(b"legacy-ppt-placeholder") monkeypatch.setattr( "review_agent.file_summary.services.page_count._can_try_com_fallback", lambda path, ext: True, ) monkeypatch.setattr( "review_agent.file_summary.services.page_count._count_powerpoint_slides_with_com", lambda path: 9, ) result = count_document_pages(ppt_path) assert result.status == "success" assert result.page_count == 9 def test_count_excel_pages_uses_excel_com_fallback(monkeypatch, tmp_path): xls_path = tmp_path / "legacy.xls" xls_path.write_bytes(b"legacy-xls-placeholder") monkeypatch.setattr( "review_agent.file_summary.services.page_count._can_try_com_fallback", lambda path, ext: True, ) monkeypatch.setattr( "review_agent.file_summary.services.page_count._count_excel_sheets_with_com", lambda path: 3, ) result = count_document_pages(xls_path) assert result.status == "success" assert result.page_count == 3 def test_invalid_xlsx_does_not_start_excel_com(monkeypatch, tmp_path): xlsx_path = tmp_path / "broken.xlsx" xlsx_path.write_bytes(b"not a real workbook") def fail_if_called(path): raise AssertionError("Excel COM should not start for invalid xlsx signatures") monkeypatch.setattr( "review_agent.file_summary.services.page_count._count_excel_sheets_with_com", fail_if_called, ) result = count_document_pages(xlsx_path) assert result.status == "uncertain" def test_document_page_count_skill_marks_unsupported_and_success(tmp_path, django_user_model): xlsx_path = tmp_path / "a.xlsx" workbook = Workbook() workbook.save(xlsx_path) txt_path = tmp_path / "a.txt" txt_path.write_text("x", encoding="utf-8") user = django_user_model.objects.create_user(username="owner", password="pass") conversation = Conversation.objects.create(user=user, title="会话") batch = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-P") xlsx_item = FileSummaryItem.objects.create( batch=batch, file_index=1, file_name="a.xlsx", file_type="xlsx", relative_path="a.xlsx", storage_path=str(xlsx_path), ) txt_item = FileSummaryItem.objects.create( batch=batch, file_index=2, file_name="a.txt", file_type="txt", relative_path="a.txt", storage_path=str(txt_path), ) result = DocumentPageCountSkill().run(WorkflowContext(batch=batch)) xlsx_item.refresh_from_db() txt_item.refresh_from_db() assert result.success is True assert xlsx_item.statistics_status == FileSummaryItem.StatisticsStatus.SUCCESS assert txt_item.statistics_status == FileSummaryItem.StatisticsStatus.UNSUPPORTED