diff --git a/review_agent/file_summary/services/page_count.py b/review_agent/file_summary/services/page_count.py index 6b405a5..4f1e63a 100644 --- a/review_agent/file_summary/services/page_count.py +++ b/review_agent/file_summary/services/page_count.py @@ -4,7 +4,7 @@ import logging from dataclasses import dataclass from pathlib import Path from xml.etree import ElementTree -from zipfile import ZipFile +from zipfile import ZipFile, is_zipfile SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"} @@ -33,32 +33,38 @@ def count_document_pages(path: str | Path) -> PageCountResult: pages = _count_docx_pages_from_extended_properties(file_path) if pages: return PageCountResult(status="success", page_count=pages) - pages = _count_word_pages_with_com(file_path) + pages = _count_word_pages_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None if pages: return PageCountResult(status="success", page_count=pages) return PageCountResult(status="uncertain") if ext == "xlsx": - pages = _count_xlsx_sheets(file_path) or _count_excel_sheets_with_com(file_path) + pages = _count_xlsx_sheets(file_path) or ( + _count_excel_sheets_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None + ) if pages: return PageCountResult(status="success", page_count=pages) return PageCountResult(status="uncertain") if ext == "xls": - pages = _count_xls_sheets(file_path) or _count_excel_sheets_with_com(file_path) + pages = _count_xls_sheets(file_path) or ( + _count_excel_sheets_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None + ) if pages: return PageCountResult(status="success", page_count=pages) return PageCountResult(status="uncertain") if ext == "pptx": - pages = _count_pptx_slides(file_path) or _count_powerpoint_slides_with_com(file_path) + pages = _count_pptx_slides(file_path) or ( + _count_powerpoint_slides_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None + ) if pages: return PageCountResult(status="success", page_count=pages) return PageCountResult(status="uncertain") if ext == "doc": - pages = _count_word_pages_with_com(file_path) + pages = _count_word_pages_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None if pages: return PageCountResult(status="success", page_count=pages) return _ole_uncertain_or_failed(file_path) if ext == "ppt": - pages = _count_powerpoint_slides_with_com(file_path) + pages = _count_powerpoint_slides_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None if pages: return PageCountResult(status="success", page_count=pages) return _ole_uncertain_or_failed(file_path) @@ -143,6 +149,20 @@ def _ole_uncertain_or_failed(path: Path) -> PageCountResult: return PageCountResult(status="uncertain") +def _can_try_com_fallback(path: Path, ext: str) -> bool: + if ext in {"docx", "xlsx", "pptx"}: + return is_zipfile(path) + if ext in {"doc", "xls", "ppt"}: + try: + import olefile + + return olefile.isOleFile(str(path)) + except Exception as exc: + logger.warning("OLE signature check failed", extra={"path": str(path), "error": str(exc)}) + return False + return False + + def _count_word_pages_with_com(path: Path) -> int | None: try: import pythoncom diff --git a/tests/test_file_summary_page_count.py b/tests/test_file_summary_page_count.py index 3a7b4cd..1ce353e 100644 --- a/tests/test_file_summary_page_count.py +++ b/tests/test_file_summary_page_count.py @@ -77,6 +77,10 @@ def test_count_docx_pages_uses_word_com_fallback(monkeypatch, tmp_path): def test_count_doc_pages_uses_word_com_fallback(monkeypatch, tmp_path): doc_path = tmp_path / "legacy.doc" doc_path.write_bytes(b"legacy-doc-placeholder") + monkeypatch.setattr( + "review_agent.file_summary.services.page_count._can_try_com_fallback", + lambda path, ext: True, + ) monkeypatch.setattr( "review_agent.file_summary.services.page_count._count_word_pages_with_com", lambda path: 5, @@ -91,6 +95,10 @@ def test_count_doc_pages_uses_word_com_fallback(monkeypatch, tmp_path): def test_count_ppt_pages_uses_powerpoint_com_fallback(monkeypatch, tmp_path): ppt_path = tmp_path / "legacy.ppt" ppt_path.write_bytes(b"legacy-ppt-placeholder") + monkeypatch.setattr( + "review_agent.file_summary.services.page_count._can_try_com_fallback", + lambda path, ext: True, + ) monkeypatch.setattr( "review_agent.file_summary.services.page_count._count_powerpoint_slides_with_com", lambda path: 9, @@ -105,6 +113,10 @@ def test_count_ppt_pages_uses_powerpoint_com_fallback(monkeypatch, tmp_path): def test_count_excel_pages_uses_excel_com_fallback(monkeypatch, tmp_path): xls_path = tmp_path / "legacy.xls" xls_path.write_bytes(b"legacy-xls-placeholder") + monkeypatch.setattr( + "review_agent.file_summary.services.page_count._can_try_com_fallback", + lambda path, ext: True, + ) monkeypatch.setattr( "review_agent.file_summary.services.page_count._count_excel_sheets_with_com", lambda path: 3, @@ -116,6 +128,23 @@ def test_count_excel_pages_uses_excel_com_fallback(monkeypatch, tmp_path): assert result.page_count == 3 +def test_invalid_xlsx_does_not_start_excel_com(monkeypatch, tmp_path): + xlsx_path = tmp_path / "broken.xlsx" + xlsx_path.write_bytes(b"not a real workbook") + + def fail_if_called(path): + raise AssertionError("Excel COM should not start for invalid xlsx signatures") + + monkeypatch.setattr( + "review_agent.file_summary.services.page_count._count_excel_sheets_with_com", + fail_if_called, + ) + + result = count_document_pages(xlsx_path) + + assert result.status == "uncertain" + + def test_document_page_count_skill_marks_unsupported_and_success(tmp_path, django_user_model): xlsx_path = tmp_path / "a.xlsx" workbook = Workbook()