fix(file-summary): 避免无效 Office 文件触发 COM 统计

This commit is contained in:
2026-06-06 19:44:42 +08:00
parent 460d418921
commit c78ff3a1fd
2 changed files with 56 additions and 7 deletions

View File

@@ -4,7 +4,7 @@ import logging
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from xml.etree import ElementTree from xml.etree import ElementTree
from zipfile import ZipFile from zipfile import ZipFile, is_zipfile
SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"} SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
@@ -33,32 +33,38 @@ def count_document_pages(path: str | Path) -> PageCountResult:
pages = _count_docx_pages_from_extended_properties(file_path) pages = _count_docx_pages_from_extended_properties(file_path)
if pages: if pages:
return PageCountResult(status="success", page_count=pages) return PageCountResult(status="success", page_count=pages)
pages = _count_word_pages_with_com(file_path) pages = _count_word_pages_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
if pages: if pages:
return PageCountResult(status="success", page_count=pages) return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain") return PageCountResult(status="uncertain")
if ext == "xlsx": if ext == "xlsx":
pages = _count_xlsx_sheets(file_path) or _count_excel_sheets_with_com(file_path) pages = _count_xlsx_sheets(file_path) or (
_count_excel_sheets_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
)
if pages: if pages:
return PageCountResult(status="success", page_count=pages) return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain") return PageCountResult(status="uncertain")
if ext == "xls": if ext == "xls":
pages = _count_xls_sheets(file_path) or _count_excel_sheets_with_com(file_path) pages = _count_xls_sheets(file_path) or (
_count_excel_sheets_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
)
if pages: if pages:
return PageCountResult(status="success", page_count=pages) return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain") return PageCountResult(status="uncertain")
if ext == "pptx": if ext == "pptx":
pages = _count_pptx_slides(file_path) or _count_powerpoint_slides_with_com(file_path) pages = _count_pptx_slides(file_path) or (
_count_powerpoint_slides_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
)
if pages: if pages:
return PageCountResult(status="success", page_count=pages) return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain") return PageCountResult(status="uncertain")
if ext == "doc": if ext == "doc":
pages = _count_word_pages_with_com(file_path) pages = _count_word_pages_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
if pages: if pages:
return PageCountResult(status="success", page_count=pages) return PageCountResult(status="success", page_count=pages)
return _ole_uncertain_or_failed(file_path) return _ole_uncertain_or_failed(file_path)
if ext == "ppt": if ext == "ppt":
pages = _count_powerpoint_slides_with_com(file_path) pages = _count_powerpoint_slides_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
if pages: if pages:
return PageCountResult(status="success", page_count=pages) return PageCountResult(status="success", page_count=pages)
return _ole_uncertain_or_failed(file_path) return _ole_uncertain_or_failed(file_path)
@@ -143,6 +149,20 @@ def _ole_uncertain_or_failed(path: Path) -> PageCountResult:
return PageCountResult(status="uncertain") return PageCountResult(status="uncertain")
def _can_try_com_fallback(path: Path, ext: str) -> bool:
if ext in {"docx", "xlsx", "pptx"}:
return is_zipfile(path)
if ext in {"doc", "xls", "ppt"}:
try:
import olefile
return olefile.isOleFile(str(path))
except Exception as exc:
logger.warning("OLE signature check failed", extra={"path": str(path), "error": str(exc)})
return False
return False
def _count_word_pages_with_com(path: Path) -> int | None: def _count_word_pages_with_com(path: Path) -> int | None:
try: try:
import pythoncom import pythoncom

View File

@@ -77,6 +77,10 @@ def test_count_docx_pages_uses_word_com_fallback(monkeypatch, tmp_path):
def test_count_doc_pages_uses_word_com_fallback(monkeypatch, tmp_path): def test_count_doc_pages_uses_word_com_fallback(monkeypatch, tmp_path):
doc_path = tmp_path / "legacy.doc" doc_path = tmp_path / "legacy.doc"
doc_path.write_bytes(b"legacy-doc-placeholder") doc_path.write_bytes(b"legacy-doc-placeholder")
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._can_try_com_fallback",
lambda path, ext: True,
)
monkeypatch.setattr( monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_word_pages_with_com", "review_agent.file_summary.services.page_count._count_word_pages_with_com",
lambda path: 5, lambda path: 5,
@@ -91,6 +95,10 @@ def test_count_doc_pages_uses_word_com_fallback(monkeypatch, tmp_path):
def test_count_ppt_pages_uses_powerpoint_com_fallback(monkeypatch, tmp_path): def test_count_ppt_pages_uses_powerpoint_com_fallback(monkeypatch, tmp_path):
ppt_path = tmp_path / "legacy.ppt" ppt_path = tmp_path / "legacy.ppt"
ppt_path.write_bytes(b"legacy-ppt-placeholder") ppt_path.write_bytes(b"legacy-ppt-placeholder")
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._can_try_com_fallback",
lambda path, ext: True,
)
monkeypatch.setattr( monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_powerpoint_slides_with_com", "review_agent.file_summary.services.page_count._count_powerpoint_slides_with_com",
lambda path: 9, lambda path: 9,
@@ -105,6 +113,10 @@ def test_count_ppt_pages_uses_powerpoint_com_fallback(monkeypatch, tmp_path):
def test_count_excel_pages_uses_excel_com_fallback(monkeypatch, tmp_path): def test_count_excel_pages_uses_excel_com_fallback(monkeypatch, tmp_path):
xls_path = tmp_path / "legacy.xls" xls_path = tmp_path / "legacy.xls"
xls_path.write_bytes(b"legacy-xls-placeholder") xls_path.write_bytes(b"legacy-xls-placeholder")
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._can_try_com_fallback",
lambda path, ext: True,
)
monkeypatch.setattr( monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_excel_sheets_with_com", "review_agent.file_summary.services.page_count._count_excel_sheets_with_com",
lambda path: 3, lambda path: 3,
@@ -116,6 +128,23 @@ def test_count_excel_pages_uses_excel_com_fallback(monkeypatch, tmp_path):
assert result.page_count == 3 assert result.page_count == 3
def test_invalid_xlsx_does_not_start_excel_com(monkeypatch, tmp_path):
xlsx_path = tmp_path / "broken.xlsx"
xlsx_path.write_bytes(b"not a real workbook")
def fail_if_called(path):
raise AssertionError("Excel COM should not start for invalid xlsx signatures")
monkeypatch.setattr(
"review_agent.file_summary.services.page_count._count_excel_sheets_with_com",
fail_if_called,
)
result = count_document_pages(xlsx_path)
assert result.status == "uncertain"
def test_document_page_count_skill_marks_unsupported_and_success(tmp_path, django_user_model): def test_document_page_count_skill_marks_unsupported_and_success(tmp_path, django_user_model):
xlsx_path = tmp_path / "a.xlsx" xlsx_path = tmp_path / "a.xlsx"
workbook = Workbook() workbook = Workbook()