fix(file-summary): 避免无效 Office 文件触发 COM 统计

This commit is contained in:
2026-06-06 19:44:42 +08:00
parent 460d418921
commit c78ff3a1fd
2 changed files with 56 additions and 7 deletions

View File

@@ -4,7 +4,7 @@ import logging
from dataclasses import dataclass
from pathlib import Path
from xml.etree import ElementTree
from zipfile import ZipFile
from zipfile import ZipFile, is_zipfile
SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
@@ -33,32 +33,38 @@ def count_document_pages(path: str | Path) -> PageCountResult:
pages = _count_docx_pages_from_extended_properties(file_path)
if pages:
return PageCountResult(status="success", page_count=pages)
pages = _count_word_pages_with_com(file_path)
pages = _count_word_pages_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
if pages:
return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain")
if ext == "xlsx":
pages = _count_xlsx_sheets(file_path) or _count_excel_sheets_with_com(file_path)
pages = _count_xlsx_sheets(file_path) or (
_count_excel_sheets_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
)
if pages:
return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain")
if ext == "xls":
pages = _count_xls_sheets(file_path) or _count_excel_sheets_with_com(file_path)
pages = _count_xls_sheets(file_path) or (
_count_excel_sheets_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
)
if pages:
return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain")
if ext == "pptx":
pages = _count_pptx_slides(file_path) or _count_powerpoint_slides_with_com(file_path)
pages = _count_pptx_slides(file_path) or (
_count_powerpoint_slides_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
)
if pages:
return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain")
if ext == "doc":
pages = _count_word_pages_with_com(file_path)
pages = _count_word_pages_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
if pages:
return PageCountResult(status="success", page_count=pages)
return _ole_uncertain_or_failed(file_path)
if ext == "ppt":
pages = _count_powerpoint_slides_with_com(file_path)
pages = _count_powerpoint_slides_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
if pages:
return PageCountResult(status="success", page_count=pages)
return _ole_uncertain_or_failed(file_path)
@@ -143,6 +149,20 @@ def _ole_uncertain_or_failed(path: Path) -> PageCountResult:
return PageCountResult(status="uncertain")
def _can_try_com_fallback(path: Path, ext: str) -> bool:
if ext in {"docx", "xlsx", "pptx"}:
return is_zipfile(path)
if ext in {"doc", "xls", "ppt"}:
try:
import olefile
return olefile.isOleFile(str(path))
except Exception as exc:
logger.warning("OLE signature check failed", extra={"path": str(path), "error": str(exc)})
return False
return False
def _count_word_pages_with_com(path: Path) -> int | None:
try:
import pythoncom