fix(file-summary): 避免无效 Office 文件触发 COM 统计
This commit is contained in:
@@ -4,7 +4,7 @@ import logging
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from xml.etree import ElementTree
|
from xml.etree import ElementTree
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile, is_zipfile
|
||||||
|
|
||||||
|
|
||||||
SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
|
SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
|
||||||
@@ -33,32 +33,38 @@ def count_document_pages(path: str | Path) -> PageCountResult:
|
|||||||
pages = _count_docx_pages_from_extended_properties(file_path)
|
pages = _count_docx_pages_from_extended_properties(file_path)
|
||||||
if pages:
|
if pages:
|
||||||
return PageCountResult(status="success", page_count=pages)
|
return PageCountResult(status="success", page_count=pages)
|
||||||
pages = _count_word_pages_with_com(file_path)
|
pages = _count_word_pages_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
|
||||||
if pages:
|
if pages:
|
||||||
return PageCountResult(status="success", page_count=pages)
|
return PageCountResult(status="success", page_count=pages)
|
||||||
return PageCountResult(status="uncertain")
|
return PageCountResult(status="uncertain")
|
||||||
if ext == "xlsx":
|
if ext == "xlsx":
|
||||||
pages = _count_xlsx_sheets(file_path) or _count_excel_sheets_with_com(file_path)
|
pages = _count_xlsx_sheets(file_path) or (
|
||||||
|
_count_excel_sheets_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
|
||||||
|
)
|
||||||
if pages:
|
if pages:
|
||||||
return PageCountResult(status="success", page_count=pages)
|
return PageCountResult(status="success", page_count=pages)
|
||||||
return PageCountResult(status="uncertain")
|
return PageCountResult(status="uncertain")
|
||||||
if ext == "xls":
|
if ext == "xls":
|
||||||
pages = _count_xls_sheets(file_path) or _count_excel_sheets_with_com(file_path)
|
pages = _count_xls_sheets(file_path) or (
|
||||||
|
_count_excel_sheets_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
|
||||||
|
)
|
||||||
if pages:
|
if pages:
|
||||||
return PageCountResult(status="success", page_count=pages)
|
return PageCountResult(status="success", page_count=pages)
|
||||||
return PageCountResult(status="uncertain")
|
return PageCountResult(status="uncertain")
|
||||||
if ext == "pptx":
|
if ext == "pptx":
|
||||||
pages = _count_pptx_slides(file_path) or _count_powerpoint_slides_with_com(file_path)
|
pages = _count_pptx_slides(file_path) or (
|
||||||
|
_count_powerpoint_slides_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
|
||||||
|
)
|
||||||
if pages:
|
if pages:
|
||||||
return PageCountResult(status="success", page_count=pages)
|
return PageCountResult(status="success", page_count=pages)
|
||||||
return PageCountResult(status="uncertain")
|
return PageCountResult(status="uncertain")
|
||||||
if ext == "doc":
|
if ext == "doc":
|
||||||
pages = _count_word_pages_with_com(file_path)
|
pages = _count_word_pages_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
|
||||||
if pages:
|
if pages:
|
||||||
return PageCountResult(status="success", page_count=pages)
|
return PageCountResult(status="success", page_count=pages)
|
||||||
return _ole_uncertain_or_failed(file_path)
|
return _ole_uncertain_or_failed(file_path)
|
||||||
if ext == "ppt":
|
if ext == "ppt":
|
||||||
pages = _count_powerpoint_slides_with_com(file_path)
|
pages = _count_powerpoint_slides_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
|
||||||
if pages:
|
if pages:
|
||||||
return PageCountResult(status="success", page_count=pages)
|
return PageCountResult(status="success", page_count=pages)
|
||||||
return _ole_uncertain_or_failed(file_path)
|
return _ole_uncertain_or_failed(file_path)
|
||||||
@@ -143,6 +149,20 @@ def _ole_uncertain_or_failed(path: Path) -> PageCountResult:
|
|||||||
return PageCountResult(status="uncertain")
|
return PageCountResult(status="uncertain")
|
||||||
|
|
||||||
|
|
||||||
|
def _can_try_com_fallback(path: Path, ext: str) -> bool:
|
||||||
|
if ext in {"docx", "xlsx", "pptx"}:
|
||||||
|
return is_zipfile(path)
|
||||||
|
if ext in {"doc", "xls", "ppt"}:
|
||||||
|
try:
|
||||||
|
import olefile
|
||||||
|
|
||||||
|
return olefile.isOleFile(str(path))
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("OLE signature check failed", extra={"path": str(path), "error": str(exc)})
|
||||||
|
return False
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _count_word_pages_with_com(path: Path) -> int | None:
|
def _count_word_pages_with_com(path: Path) -> int | None:
|
||||||
try:
|
try:
|
||||||
import pythoncom
|
import pythoncom
|
||||||
|
|||||||
@@ -77,6 +77,10 @@ def test_count_docx_pages_uses_word_com_fallback(monkeypatch, tmp_path):
|
|||||||
def test_count_doc_pages_uses_word_com_fallback(monkeypatch, tmp_path):
|
def test_count_doc_pages_uses_word_com_fallback(monkeypatch, tmp_path):
|
||||||
doc_path = tmp_path / "legacy.doc"
|
doc_path = tmp_path / "legacy.doc"
|
||||||
doc_path.write_bytes(b"legacy-doc-placeholder")
|
doc_path.write_bytes(b"legacy-doc-placeholder")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"review_agent.file_summary.services.page_count._can_try_com_fallback",
|
||||||
|
lambda path, ext: True,
|
||||||
|
)
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"review_agent.file_summary.services.page_count._count_word_pages_with_com",
|
"review_agent.file_summary.services.page_count._count_word_pages_with_com",
|
||||||
lambda path: 5,
|
lambda path: 5,
|
||||||
@@ -91,6 +95,10 @@ def test_count_doc_pages_uses_word_com_fallback(monkeypatch, tmp_path):
|
|||||||
def test_count_ppt_pages_uses_powerpoint_com_fallback(monkeypatch, tmp_path):
|
def test_count_ppt_pages_uses_powerpoint_com_fallback(monkeypatch, tmp_path):
|
||||||
ppt_path = tmp_path / "legacy.ppt"
|
ppt_path = tmp_path / "legacy.ppt"
|
||||||
ppt_path.write_bytes(b"legacy-ppt-placeholder")
|
ppt_path.write_bytes(b"legacy-ppt-placeholder")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"review_agent.file_summary.services.page_count._can_try_com_fallback",
|
||||||
|
lambda path, ext: True,
|
||||||
|
)
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"review_agent.file_summary.services.page_count._count_powerpoint_slides_with_com",
|
"review_agent.file_summary.services.page_count._count_powerpoint_slides_with_com",
|
||||||
lambda path: 9,
|
lambda path: 9,
|
||||||
@@ -105,6 +113,10 @@ def test_count_ppt_pages_uses_powerpoint_com_fallback(monkeypatch, tmp_path):
|
|||||||
def test_count_excel_pages_uses_excel_com_fallback(monkeypatch, tmp_path):
|
def test_count_excel_pages_uses_excel_com_fallback(monkeypatch, tmp_path):
|
||||||
xls_path = tmp_path / "legacy.xls"
|
xls_path = tmp_path / "legacy.xls"
|
||||||
xls_path.write_bytes(b"legacy-xls-placeholder")
|
xls_path.write_bytes(b"legacy-xls-placeholder")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"review_agent.file_summary.services.page_count._can_try_com_fallback",
|
||||||
|
lambda path, ext: True,
|
||||||
|
)
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"review_agent.file_summary.services.page_count._count_excel_sheets_with_com",
|
"review_agent.file_summary.services.page_count._count_excel_sheets_with_com",
|
||||||
lambda path: 3,
|
lambda path: 3,
|
||||||
@@ -116,6 +128,23 @@ def test_count_excel_pages_uses_excel_com_fallback(monkeypatch, tmp_path):
|
|||||||
assert result.page_count == 3
|
assert result.page_count == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_invalid_xlsx_does_not_start_excel_com(monkeypatch, tmp_path):
|
||||||
|
xlsx_path = tmp_path / "broken.xlsx"
|
||||||
|
xlsx_path.write_bytes(b"not a real workbook")
|
||||||
|
|
||||||
|
def fail_if_called(path):
|
||||||
|
raise AssertionError("Excel COM should not start for invalid xlsx signatures")
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"review_agent.file_summary.services.page_count._count_excel_sheets_with_com",
|
||||||
|
fail_if_called,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = count_document_pages(xlsx_path)
|
||||||
|
|
||||||
|
assert result.status == "uncertain"
|
||||||
|
|
||||||
|
|
||||||
def test_document_page_count_skill_marks_unsupported_and_success(tmp_path, django_user_model):
|
def test_document_page_count_skill_marks_unsupported_and_success(tmp_path, django_user_model):
|
||||||
xlsx_path = tmp_path / "a.xlsx"
|
xlsx_path = tmp_path / "a.xlsx"
|
||||||
workbook = Workbook()
|
workbook = Workbook()
|
||||||
|
|||||||
Reference in New Issue
Block a user