Files
DEMO-AGENT/review_agent/file_summary/services/page_count.py

283 lines
10 KiB
Python

from __future__ import annotations
import logging
from dataclasses import dataclass
from pathlib import Path
from xml.etree import ElementTree
from zipfile import ZipFile, is_zipfile
SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
logger = logging.getLogger("review_agent.file_summary.page_count")
@dataclass(frozen=True)
class PageCountResult:
status: str
page_count: int | None = None
error_message: str = ""
def count_document_pages(path: str | Path) -> PageCountResult:
file_path = Path(path)
ext = file_path.suffix.lower().lstrip(".")
if ext not in SUPPORTED_EXTENSIONS:
return PageCountResult(status="unsupported")
try:
if ext == "pdf":
from pypdf import PdfReader
return PageCountResult(status="success", page_count=len(PdfReader(str(file_path)).pages))
if ext == "docx":
pages = _count_docx_pages_from_extended_properties(file_path)
if pages:
return PageCountResult(status="success", page_count=pages)
pages = _count_word_pages_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
if pages:
return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain")
if ext == "xlsx":
pages = _count_xlsx_sheets(file_path) or (
_count_excel_sheets_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
)
if pages:
return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain")
if ext == "xls":
pages = _count_xls_sheets(file_path) or (
_count_excel_sheets_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
)
if pages:
return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain")
if ext == "pptx":
pages = _count_pptx_slides(file_path) or (
_count_powerpoint_slides_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
)
if pages:
return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain")
if ext == "doc":
pages = _count_word_pages_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
if pages:
return PageCountResult(status="success", page_count=pages)
return _ole_uncertain_or_failed(file_path)
if ext == "ppt":
pages = _count_powerpoint_slides_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None
if pages:
return PageCountResult(status="success", page_count=pages)
return _ole_uncertain_or_failed(file_path)
except Exception as exc:
return PageCountResult(status="failed", error_message=str(exc))
return PageCountResult(status="uncertain")
def _count_docx_pages_from_extended_properties(path: Path) -> int | None:
try:
with ZipFile(path) as archive:
app_entries = [
item for item in archive.infolist() if item.filename == "docProps/app.xml"
]
if not app_entries:
return None
content = archive.read(app_entries[-1]).decode("utf-8", errors="replace")
except Exception as exc:
logger.warning("DOCX extended properties read failed", extra={"path": str(path), "error": str(exc)})
return None
try:
root = ElementTree.fromstring(content)
except ElementTree.ParseError as exc:
logger.warning("DOCX extended properties parse failed", extra={"path": str(path), "error": str(exc)})
return None
pages_node = root.find("{http://schemas.openxmlformats.org/officeDocument/2006/extended-properties}Pages")
if pages_node is None or not pages_node.text:
return None
return _positive_int(pages_node.text)
def _count_xlsx_sheets(path: Path) -> int | None:
try:
from openpyxl import load_workbook
workbook = load_workbook(str(path), read_only=True, data_only=True)
try:
return _positive_int(len(workbook.sheetnames))
finally:
workbook.close()
except Exception as exc:
logger.warning("XLSX sheet count failed", extra={"path": str(path), "error": str(exc)})
return None
def _count_xls_sheets(path: Path) -> int | None:
try:
import xlrd
workbook = xlrd.open_workbook(str(path), on_demand=True)
try:
return _positive_int(workbook.nsheets)
finally:
workbook.release_resources()
except Exception as exc:
logger.warning("XLS sheet count failed", extra={"path": str(path), "error": str(exc)})
return None
def _count_pptx_slides(path: Path) -> int | None:
try:
from pptx import Presentation
return _positive_int(len(Presentation(str(path)).slides))
except Exception as exc:
logger.warning("PPTX slide count failed", extra={"path": str(path), "error": str(exc)})
return None
def _ole_uncertain_or_failed(path: Path) -> PageCountResult:
try:
import olefile
if olefile.isOleFile(str(path)):
return PageCountResult(status="uncertain")
return PageCountResult(status="failed", error_message="不是有效的 OLE 文件。")
except Exception as exc:
logger.warning("OLE validation failed", extra={"path": str(path), "error": str(exc)})
return PageCountResult(status="uncertain")
def _can_try_com_fallback(path: Path, ext: str) -> bool:
if ext in {"docx", "xlsx", "pptx"}:
return is_zipfile(path)
if ext in {"doc", "xls", "ppt"}:
try:
import olefile
return olefile.isOleFile(str(path))
except Exception as exc:
logger.warning("OLE signature check failed", extra={"path": str(path), "error": str(exc)})
return False
return False
def _count_word_pages_with_com(path: Path) -> int | None:
try:
import pythoncom
import win32com.client
except Exception as exc:
logger.info("Word COM page count unavailable", extra={"path": str(path), "error": str(exc)})
return None
word = None
document = None
pythoncom.CoInitialize()
try:
word = win32com.client.DispatchEx("Word.Application")
word.Visible = False
word.DisplayAlerts = 0
document = word.Documents.Open(
str(path.resolve()),
ReadOnly=True,
AddToRecentFiles=False,
ConfirmConversions=False,
)
document.Repaginate()
return _positive_int(document.ComputeStatistics(2))
except Exception as exc:
logger.warning("Word COM page count failed", extra={"path": str(path), "error": str(exc)})
return None
finally:
try:
if document is not None:
document.Close(False)
except Exception as exc:
logger.debug("Word document close failed", extra={"path": str(path), "error": str(exc)})
try:
if word is not None:
word.Quit()
except Exception as exc:
logger.debug("Word application quit failed", extra={"path": str(path), "error": str(exc)})
pythoncom.CoUninitialize()
def _count_powerpoint_slides_with_com(path: Path) -> int | None:
try:
import pythoncom
import win32com.client
except Exception as exc:
logger.info("PowerPoint COM slide count unavailable", extra={"path": str(path), "error": str(exc)})
return None
powerpoint = None
presentation = None
pythoncom.CoInitialize()
try:
powerpoint = win32com.client.DispatchEx("PowerPoint.Application")
presentation = powerpoint.Presentations.Open(
str(path.resolve()),
ReadOnly=True,
Untitled=False,
WithWindow=False,
)
return _positive_int(presentation.Slides.Count)
except Exception as exc:
logger.warning("PowerPoint COM slide count failed", extra={"path": str(path), "error": str(exc)})
return None
finally:
try:
if presentation is not None:
presentation.Close()
except Exception as exc:
logger.debug("PowerPoint presentation close failed", extra={"path": str(path), "error": str(exc)})
try:
if powerpoint is not None:
powerpoint.Quit()
except Exception as exc:
logger.debug("PowerPoint application quit failed", extra={"path": str(path), "error": str(exc)})
pythoncom.CoUninitialize()
def _count_excel_sheets_with_com(path: Path) -> int | None:
try:
import pythoncom
import win32com.client
except Exception as exc:
logger.info("Excel COM sheet count unavailable", extra={"path": str(path), "error": str(exc)})
return None
excel = None
workbook = None
pythoncom.CoInitialize()
try:
excel = win32com.client.DispatchEx("Excel.Application")
excel.Visible = False
excel.DisplayAlerts = False
workbook = excel.Workbooks.Open(str(path.resolve()), ReadOnly=True)
return _positive_int(workbook.Worksheets.Count)
except Exception as exc:
logger.warning("Excel COM sheet count failed", extra={"path": str(path), "error": str(exc)})
return None
finally:
try:
if workbook is not None:
workbook.Close(False)
except Exception as exc:
logger.debug("Excel workbook close failed", extra={"path": str(path), "error": str(exc)})
try:
if excel is not None:
excel.Quit()
except Exception as exc:
logger.debug("Excel application quit failed", extra={"path": str(path), "error": str(exc)})
pythoncom.CoUninitialize()
def _positive_int(value) -> int | None:
try:
number = int(value)
except (TypeError, ValueError):
return None
return number if number > 0 else None