from __future__ import annotations import logging from dataclasses import dataclass from pathlib import Path from xml.etree import ElementTree from zipfile import ZipFile, is_zipfile SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"} logger = logging.getLogger("review_agent.file_summary.page_count") @dataclass(frozen=True) class PageCountResult: status: str page_count: int | None = None error_message: str = "" def count_document_pages(path: str | Path) -> PageCountResult: file_path = Path(path) ext = file_path.suffix.lower().lstrip(".") if ext not in SUPPORTED_EXTENSIONS: return PageCountResult(status="unsupported") try: if ext == "pdf": from pypdf import PdfReader return PageCountResult(status="success", page_count=len(PdfReader(str(file_path)).pages)) if ext == "docx": pages = _count_docx_pages_from_extended_properties(file_path) if pages: return PageCountResult(status="success", page_count=pages) pages = _count_word_pages_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None if pages: return PageCountResult(status="success", page_count=pages) return PageCountResult(status="uncertain") if ext == "xlsx": pages = _count_xlsx_sheets(file_path) or ( _count_excel_sheets_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None ) if pages: return PageCountResult(status="success", page_count=pages) return PageCountResult(status="uncertain") if ext == "xls": pages = _count_xls_sheets(file_path) or ( _count_excel_sheets_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None ) if pages: return PageCountResult(status="success", page_count=pages) return PageCountResult(status="uncertain") if ext == "pptx": pages = _count_pptx_slides(file_path) or ( _count_powerpoint_slides_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None ) if pages: return PageCountResult(status="success", page_count=pages) return PageCountResult(status="uncertain") if ext == "doc": pages = _count_word_pages_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None if pages: return PageCountResult(status="success", page_count=pages) return _ole_uncertain_or_failed(file_path) if ext == "ppt": pages = _count_powerpoint_slides_with_com(file_path) if _can_try_com_fallback(file_path, ext) else None if pages: return PageCountResult(status="success", page_count=pages) return _ole_uncertain_or_failed(file_path) except Exception as exc: return PageCountResult(status="failed", error_message=str(exc)) return PageCountResult(status="uncertain") def _count_docx_pages_from_extended_properties(path: Path) -> int | None: try: with ZipFile(path) as archive: app_entries = [ item for item in archive.infolist() if item.filename == "docProps/app.xml" ] if not app_entries: return None content = archive.read(app_entries[-1]).decode("utf-8", errors="replace") except Exception as exc: logger.warning("DOCX extended properties read failed", extra={"path": str(path), "error": str(exc)}) return None try: root = ElementTree.fromstring(content) except ElementTree.ParseError as exc: logger.warning("DOCX extended properties parse failed", extra={"path": str(path), "error": str(exc)}) return None pages_node = root.find("{http://schemas.openxmlformats.org/officeDocument/2006/extended-properties}Pages") if pages_node is None or not pages_node.text: return None return _positive_int(pages_node.text) def _count_xlsx_sheets(path: Path) -> int | None: try: from openpyxl import load_workbook workbook = load_workbook(str(path), read_only=True, data_only=True) try: return _positive_int(len(workbook.sheetnames)) finally: workbook.close() except Exception as exc: logger.warning("XLSX sheet count failed", extra={"path": str(path), "error": str(exc)}) return None def _count_xls_sheets(path: Path) -> int | None: try: import xlrd workbook = xlrd.open_workbook(str(path), on_demand=True) try: return _positive_int(workbook.nsheets) finally: workbook.release_resources() except Exception as exc: logger.warning("XLS sheet count failed", extra={"path": str(path), "error": str(exc)}) return None def _count_pptx_slides(path: Path) -> int | None: try: from pptx import Presentation return _positive_int(len(Presentation(str(path)).slides)) except Exception as exc: logger.warning("PPTX slide count failed", extra={"path": str(path), "error": str(exc)}) return None def _ole_uncertain_or_failed(path: Path) -> PageCountResult: try: import olefile if olefile.isOleFile(str(path)): return PageCountResult(status="uncertain") return PageCountResult(status="failed", error_message="不是有效的 OLE 文件。") except Exception as exc: logger.warning("OLE validation failed", extra={"path": str(path), "error": str(exc)}) return PageCountResult(status="uncertain") def _can_try_com_fallback(path: Path, ext: str) -> bool: if ext in {"docx", "xlsx", "pptx"}: return is_zipfile(path) if ext in {"doc", "xls", "ppt"}: try: import olefile return olefile.isOleFile(str(path)) except Exception as exc: logger.warning("OLE signature check failed", extra={"path": str(path), "error": str(exc)}) return False return False def _count_word_pages_with_com(path: Path) -> int | None: try: import pythoncom import win32com.client except Exception as exc: logger.info("Word COM page count unavailable", extra={"path": str(path), "error": str(exc)}) return None word = None document = None pythoncom.CoInitialize() try: word = win32com.client.DispatchEx("Word.Application") word.Visible = False word.DisplayAlerts = 0 document = word.Documents.Open( str(path.resolve()), ReadOnly=True, AddToRecentFiles=False, ConfirmConversions=False, ) document.Repaginate() return _positive_int(document.ComputeStatistics(2)) except Exception as exc: logger.warning("Word COM page count failed", extra={"path": str(path), "error": str(exc)}) return None finally: try: if document is not None: document.Close(False) except Exception as exc: logger.debug("Word document close failed", extra={"path": str(path), "error": str(exc)}) try: if word is not None: word.Quit() except Exception as exc: logger.debug("Word application quit failed", extra={"path": str(path), "error": str(exc)}) pythoncom.CoUninitialize() def _count_powerpoint_slides_with_com(path: Path) -> int | None: try: import pythoncom import win32com.client except Exception as exc: logger.info("PowerPoint COM slide count unavailable", extra={"path": str(path), "error": str(exc)}) return None powerpoint = None presentation = None pythoncom.CoInitialize() try: powerpoint = win32com.client.DispatchEx("PowerPoint.Application") presentation = powerpoint.Presentations.Open( str(path.resolve()), ReadOnly=True, Untitled=False, WithWindow=False, ) return _positive_int(presentation.Slides.Count) except Exception as exc: logger.warning("PowerPoint COM slide count failed", extra={"path": str(path), "error": str(exc)}) return None finally: try: if presentation is not None: presentation.Close() except Exception as exc: logger.debug("PowerPoint presentation close failed", extra={"path": str(path), "error": str(exc)}) try: if powerpoint is not None: powerpoint.Quit() except Exception as exc: logger.debug("PowerPoint application quit failed", extra={"path": str(path), "error": str(exc)}) pythoncom.CoUninitialize() def _count_excel_sheets_with_com(path: Path) -> int | None: try: import pythoncom import win32com.client except Exception as exc: logger.info("Excel COM sheet count unavailable", extra={"path": str(path), "error": str(exc)}) return None excel = None workbook = None pythoncom.CoInitialize() try: excel = win32com.client.DispatchEx("Excel.Application") excel.Visible = False excel.DisplayAlerts = False workbook = excel.Workbooks.Open(str(path.resolve()), ReadOnly=True) return _positive_int(workbook.Worksheets.Count) except Exception as exc: logger.warning("Excel COM sheet count failed", extra={"path": str(path), "error": str(exc)}) return None finally: try: if workbook is not None: workbook.Close(False) except Exception as exc: logger.debug("Excel workbook close failed", extra={"path": str(path), "error": str(exc)}) try: if excel is not None: excel.Quit() except Exception as exc: logger.debug("Excel application quit failed", extra={"path": str(path), "error": str(exc)}) pythoncom.CoUninitialize() def _positive_int(value) -> int | None: try: number = int(value) except (TypeError, ValueError): return None return number if number > 0 else None