feat(file-summary): 实现文件处理技能链路

This commit is contained in:
2026-06-06 01:20:26 +08:00
parent 51e7c0c007
commit 18d045d487
19 changed files with 604 additions and 9 deletions

View File

@@ -0,0 +1,59 @@
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
@dataclass(frozen=True)
class PageCountResult:
status: str
page_count: int | None = None
error_message: str = ""
def count_document_pages(path: str | Path) -> PageCountResult:
file_path = Path(path)
ext = file_path.suffix.lower().lstrip(".")
if ext not in SUPPORTED_EXTENSIONS:
return PageCountResult(status="unsupported")
try:
if ext == "pdf":
from pypdf import PdfReader
return PageCountResult(status="success", page_count=len(PdfReader(str(file_path)).pages))
if ext == "docx":
from docx import Document
properties = Document(str(file_path)).core_properties
pages = getattr(properties, "pages", None)
if pages:
return PageCountResult(status="success", page_count=pages)
return PageCountResult(status="uncertain")
if ext == "xlsx":
from openpyxl import load_workbook
workbook = load_workbook(str(file_path), read_only=True, data_only=True)
return PageCountResult(status="success", page_count=len(workbook.sheetnames))
if ext == "xls":
import xlrd
workbook = xlrd.open_workbook(str(file_path), on_demand=True)
return PageCountResult(status="success", page_count=workbook.nsheets)
if ext == "pptx":
from pptx import Presentation
return PageCountResult(status="success", page_count=len(Presentation(str(file_path)).slides))
if ext in {"doc", "ppt"}:
import olefile
if olefile.isOleFile(str(file_path)):
return PageCountResult(status="uncertain")
return PageCountResult(status="failed", error_message="不是有效的 OLE 文件。")
except Exception as exc:
return PageCountResult(status="failed", error_message=str(exc))
return PageCountResult(status="uncertain")