feat(file-summary): 实现文件处理技能链路

2026-06-06 01:20:26 +08:00
parent 51e7c0c007
commit 18d045d487
19 changed files with 604 additions and 9 deletions
--- a/review_agent/file_summary/services/init.py
+++ b/review_agent/file_summary/services/init.py
@@ -0,0 +1 @@
+
--- a/review_agent/file_summary/services/archive.py
+++ b/review_agent/file_summary/services/archive.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+from zipfile import ZipFile
+
+import py7zr
+
+
+ARCHIVE_EXTENSIONS = {"zip", "7z", "rar"}
+
+
+def _ensure_inside_target(path: Path, target_dir: Path) -> None:
+    target = target_dir.resolve()
+    resolved = path.resolve()
+    if target != resolved and target not in resolved.parents:
+        raise ValueError("解压路径必须位于批次工作目录内。")
+
+
+def _safe_member_path(target_dir: Path, member_name: str) -> Path:
+    destination = target_dir / member_name
+    _ensure_inside_target(destination, target_dir)
+    return destination
+
+
+def extract_archive(archive_path: str | Path, target_dir: str | Path) -> list[Path]:
+    archive_path = Path(archive_path)
+    target_dir = Path(target_dir)
+    target_dir.mkdir(parents=True, exist_ok=True)
+    ext = archive_path.suffix.lower().lstrip(".")
+    if ext not in ARCHIVE_EXTENSIONS:
+        return []
+
+    if ext == "zip":
+        return _extract_zip(archive_path, target_dir)
+    if ext == "7z":
+        return _extract_7z(archive_path, target_dir)
+    return _extract_rar(archive_path, target_dir)
+
+
+def _extract_zip(archive_path: Path, target_dir: Path) -> list[Path]:
+    extracted: list[Path] = []
+    with ZipFile(archive_path) as archive:
+        for member in archive.infolist():
+            destination = _safe_member_path(target_dir, member.filename)
+            if member.is_dir():
+                destination.mkdir(parents=True, exist_ok=True)
+                continue
+            destination.parent.mkdir(parents=True, exist_ok=True)
+            with archive.open(member) as source, destination.open("wb") as target:
+                target.write(source.read())
+            extracted.append(destination)
+    return extracted
+
+
+def _extract_7z(archive_path: Path, target_dir: Path) -> list[Path]:
+    with py7zr.SevenZipFile(archive_path, mode="r") as archive:
+        names = archive.getnames()
+        for name in names:
+            _safe_member_path(target_dir, name)
+        archive.extractall(path=target_dir)
+    return [target_dir / name for name in names if (target_dir / name).is_file()]
+
+
+def _extract_rar(archive_path: Path, target_dir: Path) -> list[Path]:
+    result = subprocess.run(
+        ["7z", "x", f"-o{target_dir}", str(archive_path), "-y"],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(result.stderr or result.stdout or "rar 解压失败")
+    extracted = [path for path in target_dir.rglob("*") if path.is_file()]
+    for path in extracted:
+        _ensure_inside_target(path, target_dir)
+    return extracted
--- a/review_agent/file_summary/services/inventory.py
+++ b/review_agent/file_summary/services/inventory.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from review_agent.models import FileSummaryBatch, FileSummaryItem
+
+
+SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
+
+
+def _directory_level(relative_path: Path) -> str:
+    if len(relative_path.parts) <= 1:
+        return ""
+    return "/".join(relative_path.parts[:-1])
+
+
+def scan_files_to_items(*, batch: FileSummaryBatch, roots: list[Path]) -> list[FileSummaryItem]:
+    files: list[tuple[Path, Path]] = []
+    for root in roots:
+        root = Path(root)
+        if root.is_file():
+            files.append((root.parent, root))
+            continue
+        for path in sorted(item for item in root.rglob("*") if item.is_file()):
+            if path.name.startswith(".") or path.stat().st_size == 0:
+                continue
+            files.append((root, path))
+
+    created: list[FileSummaryItem] = []
+    for index, (root, path) in enumerate(files, start=1):
+        relative = path.relative_to(root).as_posix()
+        file_type = path.suffix.lower().lstrip(".")
+        item = FileSummaryItem.objects.create(
+            batch=batch,
+            file_index=index,
+            directory_level=_directory_level(Path(relative)),
+            file_name=path.name,
+            file_type=file_type,
+            relative_path=relative,
+            storage_path=str(path),
+            statistics_status=FileSummaryItem.StatisticsStatus.SKIPPED,
+        )
+        created.append(item)
+
+    batch.total_files = len(created)
+    batch.supported_files = sum(1 for item in created if item.file_type in SUPPORTED_EXTENSIONS)
+    batch.unsupported_files = len(created) - batch.supported_files
+    batch.save(update_fields=["total_files", "supported_files", "unsupported_files"])
+    return created
--- a/review_agent/file_summary/services/page_count.py
+++ b/review_agent/file_summary/services/page_count.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+
+SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
+
+
+@dataclass(frozen=True)
+class PageCountResult:
+    status: str
+    page_count: int | None = None
+    error_message: str = ""
+
+
+def count_document_pages(path: str | Path) -> PageCountResult:
+    file_path = Path(path)
+    ext = file_path.suffix.lower().lstrip(".")
+    if ext not in SUPPORTED_EXTENSIONS:
+        return PageCountResult(status="unsupported")
+
+    try:
+        if ext == "pdf":
+            from pypdf import PdfReader
+
+            return PageCountResult(status="success", page_count=len(PdfReader(str(file_path)).pages))
+        if ext == "docx":
+            from docx import Document
+
+            properties = Document(str(file_path)).core_properties
+            pages = getattr(properties, "pages", None)
+            if pages:
+                return PageCountResult(status="success", page_count=pages)
+            return PageCountResult(status="uncertain")
+        if ext == "xlsx":
+            from openpyxl import load_workbook
+
+            workbook = load_workbook(str(file_path), read_only=True, data_only=True)
+            return PageCountResult(status="success", page_count=len(workbook.sheetnames))
+        if ext == "xls":
+            import xlrd
+
+            workbook = xlrd.open_workbook(str(file_path), on_demand=True)
+            return PageCountResult(status="success", page_count=workbook.nsheets)
+        if ext == "pptx":
+            from pptx import Presentation
+
+            return PageCountResult(status="success", page_count=len(Presentation(str(file_path)).slides))
+        if ext in {"doc", "ppt"}:
+            import olefile
+
+            if olefile.isOleFile(str(file_path)):
+                return PageCountResult(status="uncertain")
+            return PageCountResult(status="failed", error_message="不是有效的 OLE 文件。")
+    except Exception as exc:
+        return PageCountResult(status="failed", error_message=str(exc))
+
+    return PageCountResult(status="uncertain")
--- a/review_agent/file_summary/services/product_detect.py
+++ b/review_agent/file_summary/services/product_detect.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from review_agent.models import FileSummaryBatch
+
+
+def detect_product_name(batch: FileSummaryBatch) -> str:
+    product_name = ""
+    for item in batch.items.order_by("file_index"):
+        parts = Path(item.relative_path).parts
+        if len(parts) > 1:
+            product_name = parts[0]
+            break
+        name = Path(item.file_name).stem
+        for keyword in ("产品", "试剂盒", "说明书"):
+            if keyword in name:
+                product_name = name
+                break
+        if product_name:
+            break
+
+    if not product_name:
+        return ""
+
+    batch.product_name = product_name
+    batch.save(update_fields=["product_name"])
+    if batch.conversation.title.startswith("新对话"):
+        batch.conversation.title = f"{product_name}-文件汇总"
+        batch.conversation.save(update_fields=["title", "updated_at"])
+    return product_name