feat(file-summary): 实现文件处理技能链路

2026-06-06 01:20:26 +08:00
parent 51e7c0c007
commit 18d045d487
19 changed files with 604 additions and 9 deletions
--- a/review_agent/file_summary/paths.py
+++ b/review_agent/file_summary/paths.py
@@ -0,0 +1,12 @@
 from __future__ import annotations
 from pathlib import Path
 from django.conf import settings
 def resolve_storage_path(storage_path: str) -> Path:
    path = Path(storage_path)
    if path.is_absolute():
        return path
    return Path(settings.MEDIA_ROOT) / path
--- a/review_agent/file_summary/services/init.py
+++ b/review_agent/file_summary/services/init.py
@@ -0,0 +1 @@
--- a/review_agent/file_summary/services/archive.py
+++ b/review_agent/file_summary/services/archive.py
@@ -0,0 +1,77 @@
 from __future__ import annotations
 import subprocess
 from pathlib import Path
 from zipfile import ZipFile
 import py7zr
 ARCHIVE_EXTENSIONS = {"zip", "7z", "rar"}
 def _ensure_inside_target(path: Path, target_dir: Path) -> None:
    target = target_dir.resolve()
    resolved = path.resolve()
    if target != resolved and target not in resolved.parents:
        raise ValueError("解压路径必须位于批次工作目录内。")
 def _safe_member_path(target_dir: Path, member_name: str) -> Path:
    destination = target_dir / member_name
    _ensure_inside_target(destination, target_dir)
    return destination
 def extract_archive(archive_path: str | Path, target_dir: str | Path) -> list[Path]:
    archive_path = Path(archive_path)
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    ext = archive_path.suffix.lower().lstrip(".")
    if ext not in ARCHIVE_EXTENSIONS:
        return []
    if ext == "zip":
        return _extract_zip(archive_path, target_dir)
    if ext == "7z":
        return _extract_7z(archive_path, target_dir)
    return _extract_rar(archive_path, target_dir)
 def _extract_zip(archive_path: Path, target_dir: Path) -> list[Path]:
    extracted: list[Path] = []
    with ZipFile(archive_path) as archive:
        for member in archive.infolist():
            destination = _safe_member_path(target_dir, member.filename)
            if member.is_dir():
                destination.mkdir(parents=True, exist_ok=True)
                continue
            destination.parent.mkdir(parents=True, exist_ok=True)
            with archive.open(member) as source, destination.open("wb") as target:
                target.write(source.read())
            extracted.append(destination)
    return extracted
 def _extract_7z(archive_path: Path, target_dir: Path) -> list[Path]:
    with py7zr.SevenZipFile(archive_path, mode="r") as archive:
        names = archive.getnames()
        for name in names:
            _safe_member_path(target_dir, name)
        archive.extractall(path=target_dir)
    return [target_dir / name for name in names if (target_dir / name).is_file()]
 def _extract_rar(archive_path: Path, target_dir: Path) -> list[Path]:
    result = subprocess.run(
        ["7z", "x", f"-o{target_dir}", str(archive_path), "-y"],
        check=False,
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        raise RuntimeError(result.stderr or result.stdout or "rar 解压失败")
    extracted = [path for path in target_dir.rglob("*") if path.is_file()]
    for path in extracted:
        _ensure_inside_target(path, target_dir)
    return extracted
--- a/review_agent/file_summary/services/inventory.py
+++ b/review_agent/file_summary/services/inventory.py
@@ -0,0 +1,49 @@
 from __future__ import annotations
 from pathlib import Path
 from review_agent.models import FileSummaryBatch, FileSummaryItem
 SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
 def _directory_level(relative_path: Path) -> str:
    if len(relative_path.parts) <= 1:
        return ""
    return "/".join(relative_path.parts[:-1])
 def scan_files_to_items(*, batch: FileSummaryBatch, roots: list[Path]) -> list[FileSummaryItem]:
    files: list[tuple[Path, Path]] = []
    for root in roots:
        root = Path(root)
        if root.is_file():
            files.append((root.parent, root))
            continue
        for path in sorted(item for item in root.rglob("*") if item.is_file()):
            if path.name.startswith(".") or path.stat().st_size == 0:
                continue
            files.append((root, path))
    created: list[FileSummaryItem] = []
    for index, (root, path) in enumerate(files, start=1):
        relative = path.relative_to(root).as_posix()
        file_type = path.suffix.lower().lstrip(".")
        item = FileSummaryItem.objects.create(
            batch=batch,
            file_index=index,
            directory_level=_directory_level(Path(relative)),
            file_name=path.name,
            file_type=file_type,
            relative_path=relative,
            storage_path=str(path),
            statistics_status=FileSummaryItem.StatisticsStatus.SKIPPED,
        )
        created.append(item)
    batch.total_files = len(created)
    batch.supported_files = sum(1 for item in created if item.file_type in SUPPORTED_EXTENSIONS)
    batch.unsupported_files = len(created) - batch.supported_files
    batch.save(update_fields=["total_files", "supported_files", "unsupported_files"])
    return created
--- a/review_agent/file_summary/services/page_count.py
+++ b/review_agent/file_summary/services/page_count.py
@@ -0,0 +1,59 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from pathlib import Path
 SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
@dataclass(frozen=True)
 class PageCountResult:
    status: str
    page_count: int | None = None
    error_message: str = ""
 def count_document_pages(path: str | Path) -> PageCountResult:
    file_path = Path(path)
    ext = file_path.suffix.lower().lstrip(".")
    if ext not in SUPPORTED_EXTENSIONS:
        return PageCountResult(status="unsupported")
    try:
        if ext == "pdf":
            from pypdf import PdfReader
            return PageCountResult(status="success", page_count=len(PdfReader(str(file_path)).pages))
        if ext == "docx":
            from docx import Document
            properties = Document(str(file_path)).core_properties
            pages = getattr(properties, "pages", None)
            if pages:
                return PageCountResult(status="success", page_count=pages)
            return PageCountResult(status="uncertain")
        if ext == "xlsx":
            from openpyxl import load_workbook
            workbook = load_workbook(str(file_path), read_only=True, data_only=True)
            return PageCountResult(status="success", page_count=len(workbook.sheetnames))
        if ext == "xls":
            import xlrd
            workbook = xlrd.open_workbook(str(file_path), on_demand=True)
            return PageCountResult(status="success", page_count=workbook.nsheets)
        if ext == "pptx":
            from pptx import Presentation
            return PageCountResult(status="success", page_count=len(Presentation(str(file_path)).slides))
        if ext in {"doc", "ppt"}:
            import olefile
            if olefile.isOleFile(str(file_path)):
                return PageCountResult(status="uncertain")
            return PageCountResult(status="failed", error_message="不是有效的 OLE 文件。")
    except Exception as exc:
        return PageCountResult(status="failed", error_message=str(exc))
    return PageCountResult(status="uncertain")
--- a/review_agent/file_summary/services/product_detect.py
+++ b/review_agent/file_summary/services/product_detect.py
@@ -0,0 +1,31 @@
 from __future__ import annotations
 from pathlib import Path
 from review_agent.models import FileSummaryBatch
 def detect_product_name(batch: FileSummaryBatch) -> str:
    product_name = ""
    for item in batch.items.order_by("file_index"):
        parts = Path(item.relative_path).parts
        if len(parts) > 1:
            product_name = parts[0]
            break
        name = Path(item.file_name).stem
        for keyword in ("产品", "试剂盒", "说明书"):
            if keyword in name:
                product_name = name
                break
        if product_name:
            break
    if not product_name:
        return ""
    batch.product_name = product_name
    batch.save(update_fields=["product_name"])
    if batch.conversation.title.startswith("新对话"):
        batch.conversation.title = f"{product_name}-文件汇总"
        batch.conversation.save(update_fields=["title", "updated_at"])
    return product_name
--- a/review_agent/file_summary/skills/init.py
+++ b/review_agent/file_summary/skills/init.py
@@ -0,0 +1 @@
--- a/review_agent/file_summary/skills/archive_extract.py
+++ b/review_agent/file_summary/skills/archive_extract.py
@@ -0,0 +1,26 @@
 from __future__ import annotations
 from pathlib import Path
 from review_agent.models import FileSummaryBatchAttachment
 from ..paths import resolve_storage_path
 from ..services.archive import ARCHIVE_EXTENSIONS, extract_archive
 from .base import BaseSkill, SkillResult, WorkflowContext
 class ArchiveExtractSkill(BaseSkill):
    name = "archive_extract"
    def run(self, context: WorkflowContext) -> SkillResult:
        extracted_count = 0
        target_dir = Path(context.batch.work_dir or "")
        if not target_dir:
            return SkillResult(success=True, data={"extracted_count": 0})
        for binding in FileSummaryBatchAttachment.objects.filter(batch=context.batch):
            path = resolve_storage_path(binding.attachment.storage_path)
            if path.suffix.lower().lstrip(".") not in ARCHIVE_EXTENSIONS:
                continue
            extracted_count += len(extract_archive(path, target_dir))
        return SkillResult(success=True, data={"extracted_count": extracted_count})
--- a/review_agent/file_summary/skills/base.py
+++ b/review_agent/file_summary/skills/base.py
@@ -0,0 +1,24 @@
 from __future__ import annotations
 from dataclasses import dataclass, field
 from review_agent.models import FileSummaryBatch
@dataclass(frozen=True)
 class WorkflowContext:
    batch: FileSummaryBatch
@dataclass
 class SkillResult:
    success: bool
    data: dict = field(default_factory=dict)
    message: str = ""
 class BaseSkill:
    name = ""
    def run(self, context: WorkflowContext) -> SkillResult:
        raise NotImplementedError
--- a/review_agent/file_summary/skills/document_page_count.py
+++ b/review_agent/file_summary/skills/document_page_count.py
@@ -0,0 +1,64 @@
 from __future__ import annotations
 from review_agent.models import FileSummaryItem
 from ..services.page_count import SUPPORTED_EXTENSIONS, count_document_pages
 from .base import BaseSkill, SkillResult, WorkflowContext
 class DocumentPageCountSkill(BaseSkill):
    name = "document_page_count"
    def run(self, context: WorkflowContext) -> SkillResult:
        success_files = failed_files = unsupported_files = uncertain_files = total_pages = 0
        for item in context.batch.items.order_by("file_index"):
            if item.file_type not in SUPPORTED_EXTENSIONS:
                item.statistics_status = FileSummaryItem.StatisticsStatus.UNSUPPORTED
                unsupported_files += 1
                item.save(update_fields=["statistics_status", "updated_at"])
                continue
            result = None
            for attempt in range(1, 4):
                result = count_document_pages(item.storage_path)
                item.retry_count = attempt - 1
                if result.status != "failed":
                    break
            item.statistics_status = result.status
            item.page_count = result.page_count
            item.error_message = result.error_message
            item.save(
                update_fields=[
                    "statistics_status",
                    "page_count",
                    "retry_count",
                    "error_message",
                    "updated_at",
                ]
            )
            if result.status == FileSummaryItem.StatisticsStatus.SUCCESS:
                success_files += 1
                total_pages += result.page_count or 0
            elif result.status == FileSummaryItem.StatisticsStatus.UNCERTAIN:
                uncertain_files += 1
            elif result.status == FileSummaryItem.StatisticsStatus.UNSUPPORTED:
                unsupported_files += 1
            else:
                failed_files += 1
        context.batch.success_files = success_files
        context.batch.failed_files = failed_files
        context.batch.unsupported_files = unsupported_files
        context.batch.uncertain_files = uncertain_files
        context.batch.total_pages = total_pages
        context.batch.save(
            update_fields=[
                "success_files",
                "failed_files",
                "unsupported_files",
                "uncertain_files",
                "total_pages",
            ]
        )
        return SkillResult(success=True)
--- a/review_agent/file_summary/skills/file_inventory.py
+++ b/review_agent/file_summary/skills/file_inventory.py
@@ -0,0 +1,21 @@
 from __future__ import annotations
 from pathlib import Path
 from review_agent.models import FileSummaryBatchAttachment
 from ..paths import resolve_storage_path
 from ..services.inventory import scan_files_to_items
 from .base import BaseSkill, SkillResult, WorkflowContext
 class FileInventorySkill(BaseSkill):
    name = "file_inventory"
    def run(self, context: WorkflowContext) -> SkillResult:
        roots = [
            resolve_storage_path(binding.attachment.storage_path)
            for binding in FileSummaryBatchAttachment.objects.filter(batch=context.batch)
        ]
        items = scan_files_to_items(batch=context.batch, roots=roots)
        return SkillResult(success=True, data={"total_files": len(items)})
--- a/review_agent/file_summary/skills/product_detect.py
+++ b/review_agent/file_summary/skills/product_detect.py
@@ -0,0 +1,12 @@
 from __future__ import annotations
 from ..services.product_detect import detect_product_name
 from .base import BaseSkill, SkillResult, WorkflowContext
 class ProductDetectSkill(BaseSkill):
    name = "product_detect"
    def run(self, context: WorkflowContext) -> SkillResult:
        product_name = detect_product_name(context.batch)
        return SkillResult(success=True, data={"product_name": product_name})
--- a/review_agent/file_summary/skills/registry.py
+++ b/review_agent/file_summary/skills/registry.py
@@ -0,0 +1,22 @@
 from __future__ import annotations
 from .base import BaseSkill, SkillResult, WorkflowContext
 class SkillRegistry:
    def __init__(self):
        self._skills: dict[str, BaseSkill] = {}
    def register(self, skill: BaseSkill) -> None:
        if not skill.name:
            raise ValueError("Skill 必须声明 name。")
        self._skills[skill.name] = skill
    def get(self, name: str) -> BaseSkill:
        try:
            return self._skills[name]
        except KeyError as exc:
            raise KeyError(f"Skill 未注册：{name}") from exc
    def execute(self, name: str, context: WorkflowContext) -> SkillResult:
        return self.get(name).run(context)
--- a/review_agent/file_summary/workflow.py
+++ b/review_agent/file_summary/workflow.py
@@ -16,19 +16,34 @@ from review_agent.models import (
 )
 from .events import record_event
 from .skills.archive_extract import ArchiveExtractSkill
 from .skills.base import WorkflowContext
 from .skills.document_page_count import DocumentPageCountSkill
 from .skills.file_inventory import FileInventorySkill
 from .skills.product_detect import ProductDetectSkill
 from .skills.registry import SkillRegistry
 NODE_DEFINITIONS = [
-    ("upload", "附件固化"),
+    ("upload", "附件固化", ""),
-    ("extract", "压缩包解压"),
+    ("extract", "压缩包解压", "archive_extract"),
-    ("inventory", "文件扫描"),
+    ("inventory", "文件扫描", "file_inventory"),
-    ("page_count", "页数统计"),
+    ("page_count", "页数统计", "document_page_count"),
-    ("product_detect", "产品识别"),
+    ("product_detect", "产品识别", "product_detect"),
-    ("report", "报告输出"),
+    ("report", "报告输出", ""),
-    ("complete", "完成"),
+    ("complete", "完成", ""),
 ]
 def default_skill_registry() -> SkillRegistry:
    registry = SkillRegistry()
    registry.register(ArchiveExtractSkill())
    registry.register(FileInventorySkill())
    registry.register(DocumentPageCountSkill())
    registry.register(ProductDetectSkill())
    return registry
 def build_batch_no() -> str:
    return f"FS-{timezone.localtime().strftime('%Y%m%d%H%M%S')}-{uuid4().hex[:6]}"
@@ -61,7 +76,7 @@ def create_file_summary_batch(
        attachment.upload_status = FileAttachment.UploadStatus.BOUND
        attachment.save(update_fields=["upload_status"])
-    for code, name in NODE_DEFINITIONS:
+    for code, name, _skill_name in NODE_DEFINITIONS:
        WorkflowNodeRun.objects.create(batch=batch, node_code=code, node_name=name)
    record_event(batch, "workflow_created", {"batch_id": batch.pk, "batch_no": batch.batch_no})
@@ -69,8 +84,9 @@ def create_file_summary_batch(
 class WorkflowExecutor:
-    def __init__(self, batch: FileSummaryBatch):
+    def __init__(self, batch: FileSummaryBatch, registry: SkillRegistry | None = None):
        self.batch = batch
        self.registry = registry or default_skill_registry()
    def run(self) -> None:
        self.batch.status = FileSummaryBatch.Status.RUNNING
@@ -107,6 +123,15 @@ class WorkflowExecutor:
            {"node_code": node.node_code, "status": node.status, "progress": node.progress},
        )
        skill_name = next(
            (skill for code, _name, skill in NODE_DEFINITIONS if code == node.node_code),
            "",
        )
        if skill_name:
            result = self.registry.execute(skill_name, WorkflowContext(batch=self.batch))
            if not result.success:
                raise RuntimeError(result.message or f"{node.node_name}执行失败")
        node.status = WorkflowNodeRun.Status.SUCCESS
        node.progress = 100
        node.finished_at = timezone.now()
--- a/tests/test_file_summary_archive.py
+++ b/tests/test_file_summary_archive.py
@@ -0,0 +1,25 @@
 from zipfile import ZipFile
 import pytest
 from review_agent.file_summary.services.archive import extract_archive
 def test_extract_zip_preserves_safe_paths(tmp_path):
    archive_path = tmp_path / "safe.zip"
    with ZipFile(archive_path, "w") as archive:
        archive.writestr("dir/a.txt", "content")
    target = tmp_path / "out"
    extracted = extract_archive(archive_path, target)
    assert extracted == [target / "dir" / "a.txt"]
    assert (target / "dir" / "a.txt").read_text(encoding="utf-8") == "content"
 def test_extract_zip_rejects_path_traversal(tmp_path):
    archive_path = tmp_path / "evil.zip"
    with ZipFile(archive_path, "w") as archive:
        archive.writestr("../evil.txt", "bad")
    with pytest.raises(ValueError):
        extract_archive(archive_path, tmp_path / "out")
--- a/tests/test_file_summary_inventory.py
+++ b/tests/test_file_summary_inventory.py
@@ -0,0 +1,24 @@
 from pathlib import Path
 import pytest
 from review_agent.file_summary.services.inventory import scan_files_to_items
 from review_agent.models import Conversation, FileSummaryBatch, FileSummaryItem
 pytestmark = pytest.mark.django_db
 def test_scan_files_to_items_preserves_relative_paths(tmp_path, django_user_model):
    root = tmp_path / "work"
    (root / "a").mkdir(parents=True)
    (root / "a" / "one.pdf").write_bytes(b"pdf")
    (root / "two.txt").write_text("x", encoding="utf-8")
    user = django_user_model.objects.create_user(username="owner", password="pass")
    conversation = Conversation.objects.create(user=user, title="会话")
    batch = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-I")
    items = scan_files_to_items(batch=batch, roots=[root])
    assert [item.relative_path for item in items] == ["a/one.pdf", "two.txt"]
    assert FileSummaryItem.objects.filter(batch=batch).count() == 2
    assert items[0].statistics_status == FileSummaryItem.StatisticsStatus.SKIPPED
--- a/tests/test_file_summary_page_count.py
+++ b/tests/test_file_summary_page_count.py
@@ -0,0 +1,66 @@
 import pytest
 from docx import Document
 from openpyxl import Workbook
 from pptx import Presentation
 from review_agent.file_summary.services.page_count import count_document_pages
 from review_agent.file_summary.skills.document_page_count import DocumentPageCountSkill
 from review_agent.file_summary.skills.base import WorkflowContext
 from review_agent.models import Conversation, FileSummaryBatch, FileSummaryItem
 pytestmark = pytest.mark.django_db
 def test_count_document_pages_for_office_formats(tmp_path):
    docx_path = tmp_path / "a.docx"
    Document().save(docx_path)
    xlsx_path = tmp_path / "a.xlsx"
    workbook = Workbook()
    workbook.create_sheet("第二页")
    workbook.save(xlsx_path)
    pptx_path = tmp_path / "a.pptx"
    presentation = Presentation()
    presentation.slides.add_slide(presentation.slide_layouts[6])
    presentation.save(pptx_path)
    assert count_document_pages(docx_path).status in {"success", "uncertain"}
    assert count_document_pages(xlsx_path).page_count == 2
    assert count_document_pages(pptx_path).page_count == 1
 def test_document_page_count_skill_marks_unsupported_and_success(tmp_path, django_user_model):
    xlsx_path = tmp_path / "a.xlsx"
    workbook = Workbook()
    workbook.save(xlsx_path)
    txt_path = tmp_path / "a.txt"
    txt_path.write_text("x", encoding="utf-8")
    user = django_user_model.objects.create_user(username="owner", password="pass")
    conversation = Conversation.objects.create(user=user, title="会话")
    batch = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-P")
    xlsx_item = FileSummaryItem.objects.create(
        batch=batch,
        file_index=1,
        file_name="a.xlsx",
        file_type="xlsx",
        relative_path="a.xlsx",
        storage_path=str(xlsx_path),
    )
    txt_item = FileSummaryItem.objects.create(
        batch=batch,
        file_index=2,
        file_name="a.txt",
        file_type="txt",
        relative_path="a.txt",
        storage_path=str(txt_path),
    )
    result = DocumentPageCountSkill().run(WorkflowContext(batch=batch))
    xlsx_item.refresh_from_db()
    txt_item.refresh_from_db()
    assert result.success is True
    assert xlsx_item.statistics_status == FileSummaryItem.StatisticsStatus.SUCCESS
    assert txt_item.statistics_status == FileSummaryItem.StatisticsStatus.UNSUPPORTED
--- a/tests/test_file_summary_product_detect.py
+++ b/tests/test_file_summary_product_detect.py
@@ -0,0 +1,29 @@
 import pytest
 from review_agent.file_summary.services.product_detect import detect_product_name
 from review_agent.models import Conversation, FileSummaryBatch, FileSummaryItem
 pytestmark = pytest.mark.django_db
 def test_detect_product_name_from_top_level_directory(django_user_model):
    user = django_user_model.objects.create_user(username="owner", password="pass")
    conversation = Conversation.objects.create(user=user, title="新对话 06-06")
    batch = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-D")
    FileSummaryItem.objects.create(
        batch=batch,
        file_index=1,
        file_name="说明书.docx",
        file_type="docx",
        relative_path="甲型试剂盒/说明书.docx",
        storage_path="x",
    )
    product_name = detect_product_name(batch)
    batch.refresh_from_db()
    conversation.refresh_from_db()
    assert product_name == "甲型试剂盒"
    assert batch.product_name == "甲型试剂盒"
    assert conversation.title == "甲型试剂盒-文件汇总"
--- a/tests/test_file_summary_skills.py
+++ b/tests/test_file_summary_skills.py
@@ -0,0 +1,27 @@
 import pytest
 from review_agent.file_summary.skills.base import BaseSkill, SkillResult, WorkflowContext
 from review_agent.file_summary.skills.registry import SkillRegistry
 class EchoSkill(BaseSkill):
    name = "echo"
    def run(self, context):
        return SkillResult(success=True, data={"batch_id": context.batch.id})
@pytest.mark.django_db
 def test_skill_registry_executes_registered_skill(django_user_model):
    from review_agent.models import Conversation, FileSummaryBatch
    user = django_user_model.objects.create_user(username="owner", password="pass")
    conversation = Conversation.objects.create(user=user, title="会话")
    batch = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-X")
    registry = SkillRegistry()
    registry.register(EchoSkill())
    result = registry.execute("echo", WorkflowContext(batch=batch))
    assert result.success is True
    assert result.data == {"batch_id": batch.id}