feat(file-summary): 实现文件处理技能链路
This commit is contained in:
12
review_agent/file_summary/paths.py
Normal file
12
review_agent/file_summary/paths.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_storage_path(storage_path: str) -> Path:
|
||||||
|
path = Path(storage_path)
|
||||||
|
if path.is_absolute():
|
||||||
|
return path
|
||||||
|
return Path(settings.MEDIA_ROOT) / path
|
||||||
1
review_agent/file_summary/services/__init__.py
Normal file
1
review_agent/file_summary/services/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
77
review_agent/file_summary/services/archive.py
Normal file
77
review_agent/file_summary/services/archive.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
import py7zr
|
||||||
|
|
||||||
|
|
||||||
|
ARCHIVE_EXTENSIONS = {"zip", "7z", "rar"}
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_inside_target(path: Path, target_dir: Path) -> None:
|
||||||
|
target = target_dir.resolve()
|
||||||
|
resolved = path.resolve()
|
||||||
|
if target != resolved and target not in resolved.parents:
|
||||||
|
raise ValueError("解压路径必须位于批次工作目录内。")
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_member_path(target_dir: Path, member_name: str) -> Path:
|
||||||
|
destination = target_dir / member_name
|
||||||
|
_ensure_inside_target(destination, target_dir)
|
||||||
|
return destination
|
||||||
|
|
||||||
|
|
||||||
|
def extract_archive(archive_path: str | Path, target_dir: str | Path) -> list[Path]:
|
||||||
|
archive_path = Path(archive_path)
|
||||||
|
target_dir = Path(target_dir)
|
||||||
|
target_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
ext = archive_path.suffix.lower().lstrip(".")
|
||||||
|
if ext not in ARCHIVE_EXTENSIONS:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if ext == "zip":
|
||||||
|
return _extract_zip(archive_path, target_dir)
|
||||||
|
if ext == "7z":
|
||||||
|
return _extract_7z(archive_path, target_dir)
|
||||||
|
return _extract_rar(archive_path, target_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_zip(archive_path: Path, target_dir: Path) -> list[Path]:
|
||||||
|
extracted: list[Path] = []
|
||||||
|
with ZipFile(archive_path) as archive:
|
||||||
|
for member in archive.infolist():
|
||||||
|
destination = _safe_member_path(target_dir, member.filename)
|
||||||
|
if member.is_dir():
|
||||||
|
destination.mkdir(parents=True, exist_ok=True)
|
||||||
|
continue
|
||||||
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with archive.open(member) as source, destination.open("wb") as target:
|
||||||
|
target.write(source.read())
|
||||||
|
extracted.append(destination)
|
||||||
|
return extracted
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_7z(archive_path: Path, target_dir: Path) -> list[Path]:
|
||||||
|
with py7zr.SevenZipFile(archive_path, mode="r") as archive:
|
||||||
|
names = archive.getnames()
|
||||||
|
for name in names:
|
||||||
|
_safe_member_path(target_dir, name)
|
||||||
|
archive.extractall(path=target_dir)
|
||||||
|
return [target_dir / name for name in names if (target_dir / name).is_file()]
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_rar(archive_path: Path, target_dir: Path) -> list[Path]:
|
||||||
|
result = subprocess.run(
|
||||||
|
["7z", "x", f"-o{target_dir}", str(archive_path), "-y"],
|
||||||
|
check=False,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
raise RuntimeError(result.stderr or result.stdout or "rar 解压失败")
|
||||||
|
extracted = [path for path in target_dir.rglob("*") if path.is_file()]
|
||||||
|
for path in extracted:
|
||||||
|
_ensure_inside_target(path, target_dir)
|
||||||
|
return extracted
|
||||||
49
review_agent/file_summary/services/inventory.py
Normal file
49
review_agent/file_summary/services/inventory.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from review_agent.models import FileSummaryBatch, FileSummaryItem
|
||||||
|
|
||||||
|
|
||||||
|
SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
|
||||||
|
|
||||||
|
|
||||||
|
def _directory_level(relative_path: Path) -> str:
|
||||||
|
if len(relative_path.parts) <= 1:
|
||||||
|
return ""
|
||||||
|
return "/".join(relative_path.parts[:-1])
|
||||||
|
|
||||||
|
|
||||||
|
def scan_files_to_items(*, batch: FileSummaryBatch, roots: list[Path]) -> list[FileSummaryItem]:
|
||||||
|
files: list[tuple[Path, Path]] = []
|
||||||
|
for root in roots:
|
||||||
|
root = Path(root)
|
||||||
|
if root.is_file():
|
||||||
|
files.append((root.parent, root))
|
||||||
|
continue
|
||||||
|
for path in sorted(item for item in root.rglob("*") if item.is_file()):
|
||||||
|
if path.name.startswith(".") or path.stat().st_size == 0:
|
||||||
|
continue
|
||||||
|
files.append((root, path))
|
||||||
|
|
||||||
|
created: list[FileSummaryItem] = []
|
||||||
|
for index, (root, path) in enumerate(files, start=1):
|
||||||
|
relative = path.relative_to(root).as_posix()
|
||||||
|
file_type = path.suffix.lower().lstrip(".")
|
||||||
|
item = FileSummaryItem.objects.create(
|
||||||
|
batch=batch,
|
||||||
|
file_index=index,
|
||||||
|
directory_level=_directory_level(Path(relative)),
|
||||||
|
file_name=path.name,
|
||||||
|
file_type=file_type,
|
||||||
|
relative_path=relative,
|
||||||
|
storage_path=str(path),
|
||||||
|
statistics_status=FileSummaryItem.StatisticsStatus.SKIPPED,
|
||||||
|
)
|
||||||
|
created.append(item)
|
||||||
|
|
||||||
|
batch.total_files = len(created)
|
||||||
|
batch.supported_files = sum(1 for item in created if item.file_type in SUPPORTED_EXTENSIONS)
|
||||||
|
batch.unsupported_files = len(created) - batch.supported_files
|
||||||
|
batch.save(update_fields=["total_files", "supported_files", "unsupported_files"])
|
||||||
|
return created
|
||||||
59
review_agent/file_summary/services/page_count.py
Normal file
59
review_agent/file_summary/services/page_count.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PageCountResult:
|
||||||
|
status: str
|
||||||
|
page_count: int | None = None
|
||||||
|
error_message: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
def count_document_pages(path: str | Path) -> PageCountResult:
|
||||||
|
file_path = Path(path)
|
||||||
|
ext = file_path.suffix.lower().lstrip(".")
|
||||||
|
if ext not in SUPPORTED_EXTENSIONS:
|
||||||
|
return PageCountResult(status="unsupported")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if ext == "pdf":
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
return PageCountResult(status="success", page_count=len(PdfReader(str(file_path)).pages))
|
||||||
|
if ext == "docx":
|
||||||
|
from docx import Document
|
||||||
|
|
||||||
|
properties = Document(str(file_path)).core_properties
|
||||||
|
pages = getattr(properties, "pages", None)
|
||||||
|
if pages:
|
||||||
|
return PageCountResult(status="success", page_count=pages)
|
||||||
|
return PageCountResult(status="uncertain")
|
||||||
|
if ext == "xlsx":
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
|
||||||
|
workbook = load_workbook(str(file_path), read_only=True, data_only=True)
|
||||||
|
return PageCountResult(status="success", page_count=len(workbook.sheetnames))
|
||||||
|
if ext == "xls":
|
||||||
|
import xlrd
|
||||||
|
|
||||||
|
workbook = xlrd.open_workbook(str(file_path), on_demand=True)
|
||||||
|
return PageCountResult(status="success", page_count=workbook.nsheets)
|
||||||
|
if ext == "pptx":
|
||||||
|
from pptx import Presentation
|
||||||
|
|
||||||
|
return PageCountResult(status="success", page_count=len(Presentation(str(file_path)).slides))
|
||||||
|
if ext in {"doc", "ppt"}:
|
||||||
|
import olefile
|
||||||
|
|
||||||
|
if olefile.isOleFile(str(file_path)):
|
||||||
|
return PageCountResult(status="uncertain")
|
||||||
|
return PageCountResult(status="failed", error_message="不是有效的 OLE 文件。")
|
||||||
|
except Exception as exc:
|
||||||
|
return PageCountResult(status="failed", error_message=str(exc))
|
||||||
|
|
||||||
|
return PageCountResult(status="uncertain")
|
||||||
31
review_agent/file_summary/services/product_detect.py
Normal file
31
review_agent/file_summary/services/product_detect.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from review_agent.models import FileSummaryBatch
|
||||||
|
|
||||||
|
|
||||||
|
def detect_product_name(batch: FileSummaryBatch) -> str:
|
||||||
|
product_name = ""
|
||||||
|
for item in batch.items.order_by("file_index"):
|
||||||
|
parts = Path(item.relative_path).parts
|
||||||
|
if len(parts) > 1:
|
||||||
|
product_name = parts[0]
|
||||||
|
break
|
||||||
|
name = Path(item.file_name).stem
|
||||||
|
for keyword in ("产品", "试剂盒", "说明书"):
|
||||||
|
if keyword in name:
|
||||||
|
product_name = name
|
||||||
|
break
|
||||||
|
if product_name:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not product_name:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
batch.product_name = product_name
|
||||||
|
batch.save(update_fields=["product_name"])
|
||||||
|
if batch.conversation.title.startswith("新对话"):
|
||||||
|
batch.conversation.title = f"{product_name}-文件汇总"
|
||||||
|
batch.conversation.save(update_fields=["title", "updated_at"])
|
||||||
|
return product_name
|
||||||
1
review_agent/file_summary/skills/__init__.py
Normal file
1
review_agent/file_summary/skills/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
26
review_agent/file_summary/skills/archive_extract.py
Normal file
26
review_agent/file_summary/skills/archive_extract.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from review_agent.models import FileSummaryBatchAttachment
|
||||||
|
|
||||||
|
from ..paths import resolve_storage_path
|
||||||
|
from ..services.archive import ARCHIVE_EXTENSIONS, extract_archive
|
||||||
|
from .base import BaseSkill, SkillResult, WorkflowContext
|
||||||
|
|
||||||
|
|
||||||
|
class ArchiveExtractSkill(BaseSkill):
|
||||||
|
name = "archive_extract"
|
||||||
|
|
||||||
|
def run(self, context: WorkflowContext) -> SkillResult:
|
||||||
|
extracted_count = 0
|
||||||
|
target_dir = Path(context.batch.work_dir or "")
|
||||||
|
if not target_dir:
|
||||||
|
return SkillResult(success=True, data={"extracted_count": 0})
|
||||||
|
|
||||||
|
for binding in FileSummaryBatchAttachment.objects.filter(batch=context.batch):
|
||||||
|
path = resolve_storage_path(binding.attachment.storage_path)
|
||||||
|
if path.suffix.lower().lstrip(".") not in ARCHIVE_EXTENSIONS:
|
||||||
|
continue
|
||||||
|
extracted_count += len(extract_archive(path, target_dir))
|
||||||
|
return SkillResult(success=True, data={"extracted_count": extracted_count})
|
||||||
24
review_agent/file_summary/skills/base.py
Normal file
24
review_agent/file_summary/skills/base.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from review_agent.models import FileSummaryBatch
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class WorkflowContext:
|
||||||
|
batch: FileSummaryBatch
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SkillResult:
|
||||||
|
success: bool
|
||||||
|
data: dict = field(default_factory=dict)
|
||||||
|
message: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class BaseSkill:
|
||||||
|
name = ""
|
||||||
|
|
||||||
|
def run(self, context: WorkflowContext) -> SkillResult:
|
||||||
|
raise NotImplementedError
|
||||||
64
review_agent/file_summary/skills/document_page_count.py
Normal file
64
review_agent/file_summary/skills/document_page_count.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from review_agent.models import FileSummaryItem
|
||||||
|
|
||||||
|
from ..services.page_count import SUPPORTED_EXTENSIONS, count_document_pages
|
||||||
|
from .base import BaseSkill, SkillResult, WorkflowContext
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentPageCountSkill(BaseSkill):
|
||||||
|
name = "document_page_count"
|
||||||
|
|
||||||
|
def run(self, context: WorkflowContext) -> SkillResult:
|
||||||
|
success_files = failed_files = unsupported_files = uncertain_files = total_pages = 0
|
||||||
|
for item in context.batch.items.order_by("file_index"):
|
||||||
|
if item.file_type not in SUPPORTED_EXTENSIONS:
|
||||||
|
item.statistics_status = FileSummaryItem.StatisticsStatus.UNSUPPORTED
|
||||||
|
unsupported_files += 1
|
||||||
|
item.save(update_fields=["statistics_status", "updated_at"])
|
||||||
|
continue
|
||||||
|
|
||||||
|
result = None
|
||||||
|
for attempt in range(1, 4):
|
||||||
|
result = count_document_pages(item.storage_path)
|
||||||
|
item.retry_count = attempt - 1
|
||||||
|
if result.status != "failed":
|
||||||
|
break
|
||||||
|
item.statistics_status = result.status
|
||||||
|
item.page_count = result.page_count
|
||||||
|
item.error_message = result.error_message
|
||||||
|
item.save(
|
||||||
|
update_fields=[
|
||||||
|
"statistics_status",
|
||||||
|
"page_count",
|
||||||
|
"retry_count",
|
||||||
|
"error_message",
|
||||||
|
"updated_at",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.status == FileSummaryItem.StatisticsStatus.SUCCESS:
|
||||||
|
success_files += 1
|
||||||
|
total_pages += result.page_count or 0
|
||||||
|
elif result.status == FileSummaryItem.StatisticsStatus.UNCERTAIN:
|
||||||
|
uncertain_files += 1
|
||||||
|
elif result.status == FileSummaryItem.StatisticsStatus.UNSUPPORTED:
|
||||||
|
unsupported_files += 1
|
||||||
|
else:
|
||||||
|
failed_files += 1
|
||||||
|
|
||||||
|
context.batch.success_files = success_files
|
||||||
|
context.batch.failed_files = failed_files
|
||||||
|
context.batch.unsupported_files = unsupported_files
|
||||||
|
context.batch.uncertain_files = uncertain_files
|
||||||
|
context.batch.total_pages = total_pages
|
||||||
|
context.batch.save(
|
||||||
|
update_fields=[
|
||||||
|
"success_files",
|
||||||
|
"failed_files",
|
||||||
|
"unsupported_files",
|
||||||
|
"uncertain_files",
|
||||||
|
"total_pages",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
return SkillResult(success=True)
|
||||||
21
review_agent/file_summary/skills/file_inventory.py
Normal file
21
review_agent/file_summary/skills/file_inventory.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from review_agent.models import FileSummaryBatchAttachment
|
||||||
|
|
||||||
|
from ..paths import resolve_storage_path
|
||||||
|
from ..services.inventory import scan_files_to_items
|
||||||
|
from .base import BaseSkill, SkillResult, WorkflowContext
|
||||||
|
|
||||||
|
|
||||||
|
class FileInventorySkill(BaseSkill):
|
||||||
|
name = "file_inventory"
|
||||||
|
|
||||||
|
def run(self, context: WorkflowContext) -> SkillResult:
|
||||||
|
roots = [
|
||||||
|
resolve_storage_path(binding.attachment.storage_path)
|
||||||
|
for binding in FileSummaryBatchAttachment.objects.filter(batch=context.batch)
|
||||||
|
]
|
||||||
|
items = scan_files_to_items(batch=context.batch, roots=roots)
|
||||||
|
return SkillResult(success=True, data={"total_files": len(items)})
|
||||||
12
review_agent/file_summary/skills/product_detect.py
Normal file
12
review_agent/file_summary/skills/product_detect.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from ..services.product_detect import detect_product_name
|
||||||
|
from .base import BaseSkill, SkillResult, WorkflowContext
|
||||||
|
|
||||||
|
|
||||||
|
class ProductDetectSkill(BaseSkill):
|
||||||
|
name = "product_detect"
|
||||||
|
|
||||||
|
def run(self, context: WorkflowContext) -> SkillResult:
|
||||||
|
product_name = detect_product_name(context.batch)
|
||||||
|
return SkillResult(success=True, data={"product_name": product_name})
|
||||||
22
review_agent/file_summary/skills/registry.py
Normal file
22
review_agent/file_summary/skills/registry.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from .base import BaseSkill, SkillResult, WorkflowContext
|
||||||
|
|
||||||
|
|
||||||
|
class SkillRegistry:
|
||||||
|
def __init__(self):
|
||||||
|
self._skills: dict[str, BaseSkill] = {}
|
||||||
|
|
||||||
|
def register(self, skill: BaseSkill) -> None:
|
||||||
|
if not skill.name:
|
||||||
|
raise ValueError("Skill 必须声明 name。")
|
||||||
|
self._skills[skill.name] = skill
|
||||||
|
|
||||||
|
def get(self, name: str) -> BaseSkill:
|
||||||
|
try:
|
||||||
|
return self._skills[name]
|
||||||
|
except KeyError as exc:
|
||||||
|
raise KeyError(f"Skill 未注册:{name}") from exc
|
||||||
|
|
||||||
|
def execute(self, name: str, context: WorkflowContext) -> SkillResult:
|
||||||
|
return self.get(name).run(context)
|
||||||
@@ -16,19 +16,34 @@ from review_agent.models import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from .events import record_event
|
from .events import record_event
|
||||||
|
from .skills.archive_extract import ArchiveExtractSkill
|
||||||
|
from .skills.base import WorkflowContext
|
||||||
|
from .skills.document_page_count import DocumentPageCountSkill
|
||||||
|
from .skills.file_inventory import FileInventorySkill
|
||||||
|
from .skills.product_detect import ProductDetectSkill
|
||||||
|
from .skills.registry import SkillRegistry
|
||||||
|
|
||||||
|
|
||||||
NODE_DEFINITIONS = [
|
NODE_DEFINITIONS = [
|
||||||
("upload", "附件固化"),
|
("upload", "附件固化", ""),
|
||||||
("extract", "压缩包解压"),
|
("extract", "压缩包解压", "archive_extract"),
|
||||||
("inventory", "文件扫描"),
|
("inventory", "文件扫描", "file_inventory"),
|
||||||
("page_count", "页数统计"),
|
("page_count", "页数统计", "document_page_count"),
|
||||||
("product_detect", "产品识别"),
|
("product_detect", "产品识别", "product_detect"),
|
||||||
("report", "报告输出"),
|
("report", "报告输出", ""),
|
||||||
("complete", "完成"),
|
("complete", "完成", ""),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def default_skill_registry() -> SkillRegistry:
|
||||||
|
registry = SkillRegistry()
|
||||||
|
registry.register(ArchiveExtractSkill())
|
||||||
|
registry.register(FileInventorySkill())
|
||||||
|
registry.register(DocumentPageCountSkill())
|
||||||
|
registry.register(ProductDetectSkill())
|
||||||
|
return registry
|
||||||
|
|
||||||
|
|
||||||
def build_batch_no() -> str:
|
def build_batch_no() -> str:
|
||||||
return f"FS-{timezone.localtime().strftime('%Y%m%d%H%M%S')}-{uuid4().hex[:6]}"
|
return f"FS-{timezone.localtime().strftime('%Y%m%d%H%M%S')}-{uuid4().hex[:6]}"
|
||||||
|
|
||||||
@@ -61,7 +76,7 @@ def create_file_summary_batch(
|
|||||||
attachment.upload_status = FileAttachment.UploadStatus.BOUND
|
attachment.upload_status = FileAttachment.UploadStatus.BOUND
|
||||||
attachment.save(update_fields=["upload_status"])
|
attachment.save(update_fields=["upload_status"])
|
||||||
|
|
||||||
for code, name in NODE_DEFINITIONS:
|
for code, name, _skill_name in NODE_DEFINITIONS:
|
||||||
WorkflowNodeRun.objects.create(batch=batch, node_code=code, node_name=name)
|
WorkflowNodeRun.objects.create(batch=batch, node_code=code, node_name=name)
|
||||||
|
|
||||||
record_event(batch, "workflow_created", {"batch_id": batch.pk, "batch_no": batch.batch_no})
|
record_event(batch, "workflow_created", {"batch_id": batch.pk, "batch_no": batch.batch_no})
|
||||||
@@ -69,8 +84,9 @@ def create_file_summary_batch(
|
|||||||
|
|
||||||
|
|
||||||
class WorkflowExecutor:
|
class WorkflowExecutor:
|
||||||
def __init__(self, batch: FileSummaryBatch):
|
def __init__(self, batch: FileSummaryBatch, registry: SkillRegistry | None = None):
|
||||||
self.batch = batch
|
self.batch = batch
|
||||||
|
self.registry = registry or default_skill_registry()
|
||||||
|
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
self.batch.status = FileSummaryBatch.Status.RUNNING
|
self.batch.status = FileSummaryBatch.Status.RUNNING
|
||||||
@@ -107,6 +123,15 @@ class WorkflowExecutor:
|
|||||||
{"node_code": node.node_code, "status": node.status, "progress": node.progress},
|
{"node_code": node.node_code, "status": node.status, "progress": node.progress},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
skill_name = next(
|
||||||
|
(skill for code, _name, skill in NODE_DEFINITIONS if code == node.node_code),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
if skill_name:
|
||||||
|
result = self.registry.execute(skill_name, WorkflowContext(batch=self.batch))
|
||||||
|
if not result.success:
|
||||||
|
raise RuntimeError(result.message or f"{node.node_name}执行失败")
|
||||||
|
|
||||||
node.status = WorkflowNodeRun.Status.SUCCESS
|
node.status = WorkflowNodeRun.Status.SUCCESS
|
||||||
node.progress = 100
|
node.progress = 100
|
||||||
node.finished_at = timezone.now()
|
node.finished_at = timezone.now()
|
||||||
|
|||||||
25
tests/test_file_summary_archive.py
Normal file
25
tests/test_file_summary_archive.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
from zipfile import ZipFile
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from review_agent.file_summary.services.archive import extract_archive
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_zip_preserves_safe_paths(tmp_path):
|
||||||
|
archive_path = tmp_path / "safe.zip"
|
||||||
|
with ZipFile(archive_path, "w") as archive:
|
||||||
|
archive.writestr("dir/a.txt", "content")
|
||||||
|
|
||||||
|
target = tmp_path / "out"
|
||||||
|
extracted = extract_archive(archive_path, target)
|
||||||
|
|
||||||
|
assert extracted == [target / "dir" / "a.txt"]
|
||||||
|
assert (target / "dir" / "a.txt").read_text(encoding="utf-8") == "content"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_zip_rejects_path_traversal(tmp_path):
|
||||||
|
archive_path = tmp_path / "evil.zip"
|
||||||
|
with ZipFile(archive_path, "w") as archive:
|
||||||
|
archive.writestr("../evil.txt", "bad")
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
extract_archive(archive_path, tmp_path / "out")
|
||||||
24
tests/test_file_summary_inventory.py
Normal file
24
tests/test_file_summary_inventory.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from review_agent.file_summary.services.inventory import scan_files_to_items
|
||||||
|
from review_agent.models import Conversation, FileSummaryBatch, FileSummaryItem
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.django_db
|
||||||
|
|
||||||
|
|
||||||
|
def test_scan_files_to_items_preserves_relative_paths(tmp_path, django_user_model):
|
||||||
|
root = tmp_path / "work"
|
||||||
|
(root / "a").mkdir(parents=True)
|
||||||
|
(root / "a" / "one.pdf").write_bytes(b"pdf")
|
||||||
|
(root / "two.txt").write_text("x", encoding="utf-8")
|
||||||
|
user = django_user_model.objects.create_user(username="owner", password="pass")
|
||||||
|
conversation = Conversation.objects.create(user=user, title="会话")
|
||||||
|
batch = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-I")
|
||||||
|
|
||||||
|
items = scan_files_to_items(batch=batch, roots=[root])
|
||||||
|
|
||||||
|
assert [item.relative_path for item in items] == ["a/one.pdf", "two.txt"]
|
||||||
|
assert FileSummaryItem.objects.filter(batch=batch).count() == 2
|
||||||
|
assert items[0].statistics_status == FileSummaryItem.StatisticsStatus.SKIPPED
|
||||||
66
tests/test_file_summary_page_count.py
Normal file
66
tests/test_file_summary_page_count.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import pytest
|
||||||
|
from docx import Document
|
||||||
|
from openpyxl import Workbook
|
||||||
|
from pptx import Presentation
|
||||||
|
|
||||||
|
from review_agent.file_summary.services.page_count import count_document_pages
|
||||||
|
from review_agent.file_summary.skills.document_page_count import DocumentPageCountSkill
|
||||||
|
from review_agent.file_summary.skills.base import WorkflowContext
|
||||||
|
from review_agent.models import Conversation, FileSummaryBatch, FileSummaryItem
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.django_db
|
||||||
|
|
||||||
|
|
||||||
|
def test_count_document_pages_for_office_formats(tmp_path):
|
||||||
|
docx_path = tmp_path / "a.docx"
|
||||||
|
Document().save(docx_path)
|
||||||
|
|
||||||
|
xlsx_path = tmp_path / "a.xlsx"
|
||||||
|
workbook = Workbook()
|
||||||
|
workbook.create_sheet("第二页")
|
||||||
|
workbook.save(xlsx_path)
|
||||||
|
|
||||||
|
pptx_path = tmp_path / "a.pptx"
|
||||||
|
presentation = Presentation()
|
||||||
|
presentation.slides.add_slide(presentation.slide_layouts[6])
|
||||||
|
presentation.save(pptx_path)
|
||||||
|
|
||||||
|
assert count_document_pages(docx_path).status in {"success", "uncertain"}
|
||||||
|
assert count_document_pages(xlsx_path).page_count == 2
|
||||||
|
assert count_document_pages(pptx_path).page_count == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_document_page_count_skill_marks_unsupported_and_success(tmp_path, django_user_model):
|
||||||
|
xlsx_path = tmp_path / "a.xlsx"
|
||||||
|
workbook = Workbook()
|
||||||
|
workbook.save(xlsx_path)
|
||||||
|
txt_path = tmp_path / "a.txt"
|
||||||
|
txt_path.write_text("x", encoding="utf-8")
|
||||||
|
user = django_user_model.objects.create_user(username="owner", password="pass")
|
||||||
|
conversation = Conversation.objects.create(user=user, title="会话")
|
||||||
|
batch = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-P")
|
||||||
|
xlsx_item = FileSummaryItem.objects.create(
|
||||||
|
batch=batch,
|
||||||
|
file_index=1,
|
||||||
|
file_name="a.xlsx",
|
||||||
|
file_type="xlsx",
|
||||||
|
relative_path="a.xlsx",
|
||||||
|
storage_path=str(xlsx_path),
|
||||||
|
)
|
||||||
|
txt_item = FileSummaryItem.objects.create(
|
||||||
|
batch=batch,
|
||||||
|
file_index=2,
|
||||||
|
file_name="a.txt",
|
||||||
|
file_type="txt",
|
||||||
|
relative_path="a.txt",
|
||||||
|
storage_path=str(txt_path),
|
||||||
|
)
|
||||||
|
|
||||||
|
result = DocumentPageCountSkill().run(WorkflowContext(batch=batch))
|
||||||
|
|
||||||
|
xlsx_item.refresh_from_db()
|
||||||
|
txt_item.refresh_from_db()
|
||||||
|
assert result.success is True
|
||||||
|
assert xlsx_item.statistics_status == FileSummaryItem.StatisticsStatus.SUCCESS
|
||||||
|
assert txt_item.statistics_status == FileSummaryItem.StatisticsStatus.UNSUPPORTED
|
||||||
29
tests/test_file_summary_product_detect.py
Normal file
29
tests/test_file_summary_product_detect.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from review_agent.file_summary.services.product_detect import detect_product_name
|
||||||
|
from review_agent.models import Conversation, FileSummaryBatch, FileSummaryItem
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.django_db
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_product_name_from_top_level_directory(django_user_model):
|
||||||
|
user = django_user_model.objects.create_user(username="owner", password="pass")
|
||||||
|
conversation = Conversation.objects.create(user=user, title="新对话 06-06")
|
||||||
|
batch = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-D")
|
||||||
|
FileSummaryItem.objects.create(
|
||||||
|
batch=batch,
|
||||||
|
file_index=1,
|
||||||
|
file_name="说明书.docx",
|
||||||
|
file_type="docx",
|
||||||
|
relative_path="甲型试剂盒/说明书.docx",
|
||||||
|
storage_path="x",
|
||||||
|
)
|
||||||
|
|
||||||
|
product_name = detect_product_name(batch)
|
||||||
|
|
||||||
|
batch.refresh_from_db()
|
||||||
|
conversation.refresh_from_db()
|
||||||
|
assert product_name == "甲型试剂盒"
|
||||||
|
assert batch.product_name == "甲型试剂盒"
|
||||||
|
assert conversation.title == "甲型试剂盒-文件汇总"
|
||||||
27
tests/test_file_summary_skills.py
Normal file
27
tests/test_file_summary_skills.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from review_agent.file_summary.skills.base import BaseSkill, SkillResult, WorkflowContext
|
||||||
|
from review_agent.file_summary.skills.registry import SkillRegistry
|
||||||
|
|
||||||
|
|
||||||
|
class EchoSkill(BaseSkill):
|
||||||
|
name = "echo"
|
||||||
|
|
||||||
|
def run(self, context):
|
||||||
|
return SkillResult(success=True, data={"batch_id": context.batch.id})
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_skill_registry_executes_registered_skill(django_user_model):
|
||||||
|
from review_agent.models import Conversation, FileSummaryBatch
|
||||||
|
|
||||||
|
user = django_user_model.objects.create_user(username="owner", password="pass")
|
||||||
|
conversation = Conversation.objects.create(user=user, title="会话")
|
||||||
|
batch = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-X")
|
||||||
|
registry = SkillRegistry()
|
||||||
|
registry.register(EchoSkill())
|
||||||
|
|
||||||
|
result = registry.execute("echo", WorkflowContext(batch=batch))
|
||||||
|
|
||||||
|
assert result.success is True
|
||||||
|
assert result.data == {"batch_id": batch.id}
|
||||||
Reference in New Issue
Block a user