feat(file-summary): 实现文件处理技能链路
This commit is contained in:
1
review_agent/file_summary/services/__init__.py
Normal file
1
review_agent/file_summary/services/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
77
review_agent/file_summary/services/archive.py
Normal file
77
review_agent/file_summary/services/archive.py
Normal file
@@ -0,0 +1,77 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from zipfile import ZipFile
|
||||
|
||||
import py7zr
|
||||
|
||||
|
||||
ARCHIVE_EXTENSIONS = {"zip", "7z", "rar"}
|
||||
|
||||
|
||||
def _ensure_inside_target(path: Path, target_dir: Path) -> None:
|
||||
target = target_dir.resolve()
|
||||
resolved = path.resolve()
|
||||
if target != resolved and target not in resolved.parents:
|
||||
raise ValueError("解压路径必须位于批次工作目录内。")
|
||||
|
||||
|
||||
def _safe_member_path(target_dir: Path, member_name: str) -> Path:
|
||||
destination = target_dir / member_name
|
||||
_ensure_inside_target(destination, target_dir)
|
||||
return destination
|
||||
|
||||
|
||||
def extract_archive(archive_path: str | Path, target_dir: str | Path) -> list[Path]:
|
||||
archive_path = Path(archive_path)
|
||||
target_dir = Path(target_dir)
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
ext = archive_path.suffix.lower().lstrip(".")
|
||||
if ext not in ARCHIVE_EXTENSIONS:
|
||||
return []
|
||||
|
||||
if ext == "zip":
|
||||
return _extract_zip(archive_path, target_dir)
|
||||
if ext == "7z":
|
||||
return _extract_7z(archive_path, target_dir)
|
||||
return _extract_rar(archive_path, target_dir)
|
||||
|
||||
|
||||
def _extract_zip(archive_path: Path, target_dir: Path) -> list[Path]:
|
||||
extracted: list[Path] = []
|
||||
with ZipFile(archive_path) as archive:
|
||||
for member in archive.infolist():
|
||||
destination = _safe_member_path(target_dir, member.filename)
|
||||
if member.is_dir():
|
||||
destination.mkdir(parents=True, exist_ok=True)
|
||||
continue
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
with archive.open(member) as source, destination.open("wb") as target:
|
||||
target.write(source.read())
|
||||
extracted.append(destination)
|
||||
return extracted
|
||||
|
||||
|
||||
def _extract_7z(archive_path: Path, target_dir: Path) -> list[Path]:
|
||||
with py7zr.SevenZipFile(archive_path, mode="r") as archive:
|
||||
names = archive.getnames()
|
||||
for name in names:
|
||||
_safe_member_path(target_dir, name)
|
||||
archive.extractall(path=target_dir)
|
||||
return [target_dir / name for name in names if (target_dir / name).is_file()]
|
||||
|
||||
|
||||
def _extract_rar(archive_path: Path, target_dir: Path) -> list[Path]:
|
||||
result = subprocess.run(
|
||||
["7z", "x", f"-o{target_dir}", str(archive_path), "-y"],
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(result.stderr or result.stdout or "rar 解压失败")
|
||||
extracted = [path for path in target_dir.rglob("*") if path.is_file()]
|
||||
for path in extracted:
|
||||
_ensure_inside_target(path, target_dir)
|
||||
return extracted
|
||||
49
review_agent/file_summary/services/inventory.py
Normal file
49
review_agent/file_summary/services/inventory.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from review_agent.models import FileSummaryBatch, FileSummaryItem
|
||||
|
||||
|
||||
SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
|
||||
|
||||
|
||||
def _directory_level(relative_path: Path) -> str:
|
||||
if len(relative_path.parts) <= 1:
|
||||
return ""
|
||||
return "/".join(relative_path.parts[:-1])
|
||||
|
||||
|
||||
def scan_files_to_items(*, batch: FileSummaryBatch, roots: list[Path]) -> list[FileSummaryItem]:
|
||||
files: list[tuple[Path, Path]] = []
|
||||
for root in roots:
|
||||
root = Path(root)
|
||||
if root.is_file():
|
||||
files.append((root.parent, root))
|
||||
continue
|
||||
for path in sorted(item for item in root.rglob("*") if item.is_file()):
|
||||
if path.name.startswith(".") or path.stat().st_size == 0:
|
||||
continue
|
||||
files.append((root, path))
|
||||
|
||||
created: list[FileSummaryItem] = []
|
||||
for index, (root, path) in enumerate(files, start=1):
|
||||
relative = path.relative_to(root).as_posix()
|
||||
file_type = path.suffix.lower().lstrip(".")
|
||||
item = FileSummaryItem.objects.create(
|
||||
batch=batch,
|
||||
file_index=index,
|
||||
directory_level=_directory_level(Path(relative)),
|
||||
file_name=path.name,
|
||||
file_type=file_type,
|
||||
relative_path=relative,
|
||||
storage_path=str(path),
|
||||
statistics_status=FileSummaryItem.StatisticsStatus.SKIPPED,
|
||||
)
|
||||
created.append(item)
|
||||
|
||||
batch.total_files = len(created)
|
||||
batch.supported_files = sum(1 for item in created if item.file_type in SUPPORTED_EXTENSIONS)
|
||||
batch.unsupported_files = len(created) - batch.supported_files
|
||||
batch.save(update_fields=["total_files", "supported_files", "unsupported_files"])
|
||||
return created
|
||||
59
review_agent/file_summary/services/page_count.py
Normal file
59
review_agent/file_summary/services/page_count.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
SUPPORTED_EXTENSIONS = {"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PageCountResult:
|
||||
status: str
|
||||
page_count: int | None = None
|
||||
error_message: str = ""
|
||||
|
||||
|
||||
def count_document_pages(path: str | Path) -> PageCountResult:
|
||||
file_path = Path(path)
|
||||
ext = file_path.suffix.lower().lstrip(".")
|
||||
if ext not in SUPPORTED_EXTENSIONS:
|
||||
return PageCountResult(status="unsupported")
|
||||
|
||||
try:
|
||||
if ext == "pdf":
|
||||
from pypdf import PdfReader
|
||||
|
||||
return PageCountResult(status="success", page_count=len(PdfReader(str(file_path)).pages))
|
||||
if ext == "docx":
|
||||
from docx import Document
|
||||
|
||||
properties = Document(str(file_path)).core_properties
|
||||
pages = getattr(properties, "pages", None)
|
||||
if pages:
|
||||
return PageCountResult(status="success", page_count=pages)
|
||||
return PageCountResult(status="uncertain")
|
||||
if ext == "xlsx":
|
||||
from openpyxl import load_workbook
|
||||
|
||||
workbook = load_workbook(str(file_path), read_only=True, data_only=True)
|
||||
return PageCountResult(status="success", page_count=len(workbook.sheetnames))
|
||||
if ext == "xls":
|
||||
import xlrd
|
||||
|
||||
workbook = xlrd.open_workbook(str(file_path), on_demand=True)
|
||||
return PageCountResult(status="success", page_count=workbook.nsheets)
|
||||
if ext == "pptx":
|
||||
from pptx import Presentation
|
||||
|
||||
return PageCountResult(status="success", page_count=len(Presentation(str(file_path)).slides))
|
||||
if ext in {"doc", "ppt"}:
|
||||
import olefile
|
||||
|
||||
if olefile.isOleFile(str(file_path)):
|
||||
return PageCountResult(status="uncertain")
|
||||
return PageCountResult(status="failed", error_message="不是有效的 OLE 文件。")
|
||||
except Exception as exc:
|
||||
return PageCountResult(status="failed", error_message=str(exc))
|
||||
|
||||
return PageCountResult(status="uncertain")
|
||||
31
review_agent/file_summary/services/product_detect.py
Normal file
31
review_agent/file_summary/services/product_detect.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from review_agent.models import FileSummaryBatch
|
||||
|
||||
|
||||
def detect_product_name(batch: FileSummaryBatch) -> str:
|
||||
product_name = ""
|
||||
for item in batch.items.order_by("file_index"):
|
||||
parts = Path(item.relative_path).parts
|
||||
if len(parts) > 1:
|
||||
product_name = parts[0]
|
||||
break
|
||||
name = Path(item.file_name).stem
|
||||
for keyword in ("产品", "试剂盒", "说明书"):
|
||||
if keyword in name:
|
||||
product_name = name
|
||||
break
|
||||
if product_name:
|
||||
break
|
||||
|
||||
if not product_name:
|
||||
return ""
|
||||
|
||||
batch.product_name = product_name
|
||||
batch.save(update_fields=["product_name"])
|
||||
if batch.conversation.title.startswith("新对话"):
|
||||
batch.conversation.title = f"{product_name}-文件汇总"
|
||||
batch.conversation.save(update_fields=["title", "updated_at"])
|
||||
return product_name
|
||||
Reference in New Issue
Block a user