fix(application-form-fill): 优先路由填表提示并支持rar预览

This commit is contained in:
2026-06-07 20:14:23 +08:00
parent 82c33e513f
commit ac5cf8bf7e
5 changed files with 145 additions and 2 deletions

View File

@@ -14,6 +14,11 @@ FORM_FILL_TRIGGER_KEYWORDS = [
"填到申报模板", "填到申报模板",
"自动填表", "自动填表",
"生成表格", "生成表格",
"申报文件模板",
"申报文件填表",
"产品关键信息",
"字段来源追溯清单",
"注册证 word",
] ]
FORM_FILL_NODE_DEFINITIONS = [ FORM_FILL_NODE_DEFINITIONS = [

View File

@@ -2,16 +2,18 @@ from __future__ import annotations
import csv import csv
import logging import logging
from tempfile import TemporaryDirectory
from dataclasses import asdict, dataclass, field from dataclasses import asdict, dataclass, field
from pathlib import Path from pathlib import Path
from django.conf import settings from django.conf import settings
from review_agent.models import FileAttachment from review_agent.models import FileAttachment
from review_agent.file_summary.services.archive import ARCHIVE_EXTENSIONS, extract_archive
TEXT_EXTENSIONS = {"txt", "md", "csv", "json", "log"} TEXT_EXTENSIONS = {"txt", "md", "csv", "json", "log"}
SUPPORTED_EXTENSIONS = TEXT_EXTENSIONS | {"pdf", "docx", "xlsx", "pptx"} SUPPORTED_EXTENSIONS = TEXT_EXTENSIONS | {"pdf", "docx", "xlsx", "pptx"} | ARCHIVE_EXTENSIONS
MAX_PREVIEW_CHARS = 3000 MAX_PREVIEW_CHARS = 3000
MAX_ROWS_PER_SHEET = 20 MAX_ROWS_PER_SHEET = 20
@@ -72,6 +74,8 @@ def read_attachment_details(attachment: FileAttachment) -> AttachmentReadResult:
sections = _read_pptx(file_path) sections = _read_pptx(file_path)
elif file_type == "csv": elif file_type == "csv":
sections = _read_csv(file_path) sections = _read_csv(file_path)
elif file_type in ARCHIVE_EXTENSIONS:
sections = _read_archive(file_path)
else: else:
sections = _read_text(file_path) sections = _read_text(file_path)
except Exception as exc: except Exception as exc:
@@ -208,6 +212,44 @@ def _read_pptx(path: Path) -> list[dict[str, object]]:
return sections return sections
def _read_archive(path: Path) -> list[dict[str, object]]:
sections: list[dict[str, object]] = []
with TemporaryDirectory(prefix="attachment-reader-") as temp_dir:
extracted = extract_archive(path, Path(temp_dir))
if not extracted:
return [{"type": "archive", "name": path.name, "text": "压缩包未解出任何可读取文件。"}]
for item in extracted:
file_type = item.suffix.lower().lstrip(".")
if file_type not in SUPPORTED_EXTENSIONS or file_type in ARCHIVE_EXTENSIONS:
sections.append(
{
"type": "file",
"name": item.name,
"text": f"暂不支持预览压缩包内的 .{file_type or 'unknown'} 文件。",
}
)
continue
for section in _read_supported_file(item, file_type):
section = dict(section)
section["name"] = f"{item.name} / {section.get('name', item.name)}"
sections.append(section)
return sections
def _read_supported_file(path: Path, file_type: str) -> list[dict[str, object]]:
if file_type == "pdf":
return _read_pdf(path)
if file_type == "docx":
return _read_docx(path)
if file_type == "xlsx":
return _read_xlsx(path)
if file_type == "pptx":
return _read_pptx(path)
if file_type == "csv":
return _read_csv(path)
return _read_text(path)
def _build_preview(sections: list[dict[str, object]]) -> str: def _build_preview(sections: list[dict[str, object]]) -> str:
parts: list[str] = [] parts: list[str] = []
for section in sections: for section in sections:

View File

@@ -51,6 +51,10 @@ class SkillRoute:
def route_message_intent(conversation: Conversation, content: str) -> SkillRoute: def route_message_intent(conversation: Conversation, content: str) -> SkillRoute:
deterministic_route = _deterministic_workflow_route(conversation, content)
if deterministic_route:
return deterministic_route
attachments = list(_active_attachments(conversation)) attachments = list(_active_attachments(conversation))
try: try:
route = _route_with_llm(conversation, content, attachments) route = _route_with_llm(conversation, content, attachments)
@@ -75,6 +79,35 @@ def route_message_intent(conversation: Conversation, content: str) -> SkillRoute
return _route_with_rules(conversation, content) return _route_with_rules(conversation, content)
def _deterministic_workflow_route(conversation: Conversation, content: str) -> SkillRoute | None:
if _matches_application_form_fill(content):
return SkillRoute(
action=FORM_FILL_WORKFLOW_TYPE,
workflow_type=FORM_FILL_WORKFLOW_TYPE,
confidence=0.9,
reason="命中明确申报文件自动填表关键词。",
source="rule_preflight",
)
if _matches_regulatory_review(content):
return SkillRoute(
action="regulatory_review",
workflow_type="regulatory_review",
confidence=0.9,
reason="命中明确法规核查关键词。",
source="rule_preflight",
)
file_summary = evaluate_file_summary_trigger(conversation, content)
if file_summary.should_start or file_summary.reason == "missing_attachment":
return SkillRoute(
action="file_summary",
workflow_type="file_summary",
confidence=0.8,
reason=file_summary.reason,
source="rule_preflight",
)
return None
def _route_with_llm( def _route_with_llm(
conversation: Conversation, conversation: Conversation,
content: str, content: str,

View File

@@ -1,6 +1,6 @@
import pytest import pytest
from review_agent.models import Conversation from review_agent.models import Conversation, FileAttachment
from review_agent.skill_router import route_message_intent from review_agent.skill_router import route_message_intent
@@ -43,3 +43,31 @@ def test_rule_router_does_not_misroute_normal_chat(monkeypatch, django_user_mode
route = route_message_intent(conversation, "你好,解释一下法规背景") route = route_message_intent(conversation, "你好,解释一下法规背景")
assert route.action == "normal_chat" assert route.action == "normal_chat"
def test_application_form_fill_prompt_preempts_attachment_reader_llm(monkeypatch, tmp_path, django_user_model):
user = django_user_model.objects.create_user(username="owner", password="pass")
conversation = Conversation.objects.create(user=user, title="会话")
archive_path = tmp_path / "第1章_监管信息.rar"
archive_path.write_bytes(b"rar")
FileAttachment.objects.create(
conversation=conversation,
user=user,
original_name="第1章_监管信息.rar",
storage_path=str(archive_path),
file_size=archive_path.stat().st_size,
)
monkeypatch.setattr(
"review_agent.skill_router._route_with_llm",
lambda conversation, content, attachments: (_ for _ in ()).throw(
AssertionError("明确自动填表意图不应进入 LLM 路由")
),
)
route = route_message_intent(
conversation,
"请基于当前对话最近成功汇总的产品资料,自动提取产品关键信息并填入申报文件模板,优先生成注册证 Word 和字段来源追溯清单。",
)
assert route.action == "application_form_fill"
assert route.source == "rule_preflight"

View File

@@ -109,3 +109,38 @@ def test_attachment_reader_skill_returns_structured_details(settings, tmp_path,
assert result.success is True assert result.success is True
assert result.data["attachments"][0]["filename"] == "readme.txt" assert result.data["attachments"][0]["filename"] == "readme.txt"
assert "请读取这个附件" in result.data["attachments"][0]["preview_text"] assert "请读取这个附件" in result.data["attachments"][0]["preview_text"]
def test_read_attachment_extracts_files_inside_rar(monkeypatch, settings, tmp_path, django_user_model):
from review_agent.file_summary.services.attachment_reader import read_attachment_details
settings.MEDIA_ROOT = tmp_path
user = django_user_model.objects.create_user(username="owner", password="pass")
conversation = Conversation.objects.create(user=user, title="会话")
archive_path = tmp_path / "uploads" / "第1章_监管信息.rar"
archive_path.parent.mkdir(parents=True)
archive_path.write_bytes(b"rar")
attachment = FileAttachment.objects.create(
conversation=conversation,
user=user,
original_name="第1章_监管信息.rar",
storage_path="uploads/第1章_监管信息.rar",
file_size=archive_path.stat().st_size,
)
def fake_extract_archive(path: Path, target_dir: Path):
extracted = target_dir / "说明书.txt"
extracted.write_text("产品名称:甲胎蛋白检测试剂盒", encoding="utf-8")
return [extracted]
monkeypatch.setattr(
"review_agent.file_summary.services.attachment_reader.extract_archive",
fake_extract_archive,
)
result = read_attachment_details(attachment)
assert result.status == "success"
assert result.file_type == "rar"
assert "说明书.txt" in result.sections[0]["name"]
assert "甲胎蛋白检测试剂盒" in result.preview_text