fix(application-form-fill): 优先路由填表提示并支持rar预览
This commit is contained in:
@@ -14,6 +14,11 @@ FORM_FILL_TRIGGER_KEYWORDS = [
|
|||||||
"填到申报模板",
|
"填到申报模板",
|
||||||
"自动填表",
|
"自动填表",
|
||||||
"生成表格",
|
"生成表格",
|
||||||
|
"申报文件模板",
|
||||||
|
"申报文件填表",
|
||||||
|
"产品关键信息",
|
||||||
|
"字段来源追溯清单",
|
||||||
|
"注册证 word",
|
||||||
]
|
]
|
||||||
|
|
||||||
FORM_FILL_NODE_DEFINITIONS = [
|
FORM_FILL_NODE_DEFINITIONS = [
|
||||||
|
|||||||
@@ -2,16 +2,18 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import csv
|
import csv
|
||||||
import logging
|
import logging
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
from dataclasses import asdict, dataclass, field
|
from dataclasses import asdict, dataclass, field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from review_agent.models import FileAttachment
|
from review_agent.models import FileAttachment
|
||||||
|
from review_agent.file_summary.services.archive import ARCHIVE_EXTENSIONS, extract_archive
|
||||||
|
|
||||||
|
|
||||||
TEXT_EXTENSIONS = {"txt", "md", "csv", "json", "log"}
|
TEXT_EXTENSIONS = {"txt", "md", "csv", "json", "log"}
|
||||||
SUPPORTED_EXTENSIONS = TEXT_EXTENSIONS | {"pdf", "docx", "xlsx", "pptx"}
|
SUPPORTED_EXTENSIONS = TEXT_EXTENSIONS | {"pdf", "docx", "xlsx", "pptx"} | ARCHIVE_EXTENSIONS
|
||||||
MAX_PREVIEW_CHARS = 3000
|
MAX_PREVIEW_CHARS = 3000
|
||||||
MAX_ROWS_PER_SHEET = 20
|
MAX_ROWS_PER_SHEET = 20
|
||||||
|
|
||||||
@@ -72,6 +74,8 @@ def read_attachment_details(attachment: FileAttachment) -> AttachmentReadResult:
|
|||||||
sections = _read_pptx(file_path)
|
sections = _read_pptx(file_path)
|
||||||
elif file_type == "csv":
|
elif file_type == "csv":
|
||||||
sections = _read_csv(file_path)
|
sections = _read_csv(file_path)
|
||||||
|
elif file_type in ARCHIVE_EXTENSIONS:
|
||||||
|
sections = _read_archive(file_path)
|
||||||
else:
|
else:
|
||||||
sections = _read_text(file_path)
|
sections = _read_text(file_path)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
@@ -208,6 +212,44 @@ def _read_pptx(path: Path) -> list[dict[str, object]]:
|
|||||||
return sections
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
def _read_archive(path: Path) -> list[dict[str, object]]:
|
||||||
|
sections: list[dict[str, object]] = []
|
||||||
|
with TemporaryDirectory(prefix="attachment-reader-") as temp_dir:
|
||||||
|
extracted = extract_archive(path, Path(temp_dir))
|
||||||
|
if not extracted:
|
||||||
|
return [{"type": "archive", "name": path.name, "text": "压缩包未解出任何可读取文件。"}]
|
||||||
|
for item in extracted:
|
||||||
|
file_type = item.suffix.lower().lstrip(".")
|
||||||
|
if file_type not in SUPPORTED_EXTENSIONS or file_type in ARCHIVE_EXTENSIONS:
|
||||||
|
sections.append(
|
||||||
|
{
|
||||||
|
"type": "file",
|
||||||
|
"name": item.name,
|
||||||
|
"text": f"暂不支持预览压缩包内的 .{file_type or 'unknown'} 文件。",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
for section in _read_supported_file(item, file_type):
|
||||||
|
section = dict(section)
|
||||||
|
section["name"] = f"{item.name} / {section.get('name', item.name)}"
|
||||||
|
sections.append(section)
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
def _read_supported_file(path: Path, file_type: str) -> list[dict[str, object]]:
|
||||||
|
if file_type == "pdf":
|
||||||
|
return _read_pdf(path)
|
||||||
|
if file_type == "docx":
|
||||||
|
return _read_docx(path)
|
||||||
|
if file_type == "xlsx":
|
||||||
|
return _read_xlsx(path)
|
||||||
|
if file_type == "pptx":
|
||||||
|
return _read_pptx(path)
|
||||||
|
if file_type == "csv":
|
||||||
|
return _read_csv(path)
|
||||||
|
return _read_text(path)
|
||||||
|
|
||||||
|
|
||||||
def _build_preview(sections: list[dict[str, object]]) -> str:
|
def _build_preview(sections: list[dict[str, object]]) -> str:
|
||||||
parts: list[str] = []
|
parts: list[str] = []
|
||||||
for section in sections:
|
for section in sections:
|
||||||
|
|||||||
@@ -51,6 +51,10 @@ class SkillRoute:
|
|||||||
|
|
||||||
|
|
||||||
def route_message_intent(conversation: Conversation, content: str) -> SkillRoute:
|
def route_message_intent(conversation: Conversation, content: str) -> SkillRoute:
|
||||||
|
deterministic_route = _deterministic_workflow_route(conversation, content)
|
||||||
|
if deterministic_route:
|
||||||
|
return deterministic_route
|
||||||
|
|
||||||
attachments = list(_active_attachments(conversation))
|
attachments = list(_active_attachments(conversation))
|
||||||
try:
|
try:
|
||||||
route = _route_with_llm(conversation, content, attachments)
|
route = _route_with_llm(conversation, content, attachments)
|
||||||
@@ -75,6 +79,35 @@ def route_message_intent(conversation: Conversation, content: str) -> SkillRoute
|
|||||||
return _route_with_rules(conversation, content)
|
return _route_with_rules(conversation, content)
|
||||||
|
|
||||||
|
|
||||||
|
def _deterministic_workflow_route(conversation: Conversation, content: str) -> SkillRoute | None:
|
||||||
|
if _matches_application_form_fill(content):
|
||||||
|
return SkillRoute(
|
||||||
|
action=FORM_FILL_WORKFLOW_TYPE,
|
||||||
|
workflow_type=FORM_FILL_WORKFLOW_TYPE,
|
||||||
|
confidence=0.9,
|
||||||
|
reason="命中明确申报文件自动填表关键词。",
|
||||||
|
source="rule_preflight",
|
||||||
|
)
|
||||||
|
if _matches_regulatory_review(content):
|
||||||
|
return SkillRoute(
|
||||||
|
action="regulatory_review",
|
||||||
|
workflow_type="regulatory_review",
|
||||||
|
confidence=0.9,
|
||||||
|
reason="命中明确法规核查关键词。",
|
||||||
|
source="rule_preflight",
|
||||||
|
)
|
||||||
|
file_summary = evaluate_file_summary_trigger(conversation, content)
|
||||||
|
if file_summary.should_start or file_summary.reason == "missing_attachment":
|
||||||
|
return SkillRoute(
|
||||||
|
action="file_summary",
|
||||||
|
workflow_type="file_summary",
|
||||||
|
confidence=0.8,
|
||||||
|
reason=file_summary.reason,
|
||||||
|
source="rule_preflight",
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _route_with_llm(
|
def _route_with_llm(
|
||||||
conversation: Conversation,
|
conversation: Conversation,
|
||||||
content: str,
|
content: str,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from review_agent.models import Conversation
|
from review_agent.models import Conversation, FileAttachment
|
||||||
from review_agent.skill_router import route_message_intent
|
from review_agent.skill_router import route_message_intent
|
||||||
|
|
||||||
|
|
||||||
@@ -43,3 +43,31 @@ def test_rule_router_does_not_misroute_normal_chat(monkeypatch, django_user_mode
|
|||||||
route = route_message_intent(conversation, "你好,解释一下法规背景")
|
route = route_message_intent(conversation, "你好,解释一下法规背景")
|
||||||
|
|
||||||
assert route.action == "normal_chat"
|
assert route.action == "normal_chat"
|
||||||
|
|
||||||
|
|
||||||
|
def test_application_form_fill_prompt_preempts_attachment_reader_llm(monkeypatch, tmp_path, django_user_model):
|
||||||
|
user = django_user_model.objects.create_user(username="owner", password="pass")
|
||||||
|
conversation = Conversation.objects.create(user=user, title="会话")
|
||||||
|
archive_path = tmp_path / "第1章_监管信息.rar"
|
||||||
|
archive_path.write_bytes(b"rar")
|
||||||
|
FileAttachment.objects.create(
|
||||||
|
conversation=conversation,
|
||||||
|
user=user,
|
||||||
|
original_name="第1章_监管信息.rar",
|
||||||
|
storage_path=str(archive_path),
|
||||||
|
file_size=archive_path.stat().st_size,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"review_agent.skill_router._route_with_llm",
|
||||||
|
lambda conversation, content, attachments: (_ for _ in ()).throw(
|
||||||
|
AssertionError("明确自动填表意图不应进入 LLM 路由")
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
route = route_message_intent(
|
||||||
|
conversation,
|
||||||
|
"请基于当前对话最近成功汇总的产品资料,自动提取产品关键信息并填入申报文件模板,优先生成注册证 Word 和字段来源追溯清单。",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert route.action == "application_form_fill"
|
||||||
|
assert route.source == "rule_preflight"
|
||||||
|
|||||||
@@ -109,3 +109,38 @@ def test_attachment_reader_skill_returns_structured_details(settings, tmp_path,
|
|||||||
assert result.success is True
|
assert result.success is True
|
||||||
assert result.data["attachments"][0]["filename"] == "readme.txt"
|
assert result.data["attachments"][0]["filename"] == "readme.txt"
|
||||||
assert "请读取这个附件" in result.data["attachments"][0]["preview_text"]
|
assert "请读取这个附件" in result.data["attachments"][0]["preview_text"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_read_attachment_extracts_files_inside_rar(monkeypatch, settings, tmp_path, django_user_model):
|
||||||
|
from review_agent.file_summary.services.attachment_reader import read_attachment_details
|
||||||
|
|
||||||
|
settings.MEDIA_ROOT = tmp_path
|
||||||
|
user = django_user_model.objects.create_user(username="owner", password="pass")
|
||||||
|
conversation = Conversation.objects.create(user=user, title="会话")
|
||||||
|
archive_path = tmp_path / "uploads" / "第1章_监管信息.rar"
|
||||||
|
archive_path.parent.mkdir(parents=True)
|
||||||
|
archive_path.write_bytes(b"rar")
|
||||||
|
attachment = FileAttachment.objects.create(
|
||||||
|
conversation=conversation,
|
||||||
|
user=user,
|
||||||
|
original_name="第1章_监管信息.rar",
|
||||||
|
storage_path="uploads/第1章_监管信息.rar",
|
||||||
|
file_size=archive_path.stat().st_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
def fake_extract_archive(path: Path, target_dir: Path):
|
||||||
|
extracted = target_dir / "说明书.txt"
|
||||||
|
extracted.write_text("产品名称:甲胎蛋白检测试剂盒", encoding="utf-8")
|
||||||
|
return [extracted]
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"review_agent.file_summary.services.attachment_reader.extract_archive",
|
||||||
|
fake_extract_archive,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = read_attachment_details(attachment)
|
||||||
|
|
||||||
|
assert result.status == "success"
|
||||||
|
assert result.file_type == "rar"
|
||||||
|
assert "说明书.txt" in result.sections[0]["name"]
|
||||||
|
assert "甲胎蛋白检测试剂盒" in result.preview_text
|
||||||
|
|||||||
Reference in New Issue
Block a user