fix(application-form-fill): 优先路由填表提示并支持rar预览

2026-06-07 20:14:23 +08:00
parent 82c33e513f
commit ac5cf8bf7e
5 changed files with 145 additions and 2 deletions
--- a/review_agent/application_form_fill/constants.py
+++ b/review_agent/application_form_fill/constants.py
@@ -14,6 +14,11 @@ FORM_FILL_TRIGGER_KEYWORDS = [
    "填到申报模板",
    "自动填表",
    "生成表格",
+    "申报文件模板",
+    "申报文件填表",
+    "产品关键信息",
+    "字段来源追溯清单",
+    "注册证 word",
 ]

 FORM_FILL_NODE_DEFINITIONS = [
--- a/review_agent/file_summary/services/attachment_reader.py
+++ b/review_agent/file_summary/services/attachment_reader.py
@@ -2,16 +2,18 @@ from __future__ import annotations

 import csv
 import logging
+from tempfile import TemporaryDirectory
 from dataclasses import asdict, dataclass, field
 from pathlib import Path

 from django.conf import settings

 from review_agent.models import FileAttachment
+from review_agent.file_summary.services.archive import ARCHIVE_EXTENSIONS, extract_archive


 TEXT_EXTENSIONS = {"txt", "md", "csv", "json", "log"}
-SUPPORTED_EXTENSIONS = TEXT_EXTENSIONS | {"pdf", "docx", "xlsx", "pptx"}
+SUPPORTED_EXTENSIONS = TEXT_EXTENSIONS | {"pdf", "docx", "xlsx", "pptx"} | ARCHIVE_EXTENSIONS
 MAX_PREVIEW_CHARS = 3000
 MAX_ROWS_PER_SHEET = 20

@@ -72,6 +74,8 @@ def read_attachment_details(attachment: FileAttachment) -> AttachmentReadResult:
            sections = _read_pptx(file_path)
        elif file_type == "csv":
            sections = _read_csv(file_path)
+        elif file_type in ARCHIVE_EXTENSIONS:
+            sections = _read_archive(file_path)
        else:
            sections = _read_text(file_path)
    except Exception as exc:
@@ -208,6 +212,44 @@ def _read_pptx(path: Path) -> list[dict[str, object]]:
    return sections


+def _read_archive(path: Path) -> list[dict[str, object]]:
+    sections: list[dict[str, object]] = []
+    with TemporaryDirectory(prefix="attachment-reader-") as temp_dir:
+        extracted = extract_archive(path, Path(temp_dir))
+        if not extracted:
+            return [{"type": "archive", "name": path.name, "text": "压缩包未解出任何可读取文件。"}]
+        for item in extracted:
+            file_type = item.suffix.lower().lstrip(".")
+            if file_type not in SUPPORTED_EXTENSIONS or file_type in ARCHIVE_EXTENSIONS:
+                sections.append(
+                    {
+                        "type": "file",
+                        "name": item.name,
+                        "text": f"暂不支持预览压缩包内的 .{file_type or 'unknown'} 文件。",
+                    }
+                )
+                continue
+            for section in _read_supported_file(item, file_type):
+                section = dict(section)
+                section["name"] = f"{item.name} / {section.get('name', item.name)}"
+                sections.append(section)
+    return sections
+
+
+def _read_supported_file(path: Path, file_type: str) -> list[dict[str, object]]:
+    if file_type == "pdf":
+        return _read_pdf(path)
+    if file_type == "docx":
+        return _read_docx(path)
+    if file_type == "xlsx":
+        return _read_xlsx(path)
+    if file_type == "pptx":
+        return _read_pptx(path)
+    if file_type == "csv":
+        return _read_csv(path)
+    return _read_text(path)
+
+
 def _build_preview(sections: list[dict[str, object]]) -> str:
    parts: list[str] = []
    for section in sections:
--- a/review_agent/skill_router.py
+++ b/review_agent/skill_router.py
@@ -51,6 +51,10 @@ class SkillRoute:


 def route_message_intent(conversation: Conversation, content: str) -> SkillRoute:
+    deterministic_route = _deterministic_workflow_route(conversation, content)
+    if deterministic_route:
+        return deterministic_route
+
    attachments = list(_active_attachments(conversation))
    try:
        route = _route_with_llm(conversation, content, attachments)
@@ -75,6 +79,35 @@ def route_message_intent(conversation: Conversation, content: str) -> SkillRoute
        return _route_with_rules(conversation, content)


+def _deterministic_workflow_route(conversation: Conversation, content: str) -> SkillRoute | None:
+    if _matches_application_form_fill(content):
+        return SkillRoute(
+            action=FORM_FILL_WORKFLOW_TYPE,
+            workflow_type=FORM_FILL_WORKFLOW_TYPE,
+            confidence=0.9,
+            reason="命中明确申报文件自动填表关键词。",
+            source="rule_preflight",
+        )
+    if _matches_regulatory_review(content):
+        return SkillRoute(
+            action="regulatory_review",
+            workflow_type="regulatory_review",
+            confidence=0.9,
+            reason="命中明确法规核查关键词。",
+            source="rule_preflight",
+        )
+    file_summary = evaluate_file_summary_trigger(conversation, content)
+    if file_summary.should_start or file_summary.reason == "missing_attachment":
+        return SkillRoute(
+            action="file_summary",
+            workflow_type="file_summary",
+            confidence=0.8,
+            reason=file_summary.reason,
+            source="rule_preflight",
+        )
+    return None
+
+
 def _route_with_llm(
    conversation: Conversation,
    content: str,
--- a/tests/test_application_form_fill_trigger.py
+++ b/tests/test_application_form_fill_trigger.py
@@ -1,6 +1,6 @@
 import pytest

-from review_agent.models import Conversation
+from review_agent.models import Conversation, FileAttachment
 from review_agent.skill_router import route_message_intent


@@ -43,3 +43,31 @@ def test_rule_router_does_not_misroute_normal_chat(monkeypatch, django_user_mode
    route = route_message_intent(conversation, "你好，解释一下法规背景")

    assert route.action == "normal_chat"
+
+
+def test_application_form_fill_prompt_preempts_attachment_reader_llm(monkeypatch, tmp_path, django_user_model):
+    user = django_user_model.objects.create_user(username="owner", password="pass")
+    conversation = Conversation.objects.create(user=user, title="会话")
+    archive_path = tmp_path / "第1章_监管信息.rar"
+    archive_path.write_bytes(b"rar")
+    FileAttachment.objects.create(
+        conversation=conversation,
+        user=user,
+        original_name="第1章_监管信息.rar",
+        storage_path=str(archive_path),
+        file_size=archive_path.stat().st_size,
+    )
+    monkeypatch.setattr(
+        "review_agent.skill_router._route_with_llm",
+        lambda conversation, content, attachments: (_ for _ in ()).throw(
+            AssertionError("明确自动填表意图不应进入 LLM 路由")
+        ),
+    )
+
+    route = route_message_intent(
+        conversation,
+        "请基于当前对话最近成功汇总的产品资料，自动提取产品关键信息并填入申报文件模板，优先生成注册证 Word 和字段来源追溯清单。",
+    )
+
+    assert route.action == "application_form_fill"
+    assert route.source == "rule_preflight"
--- a/tests/test_attachment_reader.py
+++ b/tests/test_attachment_reader.py
@@ -109,3 +109,38 @@ def test_attachment_reader_skill_returns_structured_details(settings, tmp_path,
    assert result.success is True
    assert result.data["attachments"][0]["filename"] == "readme.txt"
    assert "请读取这个附件" in result.data["attachments"][0]["preview_text"]
+
+
+def test_read_attachment_extracts_files_inside_rar(monkeypatch, settings, tmp_path, django_user_model):
+    from review_agent.file_summary.services.attachment_reader import read_attachment_details
+
+    settings.MEDIA_ROOT = tmp_path
+    user = django_user_model.objects.create_user(username="owner", password="pass")
+    conversation = Conversation.objects.create(user=user, title="会话")
+    archive_path = tmp_path / "uploads" / "第1章_监管信息.rar"
+    archive_path.parent.mkdir(parents=True)
+    archive_path.write_bytes(b"rar")
+    attachment = FileAttachment.objects.create(
+        conversation=conversation,
+        user=user,
+        original_name="第1章_监管信息.rar",
+        storage_path="uploads/第1章_监管信息.rar",
+        file_size=archive_path.stat().st_size,
+    )
+
+    def fake_extract_archive(path: Path, target_dir: Path):
+        extracted = target_dir / "说明书.txt"
+        extracted.write_text("产品名称：甲胎蛋白检测试剂盒", encoding="utf-8")
+        return [extracted]
+
+    monkeypatch.setattr(
+        "review_agent.file_summary.services.attachment_reader.extract_archive",
+        fake_extract_archive,
+    )
+
+    result = read_attachment_details(attachment)
+
+    assert result.status == "success"
+    assert result.file_type == "rar"
+    assert "说明书.txt" in result.sections[0]["name"]
+    assert "甲胎蛋白检测试剂盒" in result.preview_text