diff --git a/review_agent/application_form_fill/constants.py b/review_agent/application_form_fill/constants.py index 2fc91ba..a4082e6 100644 --- a/review_agent/application_form_fill/constants.py +++ b/review_agent/application_form_fill/constants.py @@ -14,6 +14,11 @@ FORM_FILL_TRIGGER_KEYWORDS = [ "填到申报模板", "自动填表", "生成表格", + "申报文件模板", + "申报文件填表", + "产品关键信息", + "字段来源追溯清单", + "注册证 word", ] FORM_FILL_NODE_DEFINITIONS = [ diff --git a/review_agent/file_summary/services/attachment_reader.py b/review_agent/file_summary/services/attachment_reader.py index 8f7cbb5..c8b5c69 100644 --- a/review_agent/file_summary/services/attachment_reader.py +++ b/review_agent/file_summary/services/attachment_reader.py @@ -2,16 +2,18 @@ from __future__ import annotations import csv import logging +from tempfile import TemporaryDirectory from dataclasses import asdict, dataclass, field from pathlib import Path from django.conf import settings from review_agent.models import FileAttachment +from review_agent.file_summary.services.archive import ARCHIVE_EXTENSIONS, extract_archive TEXT_EXTENSIONS = {"txt", "md", "csv", "json", "log"} -SUPPORTED_EXTENSIONS = TEXT_EXTENSIONS | {"pdf", "docx", "xlsx", "pptx"} +SUPPORTED_EXTENSIONS = TEXT_EXTENSIONS | {"pdf", "docx", "xlsx", "pptx"} | ARCHIVE_EXTENSIONS MAX_PREVIEW_CHARS = 3000 MAX_ROWS_PER_SHEET = 20 @@ -72,6 +74,8 @@ def read_attachment_details(attachment: FileAttachment) -> AttachmentReadResult: sections = _read_pptx(file_path) elif file_type == "csv": sections = _read_csv(file_path) + elif file_type in ARCHIVE_EXTENSIONS: + sections = _read_archive(file_path) else: sections = _read_text(file_path) except Exception as exc: @@ -208,6 +212,44 @@ def _read_pptx(path: Path) -> list[dict[str, object]]: return sections +def _read_archive(path: Path) -> list[dict[str, object]]: + sections: list[dict[str, object]] = [] + with TemporaryDirectory(prefix="attachment-reader-") as temp_dir: + extracted = extract_archive(path, Path(temp_dir)) + if not extracted: + return [{"type": "archive", "name": path.name, "text": "压缩包未解出任何可读取文件。"}] + for item in extracted: + file_type = item.suffix.lower().lstrip(".") + if file_type not in SUPPORTED_EXTENSIONS or file_type in ARCHIVE_EXTENSIONS: + sections.append( + { + "type": "file", + "name": item.name, + "text": f"暂不支持预览压缩包内的 .{file_type or 'unknown'} 文件。", + } + ) + continue + for section in _read_supported_file(item, file_type): + section = dict(section) + section["name"] = f"{item.name} / {section.get('name', item.name)}" + sections.append(section) + return sections + + +def _read_supported_file(path: Path, file_type: str) -> list[dict[str, object]]: + if file_type == "pdf": + return _read_pdf(path) + if file_type == "docx": + return _read_docx(path) + if file_type == "xlsx": + return _read_xlsx(path) + if file_type == "pptx": + return _read_pptx(path) + if file_type == "csv": + return _read_csv(path) + return _read_text(path) + + def _build_preview(sections: list[dict[str, object]]) -> str: parts: list[str] = [] for section in sections: diff --git a/review_agent/skill_router.py b/review_agent/skill_router.py index b0b5323..24e668a 100644 --- a/review_agent/skill_router.py +++ b/review_agent/skill_router.py @@ -51,6 +51,10 @@ class SkillRoute: def route_message_intent(conversation: Conversation, content: str) -> SkillRoute: + deterministic_route = _deterministic_workflow_route(conversation, content) + if deterministic_route: + return deterministic_route + attachments = list(_active_attachments(conversation)) try: route = _route_with_llm(conversation, content, attachments) @@ -75,6 +79,35 @@ def route_message_intent(conversation: Conversation, content: str) -> SkillRoute return _route_with_rules(conversation, content) +def _deterministic_workflow_route(conversation: Conversation, content: str) -> SkillRoute | None: + if _matches_application_form_fill(content): + return SkillRoute( + action=FORM_FILL_WORKFLOW_TYPE, + workflow_type=FORM_FILL_WORKFLOW_TYPE, + confidence=0.9, + reason="命中明确申报文件自动填表关键词。", + source="rule_preflight", + ) + if _matches_regulatory_review(content): + return SkillRoute( + action="regulatory_review", + workflow_type="regulatory_review", + confidence=0.9, + reason="命中明确法规核查关键词。", + source="rule_preflight", + ) + file_summary = evaluate_file_summary_trigger(conversation, content) + if file_summary.should_start or file_summary.reason == "missing_attachment": + return SkillRoute( + action="file_summary", + workflow_type="file_summary", + confidence=0.8, + reason=file_summary.reason, + source="rule_preflight", + ) + return None + + def _route_with_llm( conversation: Conversation, content: str, diff --git a/tests/test_application_form_fill_trigger.py b/tests/test_application_form_fill_trigger.py index 8272f29..5235c8a 100644 --- a/tests/test_application_form_fill_trigger.py +++ b/tests/test_application_form_fill_trigger.py @@ -1,6 +1,6 @@ import pytest -from review_agent.models import Conversation +from review_agent.models import Conversation, FileAttachment from review_agent.skill_router import route_message_intent @@ -43,3 +43,31 @@ def test_rule_router_does_not_misroute_normal_chat(monkeypatch, django_user_mode route = route_message_intent(conversation, "你好,解释一下法规背景") assert route.action == "normal_chat" + + +def test_application_form_fill_prompt_preempts_attachment_reader_llm(monkeypatch, tmp_path, django_user_model): + user = django_user_model.objects.create_user(username="owner", password="pass") + conversation = Conversation.objects.create(user=user, title="会话") + archive_path = tmp_path / "第1章_监管信息.rar" + archive_path.write_bytes(b"rar") + FileAttachment.objects.create( + conversation=conversation, + user=user, + original_name="第1章_监管信息.rar", + storage_path=str(archive_path), + file_size=archive_path.stat().st_size, + ) + monkeypatch.setattr( + "review_agent.skill_router._route_with_llm", + lambda conversation, content, attachments: (_ for _ in ()).throw( + AssertionError("明确自动填表意图不应进入 LLM 路由") + ), + ) + + route = route_message_intent( + conversation, + "请基于当前对话最近成功汇总的产品资料,自动提取产品关键信息并填入申报文件模板,优先生成注册证 Word 和字段来源追溯清单。", + ) + + assert route.action == "application_form_fill" + assert route.source == "rule_preflight" diff --git a/tests/test_attachment_reader.py b/tests/test_attachment_reader.py index 147f889..84afb97 100644 --- a/tests/test_attachment_reader.py +++ b/tests/test_attachment_reader.py @@ -109,3 +109,38 @@ def test_attachment_reader_skill_returns_structured_details(settings, tmp_path, assert result.success is True assert result.data["attachments"][0]["filename"] == "readme.txt" assert "请读取这个附件" in result.data["attachments"][0]["preview_text"] + + +def test_read_attachment_extracts_files_inside_rar(monkeypatch, settings, tmp_path, django_user_model): + from review_agent.file_summary.services.attachment_reader import read_attachment_details + + settings.MEDIA_ROOT = tmp_path + user = django_user_model.objects.create_user(username="owner", password="pass") + conversation = Conversation.objects.create(user=user, title="会话") + archive_path = tmp_path / "uploads" / "第1章_监管信息.rar" + archive_path.parent.mkdir(parents=True) + archive_path.write_bytes(b"rar") + attachment = FileAttachment.objects.create( + conversation=conversation, + user=user, + original_name="第1章_监管信息.rar", + storage_path="uploads/第1章_监管信息.rar", + file_size=archive_path.stat().st_size, + ) + + def fake_extract_archive(path: Path, target_dir: Path): + extracted = target_dir / "说明书.txt" + extracted.write_text("产品名称:甲胎蛋白检测试剂盒", encoding="utf-8") + return [extracted] + + monkeypatch.setattr( + "review_agent.file_summary.services.attachment_reader.extract_archive", + fake_extract_archive, + ) + + result = read_attachment_details(attachment) + + assert result.status == "success" + assert result.file_type == "rar" + assert "说明书.txt" in result.sections[0]["name"] + assert "甲胎蛋白检测试剂盒" in result.preview_text