From 1bdc7322cf6abd08397961020149c2f5f798f270 Mon Sep 17 00:00:00 2001 From: bruce Date: Sun, 7 Jun 2026 09:27:42 +0800 Subject: [PATCH] =?UTF-8?q?feat(regulatory):=20=E5=AF=B9=E9=BD=90=E9=99=84?= =?UTF-8?q?=E4=BB=B64=E7=9B=AE=E5=BD=95=E6=A0=B8=E6=9F=A5=E8=A7=84?= =?UTF-8?q?=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../rules/nmpa_ivd_registration_v1.yaml | 515 ++++++++++++++++-- .../services/completeness_check.py | 14 +- .../services/consistency_check.py | 4 + .../regulatory_review/services/rag_index.py | 7 + .../regulatory_review/services/rule_loader.py | 21 + .../services/structure_check.py | 31 +- .../services/text_extract.py | 49 +- review_agent/regulatory_review/workflow.py | 51 +- .../regulatory/attachment4_outline.json | 8 + tests/test_regulatory_completeness.py | 27 + tests/test_regulatory_consistency.py | 13 + tests/test_regulatory_rag.py | 16 + tests/test_regulatory_rule_loader.py | 25 + tests/test_regulatory_structure.py | 12 + tests/test_regulatory_workflow.py | 3 + 15 files changed, 753 insertions(+), 43 deletions(-) create mode 100644 tests/fixtures/regulatory/attachment4_outline.json diff --git a/review_agent/regulatory_review/rules/nmpa_ivd_registration_v1.yaml b/review_agent/regulatory_review/rules/nmpa_ivd_registration_v1.yaml index 19cc16b..909b63f 100644 --- a/review_agent/regulatory_review/rules/nmpa_ivd_registration_v1.yaml +++ b/review_agent/regulatory_review/rules/nmpa_ivd_registration_v1.yaml @@ -1,58 +1,503 @@ code: nmpa_ivd_registration_v1 -name: NMPA IVD 注册资料 Demo 规则 +name: NMPA IVD 注册资料附件 4 对齐规则 rag_collection: nmpa_ivd_registration_v1 -source_material_dir: docs/0.原始材料/关于公布体外诊断试剂注册申报资料要求和批准证明文件格式的公告 +source_material_dir: docs/0.原始材料 +attachment4_required_codes: + - "1" + - "1.1" + - "1.2" + - "1.3" + - "1.4" + - "1.5" + - "1.6" + - "1.7" + - "2" + - "2.1" + - "2.2" + - "2.3" + - "2.4" + - "2.5" + - "2.6" + - "3" + - "3.1" + - "3.2" + - "3.3" + - "3.4" + - "3.5" + - "3.6" + - "3.7" + - "3.8" + - "4" + - "4.1" + - "4.2" + - "5" + - "5.1" + - "5.2" + - "5.3" + - "5.4" + - "6" + - "6.1" + - "6.2" + - "6.3" + - "6.4" + - "6.5" + - "6.6" + - "6.7" + - "6.8" + - "6.9" + - "6.10" requirements: - - code: product_technical_requirements - title: 产品技术要求 + - code: attachment4_1_regulatory_info + rule_id: A4-1 + attachment4_code: "1" + title: 监管信息 + type: chapter + severity: high + category: completeness + file_keywords: [监管信息] + aliases: [监管资料] + suggestion: 请补充监管信息章节及其目录项。 + citation_query: 附件4 监管信息 体外诊断试剂 注册申报资料 + structure_required: true + - code: attachment4_1_1_toc + rule_id: A4-1.1 + attachment4_code: "1.1" + title: 章节目录 + type: directory + severity: medium + category: completeness + file_keywords: [章节目录, 目录] + aliases: [监管信息目录] + suggestion: 请补充监管信息章节目录。 + citation_query: 附件4 监管信息 章节目录 + - code: attachment4_1_2_application_form + rule_id: A4-1.2 + attachment4_code: "1.2" + title: 申请表 type: required severity: blocking category: completeness - file_keywords: - - 产品技术要求 - suggestion: 请补充产品技术要求并确认版本与注册申请资料一致。 - citation_query: 体外诊断试剂 产品技术要求 注册申报资料 - - code: instructions_for_use - title: 说明书 + file_keywords: [申请表, 注册申请表] + aliases: [医疗器械注册申请表] + suggestion: 请补充注册申请表并核对注册类型、管理类别和分类编码。 + citation_query: 附件4 监管信息 申请表 + - code: attachment4_1_3_terms + rule_id: A4-1.3 + attachment4_code: "1.3" + title: 术语/缩写词列表 + type: recommended + severity: medium + category: completeness + file_keywords: [术语, 缩写词, 缩略语] + suggestion: 请补充术语和缩写词列表。 + citation_query: 附件4 术语 缩写词列表 + - code: attachment4_1_4_product_list + rule_id: A4-1.4 + attachment4_code: "1.4" + title: 产品列表 type: required severity: high category: completeness - file_keywords: - - 说明书 - - 使用说明 - required_sections: - - 储存条件 - - 有效期 - - 样本要求 - suggestion: 请补充说明书并核对储存条件、有效期和样本要求章节。 - citation_query: 体外诊断试剂 说明书 储存条件 有效期 样本要求 + file_keywords: [产品列表, 产品清单] + suggestion: 请补充申报产品列表。 + citation_query: 附件4 产品列表 + - code: attachment4_1_5_related_files + rule_id: A4-1.5 + attachment4_code: "1.5" + title: 关联文件 + type: conditional + severity: medium + category: completeness + file_keywords: [关联文件, 关联注册, 引用文件] + suggestion: 如存在关联注册或引用资料,请补充关联文件说明。 + citation_query: 附件4 关联文件 + - code: attachment4_1_6_pre_submission + rule_id: A4-1.6 + attachment4_code: "1.6" + title: 申报前与监管机构的联系情况和沟通记录 + type: conditional + severity: medium + category: completeness + file_keywords: [沟通记录, 监管机构, 申报前] + suggestion: 如有申报前沟通,请补充沟通记录;如无,请说明不适用。 + citation_query: 附件4 申报前 监管机构 沟通记录 + - code: attachment4_1_7_declaration + rule_id: A4-1.7 + attachment4_code: "1.7" + title: 符合性声明 + type: required + severity: blocking + category: completeness + file_keywords: [符合性声明, 声明] + suggestion: 请补充符合性声明。 + citation_query: 附件4 符合性声明 + - code: attachment4_2_summary + rule_id: A4-2 + attachment4_code: "2" + title: 综述资料 + type: chapter + severity: high + category: completeness + file_keywords: [综述资料] + suggestion: 请补充综述资料章节。 + citation_query: 附件4 综述资料 + structure_required: true + - code: attachment4_2_1_toc + rule_id: A4-2.1 + attachment4_code: "2.1" + title: 章节目录 + type: directory + severity: medium + category: completeness + file_keywords: [章节目录, 综述资料目录] + suggestion: 请补充综述资料章节目录。 + citation_query: 附件4 综述资料 章节目录 + - code: attachment4_2_2_overview + rule_id: A4-2.2 + attachment4_code: "2.2" + title: 概述 + type: required + severity: high + category: completeness + file_keywords: [概述] + suggestion: 请补充产品概述。 + citation_query: 附件4 概述 + - code: attachment4_2_3_product_description + rule_id: A4-2.3 + attachment4_code: "2.3" + title: 产品描述 + type: required + severity: high + category: completeness + file_keywords: [产品描述] + suggestion: 请补充产品描述。 + citation_query: 附件4 产品描述 + - code: attachment4_2_4_intended_use + rule_id: A4-2.4 + attachment4_code: "2.4" + title: 预期用途 + type: required + severity: high + category: completeness + file_keywords: [预期用途] + suggestion: 请补充预期用途资料。 + citation_query: 附件4 预期用途 + - code: attachment4_2_5_marketing_history + rule_id: A4-2.5 + attachment4_code: "2.5" + title: 申报产品上市历史 + type: conditional + severity: medium + category: completeness + file_keywords: [上市历史] + suggestion: 如产品已有上市历史,请补充相关说明;如无,请说明不适用。 + citation_query: 附件4 上市历史 + - code: attachment4_2_6_other_summary + rule_id: A4-2.6 + attachment4_code: "2.6" + title: 其他需说明的内容 + type: conditional + severity: medium + category: completeness + file_keywords: [其他需说明, 其他说明] + suggestion: 请补充其他需说明内容或不适用说明。 + citation_query: 附件4 其他需说明 + - code: attachment4_3_nonclinical + rule_id: A4-3 + attachment4_code: "3" + title: 非临床资料 + type: chapter + severity: high + category: completeness + file_keywords: [非临床资料] + suggestion: 请补充非临床资料章节。 + citation_query: 附件4 非临床资料 + structure_required: true + - code: attachment4_3_1_toc + rule_id: A4-3.1 + attachment4_code: "3.1" + title: 章节目录 + type: directory + severity: medium + category: completeness + file_keywords: [章节目录, 非临床资料目录] + suggestion: 请补充非临床资料章节目录。 + citation_query: 附件4 非临床资料 章节目录 + - code: attachment4_3_2_risk_management + rule_id: A4-3.2 + attachment4_code: "3.2" + title: 产品风险管理资料 + type: required + severity: high + category: completeness + file_keywords: [产品风险管理, 风险管理资料] + suggestion: 请补充产品风险管理资料。 + citation_query: 附件4 产品风险管理资料 + - code: essential_principles_checklist + rule_id: A4-3.3 + attachment4_code: "3.3" + title: 体外诊断试剂安全和性能基本原则清单 + type: recommended + severity: medium + category: completeness + file_keywords: [安全和性能基本原则, 基本原则清单] + aliases: [安全和性能基本原则清单] + suggestion: 建议补充安全和性能基本原则清单,便于审评追溯。 + citation_query: 附件4 安全和性能基本原则清单 + - code: product_technical_requirements + rule_id: A4-3.4 + attachment4_code: "3.4" + title: 产品技术要求及检验报告 + type: required + severity: blocking + category: completeness + file_keywords: [产品技术要求, 注册检验报告, 检验报告] + aliases: [产品技术要求, 注册检验报告] + required_sections: [产品技术要求, 检验报告] + suggestion: 请补充产品技术要求及注册检验报告,并确认二者覆盖型号一致。 + citation_query: 附件4 产品技术要求 检验报告 - code: registration_test_report + rule_id: A4-3.4-R + attachment4_code: "3.4" title: 注册检验报告 type: required severity: blocking category: completeness - file_keywords: - - 注册检验报告 - - 检验报告 + file_keywords: [注册检验报告, 检验报告] suggestion: 请补充注册检验报告并复核报告覆盖的产品型号。 - citation_query: 体外诊断试剂 注册检验报告 注册申报资料 + citation_query: 附件4 注册检验报告 + - code: attachment4_3_5_analytical_performance + rule_id: A4-3.5 + attachment4_code: "3.5" + title: 分析性能研究 + type: required + severity: high + category: completeness + file_keywords: [分析性能研究, 分析性能] + suggestion: 请补充分析性能研究资料。 + citation_query: 附件4 分析性能研究 + - code: attachment4_3_6_stability + rule_id: A4-3.6 + attachment4_code: "3.6" + title: 稳定性研究 + type: required + severity: high + category: completeness + file_keywords: [稳定性研究, 稳定性] + suggestion: 请补充稳定性研究资料。 + citation_query: 附件4 稳定性研究 + - code: attachment4_3_7_reference_interval + rule_id: A4-3.7 + attachment4_code: "3.7" + title: 阳性判断值或参考区间研究 + type: required + severity: high + category: completeness + file_keywords: [阳性判断值, 参考区间] + suggestion: 请补充阳性判断值或参考区间研究资料。 + citation_query: 附件4 阳性判断值 参考区间 + - code: attachment4_3_8_other_nonclinical + rule_id: A4-3.8 + attachment4_code: "3.8" + title: 其他资料 + type: conditional + severity: medium + category: completeness + file_keywords: [其他资料] + suggestion: 请补充非临床其他资料或不适用说明。 + citation_query: 附件4 非临床 其他资料 + - code: attachment4_4_clinical_evaluation + rule_id: A4-4 + attachment4_code: "4" + title: 临床评价资料 + type: chapter + severity: high + category: completeness + file_keywords: [临床评价资料, 临床资料] + suggestion: 请补充临床评价资料章节。 + citation_query: 附件4 临床评价资料 + structure_required: true + - code: attachment4_4_1_toc + rule_id: A4-4.1 + attachment4_code: "4.1" + title: 章节目录 + type: directory + severity: medium + category: completeness + file_keywords: [章节目录, 临床评价资料目录] + suggestion: 请补充临床评价资料章节目录。 + citation_query: 附件4 临床评价资料 章节目录 - code: clinical_evaluation + rule_id: A4-4.2 + attachment4_code: "4.2" title: 临床评价资料 type: conditional severity: high category: completeness - file_keywords: - - 临床评价 - - 临床试验 + file_keywords: [临床评价, 临床试验, 免临床, 同品种比对] suggestion: 请根据适用情形补充临床评价资料或说明豁免依据。 - citation_query: 体外诊断试剂 临床评价资料 注册申报 - - code: essential_principles_checklist - title: 安全和性能基本原则清单 - type: recommended + citation_query: 附件4 临床评价资料 注册申报 + - code: attachment4_5_ifu_label + rule_id: A4-5 + attachment4_code: "5" + title: 产品说明书和标签样稿 + type: chapter + severity: high + category: completeness + file_keywords: [产品说明书和标签样稿, 说明书, 标签样稿] + suggestion: 请补充产品说明书和标签样稿章节。 + citation_query: 附件4 产品说明书 标签样稿 + structure_required: true + - code: attachment4_5_1_toc + rule_id: A4-5.1 + attachment4_code: "5.1" + title: 章节目录 + type: directory severity: medium category: completeness - file_keywords: - - 安全和性能基本原则 - - 基本原则清单 - suggestion: 建议补充安全和性能基本原则清单,便于审评追溯。 - citation_query: 体外诊断试剂 安全和性能基本原则清单 + file_keywords: [章节目录, 说明书目录, 标签目录] + suggestion: 请补充产品说明书和标签样稿章节目录。 + citation_query: 附件4 说明书 标签 章节目录 + - code: instructions_for_use + rule_id: A4-5.2 + attachment4_code: "5.2" + title: 产品说明书 + type: required + severity: high + category: completeness + file_keywords: [说明书, 产品说明书, 使用说明] + aliases: [说明书] + required_sections: [储存条件, 有效期, 样本要求] + suggestion: 请补充说明书并核对储存条件、有效期和样本要求章节。 + citation_query: 附件4 产品说明书 储存条件 有效期 样本要求 + - code: attachment4_5_3_label + rule_id: A4-5.3 + attachment4_code: "5.3" + title: 标签样稿 + type: required + severity: high + category: completeness + file_keywords: [标签样稿, 标签] + suggestion: 请补充标签样稿。 + citation_query: 附件4 标签样稿 + - code: attachment4_5_4_other_ifu + rule_id: A4-5.4 + attachment4_code: "5.4" + title: 其他资料 + type: conditional + severity: medium + category: completeness + file_keywords: [其他资料] + suggestion: 请补充说明书和标签相关其他资料或不适用说明。 + citation_query: 附件4 说明书 标签 其他资料 + - code: attachment4_6_quality_system + rule_id: A4-6 + attachment4_code: "6" + title: 质量管理体系文件 + type: chapter + severity: high + category: completeness + file_keywords: [质量管理体系文件, 质量体系, 质量管理体系] + suggestion: 请补充质量管理体系文件章节。 + citation_query: 附件4 质量管理体系文件 + structure_required: true + - code: attachment4_6_1_overview + rule_id: A4-6.1 + attachment4_code: "6.1" + title: 综述 + type: required + severity: high + category: completeness + file_keywords: [综述] + suggestion: 请补充质量管理体系综述。 + citation_query: 附件4 质量管理体系 综述 + - code: attachment4_6_2_toc + rule_id: A4-6.2 + attachment4_code: "6.2" + title: 章节目录 + type: directory + severity: medium + category: completeness + file_keywords: [章节目录, 质量管理体系目录] + suggestion: 请补充质量管理体系文件章节目录。 + citation_query: 附件4 质量管理体系 章节目录 + - code: attachment4_6_3_manufacturing + rule_id: A4-6.3 + attachment4_code: "6.3" + title: 生产制造信息 + type: required + severity: high + category: completeness + file_keywords: [生产制造信息, 生产制造] + suggestion: 请补充生产制造信息。 + citation_query: 附件4 生产制造信息 + - code: attachment4_6_4_qms_procedure + rule_id: A4-6.4 + attachment4_code: "6.4" + title: 质量管理体系程序 + type: required + severity: high + category: completeness + file_keywords: [质量管理体系程序, 质量体系程序] + suggestion: 请补充质量管理体系程序。 + citation_query: 附件4 质量管理体系程序 + - code: attachment4_6_5_management + rule_id: A4-6.5 + attachment4_code: "6.5" + title: 管理职责程序 + type: required + severity: high + category: completeness + file_keywords: [管理职责程序, 管理职责] + suggestion: 请补充管理职责程序。 + citation_query: 附件4 管理职责程序 + - code: attachment4_6_6_resource + rule_id: A4-6.6 + attachment4_code: "6.6" + title: 资源管理程序 + type: required + severity: high + category: completeness + file_keywords: [资源管理程序, 资源管理] + suggestion: 请补充资源管理程序。 + citation_query: 附件4 资源管理程序 + - code: attachment4_6_7_realization + rule_id: A4-6.7 + attachment4_code: "6.7" + title: 产品实现程序 + type: required + severity: high + category: completeness + file_keywords: [产品实现程序, 产品实现] + suggestion: 请补充产品实现程序。 + citation_query: 附件4 产品实现程序 + - code: attachment4_6_8_measurement + rule_id: A4-6.8 + attachment4_code: "6.8" + title: 质量管理体系的测量/分析和改进程序 + type: required + severity: high + category: completeness + file_keywords: [测量, 分析和改进, 改进程序] + suggestion: 请补充质量管理体系测量、分析和改进程序。 + citation_query: 附件4 测量 分析 改进程序 + - code: attachment4_6_9_other_qms + rule_id: A4-6.9 + attachment4_code: "6.9" + title: 其他质量体系程序信息 + type: conditional + severity: medium + category: completeness + file_keywords: [其他质量体系程序, 其他质量体系] + suggestion: 请补充其他质量体系程序信息或不适用说明。 + citation_query: 附件4 其他质量体系程序信息 + - code: attachment4_6_10_qms_audit + rule_id: A4-6.10 + attachment4_code: "6.10" + title: 质量管理体系核查文件 + type: required + severity: high + category: completeness + file_keywords: [质量管理体系核查文件, 体系核查文件, 核查文件] + suggestion: 请补充质量管理体系核查文件。 + citation_query: 附件4 质量管理体系核查文件 diff --git a/review_agent/regulatory_review/services/completeness_check.py b/review_agent/regulatory_review/services/completeness_check.py index f1a684d..7b2b1ad 100644 --- a/review_agent/regulatory_review/services/completeness_check.py +++ b/review_agent/regulatory_review/services/completeness_check.py @@ -8,12 +8,17 @@ def run_completeness_check(batch: FileSummaryBatch, rule_set: dict) -> list[Find items = list(batch.items.order_by("file_index")) findings: list[Finding] = [] for requirement in rule_set.get("requirements", []): - if requirement.get("type") not in {"required", "conditional", "recommended"}: + if requirement.get("type") not in {"required", "conditional", "recommended", "chapter", "directory"}: continue matched = [ item for item in items - if _matches_item(item.file_name, item.relative_path, requirement.get("file_keywords", [])) + if _matches_item( + item.file_name, + item.relative_path, + item.directory_level, + [*requirement.get("file_keywords", []), *requirement.get("aliases", [])], + ) ] if matched: continue @@ -29,12 +34,13 @@ def run_completeness_check(batch: FileSummaryBatch, rule_set: dict) -> list[Find "requirement_type": requirement.get("type"), "matched_files": [], "searched_keywords": requirement.get("file_keywords", []), + "searched_fields": ["file_name", "relative_path", "directory_level"], }, ) ) return findings -def _matches_item(file_name: str, relative_path: str, keywords: list[str]) -> bool: - haystack = f"{file_name} {relative_path}".lower() +def _matches_item(file_name: str, relative_path: str, directory_level: str, keywords: list[str]) -> bool: + haystack = f"{file_name} {relative_path} {directory_level}".lower() return any(str(keyword).lower() in haystack for keyword in keywords) diff --git a/review_agent/regulatory_review/services/consistency_check.py b/review_agent/regulatory_review/services/consistency_check.py index 65782ed..1f24e17 100644 --- a/review_agent/regulatory_review/services/consistency_check.py +++ b/review_agent/regulatory_review/services/consistency_check.py @@ -10,6 +10,10 @@ FIELDS = { "产品名称": r"产品名称[::]\s*([^\n\r]+)", "型号规格": r"型号规格[::]\s*([^\n\r]+)", "预期用途": r"预期用途[::]\s*([^\n\r]+)", + "管理类别": r"管理类别[::]\s*([^\n\r]+)", + "分类编码": r"分类编码[::]\s*([^\n\r]+)", + "注册类型": r"注册类型[::]\s*([^\n\r]+)", + "临床评价路径": r"临床评价路径[::]\s*([^\n\r]+)", } diff --git a/review_agent/regulatory_review/services/rag_index.py b/review_agent/regulatory_review/services/rag_index.py index bbaca66..b6a9d5a 100644 --- a/review_agent/regulatory_review/services/rag_index.py +++ b/review_agent/regulatory_review/services/rag_index.py @@ -107,12 +107,19 @@ def collect_source_chunks(source_dir: Path) -> list[TextChunk]: try: text = extract_text_from_path(path) except RuntimeError as exc: + if _is_attachment4(path): + raise RuntimeError(f"附件 4 核心法规材料抽取失败:{path.name}") from exc logger.warning("Regulatory source extraction skipped", extra={"path": str(path), "error": str(exc)}) continue chunks.extend(chunk_text(text, source=str(path.relative_to(source_dir)))) return chunks +def _is_attachment4(path: Path) -> bool: + normalized = path.name.replace(" ", "") + return "附件4" in normalized and "体外诊断试剂注册申报资料要求及说明" in normalized + + def build_chroma_index( *, source_dir: Path, diff --git a/review_agent/regulatory_review/services/rule_loader.py b/review_agent/regulatory_review/services/rule_loader.py index bbd671f..85855ad 100644 --- a/review_agent/regulatory_review/services/rule_loader.py +++ b/review_agent/regulatory_review/services/rule_loader.py @@ -47,9 +47,30 @@ def load_rule_file(path: str | Path | None = None) -> dict: raise ValueError(f"规则 code 必须为 {DEFAULT_RULE_CODE}") if not isinstance(payload.get("requirements"), list) or not payload["requirements"]: raise ValueError("规则文件必须包含 requirements 列表。") + _validate_attachment4_requirements(payload) return payload +def _validate_attachment4_requirements(payload: dict) -> None: + requirements = payload.get("requirements") or [] + required_codes = {str(code) for code in payload.get("attachment4_required_codes") or []} + by_attachment4_code: dict[str, list[dict]] = {} + for requirement in requirements: + attachment4_code = requirement.get("attachment4_code") + if attachment4_code: + by_attachment4_code.setdefault(str(attachment4_code), []).append(requirement) + for field in ["code", "rule_id", "title", "severity", "file_keywords", "citation_query"]: + if attachment4_code and not requirement.get(field): + raise ValueError(f"附件4规则 {attachment4_code} 缺少 {field}") + missing = sorted(required_codes - set(by_attachment4_code), key=_attachment4_sort_key) + if missing: + raise ValueError(f"附件4目录项缺少规则:{', '.join(missing)}") + + +def _attachment4_sort_key(value: str) -> tuple[int, ...]: + return tuple(int(part) for part in value.split(".") if part.isdigit()) + + def check_rule_version( *, path: str | Path | None = None, diff --git a/review_agent/regulatory_review/services/structure_check.py b/review_agent/regulatory_review/services/structure_check.py index d12eac0..d57758a 100644 --- a/review_agent/regulatory_review/services/structure_check.py +++ b/review_agent/regulatory_review/services/structure_check.py @@ -5,7 +5,27 @@ from review_agent.regulatory_review.schemas import Finding def run_structure_check(document_texts: dict[str, str], rule_set: dict) -> list[Finding]: findings: list[Finding] = [] + combined_all_text = "\n".join(document_texts.values()) for requirement in rule_set.get("requirements", []): + if requirement.get("structure_required") and not _contains_any( + combined_all_text, + [requirement.get("title", ""), *requirement.get("aliases", [])], + ): + findings.append( + Finding( + rule_code=requirement["code"], + category="structure", + severity=requirement.get("severity", "medium"), + title=f"申报资料目录缺少{requirement['title']}章节", + detail=f"未在申报资料目录或章节标题候选中发现{requirement['title']}。", + suggestion=requirement.get("suggestion", ""), + evidence={ + "attachment4_code": requirement.get("attachment4_code"), + "expected_title": requirement["title"], + "aliases": requirement.get("aliases", []), + }, + ) + ) required_sections = requirement.get("required_sections") or [] if not required_sections: continue @@ -14,7 +34,7 @@ def run_structure_check(document_texts: dict[str, str], rule_set: dict) -> list[ continue combined_text = "\n".join(matching_docs.values()) for section in required_sections: - if section in combined_text: + if _contains_any(combined_text, [section]): continue findings.append( Finding( @@ -39,3 +59,12 @@ def _matching_documents(document_texts: dict[str, str], keywords: list[str]) -> if any(str(keyword).lower() in haystack for keyword in keywords): result[name] = text return result + + +def _contains_any(text: str, needles: list[str]) -> bool: + normalized = _normalize_title(text) + return any(_normalize_title(needle) in normalized for needle in needles if needle) + + +def _normalize_title(value: str) -> str: + return "".join(str(value).lower().replace("/", "").replace("/", "").split()) diff --git a/review_agent/regulatory_review/services/text_extract.py b/review_agent/regulatory_review/services/text_extract.py index 7d2d1cf..bd8dfab 100644 --- a/review_agent/regulatory_review/services/text_extract.py +++ b/review_agent/regulatory_review/services/text_extract.py @@ -1,6 +1,7 @@ from __future__ import annotations import hashlib +import re from dataclasses import dataclass from pathlib import Path @@ -14,6 +15,9 @@ class ExtractedText: status: str content_hash: str = "" error_message: str = "" + front_text: str = "" + section_candidates: list[str] | None = None + field_candidates: dict[str, str] | None = None SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"} @@ -26,6 +30,47 @@ def extract_text(path: str | Path) -> ExtractedText: try: text = extract_text_from_path(file_path) except Exception as exc: - return ExtractedText(path=file_path, text="", status="failed", error_message=str(exc)) + return ExtractedText( + path=file_path, + text="", + status="failed", + error_message=str(exc), + section_candidates=[], + field_candidates={}, + ) content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() if text else "" - return ExtractedText(path=file_path, text=text, status="success", content_hash=content_hash) + return ExtractedText( + path=file_path, + text=text, + status="success", + content_hash=content_hash, + front_text=_front_text(text), + section_candidates=_section_candidates(text), + field_candidates=_field_candidates(text), + ) + + +def _front_text(text: str, limit: int = 1200) -> str: + return text[:limit] + + +def _section_candidates(text: str) -> list[str]: + candidates = [] + for line in text.splitlines(): + normalized = line.strip() + if not normalized: + continue + if re.match(r"^([一二三四五六七八九十]+[、..]|[0-9]+(\.[0-9]+)*[、..\s])", normalized): + candidates.append(normalized[:120]) + elif any(keyword in normalized for keyword in ["章节目录", "监管信息", "综述资料", "非临床资料", "临床评价资料", "质量管理体系"]): + candidates.append(normalized[:120]) + return candidates[:80] + + +def _field_candidates(text: str) -> dict[str, str]: + fields = {} + for label in ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]: + match = re.search(rf"{label}[::]\s*([^\n\r]+)", text) + if match: + fields[label] = " ".join(match.group(1).strip().split()) + return fields diff --git a/review_agent/regulatory_review/workflow.py b/review_agent/regulatory_review/workflow.py index 264b04a..f89ff8f 100644 --- a/review_agent/regulatory_review/workflow.py +++ b/review_agent/regulatory_review/workflow.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import logging from pathlib import Path from threading import Thread @@ -26,6 +27,7 @@ from review_agent.regulatory_review.services.structure_check import run_structur from review_agent.regulatory_review.services.text_extract import extract_text from .events import record_event +from .storage import save_artifact NODE_DEFINITIONS = [ @@ -105,6 +107,7 @@ class RegulatoryWorkflowExecutor: self.rule_set: dict | None = None self.findings = [] self.document_texts: dict[str, str] = {} + self.text_extract_status: dict[str, dict[str, object]] = {} def run(self) -> None: self.batch.status = RegulatoryReviewBatch.Status.RUNNING @@ -176,6 +179,13 @@ class RegulatoryWorkflowExecutor: return if node_code == "text_extract": self.document_texts = self._extract_source_texts() + save_artifact( + self.batch, + name="text_extract_status.json", + artifact_type="json", + content=json.dumps(self.text_extract_status, ensure_ascii=False, indent=2), + metadata={"artifact": "text_extract_status"}, + ) return if node_code == "structure_check": self.findings.extend(run_structure_check(self.document_texts, self._rules())) @@ -184,7 +194,29 @@ class RegulatoryWorkflowExecutor: self.findings.extend(run_consistency_check(self.document_texts)) return if node_code == "risk_assess": - persist_findings(self.batch, self.findings) + issues = persist_findings(self.batch, self.findings) + save_artifact( + self.batch, + name="rag_result_json.json", + artifact_type="json", + content=json.dumps( + { + "batch_no": self.batch.batch_no, + "text_extract_status": self.text_extract_status, + "issues": [ + { + "rule_code": issue.rule_code, + "title": issue.title, + "citations": issue.citations, + } + for issue in issues + ], + }, + ensure_ascii=False, + indent=2, + ), + metadata={"artifact": "rag_result_json"}, + ) return if node_code == "report_export": exports = export_review_results(self.batch) @@ -234,8 +266,25 @@ class RegulatoryWorkflowExecutor: if not path.is_absolute(): path = Path(settings.MEDIA_ROOT) / item.storage_path if not path.exists(): + self.text_extract_status[item.file_name] = { + "status": "missing", + "path": str(path), + "content_hash": "", + "section_candidates": [], + "field_candidates": {}, + "front_text": "", + } continue result = extract_text(path) + self.text_extract_status[item.file_name] = { + "status": result.status, + "path": str(path), + "content_hash": result.content_hash, + "section_candidates": result.section_candidates, + "field_candidates": result.field_candidates, + "front_text": result.front_text, + "error_message": result.error_message, + } if result.status == "success" and result.text: texts[item.file_name] = result.text return texts diff --git a/tests/fixtures/regulatory/attachment4_outline.json b/tests/fixtures/regulatory/attachment4_outline.json new file mode 100644 index 0000000..25d8d98 --- /dev/null +++ b/tests/fixtures/regulatory/attachment4_outline.json @@ -0,0 +1,8 @@ +[ + {"code": "1", "title": "监管信息", "children": ["章节目录", "申请表", "术语/缩写词列表", "产品列表", "关联文件", "申报前与监管机构的联系情况和沟通记录", "符合性声明"]}, + {"code": "2", "title": "综述资料", "children": ["章节目录", "概述", "产品描述", "预期用途", "申报产品上市历史", "其他需说明的内容"]}, + {"code": "3", "title": "非临床资料", "children": ["章节目录", "产品风险管理资料", "体外诊断试剂安全和性能基本原则清单", "产品技术要求及检验报告", "分析性能研究", "稳定性研究", "阳性判断值或参考区间研究", "其他资料"]}, + {"code": "4", "title": "临床评价资料", "children": ["章节目录", "临床评价资料"]}, + {"code": "5", "title": "产品说明书和标签样稿", "children": ["章节目录", "产品说明书", "标签样稿", "其他资料"]}, + {"code": "6", "title": "质量管理体系文件", "children": ["综述", "章节目录", "生产制造信息", "质量管理体系程序", "管理职责程序", "资源管理程序", "产品实现程序", "质量管理体系的测量/分析和改进程序", "其他质量体系程序信息", "质量管理体系核查文件"]} +] diff --git a/tests/test_regulatory_completeness.py b/tests/test_regulatory_completeness.py index 3a0ce5c..16467bb 100644 --- a/tests/test_regulatory_completeness.py +++ b/tests/test_regulatory_completeness.py @@ -42,3 +42,30 @@ def test_completeness_check_matches_existing_files_and_reports_missing(django_us missing = next(finding for finding in findings if finding.rule_code == "registration_test_report") assert missing.severity == "blocking" assert missing.category == "completeness" + + +def test_completeness_check_matches_attachment4_directory_names(django_user_model): + user = django_user_model.objects.create_user(username="owner", password="pass") + conversation = Conversation.objects.create(user=user, title="会话") + batch = FileSummaryBatch.objects.create( + conversation=conversation, + user=user, + batch_no="FS-A4", + status=FileSummaryBatch.Status.SUCCESS, + ) + FileSummaryItem.objects.create( + batch=batch, + file_index=1, + directory_level="1. 监管信息 / 1.2 申请表", + file_name="注册申请表.pdf", + file_type="pdf", + relative_path="1.监管信息/1.2申请表/注册申请表.pdf", + storage_path="x/app.pdf", + ) + + findings = run_completeness_check(batch, load_rule_file()) + + assert not any(finding.rule_code == "attachment4_1_2_application_form" for finding in findings) + missing_qms = next(finding for finding in findings if finding.rule_code == "attachment4_6_quality_system") + assert missing_qms.severity == "high" + assert missing_qms.evidence["searched_fields"] == ["file_name", "relative_path", "directory_level"] diff --git a/tests/test_regulatory_consistency.py b/tests/test_regulatory_consistency.py index f2b2e97..9f925e7 100644 --- a/tests/test_regulatory_consistency.py +++ b/tests/test_regulatory_consistency.py @@ -12,3 +12,16 @@ def test_consistency_check_reports_product_name_mismatch(): assert len(findings) == 1 assert findings[0].category == "consistency" assert "产品名称" in findings[0].title + + +def test_consistency_check_reports_registration_scope_fields(): + document_texts = { + "申请表.docx": "管理类别:第二类\n分类编码:6840\n注册类型:首次注册\n临床评价路径:免临床", + "综述资料.docx": "管理类别:第三类\n分类编码:6840\n注册类型:首次注册\n临床评价路径:临床试验", + } + + findings = run_consistency_check(document_texts) + titles = [finding.title for finding in findings] + + assert "管理类别在不同文件中不一致" in titles + assert "临床评价路径在不同文件中不一致" in titles diff --git a/tests/test_regulatory_rag.py b/tests/test_regulatory_rag.py index 5ea6096..356ffc6 100644 --- a/tests/test_regulatory_rag.py +++ b/tests/test_regulatory_rag.py @@ -6,6 +6,7 @@ from review_agent.regulatory_review.services.rag_citation import ( ) from review_agent.regulatory_review.services.rag_embedding import SiliconFlowEmbeddingProvider from review_agent.regulatory_review.services.rag_index import chunk_text +from review_agent.regulatory_review.services.rag_index import collect_source_chunks def test_siliconflow_embedding_provider_posts_expected_payload(monkeypatch): @@ -70,3 +71,18 @@ def test_retrieve_citations_raises_when_index_missing(settings, tmp_path): with pytest.raises(RagIndexUnavailable): retrieve_citations("注册检验报告", embedding_provider=lambda texts: [[0.1]]) + + +def test_collect_source_chunks_requires_attachment4_extraction(monkeypatch, tmp_path): + source_dir = tmp_path / "sources" + source_dir.mkdir() + attachment4 = source_dir / "附件 4 体外诊断试剂注册申报资料要求及说明.doc" + attachment4.write_bytes(b"legacy-doc") + + def fail_extract(path): + raise RuntimeError("无法通过 LibreOffice 转换法规 .doc 材料") + + monkeypatch.setattr("review_agent.regulatory_review.services.rag_index.extract_text_from_path", fail_extract) + + with pytest.raises(RuntimeError, match="附件 4"): + collect_source_chunks(source_dir) diff --git a/tests/test_regulatory_rule_loader.py b/tests/test_regulatory_rule_loader.py index e74dc88..b200b67 100644 --- a/tests/test_regulatory_rule_loader.py +++ b/tests/test_regulatory_rule_loader.py @@ -1,4 +1,5 @@ from pathlib import Path +import json import pytest from django.core.management import call_command @@ -27,6 +28,30 @@ def test_load_rule_file_reads_demo_requirements(): assert "essential_principles_checklist" in codes +def test_load_rule_file_covers_attachment4_outline(): + rule_set = load_rule_file() + requirements = rule_set["requirements"] + outline = json.loads(Path("tests/fixtures/regulatory/attachment4_outline.json").read_text(encoding="utf-8")) + + for chapter in outline: + chapter_rule = next( + item for item in requirements if item["title"] == chapter["title"] and item.get("attachment4_code") == chapter["code"] + ) + assert chapter_rule["attachment4_code"] == chapter["code"] + assert chapter_rule["severity"] == "high" + assert chapter_rule["citation_query"] + for child in chapter["children"]: + child_rule = next( + item + for item in requirements + if item["title"] == child and str(item.get("attachment4_code", "")).startswith(f"{chapter['code']}.") + ) + assert child_rule["rule_id"] + assert child_rule["file_keywords"] + assert child_rule["severity"] in {"blocking", "high", "medium"} + assert child_rule["citation_query"] + + def test_compute_file_sha256_changes_when_file_changes(tmp_path): path = tmp_path / "rule.yaml" path.write_text("code: demo\n", encoding="utf-8") diff --git a/tests/test_regulatory_structure.py b/tests/test_regulatory_structure.py index b905b6a..e883918 100644 --- a/tests/test_regulatory_structure.py +++ b/tests/test_regulatory_structure.py @@ -11,3 +11,15 @@ def test_structure_check_reports_missing_instruction_sections(): assert any(finding.rule_code == "instructions_for_use:储存条件" for finding in findings) assert all("样本要求" not in finding.title for finding in findings) + + +def test_structure_check_reports_missing_attachment4_outline_heading(): + document_texts = { + "申报资料目录.txt": "1. 监管信息\n1.2 申请表\n2. 综述资料\n3. 非临床资料\n" + } + + findings = run_structure_check(document_texts, load_rule_file()) + + missing = next(finding for finding in findings if finding.rule_code == "attachment4_4_clinical_evaluation") + assert missing.category == "structure" + assert missing.evidence["expected_title"] == "临床评价资料" diff --git a/tests/test_regulatory_workflow.py b/tests/test_regulatory_workflow.py index d175a04..51eefeb 100644 --- a/tests/test_regulatory_workflow.py +++ b/tests/test_regulatory_workflow.py @@ -7,6 +7,7 @@ from review_agent.models import ( FileSummaryItem, Message, RegulatoryIssue, + RegulatoryArtifact, RegulatoryReviewBatch, WorkflowEvent, WorkflowNodeRun, @@ -201,4 +202,6 @@ def test_workflow_generates_issues_exports_and_assistant_summary(settings, tmp_p workflow_type="regulatory_review", workflow_batch_id=batch.pk, ).count() == 3 + assert RegulatoryArtifact.objects.filter(batch=batch, name="text_extract_status.json").exists() + assert RegulatoryArtifact.objects.filter(batch=batch, name="rag_result_json.json").exists() assert conversation.messages.filter(role=Message.Role.ASSISTANT, content__contains="已完成 NMPA").exists()