feat(regulatory): 对齐附件4目录核查规则

2026-06-07 09:27:42 +08:00
parent bbd2d3532a
commit 1bdc7322cf
15 changed files with 753 additions and 43 deletions
--- a/review_agent/regulatory_review/services/completeness_check.py
+++ b/review_agent/regulatory_review/services/completeness_check.py
@@ -8,12 +8,17 @@ def run_completeness_check(batch: FileSummaryBatch, rule_set: dict) -> list[Find
    items = list(batch.items.order_by("file_index"))
    findings: list[Finding] = []
    for requirement in rule_set.get("requirements", []):
-        if requirement.get("type") not in {"required", "conditional", "recommended"}:
+        if requirement.get("type") not in {"required", "conditional", "recommended", "chapter", "directory"}:
            continue
        matched = [
            item
            for item in items
-            if _matches_item(item.file_name, item.relative_path, requirement.get("file_keywords", []))
+            if _matches_item(
+                item.file_name,
+                item.relative_path,
+                item.directory_level,
+                [*requirement.get("file_keywords", []), *requirement.get("aliases", [])],
+            )
        ]
        if matched:
            continue
@@ -29,12 +34,13 @@ def run_completeness_check(batch: FileSummaryBatch, rule_set: dict) -> list[Find
                    "requirement_type": requirement.get("type"),
                    "matched_files": [],
                    "searched_keywords": requirement.get("file_keywords", []),
+                    "searched_fields": ["file_name", "relative_path", "directory_level"],
                },
            )
        )
    return findings


-def _matches_item(file_name: str, relative_path: str, keywords: list[str]) -> bool:
-    haystack = f"{file_name} {relative_path}".lower()
+def _matches_item(file_name: str, relative_path: str, directory_level: str, keywords: list[str]) -> bool:
+    haystack = f"{file_name} {relative_path} {directory_level}".lower()
    return any(str(keyword).lower() in haystack for keyword in keywords)
--- a/review_agent/regulatory_review/services/consistency_check.py
+++ b/review_agent/regulatory_review/services/consistency_check.py
@@ -10,6 +10,10 @@ FIELDS = {
    "产品名称": r"产品名称[:：]\s*([^\n\r]+)",
    "型号规格": r"型号规格[:：]\s*([^\n\r]+)",
    "预期用途": r"预期用途[:：]\s*([^\n\r]+)",
+    "管理类别": r"管理类别[:：]\s*([^\n\r]+)",
+    "分类编码": r"分类编码[:：]\s*([^\n\r]+)",
+    "注册类型": r"注册类型[:：]\s*([^\n\r]+)",
+    "临床评价路径": r"临床评价路径[:：]\s*([^\n\r]+)",
 }


--- a/review_agent/regulatory_review/services/rag_index.py
+++ b/review_agent/regulatory_review/services/rag_index.py
@@ -107,12 +107,19 @@ def collect_source_chunks(source_dir: Path) -> list[TextChunk]:
        try:
            text = extract_text_from_path(path)
        except RuntimeError as exc:
+            if _is_attachment4(path):
+                raise RuntimeError(f"附件 4 核心法规材料抽取失败：{path.name}") from exc
            logger.warning("Regulatory source extraction skipped", extra={"path": str(path), "error": str(exc)})
            continue
        chunks.extend(chunk_text(text, source=str(path.relative_to(source_dir))))
    return chunks


+def _is_attachment4(path: Path) -> bool:
+    normalized = path.name.replace(" ", "")
+    return "附件4" in normalized and "体外诊断试剂注册申报资料要求及说明" in normalized
+
+
 def build_chroma_index(
    *,
    source_dir: Path,
--- a/review_agent/regulatory_review/services/rule_loader.py
+++ b/review_agent/regulatory_review/services/rule_loader.py
@@ -47,9 +47,30 @@ def load_rule_file(path: str | Path | None = None) -> dict:
        raise ValueError(f"规则 code 必须为 {DEFAULT_RULE_CODE}")
    if not isinstance(payload.get("requirements"), list) or not payload["requirements"]:
        raise ValueError("规则文件必须包含 requirements 列表。")
+    _validate_attachment4_requirements(payload)
    return payload


+def _validate_attachment4_requirements(payload: dict) -> None:
+    requirements = payload.get("requirements") or []
+    required_codes = {str(code) for code in payload.get("attachment4_required_codes") or []}
+    by_attachment4_code: dict[str, list[dict]] = {}
+    for requirement in requirements:
+        attachment4_code = requirement.get("attachment4_code")
+        if attachment4_code:
+            by_attachment4_code.setdefault(str(attachment4_code), []).append(requirement)
+        for field in ["code", "rule_id", "title", "severity", "file_keywords", "citation_query"]:
+            if attachment4_code and not requirement.get(field):
+                raise ValueError(f"附件4规则 {attachment4_code} 缺少 {field}")
+    missing = sorted(required_codes - set(by_attachment4_code), key=_attachment4_sort_key)
+    if missing:
+        raise ValueError(f"附件4目录项缺少规则：{', '.join(missing)}")
+
+
+def _attachment4_sort_key(value: str) -> tuple[int, ...]:
+    return tuple(int(part) for part in value.split(".") if part.isdigit())
+
+
 def check_rule_version(
    *,
    path: str | Path | None = None,
--- a/review_agent/regulatory_review/services/structure_check.py
+++ b/review_agent/regulatory_review/services/structure_check.py
@@ -5,7 +5,27 @@ from review_agent.regulatory_review.schemas import Finding

 def run_structure_check(document_texts: dict[str, str], rule_set: dict) -> list[Finding]:
    findings: list[Finding] = []
+    combined_all_text = "\n".join(document_texts.values())
    for requirement in rule_set.get("requirements", []):
+        if requirement.get("structure_required") and not _contains_any(
+            combined_all_text,
+            [requirement.get("title", ""), *requirement.get("aliases", [])],
+        ):
+            findings.append(
+                Finding(
+                    rule_code=requirement["code"],
+                    category="structure",
+                    severity=requirement.get("severity", "medium"),
+                    title=f"申报资料目录缺少{requirement['title']}章节",
+                    detail=f"未在申报资料目录或章节标题候选中发现{requirement['title']}。",
+                    suggestion=requirement.get("suggestion", ""),
+                    evidence={
+                        "attachment4_code": requirement.get("attachment4_code"),
+                        "expected_title": requirement["title"],
+                        "aliases": requirement.get("aliases", []),
+                    },
+                )
+            )
        required_sections = requirement.get("required_sections") or []
        if not required_sections:
            continue
@@ -14,7 +34,7 @@ def run_structure_check(document_texts: dict[str, str], rule_set: dict) -> list[
            continue
        combined_text = "\n".join(matching_docs.values())
        for section in required_sections:
-            if section in combined_text:
+            if _contains_any(combined_text, [section]):
                continue
            findings.append(
                Finding(
@@ -39,3 +59,12 @@ def _matching_documents(document_texts: dict[str, str], keywords: list[str]) ->
        if any(str(keyword).lower() in haystack for keyword in keywords):
            result[name] = text
    return result
+
+
+def _contains_any(text: str, needles: list[str]) -> bool:
+    normalized = _normalize_title(text)
+    return any(_normalize_title(needle) in normalized for needle in needles if needle)
+
+
+def _normalize_title(value: str) -> str:
+    return "".join(str(value).lower().replace("/", "").replace("／", "").split())
--- a/review_agent/regulatory_review/services/text_extract.py
+++ b/review_agent/regulatory_review/services/text_extract.py
@@ -1,6 +1,7 @@
 from __future__ import annotations

 import hashlib
+import re
 from dataclasses import dataclass
 from pathlib import Path

@@ -14,6 +15,9 @@ class ExtractedText:
    status: str
    content_hash: str = ""
    error_message: str = ""
+    front_text: str = ""
+    section_candidates: list[str] | None = None
+    field_candidates: dict[str, str] | None = None


 SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
@@ -26,6 +30,47 @@ def extract_text(path: str | Path) -> ExtractedText:
    try:
        text = extract_text_from_path(file_path)
    except Exception as exc:
-        return ExtractedText(path=file_path, text="", status="failed", error_message=str(exc))
+        return ExtractedText(
+            path=file_path,
+            text="",
+            status="failed",
+            error_message=str(exc),
+            section_candidates=[],
+            field_candidates={},
+        )
    content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() if text else ""
-    return ExtractedText(path=file_path, text=text, status="success", content_hash=content_hash)
+    return ExtractedText(
+        path=file_path,
+        text=text,
+        status="success",
+        content_hash=content_hash,
+        front_text=_front_text(text),
+        section_candidates=_section_candidates(text),
+        field_candidates=_field_candidates(text),
+    )
+
+
+def _front_text(text: str, limit: int = 1200) -> str:
+    return text[:limit]
+
+
+def _section_candidates(text: str) -> list[str]:
+    candidates = []
+    for line in text.splitlines():
+        normalized = line.strip()
+        if not normalized:
+            continue
+        if re.match(r"^([一二三四五六七八九十]+[、.．]|[0-9]+(\.[0-9]+)*[、.．\s])", normalized):
+            candidates.append(normalized[:120])
+        elif any(keyword in normalized for keyword in ["章节目录", "监管信息", "综述资料", "非临床资料", "临床评价资料", "质量管理体系"]):
+            candidates.append(normalized[:120])
+    return candidates[:80]
+
+
+def _field_candidates(text: str) -> dict[str, str]:
+    fields = {}
+    for label in ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]:
+        match = re.search(rf"{label}[:：]\s*([^\n\r]+)", text)
+        if match:
+            fields[label] = " ".join(match.group(1).strip().split())
+    return fields