feat(regulatory): 对齐附件4目录核查规则

This commit is contained in:
2026-06-07 09:27:42 +08:00
parent bbd2d3532a
commit 1bdc7322cf
15 changed files with 753 additions and 43 deletions

View File

@@ -8,12 +8,17 @@ def run_completeness_check(batch: FileSummaryBatch, rule_set: dict) -> list[Find
items = list(batch.items.order_by("file_index"))
findings: list[Finding] = []
for requirement in rule_set.get("requirements", []):
if requirement.get("type") not in {"required", "conditional", "recommended"}:
if requirement.get("type") not in {"required", "conditional", "recommended", "chapter", "directory"}:
continue
matched = [
item
for item in items
if _matches_item(item.file_name, item.relative_path, requirement.get("file_keywords", []))
if _matches_item(
item.file_name,
item.relative_path,
item.directory_level,
[*requirement.get("file_keywords", []), *requirement.get("aliases", [])],
)
]
if matched:
continue
@@ -29,12 +34,13 @@ def run_completeness_check(batch: FileSummaryBatch, rule_set: dict) -> list[Find
"requirement_type": requirement.get("type"),
"matched_files": [],
"searched_keywords": requirement.get("file_keywords", []),
"searched_fields": ["file_name", "relative_path", "directory_level"],
},
)
)
return findings
def _matches_item(file_name: str, relative_path: str, keywords: list[str]) -> bool:
haystack = f"{file_name} {relative_path}".lower()
def _matches_item(file_name: str, relative_path: str, directory_level: str, keywords: list[str]) -> bool:
haystack = f"{file_name} {relative_path} {directory_level}".lower()
return any(str(keyword).lower() in haystack for keyword in keywords)

View File

@@ -10,6 +10,10 @@ FIELDS = {
"产品名称": r"产品名称[:]\s*([^\n\r]+)",
"型号规格": r"型号规格[:]\s*([^\n\r]+)",
"预期用途": r"预期用途[:]\s*([^\n\r]+)",
"管理类别": r"管理类别[:]\s*([^\n\r]+)",
"分类编码": r"分类编码[:]\s*([^\n\r]+)",
"注册类型": r"注册类型[:]\s*([^\n\r]+)",
"临床评价路径": r"临床评价路径[:]\s*([^\n\r]+)",
}

View File

@@ -107,12 +107,19 @@ def collect_source_chunks(source_dir: Path) -> list[TextChunk]:
try:
text = extract_text_from_path(path)
except RuntimeError as exc:
if _is_attachment4(path):
raise RuntimeError(f"附件 4 核心法规材料抽取失败:{path.name}") from exc
logger.warning("Regulatory source extraction skipped", extra={"path": str(path), "error": str(exc)})
continue
chunks.extend(chunk_text(text, source=str(path.relative_to(source_dir))))
return chunks
def _is_attachment4(path: Path) -> bool:
normalized = path.name.replace(" ", "")
return "附件4" in normalized and "体外诊断试剂注册申报资料要求及说明" in normalized
def build_chroma_index(
*,
source_dir: Path,

View File

@@ -47,9 +47,30 @@ def load_rule_file(path: str | Path | None = None) -> dict:
raise ValueError(f"规则 code 必须为 {DEFAULT_RULE_CODE}")
if not isinstance(payload.get("requirements"), list) or not payload["requirements"]:
raise ValueError("规则文件必须包含 requirements 列表。")
_validate_attachment4_requirements(payload)
return payload
def _validate_attachment4_requirements(payload: dict) -> None:
requirements = payload.get("requirements") or []
required_codes = {str(code) for code in payload.get("attachment4_required_codes") or []}
by_attachment4_code: dict[str, list[dict]] = {}
for requirement in requirements:
attachment4_code = requirement.get("attachment4_code")
if attachment4_code:
by_attachment4_code.setdefault(str(attachment4_code), []).append(requirement)
for field in ["code", "rule_id", "title", "severity", "file_keywords", "citation_query"]:
if attachment4_code and not requirement.get(field):
raise ValueError(f"附件4规则 {attachment4_code} 缺少 {field}")
missing = sorted(required_codes - set(by_attachment4_code), key=_attachment4_sort_key)
if missing:
raise ValueError(f"附件4目录项缺少规则{', '.join(missing)}")
def _attachment4_sort_key(value: str) -> tuple[int, ...]:
return tuple(int(part) for part in value.split(".") if part.isdigit())
def check_rule_version(
*,
path: str | Path | None = None,

View File

@@ -5,7 +5,27 @@ from review_agent.regulatory_review.schemas import Finding
def run_structure_check(document_texts: dict[str, str], rule_set: dict) -> list[Finding]:
findings: list[Finding] = []
combined_all_text = "\n".join(document_texts.values())
for requirement in rule_set.get("requirements", []):
if requirement.get("structure_required") and not _contains_any(
combined_all_text,
[requirement.get("title", ""), *requirement.get("aliases", [])],
):
findings.append(
Finding(
rule_code=requirement["code"],
category="structure",
severity=requirement.get("severity", "medium"),
title=f"申报资料目录缺少{requirement['title']}章节",
detail=f"未在申报资料目录或章节标题候选中发现{requirement['title']}",
suggestion=requirement.get("suggestion", ""),
evidence={
"attachment4_code": requirement.get("attachment4_code"),
"expected_title": requirement["title"],
"aliases": requirement.get("aliases", []),
},
)
)
required_sections = requirement.get("required_sections") or []
if not required_sections:
continue
@@ -14,7 +34,7 @@ def run_structure_check(document_texts: dict[str, str], rule_set: dict) -> list[
continue
combined_text = "\n".join(matching_docs.values())
for section in required_sections:
if section in combined_text:
if _contains_any(combined_text, [section]):
continue
findings.append(
Finding(
@@ -39,3 +59,12 @@ def _matching_documents(document_texts: dict[str, str], keywords: list[str]) ->
if any(str(keyword).lower() in haystack for keyword in keywords):
result[name] = text
return result
def _contains_any(text: str, needles: list[str]) -> bool:
normalized = _normalize_title(text)
return any(_normalize_title(needle) in normalized for needle in needles if needle)
def _normalize_title(value: str) -> str:
return "".join(str(value).lower().replace("/", "").replace("", "").split())

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import hashlib
import re
from dataclasses import dataclass
from pathlib import Path
@@ -14,6 +15,9 @@ class ExtractedText:
status: str
content_hash: str = ""
error_message: str = ""
front_text: str = ""
section_candidates: list[str] | None = None
field_candidates: dict[str, str] | None = None
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
@@ -26,6 +30,47 @@ def extract_text(path: str | Path) -> ExtractedText:
try:
text = extract_text_from_path(file_path)
except Exception as exc:
return ExtractedText(path=file_path, text="", status="failed", error_message=str(exc))
return ExtractedText(
path=file_path,
text="",
status="failed",
error_message=str(exc),
section_candidates=[],
field_candidates={},
)
content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() if text else ""
return ExtractedText(path=file_path, text=text, status="success", content_hash=content_hash)
return ExtractedText(
path=file_path,
text=text,
status="success",
content_hash=content_hash,
front_text=_front_text(text),
section_candidates=_section_candidates(text),
field_candidates=_field_candidates(text),
)
def _front_text(text: str, limit: int = 1200) -> str:
return text[:limit]
def _section_candidates(text: str) -> list[str]:
candidates = []
for line in text.splitlines():
normalized = line.strip()
if not normalized:
continue
if re.match(r"^([一二三四五六七八九十]+[、.]|[0-9]+(\.[0-9]+)*[、.\s])", normalized):
candidates.append(normalized[:120])
elif any(keyword in normalized for keyword in ["章节目录", "监管信息", "综述资料", "非临床资料", "临床评价资料", "质量管理体系"]):
candidates.append(normalized[:120])
return candidates[:80]
def _field_candidates(text: str) -> dict[str, str]:
fields = {}
for label in ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]:
match = re.search(rf"{label}[:]\s*([^\n\r]+)", text)
if match:
fields[label] = " ".join(match.group(1).strip().split())
return fields