feat(regulatory): 对齐附件4目录核查规则
This commit is contained in:
@@ -8,12 +8,17 @@ def run_completeness_check(batch: FileSummaryBatch, rule_set: dict) -> list[Find
|
||||
items = list(batch.items.order_by("file_index"))
|
||||
findings: list[Finding] = []
|
||||
for requirement in rule_set.get("requirements", []):
|
||||
if requirement.get("type") not in {"required", "conditional", "recommended"}:
|
||||
if requirement.get("type") not in {"required", "conditional", "recommended", "chapter", "directory"}:
|
||||
continue
|
||||
matched = [
|
||||
item
|
||||
for item in items
|
||||
if _matches_item(item.file_name, item.relative_path, requirement.get("file_keywords", []))
|
||||
if _matches_item(
|
||||
item.file_name,
|
||||
item.relative_path,
|
||||
item.directory_level,
|
||||
[*requirement.get("file_keywords", []), *requirement.get("aliases", [])],
|
||||
)
|
||||
]
|
||||
if matched:
|
||||
continue
|
||||
@@ -29,12 +34,13 @@ def run_completeness_check(batch: FileSummaryBatch, rule_set: dict) -> list[Find
|
||||
"requirement_type": requirement.get("type"),
|
||||
"matched_files": [],
|
||||
"searched_keywords": requirement.get("file_keywords", []),
|
||||
"searched_fields": ["file_name", "relative_path", "directory_level"],
|
||||
},
|
||||
)
|
||||
)
|
||||
return findings
|
||||
|
||||
|
||||
def _matches_item(file_name: str, relative_path: str, keywords: list[str]) -> bool:
|
||||
haystack = f"{file_name} {relative_path}".lower()
|
||||
def _matches_item(file_name: str, relative_path: str, directory_level: str, keywords: list[str]) -> bool:
|
||||
haystack = f"{file_name} {relative_path} {directory_level}".lower()
|
||||
return any(str(keyword).lower() in haystack for keyword in keywords)
|
||||
|
||||
@@ -10,6 +10,10 @@ FIELDS = {
|
||||
"产品名称": r"产品名称[::]\s*([^\n\r]+)",
|
||||
"型号规格": r"型号规格[::]\s*([^\n\r]+)",
|
||||
"预期用途": r"预期用途[::]\s*([^\n\r]+)",
|
||||
"管理类别": r"管理类别[::]\s*([^\n\r]+)",
|
||||
"分类编码": r"分类编码[::]\s*([^\n\r]+)",
|
||||
"注册类型": r"注册类型[::]\s*([^\n\r]+)",
|
||||
"临床评价路径": r"临床评价路径[::]\s*([^\n\r]+)",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -107,12 +107,19 @@ def collect_source_chunks(source_dir: Path) -> list[TextChunk]:
|
||||
try:
|
||||
text = extract_text_from_path(path)
|
||||
except RuntimeError as exc:
|
||||
if _is_attachment4(path):
|
||||
raise RuntimeError(f"附件 4 核心法规材料抽取失败:{path.name}") from exc
|
||||
logger.warning("Regulatory source extraction skipped", extra={"path": str(path), "error": str(exc)})
|
||||
continue
|
||||
chunks.extend(chunk_text(text, source=str(path.relative_to(source_dir))))
|
||||
return chunks
|
||||
|
||||
|
||||
def _is_attachment4(path: Path) -> bool:
|
||||
normalized = path.name.replace(" ", "")
|
||||
return "附件4" in normalized and "体外诊断试剂注册申报资料要求及说明" in normalized
|
||||
|
||||
|
||||
def build_chroma_index(
|
||||
*,
|
||||
source_dir: Path,
|
||||
|
||||
@@ -47,9 +47,30 @@ def load_rule_file(path: str | Path | None = None) -> dict:
|
||||
raise ValueError(f"规则 code 必须为 {DEFAULT_RULE_CODE}")
|
||||
if not isinstance(payload.get("requirements"), list) or not payload["requirements"]:
|
||||
raise ValueError("规则文件必须包含 requirements 列表。")
|
||||
_validate_attachment4_requirements(payload)
|
||||
return payload
|
||||
|
||||
|
||||
def _validate_attachment4_requirements(payload: dict) -> None:
|
||||
requirements = payload.get("requirements") or []
|
||||
required_codes = {str(code) for code in payload.get("attachment4_required_codes") or []}
|
||||
by_attachment4_code: dict[str, list[dict]] = {}
|
||||
for requirement in requirements:
|
||||
attachment4_code = requirement.get("attachment4_code")
|
||||
if attachment4_code:
|
||||
by_attachment4_code.setdefault(str(attachment4_code), []).append(requirement)
|
||||
for field in ["code", "rule_id", "title", "severity", "file_keywords", "citation_query"]:
|
||||
if attachment4_code and not requirement.get(field):
|
||||
raise ValueError(f"附件4规则 {attachment4_code} 缺少 {field}")
|
||||
missing = sorted(required_codes - set(by_attachment4_code), key=_attachment4_sort_key)
|
||||
if missing:
|
||||
raise ValueError(f"附件4目录项缺少规则:{', '.join(missing)}")
|
||||
|
||||
|
||||
def _attachment4_sort_key(value: str) -> tuple[int, ...]:
|
||||
return tuple(int(part) for part in value.split(".") if part.isdigit())
|
||||
|
||||
|
||||
def check_rule_version(
|
||||
*,
|
||||
path: str | Path | None = None,
|
||||
|
||||
@@ -5,7 +5,27 @@ from review_agent.regulatory_review.schemas import Finding
|
||||
|
||||
def run_structure_check(document_texts: dict[str, str], rule_set: dict) -> list[Finding]:
|
||||
findings: list[Finding] = []
|
||||
combined_all_text = "\n".join(document_texts.values())
|
||||
for requirement in rule_set.get("requirements", []):
|
||||
if requirement.get("structure_required") and not _contains_any(
|
||||
combined_all_text,
|
||||
[requirement.get("title", ""), *requirement.get("aliases", [])],
|
||||
):
|
||||
findings.append(
|
||||
Finding(
|
||||
rule_code=requirement["code"],
|
||||
category="structure",
|
||||
severity=requirement.get("severity", "medium"),
|
||||
title=f"申报资料目录缺少{requirement['title']}章节",
|
||||
detail=f"未在申报资料目录或章节标题候选中发现{requirement['title']}。",
|
||||
suggestion=requirement.get("suggestion", ""),
|
||||
evidence={
|
||||
"attachment4_code": requirement.get("attachment4_code"),
|
||||
"expected_title": requirement["title"],
|
||||
"aliases": requirement.get("aliases", []),
|
||||
},
|
||||
)
|
||||
)
|
||||
required_sections = requirement.get("required_sections") or []
|
||||
if not required_sections:
|
||||
continue
|
||||
@@ -14,7 +34,7 @@ def run_structure_check(document_texts: dict[str, str], rule_set: dict) -> list[
|
||||
continue
|
||||
combined_text = "\n".join(matching_docs.values())
|
||||
for section in required_sections:
|
||||
if section in combined_text:
|
||||
if _contains_any(combined_text, [section]):
|
||||
continue
|
||||
findings.append(
|
||||
Finding(
|
||||
@@ -39,3 +59,12 @@ def _matching_documents(document_texts: dict[str, str], keywords: list[str]) ->
|
||||
if any(str(keyword).lower() in haystack for keyword in keywords):
|
||||
result[name] = text
|
||||
return result
|
||||
|
||||
|
||||
def _contains_any(text: str, needles: list[str]) -> bool:
|
||||
normalized = _normalize_title(text)
|
||||
return any(_normalize_title(needle) in normalized for needle in needles if needle)
|
||||
|
||||
|
||||
def _normalize_title(value: str) -> str:
|
||||
return "".join(str(value).lower().replace("/", "").replace("/", "").split())
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
@@ -14,6 +15,9 @@ class ExtractedText:
|
||||
status: str
|
||||
content_hash: str = ""
|
||||
error_message: str = ""
|
||||
front_text: str = ""
|
||||
section_candidates: list[str] | None = None
|
||||
field_candidates: dict[str, str] | None = None
|
||||
|
||||
|
||||
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
|
||||
@@ -26,6 +30,47 @@ def extract_text(path: str | Path) -> ExtractedText:
|
||||
try:
|
||||
text = extract_text_from_path(file_path)
|
||||
except Exception as exc:
|
||||
return ExtractedText(path=file_path, text="", status="failed", error_message=str(exc))
|
||||
return ExtractedText(
|
||||
path=file_path,
|
||||
text="",
|
||||
status="failed",
|
||||
error_message=str(exc),
|
||||
section_candidates=[],
|
||||
field_candidates={},
|
||||
)
|
||||
content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() if text else ""
|
||||
return ExtractedText(path=file_path, text=text, status="success", content_hash=content_hash)
|
||||
return ExtractedText(
|
||||
path=file_path,
|
||||
text=text,
|
||||
status="success",
|
||||
content_hash=content_hash,
|
||||
front_text=_front_text(text),
|
||||
section_candidates=_section_candidates(text),
|
||||
field_candidates=_field_candidates(text),
|
||||
)
|
||||
|
||||
|
||||
def _front_text(text: str, limit: int = 1200) -> str:
|
||||
return text[:limit]
|
||||
|
||||
|
||||
def _section_candidates(text: str) -> list[str]:
|
||||
candidates = []
|
||||
for line in text.splitlines():
|
||||
normalized = line.strip()
|
||||
if not normalized:
|
||||
continue
|
||||
if re.match(r"^([一二三四五六七八九十]+[、..]|[0-9]+(\.[0-9]+)*[、..\s])", normalized):
|
||||
candidates.append(normalized[:120])
|
||||
elif any(keyword in normalized for keyword in ["章节目录", "监管信息", "综述资料", "非临床资料", "临床评价资料", "质量管理体系"]):
|
||||
candidates.append(normalized[:120])
|
||||
return candidates[:80]
|
||||
|
||||
|
||||
def _field_candidates(text: str) -> dict[str, str]:
|
||||
fields = {}
|
||||
for label in ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]:
|
||||
match = re.search(rf"{label}[::]\s*([^\n\r]+)", text)
|
||||
if match:
|
||||
fields[label] = " ".join(match.group(1).strip().split())
|
||||
return fields
|
||||
|
||||
Reference in New Issue
Block a user