feat(regulatory): 对齐附件4目录核查规则

This commit is contained in:
2026-06-07 09:27:42 +08:00
parent bbd2d3532a
commit 1bdc7322cf
15 changed files with 753 additions and 43 deletions

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import hashlib
import re
from dataclasses import dataclass
from pathlib import Path
@@ -14,6 +15,9 @@ class ExtractedText:
status: str
content_hash: str = ""
error_message: str = ""
front_text: str = ""
section_candidates: list[str] | None = None
field_candidates: dict[str, str] | None = None
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
@@ -26,6 +30,47 @@ def extract_text(path: str | Path) -> ExtractedText:
try:
text = extract_text_from_path(file_path)
except Exception as exc:
return ExtractedText(path=file_path, text="", status="failed", error_message=str(exc))
return ExtractedText(
path=file_path,
text="",
status="failed",
error_message=str(exc),
section_candidates=[],
field_candidates={},
)
content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() if text else ""
return ExtractedText(path=file_path, text=text, status="success", content_hash=content_hash)
return ExtractedText(
path=file_path,
text=text,
status="success",
content_hash=content_hash,
front_text=_front_text(text),
section_candidates=_section_candidates(text),
field_candidates=_field_candidates(text),
)
def _front_text(text: str, limit: int = 1200) -> str:
return text[:limit]
def _section_candidates(text: str) -> list[str]:
candidates = []
for line in text.splitlines():
normalized = line.strip()
if not normalized:
continue
if re.match(r"^([一二三四五六七八九十]+[、.]|[0-9]+(\.[0-9]+)*[、.\s])", normalized):
candidates.append(normalized[:120])
elif any(keyword in normalized for keyword in ["章节目录", "监管信息", "综述资料", "非临床资料", "临床评价资料", "质量管理体系"]):
candidates.append(normalized[:120])
return candidates[:80]
def _field_candidates(text: str) -> dict[str, str]:
fields = {}
for label in ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]:
match = re.search(rf"{label}[:]\s*([^\n\r]+)", text)
if match:
fields[label] = " ".join(match.group(1).strip().split())
return fields