feat(regulatory): 对齐附件4目录核查规则
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
@@ -14,6 +15,9 @@ class ExtractedText:
|
||||
status: str
|
||||
content_hash: str = ""
|
||||
error_message: str = ""
|
||||
front_text: str = ""
|
||||
section_candidates: list[str] | None = None
|
||||
field_candidates: dict[str, str] | None = None
|
||||
|
||||
|
||||
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
|
||||
@@ -26,6 +30,47 @@ def extract_text(path: str | Path) -> ExtractedText:
|
||||
try:
|
||||
text = extract_text_from_path(file_path)
|
||||
except Exception as exc:
|
||||
return ExtractedText(path=file_path, text="", status="failed", error_message=str(exc))
|
||||
return ExtractedText(
|
||||
path=file_path,
|
||||
text="",
|
||||
status="failed",
|
||||
error_message=str(exc),
|
||||
section_candidates=[],
|
||||
field_candidates={},
|
||||
)
|
||||
content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() if text else ""
|
||||
return ExtractedText(path=file_path, text=text, status="success", content_hash=content_hash)
|
||||
return ExtractedText(
|
||||
path=file_path,
|
||||
text=text,
|
||||
status="success",
|
||||
content_hash=content_hash,
|
||||
front_text=_front_text(text),
|
||||
section_candidates=_section_candidates(text),
|
||||
field_candidates=_field_candidates(text),
|
||||
)
|
||||
|
||||
|
||||
def _front_text(text: str, limit: int = 1200) -> str:
|
||||
return text[:limit]
|
||||
|
||||
|
||||
def _section_candidates(text: str) -> list[str]:
|
||||
candidates = []
|
||||
for line in text.splitlines():
|
||||
normalized = line.strip()
|
||||
if not normalized:
|
||||
continue
|
||||
if re.match(r"^([一二三四五六七八九十]+[、..]|[0-9]+(\.[0-9]+)*[、..\s])", normalized):
|
||||
candidates.append(normalized[:120])
|
||||
elif any(keyword in normalized for keyword in ["章节目录", "监管信息", "综述资料", "非临床资料", "临床评价资料", "质量管理体系"]):
|
||||
candidates.append(normalized[:120])
|
||||
return candidates[:80]
|
||||
|
||||
|
||||
def _field_candidates(text: str) -> dict[str, str]:
|
||||
fields = {}
|
||||
for label in ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]:
|
||||
match = re.search(rf"{label}[::]\s*([^\n\r]+)", text)
|
||||
if match:
|
||||
fields[label] = " ".join(match.group(1).strip().split())
|
||||
return fields
|
||||
|
||||
Reference in New Issue
Block a user