feat(regulatory): 对齐附件4目录核查规则

2026-06-07 09:27:42 +08:00
parent bbd2d3532a
commit 1bdc7322cf
15 changed files with 753 additions and 43 deletions
--- a/review_agent/regulatory_review/services/text_extract.py
+++ b/review_agent/regulatory_review/services/text_extract.py
@@ -1,6 +1,7 @@
 from __future__ import annotations

 import hashlib
+import re
 from dataclasses import dataclass
 from pathlib import Path

@@ -14,6 +15,9 @@ class ExtractedText:
    status: str
    content_hash: str = ""
    error_message: str = ""
+    front_text: str = ""
+    section_candidates: list[str] | None = None
+    field_candidates: dict[str, str] | None = None


 SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
@@ -26,6 +30,47 @@ def extract_text(path: str | Path) -> ExtractedText:
    try:
        text = extract_text_from_path(file_path)
    except Exception as exc:
-        return ExtractedText(path=file_path, text="", status="failed", error_message=str(exc))
+        return ExtractedText(
+            path=file_path,
+            text="",
+            status="failed",
+            error_message=str(exc),
+            section_candidates=[],
+            field_candidates={},
+        )
    content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() if text else ""
-    return ExtractedText(path=file_path, text=text, status="success", content_hash=content_hash)
+    return ExtractedText(
+        path=file_path,
+        text=text,
+        status="success",
+        content_hash=content_hash,
+        front_text=_front_text(text),
+        section_candidates=_section_candidates(text),
+        field_candidates=_field_candidates(text),
+    )
+
+
+def _front_text(text: str, limit: int = 1200) -> str:
+    return text[:limit]
+
+
+def _section_candidates(text: str) -> list[str]:
+    candidates = []
+    for line in text.splitlines():
+        normalized = line.strip()
+        if not normalized:
+            continue
+        if re.match(r"^([一二三四五六七八九十]+[、.．]|[0-9]+(\.[0-9]+)*[、.．\s])", normalized):
+            candidates.append(normalized[:120])
+        elif any(keyword in normalized for keyword in ["章节目录", "监管信息", "综述资料", "非临床资料", "临床评价资料", "质量管理体系"]):
+            candidates.append(normalized[:120])
+    return candidates[:80]
+
+
+def _field_candidates(text: str) -> dict[str, str]:
+    fields = {}
+    for label in ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]:
+        match = re.search(rf"{label}[:：]\s*([^\n\r]+)", text)
+        if match:
+            fields[label] = " ".join(match.group(1).strip().split())
+    return fields