fix(regulatory): 修复换行产品名称提取不全

2026-06-07 11:30:48 +08:00
parent 72f18167c5
commit a34684e490
3 changed files with 79 additions and 4 deletions
--- a/review_agent/regulatory_review/services/text_extract.py
+++ b/review_agent/regulatory_review/services/text_extract.py
@@ -21,6 +21,7 @@ class ExtractedText:


 SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
+FIELD_LABELS = ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]


 def extract_text(path: str | Path) -> ExtractedText:
@@ -69,8 +70,32 @@ def _section_candidates(text: str) -> list[str]:

 def _field_candidates(text: str) -> dict[str, str]:
    fields = {}
-    for label in ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]:
-        match = re.search(rf"{label}[:：]\s*([^\n\r]+)", text)
-        if match:
-            fields[label] = " ".join(match.group(1).strip().split())
+    lines = text.splitlines()
+    for index, line in enumerate(lines):
+        normalized = line.strip()
+        if not normalized:
+            continue
+        for label in FIELD_LABELS:
+            match = re.match(rf"^{re.escape(label)}[:：]\s*(.*)$", normalized)
+            if not match or label in fields:
+                continue
+            value_parts = [match.group(1).strip()]
+            for next_line in lines[index + 1 :]:
+                continuation = next_line.strip()
+                if not continuation or _starts_field_line(continuation) or _looks_like_section_heading(continuation):
+                    break
+                value_parts.append(continuation)
+            value = " ".join(part for part in value_parts if part)
+            if value:
+                fields[label] = " ".join(value.split())
    return fields
+
+
+def _starts_field_line(line: str) -> bool:
+    if any(re.match(rf"^{re.escape(label)}[:：]", line) for label in FIELD_LABELS):
+        return True
+    return bool(re.match(r"^[^\s:：]{2,24}[:：]", line))
+
+
+def _looks_like_section_heading(line: str) -> bool:
+    return bool(re.match(r"^([一二三四五六七八九十]+[、.．]|[0-9]+(\.[0-9]+)*[、.．\s])", line))
--- a/tests/test_regulatory_condition.py
+++ b/tests/test_regulatory_condition.py
@@ -82,6 +82,41 @@ def test_detect_regulatory_condition_prefers_attachment_fields_over_chapter_titl
    assert candidates["intended_use"]["suggested"] == "用于人血清中甲胎蛋白检测"


+def test_detect_regulatory_condition_keeps_wrapped_product_name(settings, tmp_path, django_user_model):
+    settings.MEDIA_ROOT = tmp_path
+    user = django_user_model.objects.create_user(username="owner", password="pass")
+    conversation = Conversation.objects.create(user=user, title="会话")
+    summary = FileSummaryBatch.objects.create(
+        conversation=conversation,
+        user=user,
+        batch_no="FS-COND",
+        status=FileSummaryBatch.Status.SUCCESS,
+        product_name="第1章 监管信息",
+    )
+    application = tmp_path / "application.txt"
+    application.write_text(
+        "产品名称：呼吸道合胞病毒、肺炎支原体核酸检测试剂盒\n"
+        "（荧光PCR法）\n"
+        "型号规格：24人份/盒\n"
+        "预期用途：用于呼吸道合胞病毒、肺炎支原体核酸检测\n",
+        encoding="utf-8",
+    )
+    FileSummaryItem.objects.create(
+        batch=summary,
+        file_index=1,
+        directory_level="1. 监管信息 / 1.2 申请表",
+        file_name="申请表.txt",
+        file_type="txt",
+        relative_path="1.监管信息/申请表.txt",
+        storage_path=str(application),
+    )
+
+    candidates = detect_regulatory_condition_candidates(summary)
+
+    assert candidates["product_name"]["suggested"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒 （荧光PCR法）"
+    assert candidates["model_spec"]["suggested"] == "24人份/盒"
+
+
 def test_workflow_pauses_before_rule_scope_until_conditions_confirmed(settings, tmp_path, django_user_model):
    settings.MEDIA_ROOT = tmp_path
    user = django_user_model.objects.create_user(username="owner", password="pass")
--- a/tests/test_regulatory_text_extract.py
+++ b/tests/test_regulatory_text_extract.py
@@ -14,6 +14,21 @@ def test_extract_text_reads_plain_text(tmp_path):
    assert result.content_hash


+def test_extract_text_keeps_wrapped_product_name(tmp_path):
+    path = tmp_path / "申请表.txt"
+    path.write_text(
+        "产品名称：呼吸道合胞病毒、肺炎支原体核酸检测试剂盒\n"
+        "（荧光PCR法）\n"
+        "型号规格：24人份/盒\n",
+        encoding="utf-8",
+    )
+
+    result = extract_text(path)
+
+    assert result.field_candidates["产品名称"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒 （荧光PCR法）"
+    assert result.field_candidates["型号规格"] == "24人份/盒"
+
+
 def test_extract_text_reports_unsupported_file(tmp_path):
    path = tmp_path / "image.png"
    path.write_bytes(b"png")