diff --git a/review_agent/regulatory_review/services/text_extract.py b/review_agent/regulatory_review/services/text_extract.py index bd8dfab..3b98e51 100644 --- a/review_agent/regulatory_review/services/text_extract.py +++ b/review_agent/regulatory_review/services/text_extract.py @@ -21,6 +21,7 @@ class ExtractedText: SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"} +FIELD_LABELS = ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"] def extract_text(path: str | Path) -> ExtractedText: @@ -69,8 +70,32 @@ def _section_candidates(text: str) -> list[str]: def _field_candidates(text: str) -> dict[str, str]: fields = {} - for label in ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]: - match = re.search(rf"{label}[::]\s*([^\n\r]+)", text) - if match: - fields[label] = " ".join(match.group(1).strip().split()) + lines = text.splitlines() + for index, line in enumerate(lines): + normalized = line.strip() + if not normalized: + continue + for label in FIELD_LABELS: + match = re.match(rf"^{re.escape(label)}[::]\s*(.*)$", normalized) + if not match or label in fields: + continue + value_parts = [match.group(1).strip()] + for next_line in lines[index + 1 :]: + continuation = next_line.strip() + if not continuation or _starts_field_line(continuation) or _looks_like_section_heading(continuation): + break + value_parts.append(continuation) + value = " ".join(part for part in value_parts if part) + if value: + fields[label] = " ".join(value.split()) return fields + + +def _starts_field_line(line: str) -> bool: + if any(re.match(rf"^{re.escape(label)}[::]", line) for label in FIELD_LABELS): + return True + return bool(re.match(r"^[^\s::]{2,24}[::]", line)) + + +def _looks_like_section_heading(line: str) -> bool: + return bool(re.match(r"^([一二三四五六七八九十]+[、..]|[0-9]+(\.[0-9]+)*[、..\s])", line)) diff --git a/tests/test_regulatory_condition.py b/tests/test_regulatory_condition.py index d7c35f4..dfcbd28 100644 --- a/tests/test_regulatory_condition.py +++ b/tests/test_regulatory_condition.py @@ -82,6 +82,41 @@ def test_detect_regulatory_condition_prefers_attachment_fields_over_chapter_titl assert candidates["intended_use"]["suggested"] == "用于人血清中甲胎蛋白检测" +def test_detect_regulatory_condition_keeps_wrapped_product_name(settings, tmp_path, django_user_model): + settings.MEDIA_ROOT = tmp_path + user = django_user_model.objects.create_user(username="owner", password="pass") + conversation = Conversation.objects.create(user=user, title="会话") + summary = FileSummaryBatch.objects.create( + conversation=conversation, + user=user, + batch_no="FS-COND", + status=FileSummaryBatch.Status.SUCCESS, + product_name="第1章 监管信息", + ) + application = tmp_path / "application.txt" + application.write_text( + "产品名称:呼吸道合胞病毒、肺炎支原体核酸检测试剂盒\n" + "(荧光PCR法)\n" + "型号规格:24人份/盒\n" + "预期用途:用于呼吸道合胞病毒、肺炎支原体核酸检测\n", + encoding="utf-8", + ) + FileSummaryItem.objects.create( + batch=summary, + file_index=1, + directory_level="1. 监管信息 / 1.2 申请表", + file_name="申请表.txt", + file_type="txt", + relative_path="1.监管信息/申请表.txt", + storage_path=str(application), + ) + + candidates = detect_regulatory_condition_candidates(summary) + + assert candidates["product_name"]["suggested"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒 (荧光PCR法)" + assert candidates["model_spec"]["suggested"] == "24人份/盒" + + def test_workflow_pauses_before_rule_scope_until_conditions_confirmed(settings, tmp_path, django_user_model): settings.MEDIA_ROOT = tmp_path user = django_user_model.objects.create_user(username="owner", password="pass") diff --git a/tests/test_regulatory_text_extract.py b/tests/test_regulatory_text_extract.py index 713313f..4979bf6 100644 --- a/tests/test_regulatory_text_extract.py +++ b/tests/test_regulatory_text_extract.py @@ -14,6 +14,21 @@ def test_extract_text_reads_plain_text(tmp_path): assert result.content_hash +def test_extract_text_keeps_wrapped_product_name(tmp_path): + path = tmp_path / "申请表.txt" + path.write_text( + "产品名称:呼吸道合胞病毒、肺炎支原体核酸检测试剂盒\n" + "(荧光PCR法)\n" + "型号规格:24人份/盒\n", + encoding="utf-8", + ) + + result = extract_text(path) + + assert result.field_candidates["产品名称"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒 (荧光PCR法)" + assert result.field_candidates["型号规格"] == "24人份/盒" + + def test_extract_text_reports_unsupported_file(tmp_path): path = tmp_path / "image.png" path.write_bytes(b"png")