diff --git a/review_agent/regulatory_review/services/info_extract.py b/review_agent/regulatory_review/services/info_extract.py index 7e23a3a..29ebbe5 100644 --- a/review_agent/regulatory_review/services/info_extract.py +++ b/review_agent/regulatory_review/services/info_extract.py @@ -1,6 +1,11 @@ from __future__ import annotations +from pathlib import Path + +from django.conf import settings + from review_agent.models import FileSummaryBatch +from review_agent.regulatory_review.services.text_extract import extract_text OPTION_FIELDS = { @@ -14,9 +19,14 @@ def detect_regulatory_condition_candidates(summary_batch: FileSummaryBatch) -> d """Infers review-scope conditions from the summary batch and file names.""" corpus_parts = [summary_batch.product_name or ""] + field_candidates: dict[str, str] = {} for item in summary_batch.items.order_by("file_index"): corpus_parts.extend([item.directory_level, item.file_name, item.relative_path]) + extracted = _extract_item_fields(item) + field_candidates.update({key: value for key, value in extracted.items() if value and key not in field_candidates}) + corpus_parts.extend(extracted.values()) corpus = "\n".join(part for part in corpus_parts if part) + product_name = field_candidates.get("产品名称") or _safe_summary_product_name(summary_batch.product_name) return { "product_category": { @@ -40,21 +50,42 @@ def detect_regulatory_condition_candidates(summary_batch: FileSummaryBatch) -> d "product_name": { "label": "产品名称", "input_type": "text", - "suggested": summary_batch.product_name or "", + "suggested": product_name, }, "model_spec": { "label": "型号规格", "input_type": "text", - "suggested": "", + "suggested": field_candidates.get("型号规格", ""), }, "intended_use": { "label": "预期用途", "input_type": "text", - "suggested": "", + "suggested": field_candidates.get("预期用途", ""), }, } +def _extract_item_fields(item) -> dict[str, str]: + path = Path(item.storage_path) + if not path.is_absolute(): + path = Path(settings.MEDIA_ROOT) / item.storage_path + if not path.exists(): + return {} + result = extract_text(path) + if result.status != "success" or not result.field_candidates: + return {} + return result.field_candidates + + +def _safe_summary_product_name(product_name: str) -> str: + value = (product_name or "").strip() + if not value: + return "" + if any(keyword in value for keyword in ["第1章", "第2章", "监管信息", "综述资料", "非临床资料", "章节目录"]): + return "" + return value + + def _detect_product_category(corpus: str) -> str: if any(keyword in corpus for keyword in ["体外诊断", "检测试剂", "试剂盒", "IVD"]): return "体外诊断试剂" diff --git a/tests/test_regulatory_condition.py b/tests/test_regulatory_condition.py index ccfc7a5..d7c35f4 100644 --- a/tests/test_regulatory_condition.py +++ b/tests/test_regulatory_condition.py @@ -49,6 +49,39 @@ def test_detect_regulatory_condition_candidates_from_summary_items(django_user_m assert candidates["product_name"]["suggested"] == "甲胎蛋白检测试剂盒" +def test_detect_regulatory_condition_prefers_attachment_fields_over_chapter_title(settings, tmp_path, django_user_model): + settings.MEDIA_ROOT = tmp_path + user = django_user_model.objects.create_user(username="owner", password="pass") + conversation = Conversation.objects.create(user=user, title="会话") + summary = FileSummaryBatch.objects.create( + conversation=conversation, + user=user, + batch_no="FS-COND", + status=FileSummaryBatch.Status.SUCCESS, + product_name="第1章 监管信息", + ) + application = tmp_path / "application.txt" + application.write_text( + "产品名称:甲胎蛋白检测试剂盒\n型号规格:20人份/盒\n预期用途:用于人血清中甲胎蛋白检测\n注册类型:首次注册\n", + encoding="utf-8", + ) + FileSummaryItem.objects.create( + batch=summary, + file_index=1, + directory_level="1. 监管信息 / 1.2 申请表", + file_name="申请表.txt", + file_type="txt", + relative_path="1.监管信息/申请表.txt", + storage_path=str(application), + ) + + candidates = detect_regulatory_condition_candidates(summary) + + assert candidates["product_name"]["suggested"] == "甲胎蛋白检测试剂盒" + assert candidates["model_spec"]["suggested"] == "20人份/盒" + assert candidates["intended_use"]["suggested"] == "用于人血清中甲胎蛋白检测" + + def test_workflow_pauses_before_rule_scope_until_conditions_confirmed(settings, tmp_path, django_user_model): settings.MEDIA_ROOT = tmp_path user = django_user_model.objects.create_user(username="owner", password="pass")