fix(regulatory): 修复无标签文档适用条件回显

2026-06-07 12:29:22 +08:00
parent 1b4a10b5ba
commit 9e27c4c684
8 changed files with 305 additions and 8 deletions
--- a/review_agent/regulatory_review/services/info_extract.py
+++ b/review_agent/regulatory_review/services/info_extract.py
@@ -1,10 +1,11 @@
 from __future__ import annotations

+import re
 from pathlib import Path

 from django.conf import settings

-from review_agent.models import FileSummaryBatch
+from review_agent.models import FileSummaryBatch, RegulatoryReviewBatch
 from review_agent.regulatory_review.services.llm_review import review_condition_fields
 from review_agent.regulatory_review.services.text_extract import extract_text

@@ -16,6 +17,18 @@ OPTION_FIELDS = {
 }


+def ensure_regulatory_condition_candidates(batch: RegulatoryReviewBatch) -> dict[str, dict[str, object]]:
+    condition_json = batch.condition_json or {}
+    candidates = condition_json.get("candidates") or {}
+    if batch.status != RegulatoryReviewBatch.Status.WAITING_USER or not _condition_candidates_incomplete(candidates):
+        return candidates
+    refreshed = detect_regulatory_condition_candidates(batch.source_summary_batch)
+    refreshed = _merge_condition_candidates(candidates, refreshed)
+    batch.condition_json = {**condition_json, "candidates": refreshed}
+    batch.save(update_fields=["condition_json"])
+    return refreshed
+
+
 def detect_regulatory_condition_candidates(summary_batch: FileSummaryBatch) -> dict[str, dict[str, object]]:
    """Infers review-scope conditions from the summary batch and file names."""

@@ -30,6 +43,8 @@ def detect_regulatory_condition_candidates(summary_batch: FileSummaryBatch) -> d
        field_candidates.update({key: value for key, value in extracted.items() if value and key not in field_candidates})
        field_sources.update({key: value for key, value in sources.items() if value and key not in field_sources})
        corpus_parts.extend(extracted.values())
+        if review.get("front_text"):
+            corpus_parts.append(str(review["front_text"]))
    corpus = "\n".join(part for part in corpus_parts if part)
    product_name = field_candidates.get("产品名称") or _safe_summary_product_name(summary_batch.product_name)

@@ -80,13 +95,22 @@ def _extract_item_fields(item) -> dict[str, object]:
    if not path.exists():
        return {}
    result = extract_text(path)
-    if result.status != "success" or not result.field_candidates:
+    if result.status != "success" or not result.text:
        return {}
-    return review_condition_fields(
+    inferred_fields = _infer_fields_from_text(result.front_text or result.text)
+    rule_fields = {**inferred_fields, **(result.field_candidates or {})}
+    review = review_condition_fields(
        text=result.front_text or result.text,
-        rule_fields=result.field_candidates,
+        rule_fields=rule_fields,
        file_context=f"{item.directory_level}\n{item.file_name}\n{item.relative_path}",
    )
+    selected_sources = dict(review.get("selected_sources") or {})
+    for key in inferred_fields:
+        if selected_sources.get(key) == "rule" and key not in (result.field_candidates or {}):
+            selected_sources[key] = "inferred"
+    review["selected_sources"] = selected_sources
+    review["front_text"] = result.front_text or result.text[:1200]
+    return review


 def _safe_summary_product_name(product_name: str) -> str:
@@ -98,6 +122,99 @@ def _safe_summary_product_name(product_name: str) -> str:
    return value


+def _infer_fields_from_text(text: str) -> dict[str, str]:
+    normalized = _normalize_text_for_inference(text)
+    fields = {}
+    product_name = _infer_product_name(normalized)
+    if product_name:
+        fields["产品名称"] = product_name
+    model_spec = _infer_model_spec(normalized)
+    if model_spec:
+        fields["型号规格"] = model_spec
+    return fields
+
+
+def _normalize_text_for_inference(text: str) -> str:
+    value = re.sub(r"\s+", "", text or "")
+    value = value.replace("（", "(").replace("）", ")")
+    return value
+
+
+def _infer_product_name(text: str) -> str:
+    patterns = [
+        r"体外诊断试剂(?P<name>[^。；;，,]{4,120}?试剂盒\([^()]{2,30}\))产品注册",
+        r"(?P<name>[^。；;，,]{4,120}?试剂盒\([^()]{2,30}\))",
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, text)
+        if match:
+            return _restore_chinese_parentheses(_trim_product_name(match.group("name")))
+    return ""
+
+
+def _trim_product_name(value: str) -> str:
+    prefixes = ["申请境内第三类体外诊断试剂", "申请境内第二类体外诊断试剂", "境内第三类体外诊断试剂", "境内第二类体外诊断试剂"]
+    result = value
+    for prefix in prefixes:
+        if prefix in result:
+            result = result.split(prefix, 1)[-1]
+    return result
+
+
+def _infer_model_spec(text: str) -> str:
+    specs = sorted(set(re.findall(r"规格[A-ZＡ-Ｚ]", text)))
+    if specs:
+        return "、".join(specs)
+    match = re.search(r"产品的包装规格(?P<spec>.{1,80}?(?:人份/盒|测试/盒|反应/盒)(?:[、,，].{1,30}?(?:人份/盒|测试/盒|反应/盒))*)", text)
+    if not match:
+        return ""
+    return _restore_chinese_parentheses(match.group("spec").strip("：:，,。；;"))
+
+
+def _restore_chinese_parentheses(value: str) -> str:
+    return value.replace("(", "（").replace(")", "）")
+
+
+def _condition_candidates_incomplete(candidates: dict[str, dict[str, object]]) -> bool:
+    if not candidates:
+        return True
+    product_name = str((candidates.get("product_name") or {}).get("suggested") or "").strip()
+    product_category = str((candidates.get("product_category") or {}).get("suggested") or "").strip()
+    return not product_name or "<EFBFBD>" in product_name or product_category == "其他"
+
+
+def _merge_condition_candidates(
+    current: dict[str, dict[str, object]],
+    refreshed: dict[str, dict[str, object]],
+) -> dict[str, dict[str, object]]:
+    merged = {**(current or {})}
+    for field, config in (refreshed or {}).items():
+        current_config = merged.get(field) or {}
+        current_value = str(current_config.get("suggested") or "").strip()
+        refreshed_value = str((config or {}).get("suggested") or "").strip()
+        if _is_better_condition_value(current_value, refreshed_value):
+            merged[field] = config
+        elif field not in merged:
+            merged[field] = config
+    return merged
+
+
+def _is_better_condition_value(current_value: str, refreshed_value: str) -> bool:
+    if not refreshed_value:
+        return False
+    if "<EFBFBD>" in refreshed_value:
+        return False
+    if "<EFBFBD>" in current_value:
+        return True
+    if not current_value:
+        return True
+    if current_value == "其他" and refreshed_value != "其他":
+        return True
+    if current_value == "待确认" and refreshed_value != "待确认":
+        return True
+    return len(refreshed_value) > len(current_value) and current_value in refreshed_value
+
+
 def _detect_product_category(corpus: str) -> str:
    if any(keyword in corpus for keyword in ["体外诊断", "检测试剂", "试剂盒", "IVD"]):
        return "体外诊断试剂"
--- a/review_agent/regulatory_review/services/llm_review.py
+++ b/review_agent/regulatory_review/services/llm_review.py
@@ -156,7 +156,7 @@ def _clean_fields(fields: dict[str, Any]) -> dict[str, str]:
        value = fields.get(label)
        if not isinstance(value, str):
            continue
-        normalized = " ".join(value.strip().split())
+        normalized = " ".join(value.strip().split()).replace("(", "（").replace(")", "）")
        if normalized:
            clean[label] = normalized
    return clean
@@ -200,4 +200,6 @@ def _better_product_name(candidate: str, current: str) -> bool:
 def _invalid_field_value(value: str) -> bool:
    if not value:
        return True
+    if "<EFBFBD>" in value:
+        return True
    return any(keyword in value for keyword in ["第1章", "第2章", "第3章", "监管信息", "综述资料", "章节目录"])
--- a/review_agent/regulatory_review/views.py
+++ b/review_agent/regulatory_review/views.py
@@ -9,6 +9,7 @@ from django.contrib.auth.decorators import login_required

 from review_agent.models import FileSummaryBatch, RegulatoryReviewBatch, WorkflowNodeRun
 from review_agent.regulatory_review.events import record_event
+from review_agent.regulatory_review.services.info_extract import ensure_regulatory_condition_candidates
 from review_agent.regulatory_review.services.rectification_review import review_missing_issues
 from review_agent.regulatory_review.workflow import create_regulatory_review_batch, start_regulatory_review_workflow

@@ -19,6 +20,7 @@ def batch_status(request, batch_id: int):
    batch = RegulatoryReviewBatch.objects.filter(pk=batch_id, user=request.user).first()
    if not batch:
        raise Http404("批次不存在。")
+    condition_candidates = ensure_regulatory_condition_candidates(batch)
    nodes = WorkflowNodeRun.objects.filter(
        workflow_type="regulatory_review",
        workflow_batch_id=batch.pk,
@@ -45,12 +47,12 @@ def batch_status(request, batch_id: int):
                for node in nodes
            ],
        }
-    if batch.status == RegulatoryReviewBatch.Status.WAITING_USER and (batch.condition_json or {}).get("candidates"):
+    if batch.status == RegulatoryReviewBatch.Status.WAITING_USER and condition_candidates:
        payload["condition_confirmation"] = {
            "batch_id": batch.pk,
            "batch_no": batch.batch_no,
            "confirm_url": f"/api/review-agent/regulatory-review/{batch.pk}/conditions/",
-            "candidates": batch.condition_json["candidates"],
+            "candidates": condition_candidates,
        }
    return JsonResponse(payload)

--- a/review_agent/views.py
+++ b/review_agent/views.py
@@ -12,6 +12,7 @@ from .services import (
    stream_message,
 )
 from .models import Conversation, FileAttachment, FileSummaryBatch, RegulatoryReviewBatch, WorkflowNodeRun
+from .regulatory_review.services.info_extract import ensure_regulatory_condition_candidates


@login_required
@@ -132,6 +133,7 @@ def build_workflow_cards(conversation: Conversation) -> list[dict[str, object]]:
        )
    regulatory_batches = RegulatoryReviewBatch.objects.filter(conversation=conversation)
    for batch in regulatory_batches:
+        condition_candidates = ensure_regulatory_condition_candidates(batch)
        cards.append(
            {
                "id": batch.pk,
@@ -141,7 +143,7 @@ def build_workflow_cards(conversation: Conversation) -> list[dict[str, object]]:
                "error_message": batch.error_message,
                "risk_label": _format_risk_label(batch.risk_summary or {}),
                "condition_json": batch.condition_json or {},
-                "condition_candidates": (batch.condition_json or {}).get("candidates") or {},
+                "condition_candidates": condition_candidates,
                "notification_count": batch.notifications.count(),
                "review_record_count": batch.artifacts.filter(metadata__artifact="review_record").count(),
                "created_at": batch.created_at,