feat(application-form-fill): 实现字段抽取与冲突合并

2026-06-07 18:31:34 +08:00
parent 72890783b3
commit a48f778e09
5 changed files with 498 additions and 0 deletions
--- a/review_agent/application_form_fill/services/field_merge.py
+++ b/review_agent/application_form_fill/services/field_merge.py
@@ -0,0 +1,88 @@
+from __future__ import annotations
+
+import re
+from typing import Any
+
+from review_agent.application_form_fill.schemas import MergedField
+
+
+SOURCE_PRIORITY = {
+    "说明书": 1,
+    "产品技术要求": 2,
+    "注册检验报告": 3,
+    "检测报告": 3,
+    "性能研究资料": 4,
+    "其他注册资料": 5,
+}
+
+
+def normalize_field_value(value: str) -> str:
+    return re.sub(r"\s+", "", str(value or "")).strip().lower()
+
+
+def rank_source(source_role: str, source_file: str = "") -> int:
+    target = f"{source_role}\n{source_file}"
+    for keyword, rank in SOURCE_PRIORITY.items():
+        if keyword in target:
+            return rank
+    return 9
+
+
+def merge_fields(regex_results: dict[str, Any], llm_results: dict[str, Any]) -> tuple[dict[str, MergedField], list[dict]]:
+    grouped: dict[str, list[dict[str, Any]]] = {}
+    for item in list(regex_results.get("fields") or []) + list(llm_results.get("fields") or []):
+        key = str(item.get("key") or "")
+        value = str(item.get("value") or "").strip()
+        if not key or not value:
+            continue
+        grouped.setdefault(key, []).append(item)
+
+    merged: dict[str, MergedField] = {}
+    conflicts: list[dict] = []
+    for key, candidates in grouped.items():
+        selected = sorted(
+            candidates,
+            key=lambda item: (
+                rank_source(str(item.get("source_role") or ""), str(item.get("source_file") or "")),
+                -float(item.get("confidence") or 0),
+            ),
+        )[0]
+        distinct = _distinct_values(candidates)
+        has_conflict = len(distinct) > 1
+        conflict_values = [
+            {
+                "value": item.get("value"),
+                "source_file": item.get("source_file", ""),
+                "source_role": item.get("source_role", ""),
+                "evidence": item.get("evidence", ""),
+            }
+            for item in candidates
+            if normalize_field_value(str(item.get("value") or "")) != normalize_field_value(str(selected.get("value") or ""))
+        ]
+        merged_field = MergedField(
+            key=key,
+            label=str(selected.get("label") or key),
+            value=str(selected.get("value") or ""),
+            source_file=str(selected.get("source_file") or ""),
+            evidence=str(selected.get("evidence") or ""),
+            confidence=float(selected.get("confidence") or 0),
+            has_conflict=has_conflict,
+            conflict_values=conflict_values,
+        )
+        merged[key] = merged_field
+        if has_conflict:
+            conflicts.append(
+                {
+                    "field_key": key,
+                    "field_label": merged_field.label,
+                    "selected_value": merged_field.value,
+                    "selected_source": merged_field.source_file,
+                    "conflict_values": conflict_values,
+                    "handling": "说明书优先，模板内黄底红字高亮" if rank_source(merged_field.source_file, merged_field.source_file) == 1 else "按来源优先级采用最高优先级字段",
+                }
+            )
+    return merged, conflicts
+
+
+def _distinct_values(candidates: list[dict[str, Any]]) -> set[str]:
+    return {normalize_field_value(str(item.get("value") or "")) for item in candidates if item.get("value")}