from __future__ import annotations import re from typing import Any from review_agent.application_form_fill.schemas import MergedField SOURCE_PRIORITY = { "说明书": 1, "产品技术要求": 2, "注册检验报告": 3, "检测报告": 3, "性能研究资料": 4, "其他注册资料": 5, } def normalize_field_value(value: str) -> str: return re.sub(r"\s+", "", str(value or "")).strip().lower() def rank_source(source_role: str, source_file: str = "") -> int: target = f"{source_role}\n{source_file}" for keyword, rank in SOURCE_PRIORITY.items(): if keyword in target: return rank return 9 def merge_fields(regex_results: dict[str, Any], llm_results: dict[str, Any]) -> tuple[dict[str, MergedField], list[dict]]: grouped: dict[str, list[dict[str, Any]]] = {} for item in list(regex_results.get("fields") or []) + list(llm_results.get("fields") or []): key = str(item.get("key") or "") value = str(item.get("value") or "").strip() if not key or not value: continue grouped.setdefault(key, []).append(item) merged: dict[str, MergedField] = {} conflicts: list[dict] = [] for key, candidates in grouped.items(): selected = sorted( candidates, key=lambda item: ( rank_source(str(item.get("source_role") or ""), str(item.get("source_file") or "")), -float(item.get("confidence") or 0), ), )[0] distinct = _distinct_values(candidates) has_conflict = len(distinct) > 1 conflict_values = [ { "value": item.get("value"), "source_file": item.get("source_file", ""), "source_role": item.get("source_role", ""), "evidence": item.get("evidence", ""), } for item in candidates if normalize_field_value(str(item.get("value") or "")) != normalize_field_value(str(selected.get("value") or "")) ] merged_field = MergedField( key=key, label=str(selected.get("label") or key), value=str(selected.get("value") or ""), source_file=str(selected.get("source_file") or ""), evidence=str(selected.get("evidence") or ""), confidence=float(selected.get("confidence") or 0), has_conflict=has_conflict, conflict_values=conflict_values, ) merged[key] = merged_field if has_conflict: conflicts.append( { "field_key": key, "field_label": merged_field.label, "selected_value": merged_field.value, "selected_source": merged_field.source_file, "conflict_values": conflict_values, "handling": "说明书优先,模板内黄底红字高亮" if rank_source(merged_field.source_file, merged_field.source_file) == 1 else "按来源优先级采用最高优先级字段", } ) return merged, conflicts def _distinct_values(candidates: list[dict[str, Any]]) -> set[str]: return {normalize_field_value(str(item.get("value") or "")) for item in candidates if item.get("value")}