from __future__ import annotations import re from typing import Any from review_agent.application_form_fill.schemas import MergedField SOURCE_PRIORITY = { "说明书": 1, "产品技术要求": 2, "注册检验报告": 3, "检测报告": 3, "性能研究资料": 4, "其他注册资料": 5, } def normalize_field_value(value: str) -> str: return re.sub(r"\s+", "", str(value or "")).strip().lower() def rank_source(source_role: str, source_file: str = "") -> int: target = f"{source_role}\n{source_file}" for keyword, rank in SOURCE_PRIORITY.items(): if keyword in target: return rank return 9 def merge_fields(regex_results: dict[str, Any], llm_results: dict[str, Any]) -> tuple[dict[str, MergedField], list[dict]]: grouped: dict[str, list[dict[str, Any]]] = {} for item in list(regex_results.get("fields") or []) + list(llm_results.get("fields") or []): key = str(item.get("key") or "") value = str(item.get("value") or "").strip() if not key or not value: continue grouped.setdefault(key, []).append(item) merged: dict[str, MergedField] = {} conflicts: list[dict] = [] for key, candidates in grouped.items(): selected = sorted( candidates, key=lambda item: ( rank_source(str(item.get("source_role") or ""), str(item.get("source_file") or "")), -float(item.get("confidence") or 0), ), )[0] distinct = _distinct_values(candidates) has_conflict = len(distinct) > 1 conflict_values = [ { "value": item.get("value"), "source_file": item.get("source_file", ""), "source_role": item.get("source_role", ""), "evidence": item.get("evidence", ""), } for item in candidates if normalize_field_value(str(item.get("value") or "")) != normalize_field_value(str(selected.get("value") or "")) ] merged_field = MergedField( key=key, label=str(selected.get("label") or key), value=str(selected.get("value") or ""), source_file=str(selected.get("source_file") or ""), evidence=str(selected.get("evidence") or ""), confidence=float(selected.get("confidence") or 0), has_conflict=has_conflict, conflict_values=conflict_values, ) merged[key] = merged_field if has_conflict: conflicts.append( { "field_key": key, "field_label": merged_field.label, "selected_value": merged_field.value, "selected_source": merged_field.source_file, "conflict_values": conflict_values, "handling": "说明书优先,模板内黄底红字高亮" if rank_source(merged_field.source_file, merged_field.source_file) == 1 else "按来源优先级采用最高优先级字段", } ) _apply_agent_company_fallbacks(merged) return merged, conflicts def _distinct_values(candidates: list[dict[str, Any]]) -> set[str]: return {normalize_field_value(str(item.get("value") or "")) for item in candidates if item.get("value")} def _apply_agent_company_fallbacks(merged: dict[str, MergedField]) -> None: fallback_pairs = { "agent_name": ("applicant_name", "代理人名称"), "agent_address": ("applicant_address", "代理人住所"), } for target_key, (source_key, target_label) in fallback_pairs.items(): if target_key in merged or source_key not in merged: continue source = merged[source_key] merged[target_key] = MergedField( key=target_key, label=target_label, value=source.value, source_file=source.source_file, evidence=source.evidence, confidence=source.confidence, has_conflict=source.has_conflict, conflict_values=source.conflict_values, )