DEMO-AGENT/review_agent/application_form_fill/services/field_merge.py

from __future__ import annotations

import re
from typing import Any

from review_agent.application_form_fill.schemas import MergedField


SOURCE_PRIORITY = {
    "说明书": 1,
    "产品技术要求": 2,
    "注册检验报告": 3,
    "检测报告": 3,
    "性能研究资料": 4,
    "其他注册资料": 5,
}


def normalize_field_value(value: str) -> str:
    return re.sub(r"\s+", "", str(value or "")).strip().lower()


def rank_source(source_role: str, source_file: str = "") -> int:
    target = f"{source_role}\n{source_file}"
    for keyword, rank in SOURCE_PRIORITY.items():
        if keyword in target:
            return rank
    return 9


def merge_fields(regex_results: dict[str, Any], llm_results: dict[str, Any]) -> tuple[dict[str, MergedField], list[dict]]:
    grouped: dict[str, list[dict[str, Any]]] = {}
    for item in list(regex_results.get("fields") or []) + list(llm_results.get("fields") or []):
        key = str(item.get("key") or "")
        value = str(item.get("value") or "").strip()
        if not key or not value:
            continue
        grouped.setdefault(key, []).append(item)

    merged: dict[str, MergedField] = {}
    conflicts: list[dict] = []
    for key, candidates in grouped.items():
        selected = sorted(
            candidates,
            key=lambda item: (
                rank_source(str(item.get("source_role") or ""), str(item.get("source_file") or "")),
                -float(item.get("confidence") or 0),
            ),
        )[0]
        distinct = _distinct_values(candidates)
        has_conflict = len(distinct) > 1
        conflict_values = [
            {
                "value": item.get("value"),
                "source_file": item.get("source_file", ""),
                "source_role": item.get("source_role", ""),
                "evidence": item.get("evidence", ""),
            }
            for item in candidates
            if normalize_field_value(str(item.get("value") or "")) != normalize_field_value(str(selected.get("value") or ""))
        ]
        merged_field = MergedField(
            key=key,
            label=str(selected.get("label") or key),
            value=str(selected.get("value") or ""),
            source_file=str(selected.get("source_file") or ""),
            evidence=str(selected.get("evidence") or ""),
            confidence=float(selected.get("confidence") or 0),
            has_conflict=has_conflict,
            conflict_values=conflict_values,
        )
        merged[key] = merged_field
        if has_conflict:
            conflicts.append(
                {
                    "field_key": key,
                    "field_label": merged_field.label,
                    "selected_value": merged_field.value,
                    "selected_source": merged_field.source_file,
                    "conflict_values": conflict_values,
                    "handling": "说明书优先，模板内黄底红字高亮" if rank_source(merged_field.source_file, merged_field.source_file) == 1 else "按来源优先级采用最高优先级字段",
                }
            )
    _apply_agent_company_fallbacks(merged)
    return merged, conflicts


def _distinct_values(candidates: list[dict[str, Any]]) -> set[str]:
    return {normalize_field_value(str(item.get("value") or "")) for item in candidates if item.get("value")}


def _apply_agent_company_fallbacks(merged: dict[str, MergedField]) -> None:
    fallback_pairs = {
        "agent_name": ("applicant_name", "代理人名称"),
        "agent_address": ("applicant_address", "代理人住所"),
    }
    for target_key, (source_key, target_label) in fallback_pairs.items():
        if target_key in merged or source_key not in merged:
            continue
        source = merged[source_key]
        merged[target_key] = MergedField(
            key=target_key,
            label=target_label,
            value=source.value,
            source_file=source.source_file,
            evidence=source.evidence,
            confidence=source.confidence,
            has_conflict=source.has_conflict,
            conflict_values=source.conflict_values,
        )