feat(application-form-fill): 实现字段抽取与冲突合并

This commit is contained in:
2026-06-07 18:31:34 +08:00
parent 72890783b3
commit a48f778e09
5 changed files with 498 additions and 0 deletions

View File

@@ -0,0 +1,121 @@
import json
import pytest
from review_agent.application_form_fill.services.field_extract import (
extract_by_llm,
extract_by_rules,
run_parallel_extract,
save_field_extract_result,
)
from review_agent.application_form_fill.services.template_config import load_template_config
from review_agent.application_form_fill.services.template_select import select_templates
from review_agent.models import (
ApplicationFormFillArtifact,
ApplicationFormFillBatch,
Conversation,
FileSummaryBatch,
)
pytestmark = pytest.mark.django_db
def _registration_specs():
config = load_template_config()
specs, _risk_notes = select_templates(config, ["registration_certificate"], "首次注册")
return specs
def test_rule_extracts_registration_certificate_fields():
texts = {
"产品说明书.txt": "\n".join(
[
"产品名称:甲胎蛋白检测试剂盒",
"包装规格20人份/盒",
"预期用途:用于体外定量检测人血清中甲胎蛋白含量",
"产品储存条件及有效期2-8℃保存有效期12个月",
]
)
}
result = extract_by_rules(texts, _registration_specs())
values = {field["key"]: field for field in result["fields"]}
assert values["product_name"]["value"] == "甲胎蛋白检测试剂盒"
assert values["intended_use"]["source_role"] == "说明书"
assert "2-8℃保存" in values["storage_condition_and_validity"]["value"]
assert values["package_specification"]["extractor"] == "rule"
def test_llm_extract_parses_structured_json(monkeypatch):
monkeypatch.setattr(
"review_agent.application_form_fill.services.field_extract.generate_completion",
lambda messages, temperature=0.0: json.dumps(
{
"fields": [
{
"key": "product_name",
"label": "产品名称",
"value": "甲胎蛋白检测试剂盒",
"source_file": "说明书.txt",
"source_role": "说明书",
"evidence": "产品名称:甲胎蛋白检测试剂盒",
"confidence": 0.9,
}
],
"checklist_items": [],
},
ensure_ascii=False,
),
)
result = extract_by_llm({"说明书.txt": "产品名称:甲胎蛋白检测试剂盒"}, _registration_specs())
assert result["fields"][0]["extractor"] == "llm"
assert result["fields"][0]["value"] == "甲胎蛋白检测试剂盒"
def test_llm_extract_failure_returns_empty_result(monkeypatch):
monkeypatch.setattr(
"review_agent.application_form_fill.services.field_extract.generate_completion",
lambda messages, temperature=0.0: (_ for _ in ()).throw(TimeoutError("timeout")),
)
result = extract_by_llm({"说明书.txt": "产品名称:甲胎蛋白检测试剂盒"}, _registration_specs())
assert result["fields"] == []
assert "timeout" in result["error_message"]
def test_parallel_extract_preserves_rule_result_when_llm_fails(monkeypatch):
monkeypatch.setattr(
"review_agent.application_form_fill.services.field_extract.generate_completion",
lambda messages, temperature=0.0: (_ for _ in ()).throw(TimeoutError("timeout")),
)
payload = run_parallel_extract({"说明书.txt": "产品名称:甲胎蛋白检测试剂盒"}, _registration_specs())
assert payload["regex_results"]["fields"]
assert payload["llm_results"]["fields"] == []
assert payload["selected_templates"] == ["registration_certificate"]
def test_save_field_extract_result_creates_json_artifact(settings, tmp_path, django_user_model):
settings.MEDIA_ROOT = tmp_path
user = django_user_model.objects.create_user(username="owner", password="pass")
conversation = Conversation.objects.create(user=user, title="会话")
summary = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-FIELD")
batch = ApplicationFormFillBatch.objects.create(
conversation=conversation,
user=user,
source_summary_batch=summary,
batch_no="AFF-FIELD",
work_dir=str(tmp_path / "aff" / "AFF-FIELD"),
)
artifact = save_field_extract_result(batch, {"regex_results": {"fields": []}, "llm_results": {"fields": []}})
assert artifact.artifact_type == ApplicationFormFillArtifact.ArtifactType.FIELD_EXTRACT_RESULT
assert artifact.file_format == ApplicationFormFillArtifact.FileFormat.JSON
assert artifact.content_hash

View File

@@ -0,0 +1,79 @@
import pytest
from review_agent.application_form_fill.services.field_merge import merge_fields, normalize_field_value, rank_source
def test_normalize_field_value_removes_whitespace():
assert normalize_field_value(" 2-8℃ 保存 \n 有效期12个月 ") == "2-8℃保存有效期12个月"
def test_rank_source_prefers_instructions():
assert rank_source("说明书") < rank_source("产品技术要求")
def test_merge_fields_prefers_instructions_and_marks_conflict():
regex_results = {
"fields": [
{
"key": "storage_condition_and_validity",
"label": "产品储存条件及有效期",
"value": "2-8℃保存有效期12个月",
"source_file": "说明书.txt",
"source_role": "说明书",
"evidence": "产品储存条件及有效期2-8℃保存有效期12个月",
"confidence": 0.75,
},
{
"key": "storage_condition_and_validity",
"label": "产品储存条件及有效期",
"value": "-20℃保存",
"source_file": "产品技术要求.txt",
"source_role": "产品技术要求",
"evidence": "产品储存条件及有效期:-20℃保存",
"confidence": 0.8,
},
]
}
merged, conflicts = merge_fields(regex_results, {"fields": []})
field = merged["storage_condition_and_validity"]
assert field.value == "2-8℃保存有效期12个月"
assert field.has_conflict is True
assert conflicts[0]["selected_value"] == "2-8℃保存有效期12个月"
assert conflicts[0]["conflict_values"][0]["value"] == "-20℃保存"
def test_merge_fields_combines_consistent_values_without_conflict():
regex_results = {
"fields": [
{
"key": "product_name",
"label": "产品名称",
"value": "甲胎蛋白检测试剂盒",
"source_file": "说明书.txt",
"source_role": "说明书",
"evidence": "产品名称:甲胎蛋白检测试剂盒",
"confidence": 0.75,
}
]
}
llm_results = {
"fields": [
{
"key": "product_name",
"label": "产品名称",
"value": "甲胎蛋白 检测试剂盒",
"source_file": "产品技术要求.txt",
"source_role": "产品技术要求",
"evidence": "产品名称:甲胎蛋白 检测试剂盒",
"confidence": 0.9,
}
]
}
merged, conflicts = merge_fields(regex_results, llm_results)
assert merged["product_name"].value == "甲胎蛋白检测试剂盒"
assert merged["product_name"].has_conflict is False
assert conflicts == []