Files
DEMO-AGENT/tests/test_application_form_fill_field_extract.py

218 lines
8.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import pytest
from review_agent.application_form_fill.services.field_extract import (
extract_by_llm,
extract_by_rules,
run_parallel_extract,
save_field_extract_result,
)
from review_agent.application_form_fill.services.template_config import load_template_config
from review_agent.application_form_fill.services.template_select import select_templates
from review_agent.models import (
ApplicationFormFillArtifact,
ApplicationFormFillBatch,
Conversation,
FileSummaryBatch,
)
pytestmark = pytest.mark.django_db
def _registration_specs():
config = load_template_config()
specs, _risk_notes = select_templates(config, ["registration_certificate"], "首次注册")
return specs
def test_rule_extracts_registration_certificate_fields():
texts = {
"产品说明书.txt": "\n".join(
[
"产品名称:甲胎蛋白检测试剂盒",
"包装规格20人份/盒",
"预期用途:用于体外定量检测人血清中甲胎蛋白含量",
"产品储存条件及有效期2-8℃保存有效期12个月",
]
)
}
result = extract_by_rules(texts, _registration_specs())
values = {field["key"]: field for field in result["fields"]}
assert values["product_name"]["value"] == "甲胎蛋白检测试剂盒"
assert values["intended_use"]["source_role"] == "说明书"
assert "2-8℃保存" in values["storage_condition_and_validity"]["value"]
assert values["package_specification"]["extractor"] == "rule"
def test_rule_extracts_bracket_sections_from_instructions():
texts = {
"目标产品说明书.docx": "\n".join(
[
"【产品名称】",
"新型冠状病毒2019-nCoV核酸检测试剂盒荧光PCR法",
"【包装规格】",
"规格A24人份/盒、48人份/盒、96人份/盒。",
"规格B24人份/盒、48人份/盒、96人份/盒。",
"【预期用途】",
"本试剂盒用于体外定性检测咽拭子、痰液样本中新型冠状病毒2019-nCoVORF1ab和N基因。",
"【检测原理】",
"本段不应进入预期用途。",
"【主要组成成分】",
"表1 规格A大包装试剂盒组成成分",
"组分\t规格\t数量",
"PCR反应液\t24人份/盒\t1管",
"【储存条件及有效期】",
"-20±5℃的避光条件有效期12个月。",
"反复冻融次数不得超过4次。",
"【样本要求】",
"适用样本类型:咽拭子、痰液。",
]
)
}
result = extract_by_rules(texts, _registration_specs())
values = {field["key"]: field["value"] for field in result["fields"]}
assert values["product_name"] == "新型冠状病毒2019-nCoV核酸检测试剂盒荧光PCR法"
assert "规格A" in values["package_specification"]
assert "检测原理" not in values["intended_use"]
assert "PCR反应液" in values["main_components"]
assert "-20±5℃" in values["storage_condition_and_validity"]
def test_rule_maps_agent_fields_to_manufacturer_company_for_now():
texts = {
"目标产品说明书.docx": "\n".join(
[
"生产企业名称:卡尤迪生物科技宜兴有限公司",
"生产企业住所江苏省宜兴经济技术开发区杏里路10号",
"生产地址江苏省宜兴经济技术开发区杏里路10号宜兴光电产业园4幢102室",
]
)
}
result = extract_by_rules(texts, _registration_specs())
values = {field["key"]: field["value"] for field in result["fields"]}
assert values["agent_name"] == "卡尤迪生物科技宜兴有限公司"
assert values["agent_address"] == "江苏省宜兴经济技术开发区杏里路10号"
assert values["manufacturer_address"] == "江苏省宜兴经济技术开发区杏里路10号宜兴光电产业园4幢102室"
def test_rule_stops_product_name_before_application_form_instructions():
texts = {
"境内体外诊断试剂注册申请表.docx": "\n".join(
[
"产品名称呼吸道合胞病毒、肺炎支原体核酸检测试剂盒荧光PCR法",
"申请人:",
"卡尤迪生物科技宜兴有限公司",
"国家药品监督管理局",
"填表说明",
"1. 本表依据《体外诊断注册与备案管理办法》制定。",
]
)
}
result = extract_by_rules(texts, _registration_specs())
values = {field["key"]: field["value"] for field in result["fields"]}
assert values["product_name"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒荧光PCR法"
assert "填表说明" not in values["product_name"]
def test_rule_ignores_generic_enterprise_name_from_application_form():
texts = {
"CH1.4 申请表.docx": "\n".join(
[
"注册人制度\t是 企业名称:否",
"优先通道申请 应急通道 同品种首个产品首次申报",
"临床试验",
"临床试验机构名称: 中国医学科学院北京协和医院、晋中市第一人民医院",
"应附资料",
]
)
}
result = extract_by_rules(texts, _registration_specs())
values = {field["key"]: field["value"] for field in result["fields"]}
assert "applicant_name" not in values
assert "agent_name" not in values
def test_llm_extract_parses_structured_json(monkeypatch):
monkeypatch.setattr(
"review_agent.application_form_fill.services.field_extract.generate_completion",
lambda messages, temperature=0.0: json.dumps(
{
"fields": [
{
"key": "product_name",
"label": "产品名称",
"value": "甲胎蛋白检测试剂盒",
"source_file": "说明书.txt",
"source_role": "说明书",
"evidence": "产品名称:甲胎蛋白检测试剂盒",
"confidence": 0.9,
}
],
"checklist_items": [],
},
ensure_ascii=False,
),
)
result = extract_by_llm({"说明书.txt": "产品名称:甲胎蛋白检测试剂盒"}, _registration_specs())
assert result["fields"][0]["extractor"] == "llm"
assert result["fields"][0]["value"] == "甲胎蛋白检测试剂盒"
def test_llm_extract_failure_returns_empty_result(monkeypatch):
monkeypatch.setattr(
"review_agent.application_form_fill.services.field_extract.generate_completion",
lambda messages, temperature=0.0: (_ for _ in ()).throw(TimeoutError("timeout")),
)
result = extract_by_llm({"说明书.txt": "产品名称:甲胎蛋白检测试剂盒"}, _registration_specs())
assert result["fields"] == []
assert "timeout" in result["error_message"]
def test_parallel_extract_preserves_rule_result_when_llm_fails(monkeypatch):
monkeypatch.setattr(
"review_agent.application_form_fill.services.field_extract.generate_completion",
lambda messages, temperature=0.0: (_ for _ in ()).throw(TimeoutError("timeout")),
)
payload = run_parallel_extract({"说明书.txt": "产品名称:甲胎蛋白检测试剂盒"}, _registration_specs())
assert payload["regex_results"]["fields"]
assert payload["llm_results"]["fields"] == []
assert payload["selected_templates"] == ["registration_certificate"]
def test_save_field_extract_result_creates_json_artifact(settings, tmp_path, django_user_model):
settings.MEDIA_ROOT = tmp_path
user = django_user_model.objects.create_user(username="owner", password="pass")
conversation = Conversation.objects.create(user=user, title="会话")
summary = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-FIELD")
batch = ApplicationFormFillBatch.objects.create(
conversation=conversation,
user=user,
source_summary_batch=summary,
batch_no="AFF-FIELD",
work_dir=str(tmp_path / "aff" / "AFF-FIELD"),
)
artifact = save_field_extract_result(batch, {"regex_results": {"fields": []}, "llm_results": {"fields": []}})
assert artifact.artifact_type == ApplicationFormFillArtifact.ArtifactType.FIELD_EXTRACT_RESULT
assert artifact.file_format == ApplicationFormFillArtifact.FileFormat.JSON
assert artifact.content_hash