198 lines
8.0 KiB
Python
198 lines
8.0 KiB
Python
import json
|
||
|
||
import pytest
|
||
|
||
from review_agent.application_form_fill.services.field_extract import (
|
||
extract_by_llm,
|
||
extract_by_rules,
|
||
run_parallel_extract,
|
||
save_field_extract_result,
|
||
)
|
||
from review_agent.application_form_fill.services.template_config import load_template_config
|
||
from review_agent.application_form_fill.services.template_select import select_templates
|
||
from review_agent.models import (
|
||
ApplicationFormFillArtifact,
|
||
ApplicationFormFillBatch,
|
||
Conversation,
|
||
FileSummaryBatch,
|
||
)
|
||
|
||
|
||
pytestmark = pytest.mark.django_db
|
||
|
||
|
||
def _registration_specs():
|
||
config = load_template_config()
|
||
specs, _risk_notes = select_templates(config, ["registration_certificate"], "首次注册")
|
||
return specs
|
||
|
||
|
||
def test_rule_extracts_registration_certificate_fields():
|
||
texts = {
|
||
"产品说明书.txt": "\n".join(
|
||
[
|
||
"产品名称:甲胎蛋白检测试剂盒",
|
||
"包装规格:20人份/盒",
|
||
"预期用途:用于体外定量检测人血清中甲胎蛋白含量",
|
||
"产品储存条件及有效期:2-8℃保存,有效期12个月",
|
||
]
|
||
)
|
||
}
|
||
|
||
result = extract_by_rules(texts, _registration_specs())
|
||
|
||
values = {field["key"]: field for field in result["fields"]}
|
||
assert values["product_name"]["value"] == "甲胎蛋白检测试剂盒"
|
||
assert values["intended_use"]["source_role"] == "说明书"
|
||
assert "2-8℃保存" in values["storage_condition_and_validity"]["value"]
|
||
assert values["package_specification"]["extractor"] == "rule"
|
||
|
||
|
||
def test_rule_extracts_bracket_sections_from_instructions():
|
||
texts = {
|
||
"目标产品说明书.docx": "\n".join(
|
||
[
|
||
"【产品名称】",
|
||
"新型冠状病毒2019-nCoV核酸检测试剂盒(荧光PCR法)",
|
||
"【包装规格】",
|
||
"规格A:24人份/盒、48人份/盒、96人份/盒。",
|
||
"规格B:24人份/盒、48人份/盒、96人份/盒。",
|
||
"【预期用途】",
|
||
"本试剂盒用于体外定性检测咽拭子、痰液样本中新型冠状病毒(2019-nCoV)ORF1ab和N基因。",
|
||
"【检测原理】",
|
||
"本段不应进入预期用途。",
|
||
"【主要组成成分】",
|
||
"表1 规格A大包装试剂盒组成成分",
|
||
"组分\t规格\t数量",
|
||
"PCR反应液\t24人份/盒\t1管",
|
||
"【储存条件及有效期】",
|
||
"-20±5℃的避光条件,有效期12个月。",
|
||
"反复冻融次数不得超过4次。",
|
||
"【样本要求】",
|
||
"适用样本类型:咽拭子、痰液。",
|
||
]
|
||
)
|
||
}
|
||
|
||
result = extract_by_rules(texts, _registration_specs())
|
||
|
||
values = {field["key"]: field["value"] for field in result["fields"]}
|
||
assert values["product_name"] == "新型冠状病毒2019-nCoV核酸检测试剂盒(荧光PCR法)"
|
||
assert "规格A" in values["package_specification"]
|
||
assert "检测原理" not in values["intended_use"]
|
||
assert "PCR反应液" in values["main_components"]
|
||
assert "-20±5℃" in values["storage_condition_and_validity"]
|
||
|
||
|
||
def test_rule_maps_agent_fields_to_manufacturer_company_for_now():
|
||
texts = {
|
||
"目标产品说明书.docx": "\n".join(
|
||
[
|
||
"生产企业名称:卡尤迪生物科技宜兴有限公司",
|
||
"生产企业住所:江苏省宜兴经济技术开发区杏里路10号",
|
||
"生产地址:江苏省宜兴经济技术开发区杏里路10号宜兴光电产业园4幢102室",
|
||
]
|
||
)
|
||
}
|
||
|
||
result = extract_by_rules(texts, _registration_specs())
|
||
|
||
values = {field["key"]: field["value"] for field in result["fields"]}
|
||
assert values["agent_name"] == "卡尤迪生物科技宜兴有限公司"
|
||
assert values["agent_address"] == "江苏省宜兴经济技术开发区杏里路10号"
|
||
assert values["manufacturer_address"] == "江苏省宜兴经济技术开发区杏里路10号宜兴光电产业园4幢102室"
|
||
|
||
|
||
def test_rule_stops_product_name_before_application_form_instructions():
|
||
texts = {
|
||
"境内体外诊断试剂注册申请表.docx": "\n".join(
|
||
[
|
||
"产品名称:呼吸道合胞病毒、肺炎支原体核酸检测试剂盒(荧光PCR法)",
|
||
"申请人:",
|
||
"卡尤迪生物科技宜兴有限公司",
|
||
"国家药品监督管理局",
|
||
"填表说明",
|
||
"1. 本表依据《体外诊断注册与备案管理办法》制定。",
|
||
]
|
||
)
|
||
}
|
||
|
||
result = extract_by_rules(texts, _registration_specs())
|
||
|
||
values = {field["key"]: field["value"] for field in result["fields"]}
|
||
assert values["product_name"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒(荧光PCR法)"
|
||
assert "填表说明" not in values["product_name"]
|
||
|
||
|
||
def test_llm_extract_parses_structured_json(monkeypatch):
|
||
monkeypatch.setattr(
|
||
"review_agent.application_form_fill.services.field_extract.generate_completion",
|
||
lambda messages, temperature=0.0: json.dumps(
|
||
{
|
||
"fields": [
|
||
{
|
||
"key": "product_name",
|
||
"label": "产品名称",
|
||
"value": "甲胎蛋白检测试剂盒",
|
||
"source_file": "说明书.txt",
|
||
"source_role": "说明书",
|
||
"evidence": "产品名称:甲胎蛋白检测试剂盒",
|
||
"confidence": 0.9,
|
||
}
|
||
],
|
||
"checklist_items": [],
|
||
},
|
||
ensure_ascii=False,
|
||
),
|
||
)
|
||
|
||
result = extract_by_llm({"说明书.txt": "产品名称:甲胎蛋白检测试剂盒"}, _registration_specs())
|
||
|
||
assert result["fields"][0]["extractor"] == "llm"
|
||
assert result["fields"][0]["value"] == "甲胎蛋白检测试剂盒"
|
||
|
||
|
||
def test_llm_extract_failure_returns_empty_result(monkeypatch):
|
||
monkeypatch.setattr(
|
||
"review_agent.application_form_fill.services.field_extract.generate_completion",
|
||
lambda messages, temperature=0.0: (_ for _ in ()).throw(TimeoutError("timeout")),
|
||
)
|
||
|
||
result = extract_by_llm({"说明书.txt": "产品名称:甲胎蛋白检测试剂盒"}, _registration_specs())
|
||
|
||
assert result["fields"] == []
|
||
assert "timeout" in result["error_message"]
|
||
|
||
|
||
def test_parallel_extract_preserves_rule_result_when_llm_fails(monkeypatch):
|
||
monkeypatch.setattr(
|
||
"review_agent.application_form_fill.services.field_extract.generate_completion",
|
||
lambda messages, temperature=0.0: (_ for _ in ()).throw(TimeoutError("timeout")),
|
||
)
|
||
|
||
payload = run_parallel_extract({"说明书.txt": "产品名称:甲胎蛋白检测试剂盒"}, _registration_specs())
|
||
|
||
assert payload["regex_results"]["fields"]
|
||
assert payload["llm_results"]["fields"] == []
|
||
assert payload["selected_templates"] == ["registration_certificate"]
|
||
|
||
|
||
def test_save_field_extract_result_creates_json_artifact(settings, tmp_path, django_user_model):
|
||
settings.MEDIA_ROOT = tmp_path
|
||
user = django_user_model.objects.create_user(username="owner", password="pass")
|
||
conversation = Conversation.objects.create(user=user, title="会话")
|
||
summary = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-FIELD")
|
||
batch = ApplicationFormFillBatch.objects.create(
|
||
conversation=conversation,
|
||
user=user,
|
||
source_summary_batch=summary,
|
||
batch_no="AFF-FIELD",
|
||
work_dir=str(tmp_path / "aff" / "AFF-FIELD"),
|
||
)
|
||
|
||
artifact = save_field_extract_result(batch, {"regex_results": {"fields": []}, "llm_results": {"fields": []}})
|
||
|
||
assert artifact.artifact_type == ApplicationFormFillArtifact.ArtifactType.FIELD_EXTRACT_RESULT
|
||
assert artifact.file_format == ApplicationFormFillArtifact.FileFormat.JSON
|
||
assert artifact.content_hash
|