import json import pytest from review_agent.application_form_fill.services.field_extract import ( extract_by_llm, extract_by_rules, run_parallel_extract, save_field_extract_result, ) from review_agent.application_form_fill.services.template_config import load_template_config from review_agent.application_form_fill.services.template_select import select_templates from review_agent.models import ( ApplicationFormFillArtifact, ApplicationFormFillBatch, Conversation, FileSummaryBatch, ) pytestmark = pytest.mark.django_db def _registration_specs(): config = load_template_config() specs, _risk_notes = select_templates(config, ["registration_certificate"], "首次注册") return specs def test_rule_extracts_registration_certificate_fields(): texts = { "产品说明书.txt": "\n".join( [ "产品名称:甲胎蛋白检测试剂盒", "包装规格:20人份/盒", "预期用途:用于体外定量检测人血清中甲胎蛋白含量", "产品储存条件及有效期:2-8℃保存,有效期12个月", ] ) } result = extract_by_rules(texts, _registration_specs()) values = {field["key"]: field for field in result["fields"]} assert values["product_name"]["value"] == "甲胎蛋白检测试剂盒" assert values["intended_use"]["source_role"] == "说明书" assert "2-8℃保存" in values["storage_condition_and_validity"]["value"] assert values["package_specification"]["extractor"] == "rule" def test_rule_extracts_bracket_sections_from_instructions(): texts = { "目标产品说明书.docx": "\n".join( [ "【产品名称】", "新型冠状病毒2019-nCoV核酸检测试剂盒(荧光PCR法)", "【包装规格】", "规格A:24人份/盒、48人份/盒、96人份/盒。", "规格B:24人份/盒、48人份/盒、96人份/盒。", "【预期用途】", "本试剂盒用于体外定性检测咽拭子、痰液样本中新型冠状病毒(2019-nCoV)ORF1ab和N基因。", "【检测原理】", "本段不应进入预期用途。", "【主要组成成分】", "表1 规格A大包装试剂盒组成成分", "组分\t规格\t数量", "PCR反应液\t24人份/盒\t1管", "【储存条件及有效期】", "-20±5℃的避光条件,有效期12个月。", "反复冻融次数不得超过4次。", "【样本要求】", "适用样本类型:咽拭子、痰液。", ] ) } result = extract_by_rules(texts, _registration_specs()) values = {field["key"]: field["value"] for field in result["fields"]} assert values["product_name"] == "新型冠状病毒2019-nCoV核酸检测试剂盒(荧光PCR法)" assert "规格A" in values["package_specification"] assert "检测原理" not in values["intended_use"] assert "PCR反应液" in values["main_components"] assert "-20±5℃" in values["storage_condition_and_validity"] def test_rule_maps_agent_fields_to_manufacturer_company_for_now(): texts = { "目标产品说明书.docx": "\n".join( [ "生产企业名称:卡尤迪生物科技宜兴有限公司", "生产企业住所:江苏省宜兴经济技术开发区杏里路10号", "生产地址:江苏省宜兴经济技术开发区杏里路10号宜兴光电产业园4幢102室", ] ) } result = extract_by_rules(texts, _registration_specs()) values = {field["key"]: field["value"] for field in result["fields"]} assert values["agent_name"] == "卡尤迪生物科技宜兴有限公司" assert values["agent_address"] == "江苏省宜兴经济技术开发区杏里路10号" assert values["manufacturer_address"] == "江苏省宜兴经济技术开发区杏里路10号宜兴光电产业园4幢102室" def test_rule_stops_product_name_before_application_form_instructions(): texts = { "境内体外诊断试剂注册申请表.docx": "\n".join( [ "产品名称:呼吸道合胞病毒、肺炎支原体核酸检测试剂盒(荧光PCR法)", "申请人:", "卡尤迪生物科技宜兴有限公司", "国家药品监督管理局", "填表说明", "1. 本表依据《体外诊断注册与备案管理办法》制定。", ] ) } result = extract_by_rules(texts, _registration_specs()) values = {field["key"]: field["value"] for field in result["fields"]} assert values["product_name"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒(荧光PCR法)" assert "填表说明" not in values["product_name"] def test_llm_extract_parses_structured_json(monkeypatch): monkeypatch.setattr( "review_agent.application_form_fill.services.field_extract.generate_completion", lambda messages, temperature=0.0: json.dumps( { "fields": [ { "key": "product_name", "label": "产品名称", "value": "甲胎蛋白检测试剂盒", "source_file": "说明书.txt", "source_role": "说明书", "evidence": "产品名称:甲胎蛋白检测试剂盒", "confidence": 0.9, } ], "checklist_items": [], }, ensure_ascii=False, ), ) result = extract_by_llm({"说明书.txt": "产品名称:甲胎蛋白检测试剂盒"}, _registration_specs()) assert result["fields"][0]["extractor"] == "llm" assert result["fields"][0]["value"] == "甲胎蛋白检测试剂盒" def test_llm_extract_failure_returns_empty_result(monkeypatch): monkeypatch.setattr( "review_agent.application_form_fill.services.field_extract.generate_completion", lambda messages, temperature=0.0: (_ for _ in ()).throw(TimeoutError("timeout")), ) result = extract_by_llm({"说明书.txt": "产品名称:甲胎蛋白检测试剂盒"}, _registration_specs()) assert result["fields"] == [] assert "timeout" in result["error_message"] def test_parallel_extract_preserves_rule_result_when_llm_fails(monkeypatch): monkeypatch.setattr( "review_agent.application_form_fill.services.field_extract.generate_completion", lambda messages, temperature=0.0: (_ for _ in ()).throw(TimeoutError("timeout")), ) payload = run_parallel_extract({"说明书.txt": "产品名称:甲胎蛋白检测试剂盒"}, _registration_specs()) assert payload["regex_results"]["fields"] assert payload["llm_results"]["fields"] == [] assert payload["selected_templates"] == ["registration_certificate"] def test_save_field_extract_result_creates_json_artifact(settings, tmp_path, django_user_model): settings.MEDIA_ROOT = tmp_path user = django_user_model.objects.create_user(username="owner", password="pass") conversation = Conversation.objects.create(user=user, title="会话") summary = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-FIELD") batch = ApplicationFormFillBatch.objects.create( conversation=conversation, user=user, source_summary_batch=summary, batch_no="AFF-FIELD", work_dir=str(tmp_path / "aff" / "AFF-FIELD"), ) artifact = save_field_extract_result(batch, {"regex_results": {"fields": []}, "llm_results": {"fields": []}}) assert artifact.artifact_type == ApplicationFormFillArtifact.ArtifactType.FIELD_EXTRACT_RESULT assert artifact.file_format == ApplicationFormFillArtifact.FileFormat.JSON assert artifact.content_hash