DEMO-AGENT/tests/test_regulatory_info_package_field_extract.py

import json

from review_agent.regulatory_info_package.schemas import InstructionExtractResult
from review_agent.regulatory_info_package.services.field_extract import extract_fields_by_rules, run_parallel_extract


def test_extract_fields_by_rules_finds_product_name_and_storage():
    instruction = InstructionExtractResult(
        source_file_name="目标产品说明书.docx",
        paragraphs=["产品名称：新型冠状病毒检测试剂盒", "储存条件：2-8℃保存"],
        sections={},
        tables=[],
        component_tables=[],
        front_text="产品名称：新型冠状病毒检测试剂盒\n储存条件：2-8℃保存",
    )

    result = extract_fields_by_rules(instruction)

    assert result["product_name"]["value"] == "新型冠状病毒检测试剂盒"
    assert result["storage_condition"]["value"] == "2-8℃保存"


def test_extract_fields_by_rules_uses_registrant_or_manufacturer_for_applicant():
    instruction = InstructionExtractResult(
        source_file_name="目标产品说明书.docx",
        paragraphs=[
            "注册人/售后服务单位名称：卡尤迪生物科技宜兴有限公司",
            "生产企业名称：卡尤迪生物科技宜兴有限公司",
            "生产企业住所：宜兴经济技术开发区杏里路10号宜兴光电产业园4幢101室、102室",
            "联系方式： 0510-80330909, 0510-80330919",
            "生产地址：江苏省宜兴经济技术开发区杏里路10号宜兴光电产业园4幢102室",
        ],
        sections={},
        tables=[],
        component_tables=[],
        front_text="",
    )

    result = extract_fields_by_rules(instruction)

    assert result["applicant_name"]["value"] == "卡尤迪生物科技宜兴有限公司"
    assert result["manufacturer_name"]["value"] == "卡尤迪生物科技宜兴有限公司"
    assert result["applicant_address"]["value"] == "宜兴经济技术开发区杏里路10号宜兴光电产业园4幢101室、102室"
    assert result["applicant_contact"]["value"] == "0510-80330909, 0510-80330919"
    assert result["production_address"]["value"] == "江苏省宜兴经济技术开发区杏里路10号宜兴光电产业园4幢102室"


def test_extract_fields_by_rules_serializes_component_table_and_notes():
    instruction = InstructionExtractResult(
        source_file_name="目标产品说明书.docx",
        paragraphs=[],
        sections={"【主要组成成分】": "表1  规格A大包装试剂盒组成成分\n注：不同批号试剂盒中各组分不得互换使用。"},
        tables=[],
        component_tables=[
            {
                "header": ["组分", "主要组成成分", "规格（24人份/盒）", "规格（48人份/盒）"],
                "rows": [
                    ["PCR反应液 I", "逆转录酶、Taq酶", "840μL/管×1管", "840μL/管×2管"],
                    ["阳性对照品", "含目的片段的假病毒", "600μL/管×2管", "1200μL/管×2管"],
                ],
            }
        ],
        front_text="",
    )

    result = extract_fields_by_rules(instruction)
    payload = json.loads(result["component_table"]["value"])

    assert payload["header"][0:2] == ["组分", "主要组成成分"]
    assert payload["rows"][0][0] == "PCR反应液 I"
    assert result["component_notes"]["value"] == "表1  规格A大包装试剂盒组成成分\n注：不同批号试剂盒中各组分不得互换使用。"


def test_run_parallel_extract_keeps_rule_result_when_llm_fails():
    instruction = InstructionExtractResult(
        source_file_name="目标产品说明书.docx",
        paragraphs=["产品名称：测试产品"],
        sections={},
        tables=[],
        component_tables=[],
        front_text="产品名称：测试产品",
    )

    result = run_parallel_extract(instruction, llm_extract_func=lambda _instruction: (_ for _ in ()).throw(ValueError("bad llm")))

    assert result["regex_results"]["product_name"]["value"] == "测试产品"
    assert result["llm_results"] == {}
    assert result["llm_error"]