DEMO-AGENT/tests/test_regulatory_text_extract.py

from pathlib import Path

from review_agent.regulatory_review.services.text_extract import extract_text


def test_extract_text_reads_plain_text(tmp_path):
    path = tmp_path / "说明书.txt"
    path.write_text("产品名称：甲胎蛋白检测试剂盒\n储存条件：2-8℃", encoding="utf-8")

    result = extract_text(path)

    assert "甲胎蛋白" in result.text
    assert result.status == "success"
    assert result.content_hash


def test_extract_text_keeps_wrapped_product_name(tmp_path):
    path = tmp_path / "申请表.txt"
    path.write_text(
        "产品名称：呼吸道合胞病毒、肺炎支原体核酸检测试剂盒\n"
        "（荧光PCR法）\n"
        "型号规格：24人份/盒\n",
        encoding="utf-8",
    )

    result = extract_text(path)

    assert result.field_candidates["产品名称"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒 （荧光PCR法）"
    assert result.field_candidates["型号规格"] == "24人份/盒"


def test_extract_text_reports_unsupported_file(tmp_path):
    path = tmp_path / "image.png"
    path.write_bytes(b"png")

    result = extract_text(path)

    assert result.status == "unsupported"
    assert result.text == ""


def test_extract_text_from_docx_preserves_table_text(tmp_path):
    from docx import Document

    path = tmp_path / "说明书.docx"
    document = Document()
    document.add_paragraph("【主要组成成分】")
    table = document.add_table(rows=2, cols=2)
    table.rows[0].cells[0].text = "组分"
    table.rows[0].cells[1].text = "数量"
    table.rows[1].cells[0].text = "PCR反应液"
    table.rows[1].cells[1].text = "1管"
    document.add_paragraph("【储存条件及有效期】")
    document.add_paragraph("-20±5℃保存，有效期12个月。")
    document.save(path)

    result = extract_text(path)

    assert result.status == "success"
    assert "组分\t数量" in result.text
    assert result.text.index("PCR反应液") < result.text.index("【储存条件及有效期】")