Files
DEMO-AGENT/tests/test_regulatory_text_extract.py

62 lines
2.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from pathlib import Path
from review_agent.regulatory_review.services.text_extract import extract_text
def test_extract_text_reads_plain_text(tmp_path):
path = tmp_path / "说明书.txt"
path.write_text("产品名称:甲胎蛋白检测试剂盒\n储存条件2-8℃", encoding="utf-8")
result = extract_text(path)
assert "甲胎蛋白" in result.text
assert result.status == "success"
assert result.content_hash
def test_extract_text_keeps_wrapped_product_name(tmp_path):
path = tmp_path / "申请表.txt"
path.write_text(
"产品名称:呼吸道合胞病毒、肺炎支原体核酸检测试剂盒\n"
"荧光PCR法\n"
"型号规格24人份/盒\n",
encoding="utf-8",
)
result = extract_text(path)
assert result.field_candidates["产品名称"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒 荧光PCR法"
assert result.field_candidates["型号规格"] == "24人份/盒"
def test_extract_text_reports_unsupported_file(tmp_path):
path = tmp_path / "image.png"
path.write_bytes(b"png")
result = extract_text(path)
assert result.status == "unsupported"
assert result.text == ""
def test_extract_text_from_docx_preserves_table_text(tmp_path):
from docx import Document
path = tmp_path / "说明书.docx"
document = Document()
document.add_paragraph("【主要组成成分】")
table = document.add_table(rows=2, cols=2)
table.rows[0].cells[0].text = "组分"
table.rows[0].cells[1].text = "数量"
table.rows[1].cells[0].text = "PCR反应液"
table.rows[1].cells[1].text = "1管"
document.add_paragraph("【储存条件及有效期】")
document.add_paragraph("-20±5℃保存有效期12个月。")
document.save(path)
result = extract_text(path)
assert result.status == "success"
assert "组分\t数量" in result.text
assert result.text.index("PCR反应液") < result.text.index("【储存条件及有效期】")