from pathlib import Path from review_agent.regulatory_review.services.text_extract import extract_text def test_extract_text_reads_plain_text(tmp_path): path = tmp_path / "说明书.txt" path.write_text("产品名称:甲胎蛋白检测试剂盒\n储存条件:2-8℃", encoding="utf-8") result = extract_text(path) assert "甲胎蛋白" in result.text assert result.status == "success" assert result.content_hash def test_extract_text_keeps_wrapped_product_name(tmp_path): path = tmp_path / "申请表.txt" path.write_text( "产品名称:呼吸道合胞病毒、肺炎支原体核酸检测试剂盒\n" "(荧光PCR法)\n" "型号规格:24人份/盒\n", encoding="utf-8", ) result = extract_text(path) assert result.field_candidates["产品名称"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒 (荧光PCR法)" assert result.field_candidates["型号规格"] == "24人份/盒" def test_extract_text_reports_unsupported_file(tmp_path): path = tmp_path / "image.png" path.write_bytes(b"png") result = extract_text(path) assert result.status == "unsupported" assert result.text == "" def test_extract_text_from_docx_preserves_table_text(tmp_path): from docx import Document path = tmp_path / "说明书.docx" document = Document() document.add_paragraph("【主要组成成分】") table = document.add_table(rows=2, cols=2) table.rows[0].cells[0].text = "组分" table.rows[0].cells[1].text = "数量" table.rows[1].cells[0].text = "PCR反应液" table.rows[1].cells[1].text = "1管" document.add_paragraph("【储存条件及有效期】") document.add_paragraph("-20±5℃保存,有效期12个月。") document.save(path) result = extract_text(path) assert result.status == "success" assert "组分\t数量" in result.text assert result.text.index("PCR反应液") < result.text.index("【储存条件及有效期】")