fix(regulatory): 修复换行产品名称提取不全
This commit is contained in:
@@ -21,6 +21,7 @@ class ExtractedText:
|
|||||||
|
|
||||||
|
|
||||||
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
|
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
|
||||||
|
FIELD_LABELS = ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]
|
||||||
|
|
||||||
|
|
||||||
def extract_text(path: str | Path) -> ExtractedText:
|
def extract_text(path: str | Path) -> ExtractedText:
|
||||||
@@ -69,8 +70,32 @@ def _section_candidates(text: str) -> list[str]:
|
|||||||
|
|
||||||
def _field_candidates(text: str) -> dict[str, str]:
|
def _field_candidates(text: str) -> dict[str, str]:
|
||||||
fields = {}
|
fields = {}
|
||||||
for label in ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]:
|
lines = text.splitlines()
|
||||||
match = re.search(rf"{label}[::]\s*([^\n\r]+)", text)
|
for index, line in enumerate(lines):
|
||||||
if match:
|
normalized = line.strip()
|
||||||
fields[label] = " ".join(match.group(1).strip().split())
|
if not normalized:
|
||||||
|
continue
|
||||||
|
for label in FIELD_LABELS:
|
||||||
|
match = re.match(rf"^{re.escape(label)}[::]\s*(.*)$", normalized)
|
||||||
|
if not match or label in fields:
|
||||||
|
continue
|
||||||
|
value_parts = [match.group(1).strip()]
|
||||||
|
for next_line in lines[index + 1 :]:
|
||||||
|
continuation = next_line.strip()
|
||||||
|
if not continuation or _starts_field_line(continuation) or _looks_like_section_heading(continuation):
|
||||||
|
break
|
||||||
|
value_parts.append(continuation)
|
||||||
|
value = " ".join(part for part in value_parts if part)
|
||||||
|
if value:
|
||||||
|
fields[label] = " ".join(value.split())
|
||||||
return fields
|
return fields
|
||||||
|
|
||||||
|
|
||||||
|
def _starts_field_line(line: str) -> bool:
|
||||||
|
if any(re.match(rf"^{re.escape(label)}[::]", line) for label in FIELD_LABELS):
|
||||||
|
return True
|
||||||
|
return bool(re.match(r"^[^\s::]{2,24}[::]", line))
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_section_heading(line: str) -> bool:
|
||||||
|
return bool(re.match(r"^([一二三四五六七八九十]+[、..]|[0-9]+(\.[0-9]+)*[、..\s])", line))
|
||||||
|
|||||||
@@ -82,6 +82,41 @@ def test_detect_regulatory_condition_prefers_attachment_fields_over_chapter_titl
|
|||||||
assert candidates["intended_use"]["suggested"] == "用于人血清中甲胎蛋白检测"
|
assert candidates["intended_use"]["suggested"] == "用于人血清中甲胎蛋白检测"
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_regulatory_condition_keeps_wrapped_product_name(settings, tmp_path, django_user_model):
|
||||||
|
settings.MEDIA_ROOT = tmp_path
|
||||||
|
user = django_user_model.objects.create_user(username="owner", password="pass")
|
||||||
|
conversation = Conversation.objects.create(user=user, title="会话")
|
||||||
|
summary = FileSummaryBatch.objects.create(
|
||||||
|
conversation=conversation,
|
||||||
|
user=user,
|
||||||
|
batch_no="FS-COND",
|
||||||
|
status=FileSummaryBatch.Status.SUCCESS,
|
||||||
|
product_name="第1章 监管信息",
|
||||||
|
)
|
||||||
|
application = tmp_path / "application.txt"
|
||||||
|
application.write_text(
|
||||||
|
"产品名称:呼吸道合胞病毒、肺炎支原体核酸检测试剂盒\n"
|
||||||
|
"(荧光PCR法)\n"
|
||||||
|
"型号规格:24人份/盒\n"
|
||||||
|
"预期用途:用于呼吸道合胞病毒、肺炎支原体核酸检测\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
FileSummaryItem.objects.create(
|
||||||
|
batch=summary,
|
||||||
|
file_index=1,
|
||||||
|
directory_level="1. 监管信息 / 1.2 申请表",
|
||||||
|
file_name="申请表.txt",
|
||||||
|
file_type="txt",
|
||||||
|
relative_path="1.监管信息/申请表.txt",
|
||||||
|
storage_path=str(application),
|
||||||
|
)
|
||||||
|
|
||||||
|
candidates = detect_regulatory_condition_candidates(summary)
|
||||||
|
|
||||||
|
assert candidates["product_name"]["suggested"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒 (荧光PCR法)"
|
||||||
|
assert candidates["model_spec"]["suggested"] == "24人份/盒"
|
||||||
|
|
||||||
|
|
||||||
def test_workflow_pauses_before_rule_scope_until_conditions_confirmed(settings, tmp_path, django_user_model):
|
def test_workflow_pauses_before_rule_scope_until_conditions_confirmed(settings, tmp_path, django_user_model):
|
||||||
settings.MEDIA_ROOT = tmp_path
|
settings.MEDIA_ROOT = tmp_path
|
||||||
user = django_user_model.objects.create_user(username="owner", password="pass")
|
user = django_user_model.objects.create_user(username="owner", password="pass")
|
||||||
|
|||||||
@@ -14,6 +14,21 @@ def test_extract_text_reads_plain_text(tmp_path):
|
|||||||
assert result.content_hash
|
assert result.content_hash
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_text_keeps_wrapped_product_name(tmp_path):
|
||||||
|
path = tmp_path / "申请表.txt"
|
||||||
|
path.write_text(
|
||||||
|
"产品名称:呼吸道合胞病毒、肺炎支原体核酸检测试剂盒\n"
|
||||||
|
"(荧光PCR法)\n"
|
||||||
|
"型号规格:24人份/盒\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = extract_text(path)
|
||||||
|
|
||||||
|
assert result.field_candidates["产品名称"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒 (荧光PCR法)"
|
||||||
|
assert result.field_candidates["型号规格"] == "24人份/盒"
|
||||||
|
|
||||||
|
|
||||||
def test_extract_text_reports_unsupported_file(tmp_path):
|
def test_extract_text_reports_unsupported_file(tmp_path):
|
||||||
path = tmp_path / "image.png"
|
path = tmp_path / "image.png"
|
||||||
path.write_bytes(b"png")
|
path.write_bytes(b"png")
|
||||||
|
|||||||
Reference in New Issue
Block a user