From d640ced7488f41e96c9b4d09f8406b8b2b4c7d23 Mon Sep 17 00:00:00 2001 From: bruce Date: Sun, 7 Jun 2026 20:26:32 +0800 Subject: [PATCH] =?UTF-8?q?fix(application-form-fill):=20=E6=B8=85?= =?UTF-8?q?=E7=90=86=E5=A1=AB=E8=A1=A8=E8=AF=B4=E6=98=8E=E5=B9=B6=E6=94=B6?= =?UTF-8?q?=E7=AA=84=E6=8C=89=E9=92=AE=E8=AF=9D=E6=9C=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../services/field_extract.py | 13 ++++++-- .../services/word_fill.py | 29 +++++++++++++++++ templates/home.html | 2 +- ...est_application_form_fill_field_extract.py | 21 ++++++++++++ tests/test_application_form_fill_word_fill.py | 32 +++++++++++++++++++ tests/test_file_summary_frontend.py | 3 +- 6 files changed, 96 insertions(+), 4 deletions(-) diff --git a/review_agent/application_form_fill/services/field_extract.py b/review_agent/application_form_fill/services/field_extract.py index 82650a0..7bb636f 100644 --- a/review_agent/application_form_fill/services/field_extract.py +++ b/review_agent/application_form_fill/services/field_extract.py @@ -28,6 +28,15 @@ FIELD_ALIASES = { "storage_condition_and_validity": ["产品储存条件及有效期", "储存条件及有效期", "储存条件", "有效期"], } +STATIC_STOP_LABELS = [ + "申请人", + "国家药品监督管理局", + "填表说明", + "注", + "保证书", + "应附资料", +] + def collect_document_texts(summary_batch: FileSummaryBatch) -> dict[str, str]: texts: dict[str, str] = {} @@ -180,7 +189,7 @@ def _field_aliases(field: dict[str, str]) -> list[str]: def _all_field_labels(fields: list[dict[str, str]]) -> list[str]: - labels: list[str] = [] + labels: list[str] = list(STATIC_STOP_LABELS) for field in fields: for label in _field_aliases(field): if label not in labels: @@ -194,7 +203,7 @@ def _extract_label_value(text: str, label: str, labels: list[str]) -> tuple[str, def _extract_colon_label_value(text: str, label: str, labels: list[str]) -> tuple[str, str]: escaped_labels = "|".join(re.escape(item) for item in labels if item != label) - stop_pattern = rf"(?=\n\s*(?:{escaped_labels})\s*[::])" if escaped_labels else r"(?=\Z)" + stop_pattern = rf"(?=\n\s*(?:{escaped_labels})(?:\s*[::]|\s*$))" if escaped_labels else r"(?=\Z)" pattern = re.compile(rf"{re.escape(label)}\s*[::]\s*(.+?)(?:{stop_pattern}|\Z)", re.S) match = pattern.search(text or "") if not match: diff --git a/review_agent/application_form_fill/services/word_fill.py b/review_agent/application_form_fill/services/word_fill.py index 195b918..9a6e11a 100644 --- a/review_agent/application_form_fill/services/word_fill.py +++ b/review_agent/application_form_fill/services/word_fill.py @@ -22,6 +22,7 @@ def fill_template( conflicts: list[dict] | None = None, ) -> Path: document = Document(str(template_path)) + remove_fill_instructions(document) conflict_keys = {item.get("field_key") for item in conflicts or []} for field_config in spec.fields: target = field_config.get("target") or {} @@ -43,6 +44,25 @@ def fill_template( return output +def remove_fill_instructions(document: Document) -> None: + removing = False + for paragraph in list(document.paragraphs): + text = _normalize_label(paragraph.text) + if text == "填表说明": + removing = True + if removing: + _remove_paragraph(paragraph) + continue + if text.startswith("注填表前") and "填表说明" in text: + _remove_paragraph(paragraph) + + for table in document.tables: + for row in list(table.rows): + row_text = _normalize_label("".join(cell.text for cell in row.cells)) + if row_text == "填表说明" or row_text.startswith("注填表前"): + _remove_row(row) + + def fill_table_row(document: Document, row_label: str, value: str, *, conflict: bool = False) -> bool: normalized_label = _normalize_label(row_label) for table in document.tables: @@ -71,6 +91,15 @@ def apply_cell_shading(cell, fill: str) -> None: shading.set(qn("w:fill"), fill) +def _remove_paragraph(paragraph) -> None: + element = paragraph._element + element.getparent().remove(element) + + +def _remove_row(row) -> None: + row._tr.getparent().remove(row._tr) + + def create_word_export( batch: ApplicationFormFillBatch, spec: TemplateSpec, diff --git a/templates/home.html b/templates/home.html index 38fa136..ef75d33 100644 --- a/templates/home.html +++ b/templates/home.html @@ -211,7 +211,7 @@ diff --git a/tests/test_application_form_fill_field_extract.py b/tests/test_application_form_fill_field_extract.py index b1e2b01..28f020b 100644 --- a/tests/test_application_form_fill_field_extract.py +++ b/tests/test_application_form_fill_field_extract.py @@ -103,6 +103,27 @@ def test_rule_maps_agent_fields_to_manufacturer_company_for_now(): assert values["manufacturer_address"] == "江苏省宜兴经济技术开发区杏里路10号宜兴光电产业园4幢102室" +def test_rule_stops_product_name_before_application_form_instructions(): + texts = { + "境内体外诊断试剂注册申请表.docx": "\n".join( + [ + "产品名称:呼吸道合胞病毒、肺炎支原体核酸检测试剂盒(荧光PCR法)", + "申请人:", + "卡尤迪生物科技宜兴有限公司", + "国家药品监督管理局", + "填表说明", + "1. 本表依据《体外诊断注册与备案管理办法》制定。", + ] + ) + } + + result = extract_by_rules(texts, _registration_specs()) + + values = {field["key"]: field["value"] for field in result["fields"]} + assert values["product_name"] == "呼吸道合胞病毒、肺炎支原体核酸检测试剂盒(荧光PCR法)" + assert "填表说明" not in values["product_name"] + + def test_llm_extract_parses_structured_json(monkeypatch): monkeypatch.setattr( "review_agent.application_form_fill.services.field_extract.generate_completion", diff --git a/tests/test_application_form_fill_word_fill.py b/tests/test_application_form_fill_word_fill.py index 04c918f..2708e5d 100644 --- a/tests/test_application_form_fill_word_fill.py +++ b/tests/test_application_form_fill_word_fill.py @@ -41,6 +41,17 @@ def _template(path): document.save(path) +def _template_with_instructions(path): + document = Document() + table = document.add_table(rows=2, cols=2) + table.rows[0].cells[0].text = "产品名称" + table.rows[1].cells[0].text = "预期用途" + document.add_paragraph("填表说明") + document.add_paragraph("1. 本表依据《体外诊断注册与备案管理办法》制定。") + document.add_paragraph("2. 本表可从国家药品监督管理局网站下载。") + document.save(path) + + def test_word_fill_writes_table_rows(tmp_path): template_path = tmp_path / "template.docx" output_path = tmp_path / "filled.docx" @@ -61,6 +72,27 @@ def test_word_fill_writes_table_rows(tmp_path): assert document.tables[0].rows[1].cells[1].text == "用于体外检测" +def test_word_fill_removes_template_fill_instructions(tmp_path): + template_path = tmp_path / "template.docx" + output_path = tmp_path / "filled.docx" + _template_with_instructions(template_path) + + fill_template( + template_path, + output_path, + _spec(), + { + "product_name": MergedField("product_name", "产品名称", "甲胎蛋白检测试剂盒", "说明书.txt", "证据", 0.8), + }, + ) + + document = Document(output_path) + text = "\n".join(paragraph.text for paragraph in document.paragraphs) + assert "填表说明" not in text + assert "本表依据" not in text + assert document.tables[0].rows[0].cells[1].text == "甲胎蛋白检测试剂盒" + + def test_word_fill_highlights_conflict_in_docx_xml(tmp_path): template_path = tmp_path / "template.docx" output_path = tmp_path / "filled.docx" diff --git a/tests/test_file_summary_frontend.py b/tests/test_file_summary_frontend.py index 5481f5b..1355619 100644 --- a/tests/test_file_summary_frontend.py +++ b/tests/test_file_summary_frontend.py @@ -251,6 +251,7 @@ def test_workspace_tool_buttons_fill_default_prompts(client, django_user_model): assert ">风险预警" not in content assert 'data-prompt-template="请对当前对话已上传的文件或压缩包自动汇总文件目录' in content assert 'data-prompt-template="请对当前对话最近成功汇总的注册资料发起 NMPA 法规核查与风险预警' in content - assert 'data-prompt-template="请基于当前对话最近成功汇总的产品资料,自动提取产品关键信息并填入申报文件模板' in content + assert 'data-prompt-template="请基于当前对话最近成功汇总的产品资料,自动提取产品关键信息并填入申报文件模板"' in content + assert "优先生成注册证 Word 和字段来源追溯清单" not in content assert "bindPromptTemplateButtons" in script assert "promptInput.value = template" in script