fix(application-form-fill): 过滤申请表噪声冲突内容

This commit is contained in:
2026-06-07 20:34:24 +08:00
parent d640ced748
commit 003ff59268
4 changed files with 81 additions and 6 deletions

View File

@@ -17,11 +17,11 @@ from review_agent.regulatory_review.services.text_extract import extract_text
FIELD_ALIASES = {
"product_name": ["产品名称"],
"applicant_name": ["注册人名称", "生产企业名称", "企业名称", "生产企业"],
"applicant_address": ["注册人住所", "生产企业住所", "企业住所", "住所"],
"applicant_name": ["注册人名称", "申请人名称", "生产企业名称"],
"applicant_address": ["注册人住所", "申请人住所", "生产企业住所"],
"manufacturer_address": ["生产地址", "生产企业地址", "生产场所"],
"agent_name": ["代理人名称", "生产企业名称", "企业名称", "生产企业", "注册人名称"],
"agent_address": ["代理人住所", "生产企业住所", "企业住所", "住所", "注册人住所"],
"agent_name": ["代理人名称", "生产企业名称", "注册人名称", "申请人名称"],
"agent_address": ["代理人住所", "生产企业住所", "注册人住所", "申请人住所"],
"package_specification": ["包装规格", "规格"],
"main_components": ["主要组成成分", "主要组成", "组成成分"],
"intended_use": ["预期用途"],
@@ -35,6 +35,14 @@ STATIC_STOP_LABELS = [
"",
"保证书",
"应附资料",
"优先通道申请",
"分类编码",
"医疗器械唯一标识",
"注册产品目前是否",
"临床评价路径",
"临床试验",
"其他需要说明的问题",
"国家药监局器审中心医疗器械",
]

View File

@@ -22,10 +22,11 @@ def build_assistant_summary(batch: ApplicationFormFillBatch, exports: list[Expor
lines.extend(["", "| 冲突字段 | 采用值 | 冲突来源 | 处理 |", "| --- | --- | --- | --- |"])
for item in conflicts:
conflict_sources = "".join(
f"{value.get('source_file', '')}{value.get('value', '')}" for value in item.get("conflict_values", [])
f"{_compact_table_text(value.get('source_file', ''))}{_compact_table_text(value.get('value', ''))}"
for value in item.get("conflict_values", [])
)
lines.append(
f"| {item.get('field_label', item.get('field_key', ''))} | {item.get('selected_value', '')} | {conflict_sources or '-'} | {item.get('handling', '')} |"
f"| {_compact_table_text(item.get('field_label', item.get('field_key', '')))} | {_compact_table_text(item.get('selected_value', ''))} | {_compact_table_text(conflict_sources or '-')} | {_compact_table_text(item.get('handling', ''))} |"
)
if trace_exports:
@@ -33,3 +34,10 @@ def build_assistant_summary(batch: ApplicationFormFillBatch, exports: list[Expor
for export in trace_exports:
lines.append(f"[下载{export.file_name}](/api/review-agent/file-summary/exports/{export.pk}/download/)")
return "\n".join(lines).strip()
def _compact_table_text(value: object, *, limit: int = 80) -> str:
text = " ".join(str(value or "").replace("|", " ").split())
if len(text) <= limit:
return text
return f"{text[:limit]}..."

View File

@@ -124,6 +124,26 @@ def test_rule_stops_product_name_before_application_form_instructions():
assert "填表说明" not in values["product_name"]
def test_rule_ignores_generic_enterprise_name_from_application_form():
texts = {
"CH1.4 申请表.docx": "\n".join(
[
"注册人制度\t是 企业名称:否",
"优先通道申请 应急通道 同品种首个产品首次申报",
"临床试验",
"临床试验机构名称: 中国医学科学院北京协和医院、晋中市第一人民医院",
"应附资料",
]
)
}
result = extract_by_rules(texts, _registration_specs())
values = {field["key"]: field["value"] for field in result["fields"]}
assert "applicant_name" not in values
assert "agent_name" not in values
def test_llm_extract_parses_structured_json(monkeypatch):
monkeypatch.setattr(
"review_agent.application_form_fill.services.field_extract.generate_completion",

View File

@@ -0,0 +1,39 @@
import pytest
from review_agent.application_form_fill.services.summary import build_assistant_summary
from review_agent.models import ApplicationFormFillBatch, Conversation, FileSummaryBatch
pytestmark = pytest.mark.django_db
def test_assistant_summary_compacts_long_conflict_values(django_user_model):
user = django_user_model.objects.create_user(username="owner", password="pass")
conversation = Conversation.objects.create(user=user, title="会话")
summary = FileSummaryBatch.objects.create(conversation=conversation, user=user, batch_no="FS-SUMMARY")
batch = ApplicationFormFillBatch.objects.create(
conversation=conversation,
user=user,
source_summary_batch=summary,
batch_no="AFF-SUMMARY",
conflict_summary=[
{
"field_key": "applicant_name",
"field_label": "注册人名称",
"selected_value": "卡尤迪生物科技宜兴有限公司",
"conflict_values": [
{
"source_file": "CH1.4 申请表.docx",
"value": "\n临床试验\n临床试验机构名称: 中国医学科学院北京协和医院、晋中市第一人民医院、北京市疾病预防控制中心 临床数据库.zip\n应附资料",
}
],
"handling": "说明书优先,模板内黄底红字高亮",
}
],
)
content = build_assistant_summary(batch, [])
assert "临床试验机构名称" in content
assert len([line for line in content.splitlines() if "临床试验机构名称" in line][0]) < 220
assert "\n临床试验\n" not in content