fix(regulatory): 修复无标签文档适用条件回显

This commit is contained in:
2026-06-07 12:29:22 +08:00
parent 1b4a10b5ba
commit 9e27c4c684
8 changed files with 305 additions and 8 deletions

View File

@@ -1,10 +1,11 @@
from __future__ import annotations
import re
from pathlib import Path
from django.conf import settings
from review_agent.models import FileSummaryBatch
from review_agent.models import FileSummaryBatch, RegulatoryReviewBatch
from review_agent.regulatory_review.services.llm_review import review_condition_fields
from review_agent.regulatory_review.services.text_extract import extract_text
@@ -16,6 +17,18 @@ OPTION_FIELDS = {
}
def ensure_regulatory_condition_candidates(batch: RegulatoryReviewBatch) -> dict[str, dict[str, object]]:
condition_json = batch.condition_json or {}
candidates = condition_json.get("candidates") or {}
if batch.status != RegulatoryReviewBatch.Status.WAITING_USER or not _condition_candidates_incomplete(candidates):
return candidates
refreshed = detect_regulatory_condition_candidates(batch.source_summary_batch)
refreshed = _merge_condition_candidates(candidates, refreshed)
batch.condition_json = {**condition_json, "candidates": refreshed}
batch.save(update_fields=["condition_json"])
return refreshed
def detect_regulatory_condition_candidates(summary_batch: FileSummaryBatch) -> dict[str, dict[str, object]]:
"""Infers review-scope conditions from the summary batch and file names."""
@@ -30,6 +43,8 @@ def detect_regulatory_condition_candidates(summary_batch: FileSummaryBatch) -> d
field_candidates.update({key: value for key, value in extracted.items() if value and key not in field_candidates})
field_sources.update({key: value for key, value in sources.items() if value and key not in field_sources})
corpus_parts.extend(extracted.values())
if review.get("front_text"):
corpus_parts.append(str(review["front_text"]))
corpus = "\n".join(part for part in corpus_parts if part)
product_name = field_candidates.get("产品名称") or _safe_summary_product_name(summary_batch.product_name)
@@ -80,13 +95,22 @@ def _extract_item_fields(item) -> dict[str, object]:
if not path.exists():
return {}
result = extract_text(path)
if result.status != "success" or not result.field_candidates:
if result.status != "success" or not result.text:
return {}
return review_condition_fields(
inferred_fields = _infer_fields_from_text(result.front_text or result.text)
rule_fields = {**inferred_fields, **(result.field_candidates or {})}
review = review_condition_fields(
text=result.front_text or result.text,
rule_fields=result.field_candidates,
rule_fields=rule_fields,
file_context=f"{item.directory_level}\n{item.file_name}\n{item.relative_path}",
)
selected_sources = dict(review.get("selected_sources") or {})
for key in inferred_fields:
if selected_sources.get(key) == "rule" and key not in (result.field_candidates or {}):
selected_sources[key] = "inferred"
review["selected_sources"] = selected_sources
review["front_text"] = result.front_text or result.text[:1200]
return review
def _safe_summary_product_name(product_name: str) -> str:
@@ -98,6 +122,99 @@ def _safe_summary_product_name(product_name: str) -> str:
return value
def _infer_fields_from_text(text: str) -> dict[str, str]:
normalized = _normalize_text_for_inference(text)
fields = {}
product_name = _infer_product_name(normalized)
if product_name:
fields["产品名称"] = product_name
model_spec = _infer_model_spec(normalized)
if model_spec:
fields["型号规格"] = model_spec
return fields
def _normalize_text_for_inference(text: str) -> str:
value = re.sub(r"\s+", "", text or "")
value = value.replace("", "(").replace("", ")")
return value
def _infer_product_name(text: str) -> str:
patterns = [
r"体外诊断试剂(?P<name>[^。;;,]{4,120}?试剂盒\([^()]{2,30}\))产品注册",
r"(?P<name>[^。;;,]{4,120}?试剂盒\([^()]{2,30}\))",
]
for pattern in patterns:
match = re.search(pattern, text)
if match:
return _restore_chinese_parentheses(_trim_product_name(match.group("name")))
return ""
def _trim_product_name(value: str) -> str:
prefixes = ["申请境内第三类体外诊断试剂", "申请境内第二类体外诊断试剂", "境内第三类体外诊断试剂", "境内第二类体外诊断试剂"]
result = value
for prefix in prefixes:
if prefix in result:
result = result.split(prefix, 1)[-1]
return result
def _infer_model_spec(text: str) -> str:
specs = sorted(set(re.findall(r"规格[A-Z-]", text)))
if specs:
return "".join(specs)
match = re.search(r"产品的包装规格(?P<spec>.{1,80}?(?:人份/盒|测试/盒|反应/盒)(?:[、,].{1,30}?(?:人份/盒|测试/盒|反应/盒))*)", text)
if not match:
return ""
return _restore_chinese_parentheses(match.group("spec").strip(":,。;;"))
def _restore_chinese_parentheses(value: str) -> str:
return value.replace("(", "").replace(")", "")
def _condition_candidates_incomplete(candidates: dict[str, dict[str, object]]) -> bool:
if not candidates:
return True
product_name = str((candidates.get("product_name") or {}).get("suggested") or "").strip()
product_category = str((candidates.get("product_category") or {}).get("suggested") or "").strip()
return not product_name or "<EFBFBD>" in product_name or product_category == "其他"
def _merge_condition_candidates(
current: dict[str, dict[str, object]],
refreshed: dict[str, dict[str, object]],
) -> dict[str, dict[str, object]]:
merged = {**(current or {})}
for field, config in (refreshed or {}).items():
current_config = merged.get(field) or {}
current_value = str(current_config.get("suggested") or "").strip()
refreshed_value = str((config or {}).get("suggested") or "").strip()
if _is_better_condition_value(current_value, refreshed_value):
merged[field] = config
elif field not in merged:
merged[field] = config
return merged
def _is_better_condition_value(current_value: str, refreshed_value: str) -> bool:
if not refreshed_value:
return False
if "<EFBFBD>" in refreshed_value:
return False
if "<EFBFBD>" in current_value:
return True
if not current_value:
return True
if current_value == "其他" and refreshed_value != "其他":
return True
if current_value == "待确认" and refreshed_value != "待确认":
return True
return len(refreshed_value) > len(current_value) and current_value in refreshed_value
def _detect_product_category(corpus: str) -> str:
if any(keyword in corpus for keyword in ["体外诊断", "检测试剂", "试剂盒", "IVD"]):
return "体外诊断试剂"