feat(regulatory): 对齐附件4目录核查规则

This commit is contained in:
2026-06-07 09:27:42 +08:00
parent bbd2d3532a
commit 1bdc7322cf
15 changed files with 753 additions and 43 deletions

View File

@@ -107,12 +107,19 @@ def collect_source_chunks(source_dir: Path) -> list[TextChunk]:
try:
text = extract_text_from_path(path)
except RuntimeError as exc:
if _is_attachment4(path):
raise RuntimeError(f"附件 4 核心法规材料抽取失败:{path.name}") from exc
logger.warning("Regulatory source extraction skipped", extra={"path": str(path), "error": str(exc)})
continue
chunks.extend(chunk_text(text, source=str(path.relative_to(source_dir))))
return chunks
def _is_attachment4(path: Path) -> bool:
normalized = path.name.replace(" ", "")
return "附件4" in normalized and "体外诊断试剂注册申报资料要求及说明" in normalized
def build_chroma_index(
*,
source_dir: Path,