fix(regulatory): 修复换行产品名称提取不全
This commit is contained in:
@@ -21,6 +21,7 @@ class ExtractedText:
|
||||
|
||||
|
||||
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
|
||||
FIELD_LABELS = ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]
|
||||
|
||||
|
||||
def extract_text(path: str | Path) -> ExtractedText:
|
||||
@@ -69,8 +70,32 @@ def _section_candidates(text: str) -> list[str]:
|
||||
|
||||
def _field_candidates(text: str) -> dict[str, str]:
|
||||
fields = {}
|
||||
for label in ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]:
|
||||
match = re.search(rf"{label}[::]\s*([^\n\r]+)", text)
|
||||
if match:
|
||||
fields[label] = " ".join(match.group(1).strip().split())
|
||||
lines = text.splitlines()
|
||||
for index, line in enumerate(lines):
|
||||
normalized = line.strip()
|
||||
if not normalized:
|
||||
continue
|
||||
for label in FIELD_LABELS:
|
||||
match = re.match(rf"^{re.escape(label)}[::]\s*(.*)$", normalized)
|
||||
if not match or label in fields:
|
||||
continue
|
||||
value_parts = [match.group(1).strip()]
|
||||
for next_line in lines[index + 1 :]:
|
||||
continuation = next_line.strip()
|
||||
if not continuation or _starts_field_line(continuation) or _looks_like_section_heading(continuation):
|
||||
break
|
||||
value_parts.append(continuation)
|
||||
value = " ".join(part for part in value_parts if part)
|
||||
if value:
|
||||
fields[label] = " ".join(value.split())
|
||||
return fields
|
||||
|
||||
|
||||
def _starts_field_line(line: str) -> bool:
|
||||
if any(re.match(rf"^{re.escape(label)}[::]", line) for label in FIELD_LABELS):
|
||||
return True
|
||||
return bool(re.match(r"^[^\s::]{2,24}[::]", line))
|
||||
|
||||
|
||||
def _looks_like_section_heading(line: str) -> bool:
|
||||
return bool(re.match(r"^([一二三四五六七八九十]+[、..]|[0-9]+(\.[0-9]+)*[、..\s])", line))
|
||||
|
||||
Reference in New Issue
Block a user