Files

102 lines
3.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import hashlib
import re
from dataclasses import dataclass
from pathlib import Path
from .rag_index import extract_text_from_path
@dataclass(frozen=True)
class ExtractedText:
path: Path
text: str
status: str
content_hash: str = ""
error_message: str = ""
front_text: str = ""
section_candidates: list[str] | None = None
field_candidates: dict[str, str] | None = None
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx", ".pptx", ".xlsx", ".doc"}
FIELD_LABELS = ["产品名称", "型号规格", "预期用途", "管理类别", "分类编码", "注册类型", "临床评价路径"]
def extract_text(path: str | Path) -> ExtractedText:
file_path = Path(path)
if file_path.suffix.lower() not in SUPPORTED_EXTENSIONS:
return ExtractedText(path=file_path, text="", status="unsupported")
try:
text = extract_text_from_path(file_path)
except Exception as exc:
return ExtractedText(
path=file_path,
text="",
status="failed",
error_message=str(exc),
section_candidates=[],
field_candidates={},
)
content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() if text else ""
return ExtractedText(
path=file_path,
text=text,
status="success",
content_hash=content_hash,
front_text=_front_text(text),
section_candidates=_section_candidates(text),
field_candidates=_field_candidates(text),
)
def _front_text(text: str, limit: int = 1200) -> str:
return text[:limit]
def _section_candidates(text: str) -> list[str]:
candidates = []
for line in text.splitlines():
normalized = line.strip()
if not normalized:
continue
if re.match(r"^([一二三四五六七八九十]+[、.]|[0-9]+(\.[0-9]+)*[、.\s])", normalized):
candidates.append(normalized[:120])
elif any(keyword in normalized for keyword in ["章节目录", "监管信息", "综述资料", "非临床资料", "临床评价资料", "质量管理体系"]):
candidates.append(normalized[:120])
return candidates[:80]
def _field_candidates(text: str) -> dict[str, str]:
fields = {}
lines = text.splitlines()
for index, line in enumerate(lines):
normalized = line.strip()
if not normalized:
continue
for label in FIELD_LABELS:
match = re.match(rf"^{re.escape(label)}[:]\s*(.*)$", normalized)
if not match or label in fields:
continue
value_parts = [match.group(1).strip()]
for next_line in lines[index + 1 :]:
continuation = next_line.strip()
if not continuation or _starts_field_line(continuation) or _looks_like_section_heading(continuation):
break
value_parts.append(continuation)
value = " ".join(part for part in value_parts if part)
if value:
fields[label] = " ".join(value.split())
return fields
def _starts_field_line(line: str) -> bool:
if any(re.match(rf"^{re.escape(label)}[:]", line) for label in FIELD_LABELS):
return True
return bool(re.match(r"^[^\s:]{2,24}[:]", line))
def _looks_like_section_heading(line: str) -> bool:
return bool(re.match(r"^([一二三四五六七八九十]+[、.]|[0-9]+(\.[0-9]+)*[、.\s])", line))