fix(application-form-fill): 抽取说明书章节和表格字段

2026-06-07 20:14:53 +08:00
parent 13b543c99d
commit 0ccd69d3f4
4 changed files with 149 additions and 2 deletions
--- a/review_agent/regulatory_review/services/rag_index.py
+++ b/review_agent/regulatory_review/services/rag_index.py
@@ -9,6 +9,10 @@ from pathlib import Path

 from django.conf import settings
 from docx import Document
+from docx.oxml.table import CT_Tbl
+from docx.oxml.text.paragraph import CT_P
+from docx.table import Table
+from docx.text.paragraph import Paragraph
 from openpyxl import load_workbook
 from pypdf import PdfReader
 from pptx import Presentation
@@ -49,7 +53,7 @@ def extract_text_from_path(path: Path) -> str:
    if suffix == ".pdf":
        return "\n".join(page.extract_text() or "" for page in PdfReader(str(path)).pages)
    if suffix == ".docx":
-        return "\n".join(paragraph.text for paragraph in Document(str(path)).paragraphs)
+        return _extract_docx_text(path)
    if suffix == ".pptx":
        presentation = Presentation(str(path))
        lines = []
@@ -72,6 +76,31 @@ def extract_text_from_path(path: Path) -> str:
    return ""


+def _extract_docx_text(path: Path) -> str:
+    document = Document(str(path))
+    lines: list[str] = []
+    for block in _iter_docx_blocks(document):
+        if isinstance(block, Paragraph):
+            text = block.text.strip()
+            if text:
+                lines.append(text)
+        elif isinstance(block, Table):
+            for row in block.rows:
+                values = [cell.text.strip() for cell in row.cells if cell.text.strip()]
+                if values:
+                    lines.append("\t".join(values))
+    return "\n".join(lines)
+
+
+def _iter_docx_blocks(document):
+    body = document.element.body
+    for child in body.iterchildren():
+        if isinstance(child, CT_P):
+            yield Paragraph(child, document)
+        elif isinstance(child, CT_Tbl):
+            yield Table(child, document)
+
+
 def _extract_legacy_doc_with_libreoffice(path: Path) -> str:
    with tempfile.TemporaryDirectory() as tmp_dir:
        target_dir = Path(tmp_dir)