fix(application-form-fill): 抽取说明书章节和表格字段

This commit is contained in:
2026-06-07 20:14:53 +08:00
parent 13b543c99d
commit 0ccd69d3f4
4 changed files with 149 additions and 2 deletions

View File

@@ -9,6 +9,10 @@ from pathlib import Path
from django.conf import settings
from docx import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.text.paragraph import Paragraph
from openpyxl import load_workbook
from pypdf import PdfReader
from pptx import Presentation
@@ -49,7 +53,7 @@ def extract_text_from_path(path: Path) -> str:
if suffix == ".pdf":
return "\n".join(page.extract_text() or "" for page in PdfReader(str(path)).pages)
if suffix == ".docx":
return "\n".join(paragraph.text for paragraph in Document(str(path)).paragraphs)
return _extract_docx_text(path)
if suffix == ".pptx":
presentation = Presentation(str(path))
lines = []
@@ -72,6 +76,31 @@ def extract_text_from_path(path: Path) -> str:
return ""
def _extract_docx_text(path: Path) -> str:
document = Document(str(path))
lines: list[str] = []
for block in _iter_docx_blocks(document):
if isinstance(block, Paragraph):
text = block.text.strip()
if text:
lines.append(text)
elif isinstance(block, Table):
for row in block.rows:
values = [cell.text.strip() for cell in row.cells if cell.text.strip()]
if values:
lines.append("\t".join(values))
return "\n".join(lines)
def _iter_docx_blocks(document):
body = document.element.body
for child in body.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, document)
elif isinstance(child, CT_Tbl):
yield Table(child, document)
def _extract_legacy_doc_with_libreoffice(path: Path) -> str:
with tempfile.TemporaryDirectory() as tmp_dir:
target_dir = Path(tmp_dir)