fix(application-form-fill): 抽取说明书章节和表格字段
This commit is contained in:
@@ -9,6 +9,10 @@ from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from docx import Document
|
||||
from docx.oxml.table import CT_Tbl
|
||||
from docx.oxml.text.paragraph import CT_P
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
from openpyxl import load_workbook
|
||||
from pypdf import PdfReader
|
||||
from pptx import Presentation
|
||||
@@ -49,7 +53,7 @@ def extract_text_from_path(path: Path) -> str:
|
||||
if suffix == ".pdf":
|
||||
return "\n".join(page.extract_text() or "" for page in PdfReader(str(path)).pages)
|
||||
if suffix == ".docx":
|
||||
return "\n".join(paragraph.text for paragraph in Document(str(path)).paragraphs)
|
||||
return _extract_docx_text(path)
|
||||
if suffix == ".pptx":
|
||||
presentation = Presentation(str(path))
|
||||
lines = []
|
||||
@@ -72,6 +76,31 @@ def extract_text_from_path(path: Path) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_docx_text(path: Path) -> str:
|
||||
document = Document(str(path))
|
||||
lines: list[str] = []
|
||||
for block in _iter_docx_blocks(document):
|
||||
if isinstance(block, Paragraph):
|
||||
text = block.text.strip()
|
||||
if text:
|
||||
lines.append(text)
|
||||
elif isinstance(block, Table):
|
||||
for row in block.rows:
|
||||
values = [cell.text.strip() for cell in row.cells if cell.text.strip()]
|
||||
if values:
|
||||
lines.append("\t".join(values))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _iter_docx_blocks(document):
|
||||
body = document.element.body
|
||||
for child in body.iterchildren():
|
||||
if isinstance(child, CT_P):
|
||||
yield Paragraph(child, document)
|
||||
elif isinstance(child, CT_Tbl):
|
||||
yield Table(child, document)
|
||||
|
||||
|
||||
def _extract_legacy_doc_with_libreoffice(path: Path) -> str:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
target_dir = Path(tmp_dir)
|
||||
|
||||
Reference in New Issue
Block a user