feat(regulatory-info-package): 完善目录页码与组成成分填充
This commit is contained in:
@@ -1,7 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
from zipfile import ZipFile
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from review_agent.models import RegulatoryInfoPackageBatch
|
||||
from review_agent.regulatory_info_package.constants import GENERATED_FILE_FAILED
|
||||
@@ -18,9 +21,16 @@ def generate_package_documents(
|
||||
merged_fields: dict[str, MergedField],
|
||||
) -> list[GeneratedFileResult]:
|
||||
specs = template_specs(config)
|
||||
with ThreadPoolExecutor(max_workers=min(4, len(specs) or 1)) as executor:
|
||||
futures = [executor.submit(_generate_one, batch, config, spec, merged_fields) for spec in specs]
|
||||
return [future.result() for future in as_completed(futures)]
|
||||
directory_specs = [spec for spec in specs if spec.code == "ch1_2_directory"]
|
||||
content_specs = [spec for spec in specs if spec.code != "ch1_2_directory"]
|
||||
results: list[GeneratedFileResult] = []
|
||||
with ThreadPoolExecutor(max_workers=min(4, len(content_specs) or 1)) as executor:
|
||||
futures = [executor.submit(_generate_one, batch, config, spec, merged_fields) for spec in content_specs]
|
||||
results.extend(future.result() for future in as_completed(futures))
|
||||
page_numbers = _directory_page_numbers(results)
|
||||
for spec in directory_specs:
|
||||
results.append(_generate_one(batch, config, spec, merged_fields, directory_page_numbers=page_numbers))
|
||||
return results
|
||||
|
||||
|
||||
def _generate_one(
|
||||
@@ -28,6 +38,8 @@ def _generate_one(
|
||||
config: dict,
|
||||
spec: TemplateSpec,
|
||||
merged_fields: dict[str, MergedField],
|
||||
*,
|
||||
directory_page_numbers: dict[str, str] | None = None,
|
||||
) -> GeneratedFileResult:
|
||||
try:
|
||||
template_path = copy_template_to_batch(batch, config, spec)
|
||||
@@ -44,6 +56,7 @@ def _generate_one(
|
||||
output_path,
|
||||
merged_fields,
|
||||
template_code=spec.code,
|
||||
directory_page_numbers=directory_page_numbers,
|
||||
)
|
||||
actual_path = output_path
|
||||
actual_format = "docx"
|
||||
@@ -68,3 +81,106 @@ def _generate_one(
|
||||
status=GENERATED_FILE_FAILED,
|
||||
error_message=str(exc),
|
||||
)
|
||||
|
||||
|
||||
def _directory_page_numbers(results: list[GeneratedFileResult]) -> dict[str, str]:
|
||||
page_numbers = {"CH1.2": "1"}
|
||||
for result in results:
|
||||
if result.status not in {"success", "fallback_success"} or not result.path:
|
||||
continue
|
||||
code = _directory_code_from_file_name(result.file_name)
|
||||
if not code:
|
||||
continue
|
||||
page_numbers[code] = str(count_document_pages(result.path))
|
||||
return page_numbers
|
||||
|
||||
|
||||
def _directory_code_from_file_name(file_name: str) -> str:
|
||||
stem = Path(file_name).stem.strip()
|
||||
return stem.split()[0] if stem.startswith("CH") else ""
|
||||
|
||||
|
||||
def count_document_pages(path: str | Path) -> int:
|
||||
file_path = Path(path)
|
||||
if not file_path.exists():
|
||||
return 1
|
||||
pages = _count_pages_from_docx_properties(file_path)
|
||||
if pages:
|
||||
return pages
|
||||
pages = _count_pages_with_pywin32(file_path)
|
||||
if pages:
|
||||
return pages
|
||||
pages = _count_pages_with_powershell_word(file_path)
|
||||
if pages:
|
||||
return pages
|
||||
return 1
|
||||
|
||||
|
||||
def _count_pages_from_docx_properties(file_path: Path) -> int:
|
||||
if file_path.suffix.lower() != ".docx":
|
||||
return 0
|
||||
try:
|
||||
with ZipFile(file_path) as archive:
|
||||
root = ElementTree.fromstring(archive.read("docProps/app.xml"))
|
||||
namespace = {"ep": "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"}
|
||||
pages = root.find("ep:Pages", namespace)
|
||||
return max(int((pages.text or "").strip()), 1) if pages is not None else 0
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def _count_pages_with_pywin32(file_path: Path) -> int:
|
||||
try:
|
||||
import win32com.client
|
||||
|
||||
word = win32com.client.DispatchEx("Word.Application")
|
||||
word.Visible = False
|
||||
document = None
|
||||
try:
|
||||
document = word.Documents.Open(str(file_path.resolve()), ReadOnly=True)
|
||||
document.Repaginate()
|
||||
return max(int(document.ComputeStatistics(2)), 1)
|
||||
finally:
|
||||
if document is not None:
|
||||
document.Close(False)
|
||||
word.Quit()
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def _count_pages_with_powershell_word(file_path: Path) -> int:
|
||||
script = r"""
|
||||
param([string]$Path)
|
||||
$word = $null
|
||||
$doc = $null
|
||||
try {
|
||||
$word = New-Object -ComObject Word.Application
|
||||
$word.Visible = $false
|
||||
$doc = $word.Documents.Open($Path, $false, $true)
|
||||
$doc.Repaginate()
|
||||
[Console]::Out.Write($doc.ComputeStatistics(2))
|
||||
exit 0
|
||||
} catch {
|
||||
[Console]::Error.Write($_.Exception.Message)
|
||||
exit 1
|
||||
} finally {
|
||||
if ($doc -ne $null) { $doc.Close($false) | Out-Null }
|
||||
if ($word -ne $null) { $word.Quit() | Out-Null }
|
||||
}
|
||||
"""
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
["powershell.exe", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script, str(file_path.resolve())],
|
||||
capture_output=True,
|
||||
check=False,
|
||||
text=True,
|
||||
timeout=8,
|
||||
)
|
||||
except Exception:
|
||||
return 0
|
||||
if completed.returncode != 0:
|
||||
return 0
|
||||
try:
|
||||
return max(int(completed.stdout.strip()), 1)
|
||||
except ValueError:
|
||||
return 0
|
||||
|
||||
Reference in New Issue
Block a user