245 lines
8.3 KiB
Python
245 lines
8.3 KiB
Python
import re
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Dict, List, Set, Callable, Optional
|
|
|
|
from docx import Document
|
|
from docx.oxml.ns import qn
|
|
|
|
|
|
PLACEHOLDER_PATTERN = re.compile(r"\{([a-zA-Z]+\d+)\}")
|
|
|
|
# Safety thresholds to avoid long scans on huge tables (tunable)
|
|
MAX_SCAN_ROWS = 200
|
|
MAX_SCAN_COLS = 50
|
|
MAX_SCAN_CELLS = MAX_SCAN_ROWS * MAX_SCAN_COLS
|
|
|
|
|
|
@dataclass
|
|
class TemplateScanResult:
|
|
texts: Set[str]
|
|
tables: Set[str]
|
|
charts: Set[str]
|
|
manual_tables: Set[str]
|
|
script_tables: Set[str]
|
|
script_charts: Set[str]
|
|
db_texts: Set[str] # database text placeholders {db1} to {db9}
|
|
|
|
|
|
def scan_docx_placeholders(path: Path) -> TemplateScanResult:
|
|
doc = Document(str(path))
|
|
found: List[str] = []
|
|
|
|
def scan_paragraphs(paragraphs):
|
|
for para in paragraphs:
|
|
text = para.text
|
|
for m in PLACEHOLDER_PATTERN.finditer(text):
|
|
found.append(m.group(1))
|
|
|
|
def scan_tables(tables):
|
|
for table in tables:
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
for para in cell.paragraphs:
|
|
for m in PLACEHOLDER_PATTERN.finditer(para.text):
|
|
found.append(m.group(1))
|
|
|
|
# body
|
|
scan_paragraphs(doc.paragraphs)
|
|
scan_tables(doc.tables)
|
|
|
|
# headers/footers of all sections
|
|
for sec in doc.sections:
|
|
headers = [getattr(sec, 'header', None), getattr(sec, 'first_page_header', None), getattr(sec, 'even_page_header', None)]
|
|
footers = [getattr(sec, 'footer', None), getattr(sec, 'first_page_footer', None), getattr(sec, 'even_page_footer', None)]
|
|
for hdr in headers:
|
|
if hdr:
|
|
scan_paragraphs(hdr.paragraphs)
|
|
scan_tables(hdr.tables)
|
|
for ftr in footers:
|
|
if ftr:
|
|
scan_paragraphs(ftr.paragraphs)
|
|
scan_tables(ftr.tables)
|
|
|
|
texts: Set[str] = set()
|
|
tables: Set[str] = set()
|
|
charts: Set[str] = set()
|
|
manual: Set[str] = set()
|
|
script_tables: Set[str] = set()
|
|
script_charts: Set[str] = set()
|
|
db_texts: Set[str] = set()
|
|
for key in found:
|
|
if key.startswith("text"):
|
|
texts.add(key)
|
|
elif key.startswith("table"):
|
|
tables.add(key)
|
|
elif key.startswith("chart"):
|
|
charts.add(key)
|
|
elif key.startswith("tb"):
|
|
manual.add(key)
|
|
elif key.lower().startswith("scripttable"):
|
|
script_tables.add(key)
|
|
elif key.lower().startswith("scriptchart"):
|
|
script_charts.add(key)
|
|
elif key.startswith("db") and len(key) == 3 and key[2].isdigit():
|
|
# Match {db1} to {db9}
|
|
db_texts.add(key)
|
|
return TemplateScanResult(
|
|
texts=texts,
|
|
tables=tables,
|
|
charts=charts,
|
|
manual_tables=manual,
|
|
script_tables=script_tables,
|
|
script_charts=script_charts,
|
|
db_texts=db_texts,
|
|
)
|
|
|
|
|
|
def _is_vertical_merge_continuation(cell) -> bool:
|
|
tcPr = getattr(cell._tc, 'tcPr', None)
|
|
if tcPr is None:
|
|
return False
|
|
vmerge = getattr(tcPr, 'vMerge', None)
|
|
if vmerge is None:
|
|
return False
|
|
val = vmerge.get(qn('w:val'))
|
|
return val is None or str(val).lower() == 'continue'
|
|
|
|
|
|
def scan_manual_table_grids(path: Path) -> Dict[str, List[List[str]]]:
|
|
doc = Document(str(path))
|
|
result: Dict[str, List[List[str]]] = {}
|
|
|
|
def sanitize(s: str, key: str) -> str:
|
|
return s.replace("{" + key + "}", "").strip()
|
|
|
|
for table in doc.tables:
|
|
rows = len(table.rows)
|
|
if rows == 0:
|
|
continue
|
|
cols = len(table.rows[0].cells)
|
|
keys_in_table: Set[str] = set()
|
|
for r in range(rows):
|
|
for c in range(cols):
|
|
cell = table.cell(r, c)
|
|
text = cell.text
|
|
for m in PLACEHOLDER_PATTERN.finditer(text):
|
|
k = m.group(1)
|
|
if k.startswith("tb"):
|
|
keys_in_table.add(k)
|
|
if not keys_in_table:
|
|
continue
|
|
grid: List[List[str]] = []
|
|
for r in range(rows):
|
|
row_vals: List[str] = []
|
|
for c in range(cols):
|
|
cell = table.cell(r, c)
|
|
if _is_vertical_merge_continuation(cell):
|
|
cell_text = ""
|
|
else:
|
|
cell_text = cell.text
|
|
for k in list(keys_in_table):
|
|
cell_text = sanitize(cell_text, k)
|
|
row_vals.append(cell_text)
|
|
grid.append(row_vals)
|
|
for k in keys_in_table:
|
|
result[k] = grid
|
|
return result
|
|
|
|
|
|
def scan_manual_table_grids_with_progress(path: Path, report: Optional[Callable[[str, int, int], None]] = None) -> Dict[str, List[List[str]]]:
|
|
doc = Document(str(path))
|
|
tables = list(doc.tables)
|
|
# Pre-scan to find only tables that contain tb* placeholders (quick early exit per table)
|
|
candidates: List[int] = []
|
|
for idx, table in enumerate(tables):
|
|
try:
|
|
rows = len(table.rows)
|
|
if rows == 0:
|
|
continue
|
|
cols = len(table.rows[0].cells)
|
|
scan_rows = min(rows, MAX_SCAN_ROWS)
|
|
scan_cols = min(cols, MAX_SCAN_COLS)
|
|
has_tb = False
|
|
for r in range(scan_rows):
|
|
for c in range(scan_cols):
|
|
cell = table.cell(r, c)
|
|
text = cell.text
|
|
for m in PLACEHOLDER_PATTERN.finditer(text):
|
|
k = m.group(1)
|
|
if k.startswith("tb"):
|
|
has_tb = True
|
|
break
|
|
if has_tb:
|
|
break
|
|
if has_tb:
|
|
break
|
|
if has_tb:
|
|
candidates.append(idx)
|
|
except Exception:
|
|
continue
|
|
total = len(candidates)
|
|
result: Dict[str, List[List[str]]] = {}
|
|
|
|
def _report(i: int, msg: str) -> None:
|
|
if report:
|
|
try:
|
|
report(msg, i, total if total > 0 else 1)
|
|
except Exception:
|
|
pass
|
|
|
|
for pos, tidx in enumerate(candidates, start=1):
|
|
table = tables[tidx]
|
|
_report(pos, f"读取手填表 {pos}/{total}")
|
|
try:
|
|
rows = len(table.rows)
|
|
if rows == 0:
|
|
continue
|
|
cols = len(table.rows[0].cells)
|
|
# Fast-pass detect keys (early exit on huge tables)
|
|
keys_in_table: Set[str] = set()
|
|
scan_rows = min(rows, MAX_SCAN_ROWS)
|
|
scan_cols = min(cols, MAX_SCAN_COLS)
|
|
for r in range(scan_rows):
|
|
for c in range(scan_cols):
|
|
cell = table.cell(r, c)
|
|
text = cell.text
|
|
for m in PLACEHOLDER_PATTERN.finditer(text):
|
|
k = m.group(1)
|
|
if k.startswith("tb"):
|
|
keys_in_table.add(k)
|
|
if keys_in_table:
|
|
break
|
|
if not keys_in_table:
|
|
continue
|
|
|
|
# If table is huge, avoid full scan; initialize an empty grid placeholder
|
|
if rows * cols > MAX_SCAN_CELLS:
|
|
truncated_rows = min(rows, MAX_SCAN_ROWS)
|
|
truncated_cols = min(cols, MAX_SCAN_COLS)
|
|
grid: List[List[str]] = [["" for _ in range(truncated_cols)] for _ in range(truncated_rows)]
|
|
for k in keys_in_table:
|
|
result[k] = grid
|
|
continue
|
|
|
|
# Full grid extraction with light per-row progress text
|
|
grid = []
|
|
for r in range(rows):
|
|
row_vals: List[str] = []
|
|
for c in range(cols):
|
|
cell = table.cell(r, c)
|
|
if _is_vertical_merge_continuation(cell):
|
|
cell_text = ""
|
|
else:
|
|
cell_text = cell.text
|
|
for k in list(keys_in_table):
|
|
cell_text = cell_text.replace("{" + k + "}", "").strip()
|
|
row_vals.append(cell_text)
|
|
grid.append(row_vals)
|
|
for k in keys_in_table:
|
|
result[k] = grid
|
|
except Exception:
|
|
# Skip problematic tables instead of hanging
|
|
continue
|
|
return result
|