PCM_Report/template_scanner.py

import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Set, Callable, Optional

from docx import Document
from docx.oxml.ns import qn


PLACEHOLDER_PATTERN = re.compile(r"\{([a-zA-Z]+\d+)\}")

# Safety thresholds to avoid long scans on huge tables (tunable)
MAX_SCAN_ROWS = 200
MAX_SCAN_COLS = 50
MAX_SCAN_CELLS = MAX_SCAN_ROWS * MAX_SCAN_COLS


@dataclass
class TemplateScanResult:
    texts: Set[str]
    tables: Set[str]
    charts: Set[str]
    manual_tables: Set[str]
    script_tables: Set[str]
    script_charts: Set[str]
    db_texts: Set[str]  # database text placeholders {db1} to {db9}


def scan_docx_placeholders(path: Path) -> TemplateScanResult:
    doc = Document(str(path))
    found: List[str] = []

    def scan_paragraphs(paragraphs):
        for para in paragraphs:
            text = para.text
            for m in PLACEHOLDER_PATTERN.finditer(text):
                found.append(m.group(1))

    def scan_tables(tables):
        for table in tables:
            for row in table.rows:
                for cell in row.cells:
                    for para in cell.paragraphs:
                        for m in PLACEHOLDER_PATTERN.finditer(para.text):
                            found.append(m.group(1))

    # body
    scan_paragraphs(doc.paragraphs)
    scan_tables(doc.tables)

    # headers/footers of all sections
    for sec in doc.sections:
        headers = [getattr(sec, 'header', None), getattr(sec, 'first_page_header', None), getattr(sec, 'even_page_header', None)]
        footers = [getattr(sec, 'footer', None), getattr(sec, 'first_page_footer', None), getattr(sec, 'even_page_footer', None)]
        for hdr in headers:
            if hdr:
                scan_paragraphs(hdr.paragraphs)
                scan_tables(hdr.tables)
        for ftr in footers:
            if ftr:
                scan_paragraphs(ftr.paragraphs)
                scan_tables(ftr.tables)

    texts: Set[str] = set()
    tables: Set[str] = set()
    charts: Set[str] = set()
    manual: Set[str] = set()
    script_tables: Set[str] = set()
    script_charts: Set[str] = set()
    db_texts: Set[str] = set()
    for key in found:
        if key.startswith("text"):
            texts.add(key)
        elif key.startswith("table"):
            tables.add(key)
        elif key.startswith("chart"):
            charts.add(key)
        elif key.startswith("tb"):
            manual.add(key)
        elif key.lower().startswith("scripttable"):
            script_tables.add(key)
        elif key.lower().startswith("scriptchart"):
            script_charts.add(key)
        elif key.startswith("db") and len(key) == 3 and key[2].isdigit():
            # Match {db1} to {db9}
            db_texts.add(key)
    return TemplateScanResult(
        texts=texts,
        tables=tables,
        charts=charts,
        manual_tables=manual,
        script_tables=script_tables,
        script_charts=script_charts,
        db_texts=db_texts,
    )


def _is_vertical_merge_continuation(cell) -> bool:
    tcPr = getattr(cell._tc, 'tcPr', None)
    if tcPr is None:
        return False
    vmerge = getattr(tcPr, 'vMerge', None)
    if vmerge is None:
        return False
    val = vmerge.get(qn('w:val'))
    return val is None or str(val).lower() == 'continue'


def scan_manual_table_grids(path: Path) -> Dict[str, List[List[str]]]:
    doc = Document(str(path))
    result: Dict[str, List[List[str]]] = {}

    def sanitize(s: str, key: str) -> str:
        return s.replace("{" + key + "}", "").strip()

    for table in doc.tables:
        rows = len(table.rows)
        if rows == 0:
            continue
        cols = len(table.rows[0].cells)
        keys_in_table: Set[str] = set()
        for r in range(rows):
            for c in range(cols):
                cell = table.cell(r, c)
                text = cell.text
                for m in PLACEHOLDER_PATTERN.finditer(text):
                    k = m.group(1)
                    if k.startswith("tb"):
                        keys_in_table.add(k)
        if not keys_in_table:
            continue
        grid: List[List[str]] = []
        for r in range(rows):
            row_vals: List[str] = []
            for c in range(cols):
                cell = table.cell(r, c)
                if _is_vertical_merge_continuation(cell):
                    cell_text = ""
                else:
                    cell_text = cell.text
                    for k in list(keys_in_table):
                        cell_text = sanitize(cell_text, k)
                row_vals.append(cell_text)
            grid.append(row_vals)
        for k in keys_in_table:
            result[k] = grid
    return result


def scan_manual_table_grids_with_progress(path: Path, report: Optional[Callable[[str, int, int], None]] = None) -> Dict[str, List[List[str]]]:
    doc = Document(str(path))
    tables = list(doc.tables)
    # Pre-scan to find only tables that contain tb* placeholders (quick early exit per table)
    candidates: List[int] = []
    for idx, table in enumerate(tables):
        try:
            rows = len(table.rows)
            if rows == 0:
                continue
            cols = len(table.rows[0].cells)
            scan_rows = min(rows, MAX_SCAN_ROWS)
            scan_cols = min(cols, MAX_SCAN_COLS)
            has_tb = False
            for r in range(scan_rows):
                for c in range(scan_cols):
                    cell = table.cell(r, c)
                    text = cell.text
                    for m in PLACEHOLDER_PATTERN.finditer(text):
                        k = m.group(1)
                        if k.startswith("tb"):
                            has_tb = True
                            break
                    if has_tb:
                        break
                if has_tb:
                    break
            if has_tb:
                candidates.append(idx)
        except Exception:
            continue
    total = len(candidates)
    result: Dict[str, List[List[str]]] = {}

    def _report(i: int, msg: str) -> None:
        if report:
            try:
                report(msg, i, total if total > 0 else 1)
            except Exception:
                pass

    for pos, tidx in enumerate(candidates, start=1):
        table = tables[tidx]
        _report(pos, f"读取手填表 {pos}/{total}")
        try:
            rows = len(table.rows)
            if rows == 0:
                continue
            cols = len(table.rows[0].cells)
            # Fast-pass detect keys (early exit on huge tables)
            keys_in_table: Set[str] = set()
            scan_rows = min(rows, MAX_SCAN_ROWS)
            scan_cols = min(cols, MAX_SCAN_COLS)
            for r in range(scan_rows):
                for c in range(scan_cols):
                    cell = table.cell(r, c)
                    text = cell.text
                    for m in PLACEHOLDER_PATTERN.finditer(text):
                        k = m.group(1)
                        if k.startswith("tb"):
                            keys_in_table.add(k)
                if keys_in_table:
                    break
            if not keys_in_table:
                continue

            # If table is huge, avoid full scan; initialize an empty grid placeholder
            if rows * cols > MAX_SCAN_CELLS:
                truncated_rows = min(rows, MAX_SCAN_ROWS)
                truncated_cols = min(cols, MAX_SCAN_COLS)
                grid: List[List[str]] = [["" for _ in range(truncated_cols)] for _ in range(truncated_rows)]
                for k in keys_in_table:
                    result[k] = grid
                continue

            # Full grid extraction with light per-row progress text
            grid = []
            for r in range(rows):
                row_vals: List[str] = []
                for c in range(cols):
                    cell = table.cell(r, c)
                    if _is_vertical_merge_continuation(cell):
                        cell_text = ""
                    else:
                        cell_text = cell.text
                        for k in list(keys_in_table):
                            cell_text = cell_text.replace("{" + k + "}", "").strip()
                    row_vals.append(cell_text)
                grid.append(row_vals)
            for k in keys_in_table:
                result[k] = grid
        except Exception:
            # Skip problematic tables instead of hanging
            continue
    return result