PCM_Report/convert_md_to_docx.py

243 lines
8.0 KiB
Python
Raw Normal View History

2025-12-11 14:32:31 +08:00
"""
Markdown 文件转换为 DOCX 格式
使用 python-docx 库生成格式化的 Word 文档
"""
from docx import Document
from docx.shared import Pt, RGBColor, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
import re
from pathlib import Path
def parse_markdown_to_docx(md_file: str, docx_file: str):
"""将 Markdown 文件转换为 DOCX 文档"""
# 创建文档
doc = Document()
# 设置中文字体
doc.styles['Normal'].font.name = '宋体'
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
doc.styles['Normal'].font.size = Pt(10.5)
# 读取 Markdown 文件
with open(md_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 解析并转换
i = 0
in_code_block = False
code_lines = []
in_table = False
table_lines = []
while i < len(lines):
line = lines[i].rstrip()
# 代码块处理
if line.startswith('```'):
if not in_code_block:
in_code_block = True
code_lines = []
else:
# 结束代码块
in_code_block = False
if code_lines:
p = doc.add_paragraph()
p.style = 'Normal'
run = p.add_run('\n'.join(code_lines))
run.font.name = 'Consolas'
run.font.size = Pt(9)
run.font.color.rgb = RGBColor(0, 0, 0)
# 设置背景色(浅灰色)
p.paragraph_format.left_indent = Inches(0.5)
p.paragraph_format.right_indent = Inches(0.5)
i += 1
continue
if in_code_block:
code_lines.append(line)
i += 1
continue
# 表格处理
if line.startswith('|') and '|' in line[1:]:
if not in_table:
in_table = True
table_lines = []
table_lines.append(line)
i += 1
# 检查下一行是否还是表格
if i < len(lines) and not lines[i].strip().startswith('|'):
# 表格结束,创建表格
create_table_from_markdown(doc, table_lines)
in_table = False
table_lines = []
continue
# 空行
if not line.strip():
doc.add_paragraph()
i += 1
continue
# 一级标题
if line.startswith('# '):
heading = doc.add_heading(line[2:], level=1)
heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
for run in heading.runs:
run.font.name = '黑体'
run._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
run.font.size = Pt(18)
run.font.color.rgb = RGBColor(0, 0, 0)
# 二级标题
elif line.startswith('## '):
heading = doc.add_heading(line[3:], level=2)
for run in heading.runs:
run.font.name = '黑体'
run._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
run.font.size = Pt(16)
run.font.color.rgb = RGBColor(0, 0, 128)
# 三级标题
elif line.startswith('### '):
heading = doc.add_heading(line[4:], level=3)
for run in heading.runs:
run.font.name = '黑体'
run._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
run.font.size = Pt(14)
run.font.color.rgb = RGBColor(0, 0, 128)
# 四级标题
elif line.startswith('#### '):
heading = doc.add_heading(line[5:], level=4)
for run in heading.runs:
run.font.name = '黑体'
run._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
run.font.size = Pt(12)
# 无序列表
elif line.startswith('- ') or line.startswith('* '):
text = line[2:]
# 处理加粗
text = process_bold_text(text)
p = doc.add_paragraph(style='List Bullet')
add_formatted_text(p, text)
# 有序列表
elif re.match(r'^\d+\.\s', line):
text = re.sub(r'^\d+\.\s', '', line)
text = process_bold_text(text)
p = doc.add_paragraph(style='List Number')
add_formatted_text(p, text)
# 分隔线
elif line.startswith('---'):
doc.add_paragraph('_' * 50)
# 普通段落
else:
text = process_bold_text(line)
p = doc.add_paragraph()
add_formatted_text(p, text)
i += 1
# 保存文档
doc.save(docx_file)
print(f"文档已生成: {docx_file}")
def create_table_from_markdown(doc, table_lines):
"""从 Markdown 表格行创建 Word 表格"""
if len(table_lines) < 2:
return
# 解析表头
header_line = table_lines[0]
headers = [cell.strip() for cell in header_line.split('|')[1:-1]]
# 跳过分隔线
data_lines = table_lines[2:] if len(table_lines) > 2 else []
# 解析数据行
rows_data = []
for line in data_lines:
cells = [cell.strip() for cell in line.split('|')[1:-1]]
rows_data.append(cells)
# 创建表格
table = doc.add_table(rows=1 + len(rows_data), cols=len(headers))
table.style = 'Light Grid Accent 1'
# 填充表头
header_cells = table.rows[0].cells
for i, header in enumerate(headers):
header_cells[i].text = header
# 设置表头样式
for paragraph in header_cells[i].paragraphs:
for run in paragraph.runs:
run.font.bold = True
run.font.name = '黑体'
run._element.rPr.rFonts.set(qn('w:eastAsia'), '黑体')
# 填充数据
for row_idx, row_data in enumerate(rows_data, start=1):
row_cells = table.rows[row_idx].cells
for col_idx, cell_data in enumerate(row_data):
# 处理特殊符号
cell_text = cell_data.replace('**', '').replace('`', '')
row_cells[col_idx].text = cell_text
def process_bold_text(text):
"""处理加粗文本标记"""
return text
def add_formatted_text(paragraph, text):
"""添加格式化文本到段落"""
# 处理加粗 **text**
parts = re.split(r'(\*\*.*?\*\*)', text)
for part in parts:
if part.startswith('**') and part.endswith('**'):
# 加粗文本
run = paragraph.add_run(part[2:-2])
run.font.bold = True
run.font.name = '宋体'
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
elif part.startswith('`') and part.endswith('`'):
# 代码文本
run = paragraph.add_run(part[1:-1])
run.font.name = 'Consolas'
run.font.size = Pt(9)
else:
# 普通文本
# 处理内联代码 `code`
code_parts = re.split(r'(`.*?`)', part)
for code_part in code_parts:
if code_part.startswith('`') and code_part.endswith('`'):
run = paragraph.add_run(code_part[1:-1])
run.font.name = 'Consolas'
run.font.size = Pt(9)
else:
# 处理表情符号和特殊字符
run = paragraph.add_run(code_part)
run.font.name = '宋体'
run._element.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
if __name__ == '__main__':
# 转换文件
2025-12-19 10:18:51 +08:00
md_file = Path(__file__).parent / 'Docx报告生成器使用说明书V2.0.md'
docx_file = Path(__file__).parent / 'Docx报告生成器使用说明书V2.0.docx'
2025-12-11 14:32:31 +08:00
print(f"开始转换: {md_file.name}")
parse_markdown_to_docx(str(md_file), str(docx_file))
print(f"转换完成!")
print(f"输出文件: {docx_file}")