# 【Python实战】自动化处理Word文档:批量替换+模板填充+格式转换+水印添加
一、项目背景1.1 痛点分析Word文档处理是办公中最高频的场景但手动操作效率极低场景手工方式时间50份合同改名字逐份打开修改3小时100份通知填数据模板复制粘贴5小时30份文档转PDF逐个另存为1小时批量加水印逐页插入2小时总计-11小时HR每月发薪资条、法务每季度改合同模板、行政群发通知…全是重复劳动。1.2 技术需求核心需求批量查找替换支持正则模板变量填充从Excel读取数据批量格式转换Word↔PDF批量添加水印保留原文档格式和样式二、技术架构数据源(Excel) → 模板加载 → 变量填充 → 格式处理 → 批量输出↑ ↑ ↑ ↑ ↑pandas python-docx python-docx docx2pdf pathlibcomtypes技术栈 - **python-docx**Word文档读写 - **pandas**读取Excel数据源 - **docx2pdf**Word转PDF - **comtypes**调用Word COM接口高级转换 - **Pillow**水印图片生成 - **pathlib**文件路径管理 --- ## 三、环境准备 ### 3.1 安装依赖 bash pip install python-docx pandas docx2pdf pillow openpyxl3.2 项目结构word-automation/ ├── main.py # 主程序 ├── replacer.py # 批量替换模块 ├── filler.py # 模板填充模块 ├── converter.py # 格式转换模块 ├── watermark.py # 水印模块 ├── config.py # 配置 ├── templates/ # 模板目录 │ └── contract.docx # 合同模板 ├── data/ # 数据目录 │ └── employees.xlsx # 员工数据 └── output/ # 输出目录四、核心模块实现4.1 批量查找替换模块支持段落、表格、页眉页脚的全文替换from docx import Document from pathlib import Path import re import copy class WordReplacer: def __init__(self): self.replace_count 0 def replace_in_file(self, file_path, replacements, output_pathNone): 单文件替换 doc Document(file_path) self.replace_count 0 # 替换段落 for paragraph in doc.paragraphs: self._replace_in_paragraph(paragraph, replacements) # 替换表格 for table in doc.tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: self._replace_in_paragraph(paragraph, replacements) # 替换页眉页脚 for section in doc.sections: for paragraph in section.header.paragraphs: self._replace_in_paragraph(paragraph, replacements) for paragraph in section.footer.paragraphs: self._replace_in_paragraph(paragraph, replacements) # 保存 save_path output_path or file_path doc.save(save_path) return self.replace_count def _replace_in_paragraph(self, paragraph, replacements): 替换段落文字保留格式 full_text paragraph.text for old_text, new_text in replacements.items(): if old_text in full_text: self.replace_count full_text.count(old_text) full_text full_text.replace(old_text, new_text) if paragraph.text ! full_text: # 保留第一个run的格式 if paragraph.runs: first_run_format self._get_run_format(paragraph.runs[0]) for run in paragraph.runs: run.text paragraph.runs[0].text full_text self._apply_run_format(paragraph.runs[0], first_run_format) def _get_run_format(self, run): 获取run格式 return { bold: run.bold, italic: run.italic, font_name: run.font.name, font_size: run.font.size, font_color: run.font.color.rgb if run.font.color and run.font.color.rgb else None } def _apply_run_format(self, run, fmt): 应用run格式 run.bold fmt[bold] run.italic fmt[italic] if fmt[font_name]: run.font.name fmt[font_name] if fmt[font_size]: run.font.size fmt[font_size] def batch_replace(self, folder_path, replacements, output_folderNone): 批量替换文件夹中的所有Word文档 folder Path(folder_path) output Path(output_folder) if output_folder else folder / replaced output.mkdir(exist_okTrue) files list(folder.glob(*.docx)) total_count 0 print(f发现 {len(files)} 个Word文档) for i, file in enumerate(files): if file.name.startswith(~$): continue output_path output / file.name count self.replace_in_file(str(file), replacements, str(output_path)) total_count count print(f [{i1}/{len(files)}] {file.name} → 替换{count}处) print(f✅ 批量替换完成共替换{total_count}处) return total_count4.2 模板变量填充模块从Excel读取数据批量填充Word模板import pandas as pd from docx import Document from pathlib import Path class TemplateFiller: def __init__(self, template_path): self.template_path template_path def fill_from_excel(self, excel_path, output_folder, filename_columnNone): 从Excel读取数据批量填充模板 df pd.read_excel(excel_path) output Path(output_folder) output.mkdir(exist_okTrue) print(f模板{self.template_path}) print(f数据{len(df)}行 × {len(df.columns)}列) for index, row in df.iterrows(): # 构建替换字典{{列名}} → 值 replacements {} for col in df.columns: placeholder {{ str(col) }} value str(row[col]) if pd.notna(row[col]) else replacements[placeholder] value # 生成文件名 if filename_column and filename_column in df.columns: filename f{row[filename_column]}.docx else: filename foutput_{index 1}.docx # 填充模板 doc Document(self.template_path) self._fill_document(doc, replacements) output_path output / filename doc.save(str(output_path)) print(f [{index1}/{len(df)}] → {filename}) print(f✅ 批量填充完成共生成{len(df)}个文档) def _fill_document(self, doc, replacements): 填充文档中的占位符 # 段落 for paragraph in doc.paragraphs: self._fill_paragraph(paragraph, replacements) # 表格 for table in doc.tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: self._fill_paragraph(paragraph, replacements) # 页眉页脚 for section in doc.sections: for paragraph in section.header.paragraphs: self._fill_paragraph(paragraph, replacements) for paragraph in section.footer.paragraphs: self._fill_paragraph(paragraph, replacements) def _fill_paragraph(self, paragraph, replacements): 填充段落处理跨run的占位符 full_text paragraph.text changed False for placeholder, value in replacements.items(): if placeholder in full_text: full_text full_text.replace(placeholder, value) changed True if changed and paragraph.runs: # 保留格式更新文字 first_format { bold: paragraph.runs[0].bold, italic: paragraph.runs[0].italic, font_name: paragraph.runs[0].font.name, font_size: paragraph.runs[0].font.size } for run in paragraph.runs: run.text paragraph.runs[0].text full_text paragraph.runs[0].bold first_format[bold] paragraph.runs[0].italic first_format[italic] if first_format[font_name]: paragraph.runs[0].font.name first_format[font_name] if first_format[font_size]: paragraph.runs[0].font.size first_format[font_size] def fill_single(self, data_dict, output_path): 填充单个文档 doc Document(self.template_path) replacements { {{ k }}: str(v) for k, v in data_dict.items() } self._fill_document(doc, replacements) doc.save(output_path) print(f✅ 文档已生成{output_path})4.3 格式转换模块Word↔PDF批量转换from pathlib import Path import subprocess import platform class FormatConverter: def word_to_pdf(self, input_path, output_pathNone): Word转PDF input_path Path(input_path) if not output_path: output_path input_path.with_suffix(.pdf) system platform.system() if system Windows: self._convert_windows(str(input_path), str(output_path)) else: self._convert_libreoffice(str(input_path), str(output_path)) return str(output_path) def _convert_windows(self, input_path, output_path): Windows下用COM接口转换效果最好 try: import comtypes.client word comtypes.client.CreateObject(Word.Application) word.Visible False doc word.Documents.Open(input_path) doc.SaveAs(output_path, FileFormat17) # 17 PDF doc.Close() word.Quit() except ImportError: # 降级使用docx2pdf from docx2pdf import convert convert(input_path, output_path) def _convert_libreoffice(self, input_path, output_path): Linux/Mac下用LibreOffice转换 output_dir str(Path(output_path).parent) subprocess.run([ libreoffice, --headless, --convert-to, pdf, --outdir, output_dir, input_path ], checkTrue) def batch_to_pdf(self, folder_path, output_folderNone): 批量Word转PDF folder Path(folder_path) output Path(output_folder) if output_folder else folder / pdf output.mkdir(exist_okTrue) files list(folder.glob(*.docx)) print(f发现 {len(files)} 个Word文档) success 0 for i, file in enumerate(files): if file.name.startswith(~$): continue try: output_path output / file.with_suffix(.pdf).name self.word_to_pdf(str(file), str(output_path)) success 1 print(f [{i1}/{len(files)}] {file.name} → PDF ✅) except Exception as e: print(f [{i1}/{len(files)}] {file.name} → 失败: {e}) print(f✅ 批量转换完成{success}/{len(files)})4.4 水印添加模块支持文字水印和图片水印from docx import Document from docx.shared import Pt, Inches, RGBColor, Emu from docx.oxml.ns import qn, nsdecls from docx.oxml import parse_xml from pathlib import Path class WatermarkAdder: def add_text_watermark(self, file_path, text, output_pathNone): 添加文字水印 doc Document(file_path) for section in doc.sections: header section.header header.is_linked_to_previous False # 创建水印XML watermark_xml f w:r {nsdecls(w, v, o, wp, r)} w:rPr w:noProof/ /w:rPr v:shapetype id_x0000_t136 coordsize21600,21600 o:spt136 pathm7,l8,m5,21600l6,21600e /v:shapetype v:shape idPowerP ...(truncated)...