python实现word内容替换
python实现word内容替换
·
doc文件与docx文件不同
- 存储方式的不同: doc 是二进制存储,docx是打包文件(docx文件可以解压,能看到里面的文件结构,主要是xml 等组成的打包文件);
- docx易于跨平台,docx更小;
- docx对于处理一些复杂对象比如公式、表格、图片更得心应手,因为可以通过xml的配置进行。
1. python-docx
#pip install python-docx
import docx
# 创建文档对象,获得word文档
doc = docx.Document(path)
#每一段的内容
for para in doc.paragraphs:
print(para.text)
#每一段的编号、内容
for i in range(len(doc.paragraphs)):
print(str(i), doc.paragraphs[i].text)
doc = docx.Document('test.docx')
for paragraph in doc.paragraphs:
tmp = ''
runs = paragraph.runs
for i, run in enumerate(runs):
tmp += run.text # 合并run字符串
if '需要替换的字符串' in tmp:
# 如果存在匹配得字符串,那么将当前得run替换成合并后得字符串
run.text = run.text.replace(run.text, tmp)
run.text = run.text.replace('需要替换的字符串', '我是替换后的字符串')
tmp = ''
else:
# 如果没匹配到目标字符串则把当前run置空
run.text = run.text.replace(run.text, '')
if i == len(runs) - 1:
# 如果是当前段落一直没有符合规则得字符串直接将当前run替换为tmp
run.text = run.text.replace(run.text, tmp)
def docx_inplace_replace(file):
file_ = rreplace(file, '.docx', '', 1)
new_file = file_ + '_.docx'
doc = docx.Document(file)
for paragraph in doc.paragraphs:
runs = paragraph.runs
for i, run in enumerate(runs):
tmp = run.text
tmp = re.sub("\s+", " ", tmp)
sensitive_datas = re_tmp(tmp)
names = name_identify([tmp])
if len(sensitive_datas) > 0:
names = names + sensitive_datas
if len(names) > 0:
# 如果存在匹配得字符串,那么将当前得run替换成合并后得字符串
for name in names:
tmp = tmp.replace(name, 'X'*len(name))
run.text = run.text.replace(run.text, tmp)
# 遍历所有表格的单元格
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
tmp = run.text
tmp = re.sub("\s+", " ", tmp)
sensitive_datas = re_tmp(tmp)
names = name_identify([tmp])
if len(sensitive_datas) >0:
names = names + sensitive_datas
if len(names) > 0:
# 如果存在匹配得字符串,那么将当前得run替换成合并后得字符串
for name in names:
tmp = tmp.replace(name, 'X'*len(name))
run.text = tmp
# 保存文档
doc.save(new_file)
remove_header_footer(new_file, new_file)
return new_file
2. 解压处理xml 数据
import zipfile
import os
import re
import tempfile
import shutil
from functools import reduce
#1. 获取xml 字符串
def getXml(docxFilename):
zip = zipfile.ZipFile(open(docxFilename,"rb"))
xmlString = zip.read("word/document.xml")
return xmlString.decode('utf-8')
#2. TODO 对xml 字符串进行替换处理
#3. 封装回docx 文件
def createNewDocx(originalDocx,xmlContent,newFilename):
""" Create a temp directory, expand the original docx zip.
Write the modified xml to word/document.xml
Zip it up as the new docx
"""
tmpDir = tempfile.mkdtemp()
zip = zipfile.ZipFile(open(originalDocx,"rb"))
zip.extractall(tmpDir)
with open(os.path.join(tmpDir,"word/document.xml"),"w" ,encoding='utf-8') as f:
f.write(xmlContent)
# Get a list of all the files in the original docx zipfile
filenames = zip.namelist()
# Now, create the new zip file and add all the filex into the archive
zipCopyFilename = newFilename
with zipfile.ZipFile(zipCopyFilename,"w") as docx:
for filename in filenames:
docx.write(os.path.join(tmpDir,filename),filename)
# Clean up the temp dir
shutil.rmtree(tmpDir)
更多推荐
所有评论(0)