python实现word内容替换

桂花很香,旭很美

5874人浏览 · 2022-10-28 18:50:44

桂花很香,旭很美 · 2022-10-28 18:50:44 发布

doc文件与docx文件不同

存储方式的不同： doc 是二进制存储，docx是打包文件（docx文件可以解压，能看到里面的文件结构，主要是xml 等组成的打包文件）；
docx易于跨平台，docx更小；
docx对于处理一些复杂对象比如公式、表格、图片更得心应手，因为可以通过xml的配置进行。

1. python-docx

#pip install python-docx
import docx
# 创建文档对象,获得word文档
doc = docx.Document(path)

#每一段的内容
for para in doc.paragraphs:
    print(para.text)

#每一段的编号、内容
for i in range(len(doc.paragraphs)):
    print(str(i), doc.paragraphs[i].text)

doc = docx.Document('test.docx')
for paragraph in doc.paragraphs:
    tmp = ''
    runs = paragraph.runs
    for i, run in enumerate(runs):
        tmp += run.text # 合并run字符串
        if '需要替换的字符串' in tmp:
            # 如果存在匹配得字符串，那么将当前得run替换成合并后得字符串
            run.text = run.text.replace(run.text, tmp)
            run.text = run.text.replace('需要替换的字符串', '我是替换后的字符串')
            tmp = ''
        else:
            # 如果没匹配到目标字符串则把当前run置空
            run.text = run.text.replace(run.text, '')
        if i == len(runs) - 1:
            # 如果是当前段落一直没有符合规则得字符串直接将当前run替换为tmp
            run.text = run.text.replace(run.text, tmp)


def docx_inplace_replace(file):
    file_ = rreplace(file, '.docx', '', 1)
    new_file = file_ + '_.docx'
    doc = docx.Document(file)
    for paragraph in doc.paragraphs:
        runs = paragraph.runs
        for i, run in enumerate(runs):
            tmp = run.text
            tmp = re.sub("\s+", " ", tmp)
            sensitive_datas = re_tmp(tmp)
            names = name_identify([tmp])
            if len(sensitive_datas) > 0:
                names = names + sensitive_datas
            if len(names) > 0:
                # 如果存在匹配得字符串，那么将当前得run替换成合并后得字符串
                for name in names:
                    tmp = tmp.replace(name, 'X'*len(name))
                run.text = run.text.replace(run.text, tmp)
    # 遍历所有表格的单元格
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    for run in paragraph.runs:
                        tmp = run.text
                        tmp = re.sub("\s+", " ", tmp)
                        sensitive_datas = re_tmp(tmp)
                        names = name_identify([tmp])
                        if len(sensitive_datas) >0:
                            names = names + sensitive_datas
                        if len(names) > 0:
                            # 如果存在匹配得字符串，那么将当前得run替换成合并后得字符串
                            for name in names:
                                tmp = tmp.replace(name, 'X'*len(name))
                        run.text = tmp
        # 保存文档
    doc.save(new_file)
    remove_header_footer(new_file, new_file)
    return new_file

2. 解压处理xml 数据

import zipfile
import os
import re
import tempfile
import shutil
from functools import reduce
#1. 获取xml 字符串
def getXml(docxFilename):
    zip = zipfile.ZipFile(open(docxFilename,"rb"))
    xmlString = zip.read("word/document.xml")
    return xmlString.decode('utf-8')
#2.  TODO 对xml 字符串进行替换处理

#3. 封装回docx 文件
def createNewDocx(originalDocx,xmlContent,newFilename):
    
    """ Create a temp directory, expand the original docx zip.
            Write the modified xml to word/document.xml
            Zip it up as the new docx
        """
    tmpDir = tempfile.mkdtemp()
    zip = zipfile.ZipFile(open(originalDocx,"rb"))
    zip.extractall(tmpDir)
    with open(os.path.join(tmpDir,"word/document.xml"),"w" ,encoding='utf-8') as f:
        f.write(xmlContent)
    # Get a list of all the files in the original docx zipfile
    filenames = zip.namelist()
    # Now, create the new zip file and add all the filex into the archive
    zipCopyFilename = newFilename
    with zipfile.ZipFile(zipCopyFilename,"w") as docx:
        for filename in filenames:
            docx.write(os.path.join(tmpDir,filename),filename)
    # Clean up the temp dir
    shutil.rmtree(tmpDir)

技术共进，成长同行——讯飞AI开发者社区

更多推荐

YOLOv8【卷积创新篇·第25节】Capsule Network胶囊卷积网络：让检测器拥有“空间想象力”！

讯飞AI开发者社区

一阶谓词逻辑及其重要子集对人工智能自然语言处理深层语义分析的影响与启示

讯飞AI开发者社区

机器学习与人工智能

python# 创建基类# 定义一对多关系# 定义多对一关系# 定义多对多关系（通过关联表）# 关联表（用于多对多关系）SQLAlchemy ORM提供了强大而灵活的数据库操作方式，通过本文的介绍，您应该能够：安装和配置SQLAlchemy定义数据模型和关系执行基本的CRUD操作构建复杂查询管理数据库事务遵循最佳实践SQLAlchemy还有更多高级特性，如混合属性、事件监听、自定义查询等，值得进一