python 按照word顺序遍历段落文本和表格,提取自动编号
用python-docx库来实现如何获取自动编号的,但是发现没有实现想要的效果。
·
搜索了很多用python-docx库来实现如何获取自动编号的,但是发现没有实现想要的效果。
下面是 基于pywin32库,来实现我想要的将word文档的内容(包含文本和表格)来按顺序提取出来,但是有个点就是处理表格会重复,然后就需要记录重复的,这样添加了很多工作量,导致速度也下降。
不知道大佬们有没有好办法实现这个效果且速度也冲上啊!!!!!!
import win32com.client as win32
def extract_table_data(table):
rows_content = {}
for cell in table.Range.Cells:
row_idx = cell.RowIndex
cell_text = cell.Range.Text.strip().replace('\r', '').replace('\x07', '')
if row_idx not in rows_content:
rows_content[row_idx] = []
rows_content[row_idx].append(cell_text)
# Join each row's cell contents into a single string
data = [' '.join(rows_content[row]) for row in sorted(rows_content.keys())]
if data:
return data
def extract_content_from_doc(doc_path):
processed_ranges = []
content = []
doc_app = win32.gencache.EnsureDispatch('Word.Application')
doc_app.Visible = 0
doc = doc_app.Documents.Open(doc_path, ReadOnly=True)
paragraphs = doc.Paragraphs
for paragraph in paragraphs:
if paragraph.Range.Tables.Count == 0:
paragraph_number = paragraph.Range.ListFormat.ListString
paragraph_text = paragraph.Range.Text.strip()
if paragraph_number:
content.append(f'{paragraph_number} {paragraph_text}')
else:
content.append(paragraph_text)
else:
for table in paragraph.Range.Tables:
table_range = table.Range
table_start = table_range.Start
table_end = table_range.End
already_processed = any(
(start <= table_start < end) or (start < table_end <= end)
for (start, end) in processed_ranges
)
if already_processed:
continue
processed_ranges.append((table_start, table_end))
content.extend(extract_table_data(table))
doc.Close(False)
doc_app.Quit()
return content
if __name__ == '__main__':
path = r'your_path.docx'
extracted_content = extract_content_from_doc(path)
for item in extracted_content:
print(item)
更多推荐
所有评论(0)