python 读取PDF内容(推荐pdfplumber)
环境python2.7# coding=utf-8import PyPDF2def read_pdf_test1(file_path):with open(file_path, 'rb') as f:reader = PyPDF2.PdfFileReader(f)if reader.isEncrypted:reader.decrypt('')page_num = reader.getNumPage
·
环境python2.7
# coding=utf-8
import PyPDF2
def read_pdf_test1(file_path):
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfFileReader(f)
if reader.isEncrypted:
reader.decrypt('')
page_num = reader.getNumPages()
contents = ''
for page_num in xrange(page_num):
contents += reader.getPage(page_num).extractText()
contents = contents.replace('\n', '').replace(' ', '').replace('\r', '')
print("contents = {}".format(contents))
def test2(file_path):
import pdfplumber # pdfplumber==0.5.13
contents = ''
with pdfplumber.open(file_path) as pdf:
# page_count = len(pdf.pages)
# print(page_count) # 得到页数
for page in pdf.pages:
print('---------- 第[%d]页 ----------' % page.page_number)
# 获取当前页面的全部文本信息,包括表格中的文字
contents += page.extract_text()
print("contents = {}".format(contents))
if __name__ == "__main__":
# file_path = './dlp_dengxian.pdf'
# file_path = './dlp_yuanyue.pdf'
file_path = './dlp_yuanyue2.pdf'
# read_pdf_test1(file_path)
pass
test2(file_path)
更多推荐
所有评论(0)