python通过pdfminer或pdfminer3k读取pdf文件
【摘要】 python3
pip install pdfminer3k
1
# -*- encoding: utf-8 -*-
try: from urllib.request import urlopen
except: from urllib import urlopen
from io import StringIO
from pdfminer.pdfinterp...
python3
pip install pdfminer3k
- 1
# -*- encoding: utf-8 -*-
try: from urllib.request import urlopen
except: from urllib import urlopen
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
# 读取pdf的函数,返回内容
def readPdf(pdf_file): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr=rsrcmgr, outfp=retstr, laparams=laparams) process_pdf(rsrcmgr=rsrcmgr, device=device, fp=pdf_file) device.close() content = retstr.getvalue() retstr.close() return content
url = "http://www.pythonscraping.com/pages/warandpeace/chapter1.pdf"
pdf_file = urlopen(url) # 也可以换成本地pdf文件,用open rb模式打开
content = readPdf(pdf_file)
print(content)
pdf_file.close()
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
python2
下载:https://pypi.python.org/pypi/pdfminer/
pip install pdfminer
- 1
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def readPdf2(path): rsrcmgr = PDFResourceManager() retstr = StringIO() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) with open(path, 'rb') as fp: for page in PDFPage.get_pages(fp, set()): interpreter.process_page(page) text = retstr.getvalue() device.close() retstr.close() return text
text = readPdf2("path")
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
文章来源: pengshiyu.blog.csdn.net,作者:彭世瑜,版权归原作者所有,如需转载,请联系作者。
原文链接:pengshiyu.blog.csdn.net/article/details/79918271
【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)