Python爬虫实战(一):爬取豆瓣电影top250排名
【摘要】 先上代码
#coding=utf-8import reimport urllib.request def getHtml(url): page = urllib.request.urlopen(url) html = page.read() html = html.decode('utf-8') return html def getItem(html): reg...
先上代码
-
#coding=utf-8
-
import re
-
import urllib.request
-
-
def getHtml(url):
-
page = urllib.request.urlopen(url)
-
html = page.read()
-
html = html.decode('utf-8')
-
return html
-
-
def getItem(html):
-
reg = re.compile(r'.*?<span class="title">(.*?)</span>.*?<p class="">.*?(\d+).*?</p>.*?<span class="rating_num" property="v:average">(.*?)</span>.*?<span>(\d+)人评价',re.S)
-
items = re.findall(reg,html)
-
global index
-
for index,item in enumerate(items,index+1):
-
print (index,item)
-
-
-
if __name__=='__main__':
-
index = 0
-
for i in range(0,226,25):
-
url = "https://movie.douban.com/top250?start="
-
url += str(i) + "&filter="
-
html = getHtml(url)
-
getItem(html)
-
-
print ("\nOK!All OVER!")
#关于正则表达式的一些说明
#<span class="title">(.*?)</span> 获取电影名字
#<p class="">.*?(\d+) 获取电影上映年份
#<span class="rating_num" property="v:average">(.*?)</span> 获取评分
#</span>.*?<span>(\d+)人评价 获取评价人数
如果想要代码看起来更优雅点,可以去掉
global index
index = 0
再将 index+1 改成1,只是不能按顺序统计了。
-
#coding=utf-8
-
import re
-
import urllib.request
-
-
def getHtml(url):
-
page = urllib.request.urlopen(url)
-
html = page.read()
-
html = html.decode('utf-8')
-
return html
-
-
def getItem(html):
-
reg = re.compile(r'.*?<span class="title">(.*?)</span>.*?<p class="">.*?(\d+).*?</p>.*?<span class="rating_num" property="v:average">(.*?)</span>.*?<span>(\d+)人评价',re.S)
-
items = re.findall(reg,html)
-
for index,item in enumerate(items,1):
-
print (index,item)
-
-
-
if __name__=='__main__':
-
for i in range(0,226,25):
-
url = "https://movie.douban.com/top250?start="
-
url += str(i) + "&filter="
-
html = getHtml(url)
-
getItem(html)
-
-
print ("\nOK!All OVER!")
-
#coding=utf-8
-
import re
-
import requests
-
from prettytable import PrettyTable
-
from colorama import init,Fore
-
-
def getHtml(url):
-
page = requests.get(url)
-
html = page.text
-
return html
-
-
def getItem(html):
-
reg = re.compile(r'.*?<span class="title">(.*?)</span>.*?<p class="">.*?(\d+).*?</p>.*?<span class="rating_num" property="v:average">(.*?)</span>.*?<span>(\d+)人评价',re.S)
-
items = re.findall(reg,html)
-
return items
-
-
-
if __name__=='__main__':
-
movie = []
-
init(autoreset=True)
-
table = PrettyTable([Fore.RED +"排名", "电影名",'上映年份','综合评分','评价人数'])
-
for i in range(0,226,25):
-
url = "https://movie.douban.com/top250?start="
-
url += str(i) + "&filter="
-
html = getHtml(url)
-
items = getItem(html)
-
for item in items:
-
movie.append(item)
-
for index,item in enumerate(movie,1):
-
if index % 4 == 0:
-
color = Fore.RED
-
elif index % 4 == 1:
-
color = Fore.YELLOW
-
elif index % 4 == 2:
-
color = Fore.GREEN
-
else:
-
color = Fore.CYAN
-
table.add_row([color + str(index),item[0],item[1],item[2],item[3]])
-
print (table)
文章来源: blog.csdn.net,作者:悦来客栈的老板,版权归原作者所有,如需转载,请联系作者。
原文链接:blog.csdn.net/qq523176585/article/details/77836194
【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)