爬虫案例
【摘要】
一、壁纸网站
# coding=utf-8""" 作者:gaojs 功能: 新增功能: 日期:2022/3/25 19:35"""import os.path import requestsimport parsel def get_address(): """ 获取url地址 :r...
一、壁纸网站
-
# coding=utf-8
-
"""
-
作者:gaojs
-
功能:
-
新增功能:
-
日期:2022/3/25 19:35
-
"""
-
import os.path
-
-
import requests
-
import parsel
-
-
-
def get_address():
-
"""
-
获取url地址
-
:return:
-
"""
-
dirname = 'photo/'
-
-
if not os.path.exists(dirname):
-
os.mkdir(dirname)
-
-
for page in range(2, 11):
-
print(f'=====================正在爬取第{page}页内容========================')
-
url = f'http://www.netbian.com/1920x1080/index_{page}.htm'
-
-
# url = 'http://www.netbian.com/1920x1080/'
-
headers = {
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'
-
}
-
res = requests.get(url, headers=headers)
-
res.encoding = res.apparent_encoding
-
-
selector = parsel.Selector(res.text)
-
href = selector.css('.list li a::attr(href)').getall()
-
-
# <img src="http://img.netbian.com/file/2022/0326/small003835uYAUe1648226315.jpg" alt="绿色草地 美女刘亦菲2022年4月日历桌面壁纸护眼">
-
-
url_lis = selector.css('.list li')
-
for lis in url_lis:
-
title = lis.css('b::text').get()
-
# 取出广告页面
-
if title:
-
list_url = 'http://www.netbian.com' + lis.css('a::attr(href)').get()
-
# print(list_url)
-
res1 = requests.get(list_url, headers=headers)
-
# print(res1.text)
-
selector1 = parsel.Selector(res1.text)
-
img_url = selector1.css('.pic img::attr(src)').get()
-
# print(img_url)
-
-
# 保存图片
-
img_content = requests.get(url=img_url).content
-
with open('photo/' + title + '.jpg', 'wb') as f:
-
f.write(img_content)
-
print(title, img_url)
-
-
-
get_address()
二、彼岸壁纸
-
# coding=utf-8
-
"""
-
作者:gaojs
-
功能:
-
新增功能:
-
日期:2022/4/2 14:59
-
"""
-
import os.path
-
import re
-
import requests
-
-
-
if not os.path.exists('photo/'):
-
os.mkdir('photo/')
-
-
url = 'http://www.netbian.com'
-
# http://www.netbian.com/index_2.htm
-
-
# http://www.netbian.com/desk/26344-1920x1080.htm
-
# http://www.netbian.com/desk/26345-1920x1080.htm
-
headers = {
-
'Host': 'www.netbian.com',
-
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
-
'Upgrade-Insecure-Requests': '1',
-
'Cookie': '__yjs_duid=1_4535c561a20964f1ade88776981a0f411648389371877; Hm_lvt_0f461eb489c245a31c209d36e41fcc0f=1648389374,1648986956; Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f=1648986956'
-
}
-
rsp = requests.get(url, headers=headers)
-
rsp.encoding = rsp.apparent_encoding
-
# print(rsp.text)
-
-
# <img src="http://img.netbian.com/file/2022/0402/small004425v1bwe1648831465.jpg" alt="lol英雄联盟九尾妖狐 命运之子 阿狸壁纸"/>
-
# <a href="(.*?)"title="(.*?)" target="_blank"><img src=".*?" alt=".*?" />
-
url_list = re.findall('<a href="(.*?)"title="(.*?)" target="_blank"><img src=".*?" alt=".*?" />', rsp.text)
-
# print(url_list)
-
-
for index in url_list:
-
url_lis = index[0]
-
title = index[1]
-
new_url = url + url_lis
-
# print(new_url)
-
-
rsp1 = requests.get(new_url)
-
rsp1.encoding = rsp1.apparent_encoding
-
img_list = re.findall('<a href=".*?" target="_blank"><img src="(.*?)" alt="(.*?)" title=".*?"></a>', rsp1.text)
-
# print(img_list)
-
-
for img in img_list:
-
img_url = img[0]
-
img_title = img[1]
-
content_data = requests.get(img_url).content
-
-
with open('photo/' + img_title + '.jpg', 'wb') as f:
-
f.write(content_data)
-
print(f'***************正在爬取{title}中****************')
三、某手视频
-
# coding=utf-8
-
"""
-
作者:gaojs
-
功能:
-
新增功能:
-
日期:2022/4/15 20:13
-
"""
-
import json
-
import os.path
-
import pprint
-
-
import requests
-
-
-
def get_page(pcursor):
-
path = 'video/'
-
if not os.path.exists(path):
-
os.mkdir(path)
-
# 爬取对象'https://www.kuaishou.com/profile/3xhv7zhkfr3rqag'
-
"""
-
ctrl+r 批量替换
-
https://www.kuaishou.com/short-video/3xw5fmcf9jdap29?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull
-
https://www.kuaishou.com/short-video/3xf98wc5q2cuxtq?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull
-
"""
-
-
url = 'https://www.kuaishou.com/graphql'
-
headers = {
-
'content-type': 'application/json',
-
'Cookie': 'kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_72314bf978cb158dd7034b2370d2ae70',
-
'Host': 'www.kuaishou.com',
-
'Origin': 'https://www.kuaishou.com',
-
'Referer': 'https://www.kuaishou.com/short-video/3x6v3xmcjsd5cki?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull',
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
-
}
-
data = {
-
"operationName": "visionProfilePhotoList",
-
"query": "query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n type\n author {\n id\n name\n following\n headerUrl\n headerUrls {\n cdn\n url\n __typename\n }\n __typename\n }\n tags {\n type\n name\n __typename\n }\n photo {\n id\n duration\n caption\n likeCount\n realLikeCount\n coverUrl\n coverUrls {\n cdn\n url\n __typename\n }\n photoUrls {\n cdn\n url\n __typename\n }\n photoUrl\n liked\n timestamp\n expTag\n animatedCoverUrl\n stereoType\n videoRatio\n profileUserTopPhoto\n __typename\n }\n canAddComment\n currentPcursor\n llsid\n status\n __typename\n }\n hostName\n pcursor\n __typename\n }\n}\n",
-
"variables": {"userId": "3xhv7zhkfr3rqag", "pcursor": pcursor, "page": "detail", "webPageArea": "profilexxnull"}
-
}
-
rsp = requests.post(url=url, json=data, headers=headers)
-
-
# 第一种方式转成json
-
# json_data = json.loads(rsp.text)
-
# 或者
-
json_data = rsp.json()
-
# print(json_data, type(json_data))
-
url_list = json_data['data']['visionProfilePhotoList']['feeds']
-
pcursor = json_data['data']['visionProfilePhotoList']['pcursor']
-
# print(url_list)
-
# pprint.pprint(url_list)
-
-
for key in url_list:
-
# 视屏标题
-
title = key['photo']['caption']
-
# print(title)
-
# 视频url
-
new_url = key['photo']['photoUrl']
-
# print(title, new_url)
-
# 发送请求
-
content_data = requests.get(url=new_url).content
-
# 保存目录
-
with open(f'video/{title}.mp4', mode='wb') as f:
-
f.write(content_data)
-
print(f'=======================正在下载标题为 {title} 的快手短视频==========================')
-
if pcursor != "no_more":
-
get_page(pcursor)
-
-
-
get_page("")
四、拉钩数据
-
# coding=utf-8
-
"""
-
作者:gaojs
-
功能:
-
新增功能:
-
日期:2022/4/3 17:58
-
"""
-
import csv
-
import json
-
import os.path
-
import pprint
-
-
import requests
-
import re
-
-
-
if not os.path.exists('info/'):
-
os.makedirs('info/')
-
-
f = open('info/招聘.csv', encoding='utf-8', mode='a', newline='')
-
csv_writer = csv.DictWriter(f, fieldnames=[
-
'职位名字',
-
'公司名字',
-
'工作城市',
-
'学历要求',
-
'经验要求',
-
'薪资要求',
-
'公司地址',
-
'详情页'
-
])
-
# 写入表头
-
csv_writer.writeheader()
-
-
for page in range(1, 11):
-
url = f'https://www.lagou.com/wn/jobs?pn={page}&fromSearch=true&kd=python'
-
headers = {
-
'Host': 'www.lagou.com',
-
'Referer': 'https://www.lagou.com/utrack/trackMid.html?f=https%3A%2F%2Fwww.lagou.com%2Fwn%2Fjobs%3Fpn%3D2%26fromSearch%3Dtrue%26kd%3Dpython&t=1648984113&_ti=1',
-
'Cookie': 'user_trace_token=20211122110451-60eec88a-fbaf-47fd-9a53-188f3632144b; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637550277; _ga=GA1.2.1219095688.1637550277; LGUID=20211122110452-94ffa347-2c46-4c2d-8429-b83e30e86693; RECOMMEND_TIP=true; __lg_stoken__=9ec31e7a3301bab4f215bd5f80c8af0ab0dc2b8ce81af654fe848cf33ad7c4f33d0748020b30281d56a28a756342ce5d42e6c218bcfd56dbf764c51686741cbaf14de987ef24; JSESSIONID=ABAAABAABEIABCIA45B6C458598FF70789BDFD5A4574786; WEBTJ-ID=20220403173842-17feeca7ea0402-090b1b6ee61841-a3e3164-3686400-17feeca7ea15f1; sensorsdata2015session=%7B%7D; X_HTTP_TOKEN=1ca92d1d8ffe4ecb3114898461b10fa2c7054519c6; X_MIDDLE_TOKEN=3e27b9a5a69f9fa78d5d2fe99174c9c5; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%229659966%22%2C%22%24device_id%22%3A%2217d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2298.0.4758.102%22%7D%2C%22first_id%22%3A%2217d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704%22%7D',
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
-
}
-
-
rsp = requests.get(url=url, headers=headers)
-
print(rsp.status_code)
-
# print(rsp.text)
-
# <script id="__NEXT_DATA__" type="application/json">(.*?)</script>
-
html_data = re.findall('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', rsp.text)[0]
-
# print(html_data)
-
-
-
json_data = json.loads(html_data)
-
# print(json_data)
-
# pprint.pprint(json_data)
-
result = json_data['props']['pageProps']['initData']['content']['positionResult']['result']
-
# print(result)
-
# 格式输出
-
# pprint.pprint(result)
-
for index in result:
-
# pprint.pprint(index)
-
# 岗位职责
-
job_index = index['positionDetail'].replace('<br />', '').replace('<br>', '')
-
href = f'https://www.lagou.com/wn/jobs{index["positionId"]}.html'
-
dict1 = {
-
'职位名字': index['positionName'],
-
'公司名字': index['companyFullName'],
-
'工作城市': index['city'],
-
'学历要求': index['education'],
-
'经验要求': index['workYear'],
-
'薪资要求': index['workYear'],
-
'公司地址': index['positionAddress'],
-
'详情页': href
-
}
-
csv_writer.writerow(dict1)
-
title = index['positionName'] + index['companyFullName']
-
new_title = re.sub(r'[\/?:"<>|]', '', title)
-
with open('info/' + new_title + '.txt', 'w', encoding='utf-8') as f:
-
f.write(job_index)
-
print(dict1)
五、王者荣耀英雄皮肤高清壁纸
-
# coding=utf-8
-
"""
-
作者:gaojs
-
功能:
-
新增功能:
-
日期:2022/4/2 13:05
-
"""
-
-
import requests
-
import os
-
import re
-
-
-
url = 'https://pvp.qq.com/web201605/js/herolist.json'
-
headers = {
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55'
-
}
-
rsp = requests.get(url, headers=headers)
-
# print(rsp.text)
-
print(rsp.status_code)
-
# print(rsp.json())
-
for index in rsp.json():
-
# 获取英雄名字和id
-
hero_name = index['cname']
-
hero_id = index['ename']
-
-
# filename = f'{hero_name}\\'
-
# if not os.path.exists(filename):
-
# os.mkdir(filename)
-
-
index_url = f'https://pvp.qq.com/web201605/herodetail/{hero_id}.shtml'
-
# print(hero_name, hero_id, index_url)
-
rsp1 = requests.get(url=index_url, headers=headers)
-
# rsp1.encoding = 'gbk'
-
rsp1.encoding = rsp1.apparent_encoding#自动识别编码
-
# print(rsp1.text)
-
temp = '<ul class="pic-pf-list pic-pf-list3" data-imgname="(.*?)">'
-
title_list = re.findall('<ul class="pic-pf-list pic-pf-list3" data-imgname="(.*?)">', rsp1.text)[0]
-
title_list = re.sub('&\d+', '', title_list).split('|')
-
-
for num in range(1, len(title_list) + 1):
-
-
img_url = f'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{hero_id}/{hero_id}-bigskin-{num}.jpg'
-
img_title = title_list[num - 1]
-
-
img_data = requests.get(url=img_url, headers=headers).content
-
with open('photo/' + img_title + '.jpg', 'wb') as f:
-
print(f'=====================正在爬取{hero_name}的皮肤========================')
-
f.write(img_data)
-
# print(img_title, img_url)
六、美图网站
-
# coding=utf-8
-
"""
-
作者:gaojs
-
功能:
-
新增功能:
-
日期:2022/3/26 12:17
-
"""
-
import os.path
-
from time import sleep
-
-
import requests
-
import re
-
-
-
dirname = 'photo/'
-
if not os.path.exists(dirname):
-
os.mkdir(dirname)
-
-
url = 'https://www.vmgirls.com/17081.html'
-
-
headers = {
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'
-
}
-
res = requests.get(url, headers=headers)
-
# print(res.text)
-
print(res.status_code)
-
-
# <a href="(.*?)" alt=".*?" title=".*?">
-
# 只匹配括号内的内容
-
url_list = re.findall('<a href="(.*?)" alt=".*?" title=".*?">', res.text)
-
print(url_list)
-
-
for urls in url_list:
-
name = urls.split('/')[-1]
-
new_url = 'https:' + urls
-
# print(new_url)
-
res_content = requests.get(url=new_url, headers=headers).content
-
sleep(2)
-
# 保存文件
-
with open('photo/' + name + '.jpeg', mode='wb') as f:
-
f.write(res_content)
七、表情包
-
# coding=utf-8
-
"""
-
作者:gaojs
-
功能:
-
新增功能:
-
日期:2022/3/25 17:35
-
"""
-
-
import requests
-
import re
-
-
-
def download_photo(name, url):
-
res = requests.get(url)
-
print(res.status_code)
-
suffix = url.split('.')[-1]
-
with open('photo/' + name + '.' + suffix, 'wb') as f:
-
f.write(res.content)
-
-
-
"""
-
http://tva1.sinaimg.cn/large/6a2a7a61ly1gy5fd1pb7ij20iz0iz41l.jpg
-
http://tva1.sinaimg.cn/large/6a2a7a61ly1gy5fd3od4lg208w08wdvb.gif
-
-
https://www.fabiaoqing.com/bqb/lists/page/3.html
-
"""
-
-
-
def download_page(url):
-
# url = 'https://www.fabiaoqing.com/biaoqing/lists/page/3.html'
-
res1 = requests.get(url)
-
temp = '<img class="ui image lazy" data-original="(.*?)" src="/Public/lazyload/img/transparent.gif" title="(.*?)" alt="(.*?)" style="max-height:188;margin: 0 auto"/>'
-
result1 = re.findall(temp, res1.text)
-
print(result1)
-
for img in result1:
-
print(img)
-
# name = img[0]
-
# new_name = re.sub(r'[\/:*?;"<>|\n]', '_', name)
-
# download_photo(img[1], new_name)
-
download_photo(img[1], img[0])
-
-
# download_page('https://www.fabiaoqing.com/biaoqing/lists/page/3.html')
-
-
-
def download_all_page():
-
for page in range(1, 50):
-
pages = 'https://www.fabiaoqing.com/biaoqing/lists/page/' + str(page) + '.html'
-
download_page(pages)
-
-
-
download_all_page()
八、酷狗music
-
# coding=utf-8
-
"""
-
作者:gaojs
-
功能:
-
新增功能:
-
日期:2022/4/8 12:59
-
"""
-
import os.path
-
import pprint
-
import re
-
-
import requests
-
-
-
if not os.path.exists('music/'):
-
os.mkdir('music/')
-
url = 'https://www.kugou.com/yy/html/rank.html'
-
-
headers = {
-
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
-
}
-
-
rsp = requests.get(url, headers=headers)
-
# print(rsp.text)
-
hash_list = re.findall('"Hash":"(.*?)"', rsp.text)
-
album_list = re.findall('"album_id":(.*?),', rsp.text)
-
# print(rsp.text)
-
zip_list = zip(hash_list, album_list)
-
for hash1, album_id in zip_list:
-
# print(hash1, album_id)
-
-
index_url = 'https://wwwapi.kugou.com/yy/index.php'
-
data = {
-
'r': 'play/getdata',
-
'hash': hash1,
-
'dfid': '34dlds4MjPyk0XgC5n0MobxL',
-
'appid': '1014',
-
'mid': 'fbcb28bbcbd1758696a1eb4363b645d6',
-
'platid': '4',
-
'album_id': album_id,
-
'_': '1649395118742'
-
}
-
rsp1 = requests.get(url=index_url, params=data, headers=headers)
-
# print(rsp1.json())
-
# pprint.pprint(rsp1.json())
-
audioname = rsp1.json()['data']['audio_name']
-
playurl = rsp1.json()['data']['play_url']
-
# print(audioname, playurl)
-
music_content = requests.get(url=playurl, headers=headers).content
-
with open('music\\' + audioname + '.mp3', 'wb') as f:
-
print(f'*************************正在爬取歌曲{audioname}中***********************')
-
f.write(music_content)
文章来源: blog.csdn.net,作者:懿曲折扇情,版权归原作者所有,如需转载,请联系作者。
原文链接:blog.csdn.net/qq_41332844/article/details/126837335
【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)