爬虫案例

举报
建帅小伙儿 发表于 2022/09/25 03:21:12 2022/09/25
【摘要】 一、壁纸网站 # coding=utf-8""" 作者:gaojs 功能: 新增功能: 日期:2022/3/25 19:35"""import os.path import requestsimport parsel def get_address(): """ 获取url地址 :r...

一、壁纸网站


   
  1. # coding=utf-8
  2. """
  3. 作者:gaojs
  4. 功能:
  5. 新增功能:
  6. 日期:2022/3/25 19:35
  7. """
  8. import os.path
  9. import requests
  10. import parsel
  11. def get_address():
  12. """
  13. 获取url地址
  14. :return:
  15. """
  16. dirname = 'photo/'
  17. if not os.path.exists(dirname):
  18. os.mkdir(dirname)
  19. for page in range(2, 11):
  20. print(f'=====================正在爬取第{page}页内容========================')
  21. url = f'http://www.netbian.com/1920x1080/index_{page}.htm'
  22. # url = 'http://www.netbian.com/1920x1080/'
  23. headers = {
  24. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'
  25. }
  26. res = requests.get(url, headers=headers)
  27. res.encoding = res.apparent_encoding
  28. selector = parsel.Selector(res.text)
  29. href = selector.css('.list li a::attr(href)').getall()
  30. # <img src="http://img.netbian.com/file/2022/0326/small003835uYAUe1648226315.jpg" alt="绿色草地 美女刘亦菲2022年4月日历桌面壁纸护眼">
  31. url_lis = selector.css('.list li')
  32. for lis in url_lis:
  33. title = lis.css('b::text').get()
  34. # 取出广告页面
  35. if title:
  36. list_url = 'http://www.netbian.com' + lis.css('a::attr(href)').get()
  37. # print(list_url)
  38. res1 = requests.get(list_url, headers=headers)
  39. # print(res1.text)
  40. selector1 = parsel.Selector(res1.text)
  41. img_url = selector1.css('.pic img::attr(src)').get()
  42. # print(img_url)
  43. # 保存图片
  44. img_content = requests.get(url=img_url).content
  45. with open('photo/' + title + '.jpg', 'wb') as f:
  46. f.write(img_content)
  47. print(title, img_url)
  48. get_address()

二、彼岸壁纸


   
  1. # coding=utf-8
  2. """
  3. 作者:gaojs
  4. 功能:
  5. 新增功能:
  6. 日期:2022/4/2 14:59
  7. """
  8. import os.path
  9. import re
  10. import requests
  11. if not os.path.exists('photo/'):
  12. os.mkdir('photo/')
  13. url = 'http://www.netbian.com'
  14. # http://www.netbian.com/index_2.htm
  15. # http://www.netbian.com/desk/26344-1920x1080.htm
  16. # http://www.netbian.com/desk/26345-1920x1080.htm
  17. headers = {
  18. 'Host': 'www.netbian.com',
  19. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
  20. 'Upgrade-Insecure-Requests': '1',
  21. 'Cookie': '__yjs_duid=1_4535c561a20964f1ade88776981a0f411648389371877; Hm_lvt_0f461eb489c245a31c209d36e41fcc0f=1648389374,1648986956; Hm_lpvt_0f461eb489c245a31c209d36e41fcc0f=1648986956'
  22. }
  23. rsp = requests.get(url, headers=headers)
  24. rsp.encoding = rsp.apparent_encoding
  25. # print(rsp.text)
  26. # <img src="http://img.netbian.com/file/2022/0402/small004425v1bwe1648831465.jpg" alt="lol英雄联盟九尾妖狐 命运之子 阿狸壁纸"/>
  27. # <a href="(.*?)"title="(.*?)" target="_blank"><img src=".*?" alt=".*?" />
  28. url_list = re.findall('<a href="(.*?)"title="(.*?)" target="_blank"><img src=".*?" alt=".*?" />', rsp.text)
  29. # print(url_list)
  30. for index in url_list:
  31. url_lis = index[0]
  32. title = index[1]
  33. new_url = url + url_lis
  34. # print(new_url)
  35. rsp1 = requests.get(new_url)
  36. rsp1.encoding = rsp1.apparent_encoding
  37. img_list = re.findall('<a href=".*?" target="_blank"><img src="(.*?)" alt="(.*?)" title=".*?"></a>', rsp1.text)
  38. # print(img_list)
  39. for img in img_list:
  40. img_url = img[0]
  41. img_title = img[1]
  42. content_data = requests.get(img_url).content
  43. with open('photo/' + img_title + '.jpg', 'wb') as f:
  44. f.write(content_data)
  45. print(f'***************正在爬取{title}中****************')

三、某手视频


   
  1. # coding=utf-8
  2. """
  3. 作者:gaojs
  4. 功能:
  5. 新增功能:
  6. 日期:2022/4/15 20:13
  7. """
  8. import json
  9. import os.path
  10. import pprint
  11. import requests
  12. def get_page(pcursor):
  13. path = 'video/'
  14. if not os.path.exists(path):
  15. os.mkdir(path)
  16. # 爬取对象'https://www.kuaishou.com/profile/3xhv7zhkfr3rqag'
  17. """
  18. ctrl+r 批量替换
  19. https://www.kuaishou.com/short-video/3xw5fmcf9jdap29?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull
  20. https://www.kuaishou.com/short-video/3xf98wc5q2cuxtq?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull
  21. """
  22. url = 'https://www.kuaishou.com/graphql'
  23. headers = {
  24. 'content-type': 'application/json',
  25. 'Cookie': 'kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_72314bf978cb158dd7034b2370d2ae70',
  26. 'Host': 'www.kuaishou.com',
  27. 'Origin': 'https://www.kuaishou.com',
  28. 'Referer': 'https://www.kuaishou.com/short-video/3x6v3xmcjsd5cki?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull',
  29. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
  30. }
  31. data = {
  32. "operationName": "visionProfilePhotoList",
  33. "query": "query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n type\n author {\n id\n name\n following\n headerUrl\n headerUrls {\n cdn\n url\n __typename\n }\n __typename\n }\n tags {\n type\n name\n __typename\n }\n photo {\n id\n duration\n caption\n likeCount\n realLikeCount\n coverUrl\n coverUrls {\n cdn\n url\n __typename\n }\n photoUrls {\n cdn\n url\n __typename\n }\n photoUrl\n liked\n timestamp\n expTag\n animatedCoverUrl\n stereoType\n videoRatio\n profileUserTopPhoto\n __typename\n }\n canAddComment\n currentPcursor\n llsid\n status\n __typename\n }\n hostName\n pcursor\n __typename\n }\n}\n",
  34. "variables": {"userId": "3xhv7zhkfr3rqag", "pcursor": pcursor, "page": "detail", "webPageArea": "profilexxnull"}
  35. }
  36. rsp = requests.post(url=url, json=data, headers=headers)
  37. # 第一种方式转成json
  38. # json_data = json.loads(rsp.text)
  39. # 或者
  40. json_data = rsp.json()
  41. # print(json_data, type(json_data))
  42. url_list = json_data['data']['visionProfilePhotoList']['feeds']
  43. pcursor = json_data['data']['visionProfilePhotoList']['pcursor']
  44. # print(url_list)
  45. # pprint.pprint(url_list)
  46. for key in url_list:
  47. # 视屏标题
  48. title = key['photo']['caption']
  49. # print(title)
  50. # 视频url
  51. new_url = key['photo']['photoUrl']
  52. # print(title, new_url)
  53. # 发送请求
  54. content_data = requests.get(url=new_url).content
  55. # 保存目录
  56. with open(f'video/{title}.mp4', mode='wb') as f:
  57. f.write(content_data)
  58. print(f'=======================正在下载标题为 {title} 的快手短视频==========================')
  59. if pcursor != "no_more":
  60. get_page(pcursor)
  61. get_page("")

四、拉钩数据


   
  1. # coding=utf-8
  2. """
  3. 作者:gaojs
  4. 功能:
  5. 新增功能:
  6. 日期:2022/4/3 17:58
  7. """
  8. import csv
  9. import json
  10. import os.path
  11. import pprint
  12. import requests
  13. import re
  14. if not os.path.exists('info/'):
  15. os.makedirs('info/')
  16. f = open('info/招聘.csv', encoding='utf-8', mode='a', newline='')
  17. csv_writer = csv.DictWriter(f, fieldnames=[
  18. '职位名字',
  19. '公司名字',
  20. '工作城市',
  21. '学历要求',
  22. '经验要求',
  23. '薪资要求',
  24. '公司地址',
  25. '详情页'
  26. ])
  27. # 写入表头
  28. csv_writer.writeheader()
  29. for page in range(1, 11):
  30. url = f'https://www.lagou.com/wn/jobs?pn={page}&fromSearch=true&kd=python'
  31. headers = {
  32. 'Host': 'www.lagou.com',
  33. 'Referer': 'https://www.lagou.com/utrack/trackMid.html?f=https%3A%2F%2Fwww.lagou.com%2Fwn%2Fjobs%3Fpn%3D2%26fromSearch%3Dtrue%26kd%3Dpython&t=1648984113&_ti=1',
  34. 'Cookie': 'user_trace_token=20211122110451-60eec88a-fbaf-47fd-9a53-188f3632144b; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1637550277; _ga=GA1.2.1219095688.1637550277; LGUID=20211122110452-94ffa347-2c46-4c2d-8429-b83e30e86693; RECOMMEND_TIP=true; __lg_stoken__=9ec31e7a3301bab4f215bd5f80c8af0ab0dc2b8ce81af654fe848cf33ad7c4f33d0748020b30281d56a28a756342ce5d42e6c218bcfd56dbf764c51686741cbaf14de987ef24; JSESSIONID=ABAAABAABEIABCIA45B6C458598FF70789BDFD5A4574786; WEBTJ-ID=20220403173842-17feeca7ea0402-090b1b6ee61841-a3e3164-3686400-17feeca7ea15f1; sensorsdata2015session=%7B%7D; X_HTTP_TOKEN=1ca92d1d8ffe4ecb3114898461b10fa2c7054519c6; X_MIDDLE_TOKEN=3e27b9a5a69f9fa78d5d2fe99174c9c5; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%229659966%22%2C%22%24device_id%22%3A%2217d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2298.0.4758.102%22%7D%2C%22first_id%22%3A%2217d459f2858540-02719bae0efae1-4343363-2073600-17d459f2859704%22%7D',
  35. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
  36. }
  37. rsp = requests.get(url=url, headers=headers)
  38. print(rsp.status_code)
  39. # print(rsp.text)
  40. # <script id="__NEXT_DATA__" type="application/json">(.*?)</script>
  41. html_data = re.findall('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', rsp.text)[0]
  42. # print(html_data)
  43. json_data = json.loads(html_data)
  44. # print(json_data)
  45. # pprint.pprint(json_data)
  46. result = json_data['props']['pageProps']['initData']['content']['positionResult']['result']
  47. # print(result)
  48. # 格式输出
  49. # pprint.pprint(result)
  50. for index in result:
  51. # pprint.pprint(index)
  52. # 岗位职责
  53. job_index = index['positionDetail'].replace('<br />', '').replace('<br>', '')
  54. href = f'https://www.lagou.com/wn/jobs{index["positionId"]}.html'
  55. dict1 = {
  56. '职位名字': index['positionName'],
  57. '公司名字': index['companyFullName'],
  58. '工作城市': index['city'],
  59. '学历要求': index['education'],
  60. '经验要求': index['workYear'],
  61. '薪资要求': index['workYear'],
  62. '公司地址': index['positionAddress'],
  63. '详情页': href
  64. }
  65. csv_writer.writerow(dict1)
  66. title = index['positionName'] + index['companyFullName']
  67. new_title = re.sub(r'[\/?:"<>|]', '', title)
  68. with open('info/' + new_title + '.txt', 'w', encoding='utf-8') as f:
  69. f.write(job_index)
  70. print(dict1)

五、王者荣耀英雄皮肤高清壁纸


   
  1. # coding=utf-8
  2. """
  3. 作者:gaojs
  4. 功能:
  5. 新增功能:
  6. 日期:2022/4/2 13:05
  7. """
  8. import requests
  9. import os
  10. import re
  11. url = 'https://pvp.qq.com/web201605/js/herolist.json'
  12. headers = {
  13. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55'
  14. }
  15. rsp = requests.get(url, headers=headers)
  16. # print(rsp.text)
  17. print(rsp.status_code)
  18. # print(rsp.json())
  19. for index in rsp.json():
  20. # 获取英雄名字和id
  21. hero_name = index['cname']
  22. hero_id = index['ename']
  23. # filename = f'{hero_name}\\'
  24. # if not os.path.exists(filename):
  25. # os.mkdir(filename)
  26. index_url = f'https://pvp.qq.com/web201605/herodetail/{hero_id}.shtml'
  27. # print(hero_name, hero_id, index_url)
  28. rsp1 = requests.get(url=index_url, headers=headers)
  29. # rsp1.encoding = 'gbk'
  30. rsp1.encoding = rsp1.apparent_encoding#自动识别编码
  31. # print(rsp1.text)
  32. temp = '<ul class="pic-pf-list pic-pf-list3" data-imgname="(.*?)">'
  33. title_list = re.findall('<ul class="pic-pf-list pic-pf-list3" data-imgname="(.*?)">', rsp1.text)[0]
  34. title_list = re.sub('&\d+', '', title_list).split('|')
  35. for num in range(1, len(title_list) + 1):
  36. img_url = f'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{hero_id}/{hero_id}-bigskin-{num}.jpg'
  37. img_title = title_list[num - 1]
  38. img_data = requests.get(url=img_url, headers=headers).content
  39. with open('photo/' + img_title + '.jpg', 'wb') as f:
  40. print(f'=====================正在爬取{hero_name}的皮肤========================')
  41. f.write(img_data)
  42. # print(img_title, img_url)

六、美图网站


   
  1. # coding=utf-8
  2. """
  3. 作者:gaojs
  4. 功能:
  5. 新增功能:
  6. 日期:2022/3/26 12:17
  7. """
  8. import os.path
  9. from time import sleep
  10. import requests
  11. import re
  12. dirname = 'photo/'
  13. if not os.path.exists(dirname):
  14. os.mkdir(dirname)
  15. url = 'https://www.vmgirls.com/17081.html'
  16. headers = {
  17. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46'
  18. }
  19. res = requests.get(url, headers=headers)
  20. # print(res.text)
  21. print(res.status_code)
  22. # <a href="(.*?)" alt=".*?" title=".*?">
  23. # 只匹配括号内的内容
  24. url_list = re.findall('<a href="(.*?)" alt=".*?" title=".*?">', res.text)
  25. print(url_list)
  26. for urls in url_list:
  27. name = urls.split('/')[-1]
  28. new_url = 'https:' + urls
  29. # print(new_url)
  30. res_content = requests.get(url=new_url, headers=headers).content
  31. sleep(2)
  32. # 保存文件
  33. with open('photo/' + name + '.jpeg', mode='wb') as f:
  34. f.write(res_content)

七、表情包


   
  1. # coding=utf-8
  2. """
  3. 作者:gaojs
  4. 功能:
  5. 新增功能:
  6. 日期:2022/3/25 17:35
  7. """
  8. import requests
  9. import re
  10. def download_photo(name, url):
  11. res = requests.get(url)
  12. print(res.status_code)
  13. suffix = url.split('.')[-1]
  14. with open('photo/' + name + '.' + suffix, 'wb') as f:
  15. f.write(res.content)
  16. """
  17. http://tva1.sinaimg.cn/large/6a2a7a61ly1gy5fd1pb7ij20iz0iz41l.jpg
  18. http://tva1.sinaimg.cn/large/6a2a7a61ly1gy5fd3od4lg208w08wdvb.gif
  19. https://www.fabiaoqing.com/bqb/lists/page/3.html
  20. """
  21. def download_page(url):
  22. # url = 'https://www.fabiaoqing.com/biaoqing/lists/page/3.html'
  23. res1 = requests.get(url)
  24. temp = '<img class="ui image lazy" data-original="(.*?)" src="/Public/lazyload/img/transparent.gif" title="(.*?)" alt="(.*?)" style="max-height:188;margin: 0 auto"/>'
  25. result1 = re.findall(temp, res1.text)
  26. print(result1)
  27. for img in result1:
  28. print(img)
  29. # name = img[0]
  30. # new_name = re.sub(r'[\/:*?;"<>|\n]', '_', name)
  31. # download_photo(img[1], new_name)
  32. download_photo(img[1], img[0])
  33. # download_page('https://www.fabiaoqing.com/biaoqing/lists/page/3.html')
  34. def download_all_page():
  35. for page in range(1, 50):
  36. pages = 'https://www.fabiaoqing.com/biaoqing/lists/page/' + str(page) + '.html'
  37. download_page(pages)
  38. download_all_page()

八、酷狗music


   
  1. # coding=utf-8
  2. """
  3. 作者:gaojs
  4. 功能:
  5. 新增功能:
  6. 日期:2022/4/8 12:59
  7. """
  8. import os.path
  9. import pprint
  10. import re
  11. import requests
  12. if not os.path.exists('music/'):
  13. os.mkdir('music/')
  14. url = 'https://www.kugou.com/yy/html/rank.html'
  15. headers = {
  16. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
  17. }
  18. rsp = requests.get(url, headers=headers)
  19. # print(rsp.text)
  20. hash_list = re.findall('"Hash":"(.*?)"', rsp.text)
  21. album_list = re.findall('"album_id":(.*?),', rsp.text)
  22. # print(rsp.text)
  23. zip_list = zip(hash_list, album_list)
  24. for hash1, album_id in zip_list:
  25. # print(hash1, album_id)
  26. index_url = 'https://wwwapi.kugou.com/yy/index.php'
  27. data = {
  28. 'r': 'play/getdata',
  29. 'hash': hash1,
  30. 'dfid': '34dlds4MjPyk0XgC5n0MobxL',
  31. 'appid': '1014',
  32. 'mid': 'fbcb28bbcbd1758696a1eb4363b645d6',
  33. 'platid': '4',
  34. 'album_id': album_id,
  35. '_': '1649395118742'
  36. }
  37. rsp1 = requests.get(url=index_url, params=data, headers=headers)
  38. # print(rsp1.json())
  39. # pprint.pprint(rsp1.json())
  40. audioname = rsp1.json()['data']['audio_name']
  41. playurl = rsp1.json()['data']['play_url']
  42. # print(audioname, playurl)
  43. music_content = requests.get(url=playurl, headers=headers).content
  44. with open('music\\' + audioname + '.mp3', 'wb') as f:
  45. print(f'*************************正在爬取歌曲{audioname}中***********************')
  46. f.write(music_content)

文章来源: blog.csdn.net,作者:懿曲折扇情,版权归原作者所有,如需转载,请联系作者。

原文链接:blog.csdn.net/qq_41332844/article/details/126837335

【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。