五分钟带你玩转python(一)python入门,爬取图片,文字,视频,音频

举报
小鲍侃java 发表于 2021/09/11 00:55:54 2021/09/11
【摘要】 爬取天气 并存在数据库 #!/usr/bin/python# -*- coding: utf-8 -*-import pymysqlimport requestsfrom bs4 import BeautifulSoup db = pymysql.connect( host='localhost', port=3306, ...

爬取天气 并存在数据库


  
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. import pymysql
  4. import requests
  5. from bs4 import BeautifulSoup
  6. db = pymysql.connect(
  7. host='localhost',
  8. port=3306,
  9. user='root',
  10. passwd='root',
  11. db='mysql',
  12. use_unicode=True,
  13. charset="utf8"
  14. )
  15. cursor = db.cursor()
  16. def downdata(url):
  17. hd = {
  18. 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
  19. req = requests.get(url, headers=hd)
  20. # req.encoding = 'utf-8'
  21. soup = BeautifulSoup(req.text, 'html.parser')
  22. da_new = soup.find_all('li', class_='ndays-item png-fix cf')
  23. for da in da_new:
  24. day = da.find('div', class_='td td2').find('p', class_='p1')
  25. week = da.find('div', class_='td td2').find('p', class_='p2')
  26. wd = da.find('div', class_='td td5').find('p', class_='p1')
  27. fl = da.find('div', class_='td td5').find('p', class_='p2')
  28. f2 = da.find('div', class_='td td3').find('div')['title']
  29. print('今天是' + day.text + ',' + '星期' + week.text + ',' + '温度' + wd.text + ',' + '风力' + fl.text + ',' + '天气' + f2)
  30. sql = "INSERT INTO tianiq(day1,week1, wd, fl, air) VALUES ('%s','%s','%s','%s','%s')" % (day.text, week.text, wd.text, fl.text, f2)
  31. print(sql)
  32. cursor.execute(sql)
  33. db.commit()
  34. downdata('http://tianqi.sogou.com/shenyang/15/')

爬取漫画


  
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. import re
  4. import urllib.request
  5. def gethtml(url):
  6. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
  7. req = urllib.request.Request(url=url, headers=headers)
  8. html = urllib.request.urlopen(req).read()
  9. return html
  10. def getimg(html):
  11. reg = r'src="(.*?\.jpg)"'
  12. img=re.compile(reg)
  13. html=html.decode('utf-8')#python3
  14. imglist=re.findall(img,html)
  15. x = 0
  16. for imgurl in imglist:
  17. urllib.request.urlretrieve(imgurl,'D:%s.jpg'%x)
  18. x = x+1
  19. html=gethtml("http://www.tuku.cc/")
  20. print(getimg(html))

调用数据库


  
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. import pymysql
  4. # 打开数据库连接
  5. db = pymysql.connect("localhost", "root", "root", "mysql")
  6. # 使用cursor()方法获取操作游标
  7. cursor = db.cursor()
  8. # SQL 插入语句
  9. sql = "INSERT INTO tianiq(day1, \
  10. week1, wd, fl, air) \
  11. VALUES ('Mac', 'Mohan', 'M', 'M', 'M')"
  12. try:
  13. # 执行sql语句
  14. cursor.execute(sql)
  15. # 执行sql语句
  16. db.commit()
  17. print("insert ok")
  18. except:
  19. # 发生错误时回滚
  20. db.rollback()
  21. # 关闭数据库连接
  22. db.close()

爬取视频


  
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. import re
  4. import requests
  5. from bs4 import BeautifulSoup
  6. def download(url):
  7. dz = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
  8. req = requests.get(url,headers = dz).content
  9. with open('qq.mp4', 'wb') as fp:
  10. fp.write(req)
  11. download('http://video.study.163.com/edu-video/nos/mp4/2017/04/01/1006064693_cc2842f7dc8b410c96018ec618f37ef6_sd.mp4?ak=d2e3a054a6a144f3d98805f49b4f04439064ce920ba6837d89a32d0b0294ad3c1729b01fa6a0b5a3442ba46f5001b48b1ee2fb6240fc719e1b3940ed872a11f180acad2d0d7744336d03591c3586614af455d97e99102a49b825836de913910ef0837682774232610f0d4e39d8436cb9a153bdeea4a2bfbae357803dfb6768a742fe395e87eba0c3e30b7b64ef1be06585111bf60ea26d5dad1f891edd9e94a8e167e0b04144490499ffe31e0d97a0a1babcbd7d2e007d850cc3bf7aa697e8ff')

爬取音频


  
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. import json
  4. import requests
  5. from bs4 import BeautifulSoup
  6. def download(url):
  7. hd = {
  8. 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
  9. req = requests.get(url, headers=hd)
  10. reps = req.text
  11. result = json.loads(reps)
  12. datap = result['data']['tracksAudioPlay']
  13. for index in datap:
  14. title = index['trackName']
  15. index['src']
  16. print(index['src'])
  17. data = requests.get(index['src'], headers=hd).content
  18. try:
  19. with open('%s.mp3' % title, 'wb') as f:
  20. f.write(data)
  21. except BaseException:
  22. print('1')
  23. download('http://www.ximalaya.com/revision/play/album?albumId=7371372&pageNum=1&sort=-1&pageSize=30')

爬取文字


  
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. import requests
  4. from bs4 import BeautifulSoup
  5. def get_h(url):
  6. response = requests.get(url)
  7. response .encoding = 'utf-8'
  8. return response.text
  9. def get_c(html):
  10. soup = BeautifulSoup(html,'html.parser')
  11. joke_content = soup.select('div.content')[0].getText
  12. return joke_content
  13. url_joke = "https://www.qiushibaike.com"
  14. html = get_h(url_joke)
  15. joke_content = get_c(html)
  16. print(joke_content)

爬取图片


  
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. import requests
  4. from bs4 import BeautifulSoup
  5. import os
  6. headers = {
  7. 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
  8. url = 'http://www.ivsky.com/'
  9. start_html = requests.get(url, headers=headers)
  10. Soup = BeautifulSoup(start_html.text, 'html.parser')
  11. all_div = Soup.find_all('div', class_='syl_pic')
  12. for lsd in all_div:
  13. lsds = 'http://www.ivsky.com' + lsd.find('a')['href']
  14. title = lsd.find('a').get_text
  15. print(lsds)
  16. html = requests.get(lsds, headers=headers)
  17. Soup_new = BeautifulSoup(html.text, 'html.parser')
  18. app = Soup_new.find_all('div', class_='il_img')
  19. for app_new in app:
  20. apptwo = 'http://www.ivsky.com' + app_new.find('a')['href']
  21. htmlthree = requests.get(apptwo, headers=headers)
  22. Soupthree = BeautifulSoup(htmlthree.text, 'html.parser')
  23. appthree = Soupthree.find('div', class_='pic')
  24. appf = appthree.find('img')['src']
  25. name = appf[-9:-4]
  26. img = requests.get(appf, headers=headers)
  27. f = open(name + '.jpg', 'ab') ##写入多媒体文件必须要 b 这个参数!!必须要!!
  28. f.write(img.content) ##多媒体文件要是用conctent哦!
  29. f.close()

爬取小说


  
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. from urllib import request
  4. from bs4 import BeautifulSoup
  5. import re
  6. import sys
  7. if __name__ == "__main__":
  8. #创建txt文件
  9. file = open('一念永恒.txt', 'w', encoding='utf-8')
  10. #一念永恒小说目录地址
  11. target_url = 'http://www.biqukan.com/1_1094/'
  12. #User-Agent
  13. head = {}
  14. head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'
  15. target_req = request.Request(url = target_url, headers = head)
  16. target_response = request.urlopen(target_req)
  17. target_html = target_response.read().decode('gbk','ignore')
  18. #创建BeautifulSoup对象
  19. listmain_soup = BeautifulSoup(target_html,'html.parser')
  20. #搜索文档树,找出div标签中class为listmain的所有子标签
  21. chapters = listmain_soup.find_all('div',class_ = 'listmain')
  22. #使用查询结果再创建一个BeautifulSoup对象,对其继续进行解析
  23. download_soup = BeautifulSoup(str(chapters), 'html.parser')
  24. #计算章节个数
  25. numbers = (len(download_soup.dl.contents) - 1) / 2 - 8
  26. index = 1
  27. #开始记录内容标志位,只要正文卷下面的链接,最新章节列表链接剔除
  28. begin_flag = False
  29. #遍历dl标签下所有子节点
  30. for child in download_soup.dl.children:
  31. #滤除回车
  32. if child != '\n':
  33. #找到《一念永恒》正文卷,使能标志位
  34. if child.string == u"《一念永恒》正文卷":
  35. begin_flag = True
  36. #爬取链接并下载链接内容
  37. if begin_flag == True and child.a != None:
  38. download_url = "http://www.biqukan.com" + child.a.get('href')
  39. download_req = request.Request(url = download_url, headers = head)
  40. download_response = request.urlopen(download_req)
  41. download_html = download_response.read().decode('gbk','ignore')
  42. download_name = child.string
  43. soup_texts = BeautifulSoup(download_html, 'html.parser')
  44. texts = soup_texts.find_all(id = 'content', class_ = 'showtxt')
  45. soup_text = BeautifulSoup(str(texts), 'html.parser')
  46. write_flag = True
  47. file.write(download_name + '\n\n')
  48. #将爬取内容写入文件
  49. for each in soup_text.div.text.replace('\xa0',''):
  50. if each == 'h':
  51. write_flag = False
  52. if write_flag == True and each != ' ':
  53. file.write(each)
  54. if write_flag == True and each == '\r':
  55. file.write('\n')
  56. file.write('\n\n')
  57. #打印爬取进度
  58. sys.stdout.write("已下载:%.3f%%" % float(index/numbers) + '\r')
  59. sys.stdout.flush()
  60. index += 1
  61. file.close()

文章来源: baocl.blog.csdn.net,作者:小黄鸡1992,版权归原作者所有,如需转载,请联系作者。

原文链接:baocl.blog.csdn.net/article/details/95478249

【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。