- 微信
- 微博
  
  分享文章到微博
- 复制链接
  
  复制链接到剪贴板

python学习笔记之爬虫(四) 进程、线程、协程丨【生长吧！Python】

菜鸟级攻城狮发表于 2021/07/06 21:25:22 2021/07/06

【摘要】 python学习笔记之爬虫(四) 进程、线程、协程

### 第四章 ###
''' 概述 '''

本章涉及内容：
    1、快速学会多线程
    2、快速学会多进程
    3、线程池和进程池
    4、新发地实战
    5、协程
    6、多任务异步协程实现
    7、aiohttp模块详解
    8、小说实战
    9、综合训练-抓取一部电影
'''

''' 多线程 '''

# 进程是资源单位，每一个进程至少要有一个线程
# 线程是执行单位
from threading import Thread

''' 方式1 '''
def func(name):
    for i in range(100):
        print('func', name, i)

if __name__ == '__main__':
    t1 = Thread(target=func, args=('胡辣汤',))#创建一个线程，并给线程安排任务
    t1.start() # 开启线程线程，多线程状态为可以开始工作，具体的执行时间由CPU决定

    t2 = Thread(target=func, args=('麻辣烫',))
    t2.start()

    for i in range(100):
        print('main', i)

''' 方式2 '''
class MyThread(Thread):
    def __init__(self, name):
        super().__init__()
        self.name = name

    def run(self): # 该方法是固定的，当线程被执行的时候，调用该函数
        for i in range(100):
            print('子线程', self.name, i)

if __name__ == '__main__':
    # 创建对象
    t1 = MyThread('鸡公煲')
    t1.start()      # 开启线程

    t2 = MyThread('小火锅')
    t2.start()

    for i in range(100):
        print('主线程', i)

''' 多进程 '''

from multiprocessing import Process

''' 方式1 '''
def func(name):
    for i in range(10000):
        print('子进程', name, i)

if __name__ == '__main__':
    p1 = Process(target=func, args=('热干面',))
    p1.start()

    p2 = Process(target=func, args=('手抓饼',))
    p2.start()

    for i in range(10000):
        print('主进程', i)

''' 方式2 '''
class MyProcess(Process):
    def __init__(self, name):
        super().__init__()
        self.name = name

    def run(self):
        for i in range(10000):
            print('子线程', self.name, i)

if __name__ == '__main__':
    p1 = MyProcess('臭豆腐')
    p1.start()

    p2 = MyProcess('螺狮粉')
    p2.start()

    for i in range(10000):
        print('主线程', i)

''' 线程池和进程池 '''

# 线程池：一次性开辟一些线程，用户直接给线程池提交任务，县城任务的调度交给线程池来完成
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

def func(name):
    for i in range(1000):
        print(name, i)

if __name__ == '__main__':
    # 创建线程池
    with ThreadPoolExecutor(50) as t:
        for i in range(100):
            t.submit(func, name=f'线程{i}')
    # 等待线程池中的任务全部执行完毕，才继续执行(守护进程)
    print('End!')
'''
说明：线程池中有50个线程，这50个线程复用100次，每个线程执行1000次打印任务
'''

''' 线程池案例：抓取新发地菜价 '''

# 1、提取单个页面的数据
# 2、使用线程池，多个页面同时抓取
import requests
from lxml import etree
import csv
from concurrent.futures import ThreadPoolExecutor

wstream = open('菜价all.csv', 'w', encoding='utf-8', newline='')
csvwriter = csv.writer(wstream)

def download_page(url, headers):
    # 拿到页面源代码
    resp = requests.get(url, headers=headers)
    html = etree.HTML(resp.text)
    # 获取内容
    table = html.xpath('/html/body/div[2]/div[4]/div[1]/table')[0]
    # trs = table.xpath('./tr')[1:] # 从第二个tr开始内容
    # 或
    trs = table.xpath('./tr[position()>1]') # 获取tr位置大于1的所有tr
    # 准备保存数据到本地
    for tr in trs:
        txt = tr.xpath('./td/text()')
        # 使用迭代器对数据进行简单的处理
        txt = (item.replace(r'\\', ' ').replace('/', ' ')
                   .replace('\\', ' ').replace(' ', ' ')
                   .replace('|', ' ').lstrip(' ') for item in txt)
        # print(list(txt)) # 此时txt是一个迭代器，想查看内容可强转为list
        csvwriter.writerow(txt)
    else:
        print(f'{url} 提取完毕!!!')

if __name__ == '__main__':
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4043.400',
    }
    # 效率极其底下！！
    # for i in range(1, 16252):
    #     download_page(url, headers)

    # 创建线程池
    with ThreadPoolExecutor(50) as t:
        for i in range(1, 200): # 199 * 20 = 3980
            url = f'http://www.xinfadi.com.cn/marketanalysis/0/list/{i}.shtml'
            # 把下载任务提交给线程池
            t.submit(download_page, url, headers)
    wstream.close()
    print('全部下载完毕!!!')

''' 多任务异步协程 '''

协程有4种方式

await 表达式：
    挂起 coroutine 的执行以等待一个 awaitable 对象。
    只能在 coroutine function 内部使用。
语法格式：
    await_expr ::= "await" primary
'''
import asyncio
import time

async def func():
    print('你好啊，我叫赛利亚')

if __name__ == '__main__':
    g = func()      # 此时的函数是一个写成对象
    # print(g)    # <coroutine object func at 0x00000155F9742C40>
    asyncio.run(g) # 运行协程对象
    # 协程程序运行需要asyncio模块支持

'''方式1 (不推荐) '''
async def func1():
    print('你好啊，我叫夏洛克1')
    # time.sleep(3)   # 当程序出现同步操作的时候，异步就中断了。requests.get()也是同步操作
    await asyncio.sleep(3) # 异步操作，await挂起
    # 当程序执行到“await”时，会自动切换到另一个任务上
    print('你好啊，我叫夏洛克2')

async def func2():
    print('你好啊，我叫马冬梅1')
    # time.sleep(2)
    await asyncio.sleep(2)
    print('你好啊，我叫马冬梅2')

async def func3():
    print('你好啊，我叫孙红雷1')
    # time.sleep(4)
    await asyncio.sleep(4)
    print('你好啊，我叫孙红雷2')

if __name__ == '__main__':
    # 一般不不建议这样写异步协程，会让主函数的代码显得冗余
    f1 = func1()
    f2 = func2()
    f3 = func3()
    tasks = [
        # Python3.8以前的写法
        f1, f2, f3
    ]
    t1 = time.time()
    # 一次性启动多个任务(协程)
    asyncio.run(asyncio.wait(tasks))
    t2 = time.time()
    # print(t2 - t1)      # 此时是同步协程，耗时9.017476081848145

    # 修改完func中的内容后，运行如下。
    print(t2 - t1)      # 此时是异步协程，耗时4.005483388900757

''' 方式2 (推荐) '''
async def func1():
    print('你好啊，我叫夏洛克1')
    await asyncio.sleep(3)
    print('你好啊，我叫夏洛克2')

async def func2():
    print('你好啊，我叫马冬梅1')
    await asyncio.sleep(2)
    print('你好啊，我叫马冬梅2')

async def func3():
    print('你好啊，我叫孙红雷1')
    await asyncio.sleep(4)
    print('你好啊，我叫孙红雷2')

# 用的最多的写法：定义一个异步协程的main()函数
async def main():
#     # 第一种写法
#     f1 = func1()
#     await f1    # 一般await挂起操作放在协程对象前
#     f2 = func2()
#     await f2
#     f3 = func3()
#     await f3
# #######################
    # 第二种写法(推荐)
    tasks = [
        # Python3.8以后的使用方法
        asyncio.create_task(func1()),
        asyncio.create_task(func2()),
        asyncio.create_task(func3()),
    ]
    await asyncio.wait(tasks)

if __name__ == '__main__':
    t1 = time.time()
    # 一次性启动多个任务(协程)
    asyncio.run(main())
    t2 = time.time()
    print(t2 - t1) # 4.00736927986145

''' 在爬虫领域的应用
该方法效率极高！
现在一切的操作是在单线程的情况下执行的。
'''
async def download(url):
    print('开始下载...')
    await asyncio.sleep(2) # 模拟网络请求不能是requests.get()，否则报错
    print('下载完成!!!')

async def main():
    urls = [
        'http://www.baidu.com',
        'http://www.bilibili.com',
        'http://www.163.com',
    ]

    # tasks = []
    # for url in urls:
    #     d = asyncio.create_task(download(url))
    #     tasks.append(d)
    # 或者使用列表推导式
    tasks = [asyncio.create_task(download(url)) for url in urls]
    # 运行任务列表
    await asyncio.wait(tasks)

if __name__ == '__main__':
    asyncio.run(main())
''' 执行结果
开始下载...
开始下载...
开始下载...
下载完成!!!
下载完成!!!
下载完成!!!
'''

''' 异步http请求：aiohttp模块 '''

requests.get() 是同步请求 -> 异步请求使用aiohttp

安装aiohttp模块： asynciohttp缩写
    pip install aiohttp
    pip install ailfiles

异步爬虫，运行效率极高！！！
'''
import asyncio
import aiohttp
import aiofiles
# 参考链接：https://pypi.org/project/aiofiles/

urls = [
    'http://kr.shanghai-jiuxin.com/file/2020/1031/a2c58d6d726fb7ef29390becac5d8643.jpg',
    'http://kr.shanghai-jiuxin.com/file/2020/1031/191468637cab2f0206f7d1d9b175ac81.jpg',
    'http://kr.shanghai-jiuxin.com/file/2020/1031/d7de3f9faf1e0ecdea27b73139fc8d3a.jpg',
]
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4043.400'
}

async def aiodownload(url):
    # 发送请求，获取响应
    # 得到图片内容
    # 保存到文件
    async with aiohttp.ClientSession() as session: # 相当于requests
        img_name = url.rsplit('/', 1)[1]    # 从右边的/切，切1次，取[1]的值
        async with session.get(url, headers=headers) as resp:   # 相当于resp = requests.get()
            # await resp.content.read()     # 相当于resp.content
            # await resp.text()     # 相当于resp.text
            # await resp.json()     # 相当于resp.json()

            # 异步保存，请求回来之后，使用aiofiles异步保存图片到本地
            async with aiofiles.open(img_name, 'wb') as f:
                await f.write(await resp.content.read())
                # 上述语句await resp.content.read()若不加await，则报错如下
                # TypeError: a bytes-like object is required, not 'coroutine'

            # 同步保存
            # with open(img_name, 'wb') as f:
            #     f.write(await resp.content.read()) # 读取任容是异步的，需要await挂起

        print(img_name, 'Done!!!')

async def main():
    # 打开url
    # 创建多个aiodownload任务
    tasks = []
    for url in urls:
        tasks.append(aiodownload(url))
    # 或者
    # tasks = [aiodownload(url) for url in urls]

    # 启动运行任务列表
    await asyncio.wait(tasks)

if __name__ == '__main__':
    asyncio.run(main())

【生长吧！Python】有奖征文火热进行中：https://bbs.huaweicloud.com/blogs/278897

点赞
收藏
关注作者

0/1000

抱歉，系统识别当前为高风险访问，暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称，即可参与社区互动！

*长度不超过10个汉字或20个英文字符，设置后3个月内不可修改。

确认取消

加入云驻计划，成为创作者

华为云周边好礼
免费体验产品
特殊身份标识
线下官方门票
内部专家零距离
与10000+优质创作者共同成长

立即加入

python学习笔记之爬虫(四) 进程、线程、协程丨【生长吧！Python】

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

python学习笔记之爬虫(四) 进程、线程、协程 丨【生长吧！Python】

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

推荐阅读

相关产品

python学习笔记之爬虫(四) 进程、线程、协程丨【生长吧！Python】