if __name__ == '__main__': # 开始单线程 start_time = time.time() url = 'https://www.teamssix.com' html = req_url(url) home_page = soup_url(html) for i in home_page: req_url(i) end_time = time.time() print('\n单线程:',end_time - start_time)
最终运行结果如下:
1
单线程: 29.181440114974976
单线程花费了 29 秒的时间,接下来使用多进程测试一下
0x04 测试多进程爬取方法
通过学习发现多进程的用法和多线程还是挺相似的,所以就直接放代码吧,感兴趣的可以看看参考文章。
1 2 3 4 5 6 7 8 9 10 11 12
if __name__ == '__main__': # 开始多进程 start_time = time.time() url = 'https://www.teamssix.com' pool = Pool(4) home_page = soup_url(req_url(url)) for i in home_page: pool.apply_async(req_url, args=(i,)) pool.close() pool.join() end_time = time.time() print('\n多进程:',end_time - start_time)
最终运行结果如下:
1
多进程: 12.674117088317871
多进程仅用了 12 秒就完成了任务,经过多次测试,发现使用多进程基本上能比单线程快2倍以上。
为了看到多线程与多进程的差距,这里使用多线程处理了一下上面的操作,代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
if __name__ == '__main__': #开始多线程 start_time = time.time() url = 'https://www.teamssix.com' thread_list = [] home_page = soup_url(req_url(url)) for i in home_page: t = threading.Thread(target = req_url, args=(i,)) thread_list.append(t) for i in thread_list: i.start() for i in thread_list: i.join() end_time = time.time() print('\n多线程:', end_time - start_time)
import time # 测试爬取时间 import threading from multiprocessing import Pool
defmath(i): result2 = 2 ** i #执行幂运算
if __name__ == '__main__': #开始单线程 start_time = time.time() for i inrange(0, 1000000001, 250000000): math(i) end_time = time.time() print('\n单线程:',end_time - start_time)
#开始多进程 start_time = time.time() pool = Pool(4) for i inrange(0, 1000000001, 250000000): pool.apply_async(math, args=(i,)) pool.close() pool.join() end_time = time.time() print('\n多进程:',end_time - start_time)
# 开始多线程 start_time = time.time() thread_list = [] for i inrange(0, 1000000001, 250000000): t = threading.Thread(target = math, args=(i,)) thread_list.append(t) for i in thread_list: i.start() for i in thread_list: i.join() end_time = time.time() print('\n多线程:', end_time - start_time)