使用多线程下载图片
本章节下载的图片网站
一、基本步骤讲解
1、导包
import os import time import random import shutil import re import queue import threading import requests from lxml import etree
2、定义全局变量
path = os.path.join(os.path.dirname(__file__), 'doutula') headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36', }
3、定义一个生产者获取图片地址
class Procuder(threading.Thread): """ 定义一个生产者(抓取网页上的图片url) """ def __init__(self, page_queue, img_queue, *args, **kwargs): super(Procuder, self).__init__(*args, **kwargs) self.page_queue = page_queue self.img_queue = img_queue def run(self): while True: if self.page_queue.empty(): break # 从队列中获取url地址然后添加到方法中 url = self.page_queue.get() self.parse_page(url) def parse_page(self, url): """ 定义获取全部的图片url地址,追加到图片队列中 :param url: :return: """ response = requests.get(url=url, headers=headers) time.sleep(random.randrange(2)) if response.status_code == 200: html = etree.HTML(response.text) imgs = html.xpath('//div[@class="random_article"]/div[@class="col-xs-6 col-sm-3"]/img') for img in imgs: if img.get('class') == 'gif': continue img_url = img.xpath("./@data-original")[0] alt = img.xpath('./@alt')[0] # 正则替换中文中特殊字符 name = re.sub(re.compile('[,。\.\+]'), '', alt) # 使用os模块中splitext截取文件的后缀名 filename = name + os.path.splitext(img_url)[1] # 获取到了添加到图片队列中 self.img_queue.put((img_url, filename))
4、定义一个消费者下载图片
class Consumer(threading.Thread): """ 定义一个消费者类(从生产者中获取图片url地址下载) """ def __init__(self, page_queue, img_queue, *args, **kwargs): super(Consumer, self).__init__(*args, **kwargs) self.page_queue = page_queue self.img_queue = img_queue def run(self): while True: if self.img_queue.empty() and self.page_queue.empty(): break # 从队列里面获取一个值 img_url, filename = self.img_queue.get() time.sleep(random.randrange(2)) response = requests.get(url=img_url, headers=headers) print('正在下载==>', img_url, '===', filename) with open(os.path.join(path, filename), 'wb') as fp: fp.write(response.content)
5、定义一个
main()
主运行方法def create_dir(): """ 定义创建一个文件夹的方法(有就删除再创建没用就删除) :return: """ if os.path.exists(path): shutil.rmtree(path) os.makedirs(path) else: os.makedirs(path) def main(): """ 创建一个运行的函数 :return: """ # 先创建文件夹 create_dir() page_queue = queue.Queue(100) img_queue = queue.Queue(1000) # 下载图片为1-5页 for x in range(1, 5): url = 'http://www.doutula.com/article/list/?page={0}'.format(x) page_queue.put(url) # 开启5个生产者与5个消费者线程 for x in range(5): c = Consumer(page_queue, img_queue) p = Procuder(page_queue, img_queue) c.start() p.start() if __name__ == '__main__': main()
二、全部的代码
import os
import time
import random
import shutil
import re
import queue
import threading
import requests
from lxml import etree
path = os.path.join(os.path.dirname(__file__), 'doutula')
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
}
class Procuder(threading.Thread):
"""
定义一个生产者(抓取网页上的图片url)
"""
def __init__(self, page_queue, img_queue, *args, **kwargs):
super(Procuder, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.page_queue.empty():
break
# 从队列中获取url地址然后添加到方法中
url = self.page_queue.get()
self.parse_page(url)
def parse_page(self, url):
"""
定义获取全部的图片url地址,追加到图片队列中
:param url:
:return:
"""
response = requests.get(url=url, headers=headers)
time.sleep(random.randrange(2))
if response.status_code == 200:
html = etree.HTML(response.text)
imgs = html.xpath('//div[@class="random_article"]/div[@class="col-xs-6 col-sm-3"]/img')
for img in imgs:
if img.get('class') == 'gif':
continue
img_url = img.xpath("./@data-original")[0]
alt = img.xpath('./@alt')[0]
# 正则替换中文中特殊字符
name = re.sub(re.compile('[,。\.\+]'), '', alt)
# 使用os模块中splitext截取文件的后缀名
filename = name + os.path.splitext(img_url)[1]
# 获取到了添加到图片队列中
self.img_queue.put((img_url, filename))
class Consumer(threading.Thread):
"""
定义一个消费者类(从生产者中获取图片url地址下载)
"""
def __init__(self, page_queue, img_queue, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.img_queue.empty() and self.page_queue.empty():
break
# 从队列里面获取一个值
img_url, filename = self.img_queue.get()
time.sleep(random.randrange(2))
response = requests.get(url=img_url, headers=headers)
print('正在下载==>', img_url, '===', filename)
with open(os.path.join(path, filename), 'wb') as fp:
fp.write(response.content)
def create_dir():
"""
定义创建一个文件夹的方法(有就删除再创建没用就删除)
:return:
"""
if os.path.exists(path):
shutil.rmtree(path)
os.makedirs(path)
else:
os.makedirs(path)
def main():
"""
创建一个运行的函数
:return:
"""
# 先创建文件夹
create_dir()
page_queue = queue.Queue(100)
img_queue = queue.Queue(1000)
# 下载图片为1-5页
for x in range(1, 5):
url = 'http://www.doutula.com/article/list/?page={0}'.format(x)
page_queue.put(url)
# 开启5个生产者与5个消费者线程
for x in range(5):
c = Consumer(page_queue, img_queue)
p = Procuder(page_queue, img_queue)
c.start()
p.start()
if __name__ == '__main__':
main()