使用多线程下载图片

本章节下载的图片网站

一、基本步骤讲解

  • 1、导包

    import os
    import time
    import random
    import shutil
    import re
    import queue
    import threading
    import requests
    from lxml import etree
    
  • 2、定义全局变量

    path = os.path.join(os.path.dirname(__file__), 'doutula')
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
    }
    
  • 3、定义一个生产者获取图片地址

    class Procuder(threading.Thread):
        """
        定义一个生产者(抓取网页上的图片url)
        """
    
        def __init__(self, page_queue, img_queue, *args, **kwargs):
            super(Procuder, self).__init__(*args, **kwargs)
            self.page_queue = page_queue
            self.img_queue = img_queue
    
        def run(self):
            while True:
                if self.page_queue.empty():
                    break
                # 从队列中获取url地址然后添加到方法中
                url = self.page_queue.get()
                self.parse_page(url)
    
        def parse_page(self, url):
            """
            定义获取全部的图片url地址,追加到图片队列中
            :param url:
            :return:
            """
            response = requests.get(url=url, headers=headers)
            time.sleep(random.randrange(2))
            if response.status_code == 200:
                html = etree.HTML(response.text)
                imgs = html.xpath('//div[@class="random_article"]/div[@class="col-xs-6 col-sm-3"]/img')
                for img in imgs:
                    if img.get('class') == 'gif':
                        continue
                    img_url = img.xpath("./@data-original")[0]
                    alt = img.xpath('./@alt')[0]
                    # 正则替换中文中特殊字符
                    name = re.sub(re.compile('[,。\.\+]'), '', alt)
                    # 使用os模块中splitext截取文件的后缀名
                    filename = name + os.path.splitext(img_url)[1]
                    # 获取到了添加到图片队列中
                    self.img_queue.put((img_url, filename))
    
  • 4、定义一个消费者下载图片

    class Consumer(threading.Thread):
        """
        定义一个消费者类(从生产者中获取图片url地址下载)
        """
    
        def __init__(self, page_queue, img_queue, *args, **kwargs):
            super(Consumer, self).__init__(*args, **kwargs)
            self.page_queue = page_queue
            self.img_queue = img_queue
    
        def run(self):
            while True:
                if self.img_queue.empty() and self.page_queue.empty():
                    break
                # 从队列里面获取一个值
                img_url, filename = self.img_queue.get()
                time.sleep(random.randrange(2))
                response = requests.get(url=img_url, headers=headers)
                print('正在下载==>', img_url, '===', filename)
                with open(os.path.join(path, filename), 'wb') as fp:
                    fp.write(response.content)
    
  • 5、定义一个main()主运行方法

    def create_dir():
        """
        定义创建一个文件夹的方法(有就删除再创建没用就删除)
        :return:
        """
        if os.path.exists(path):
            shutil.rmtree(path)
            os.makedirs(path)
        else:
            os.makedirs(path)
    
    def main():
        """
        创建一个运行的函数
        :return:
        """
        # 先创建文件夹
        create_dir()
    
        page_queue = queue.Queue(100)
        img_queue = queue.Queue(1000)
    
        # 下载图片为1-5页
        for x in range(1, 5):
            url = 'http://www.doutula.com/article/list/?page={0}'.format(x)
            page_queue.put(url)
    
        # 开启5个生产者与5个消费者线程
        for x in range(5):
            c = Consumer(page_queue, img_queue)
            p = Procuder(page_queue, img_queue)
            c.start()
            p.start()
    
    if __name__ == '__main__':
        main()
    

二、全部的代码

import os
import time
import random
import shutil
import re
import queue
import threading
import requests
from lxml import etree

path = os.path.join(os.path.dirname(__file__), 'doutula')
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
}

class Procuder(threading.Thread):
    """
    定义一个生产者(抓取网页上的图片url)
    """

    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Procuder, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.page_queue.empty():
                break
            # 从队列中获取url地址然后添加到方法中
            url = self.page_queue.get()
            self.parse_page(url)

    def parse_page(self, url):
        """
        定义获取全部的图片url地址,追加到图片队列中
        :param url:
        :return:
        """
        response = requests.get(url=url, headers=headers)
        time.sleep(random.randrange(2))
        if response.status_code == 200:
            html = etree.HTML(response.text)
            imgs = html.xpath('//div[@class="random_article"]/div[@class="col-xs-6 col-sm-3"]/img')
            for img in imgs:
                if img.get('class') == 'gif':
                    continue
                img_url = img.xpath("./@data-original")[0]
                alt = img.xpath('./@alt')[0]
                # 正则替换中文中特殊字符
                name = re.sub(re.compile('[,。\.\+]'), '', alt)
                # 使用os模块中splitext截取文件的后缀名
                filename = name + os.path.splitext(img_url)[1]
                # 获取到了添加到图片队列中
                self.img_queue.put((img_url, filename))

class Consumer(threading.Thread):
    """
    定义一个消费者类(从生产者中获取图片url地址下载)
    """

    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.img_queue.empty() and self.page_queue.empty():
                break
            # 从队列里面获取一个值
            img_url, filename = self.img_queue.get()
            time.sleep(random.randrange(2))
            response = requests.get(url=img_url, headers=headers)
            print('正在下载==>', img_url, '===', filename)
            with open(os.path.join(path, filename), 'wb') as fp:
                fp.write(response.content)

def create_dir():
    """
    定义创建一个文件夹的方法(有就删除再创建没用就删除)
    :return:
    """
    if os.path.exists(path):
        shutil.rmtree(path)
        os.makedirs(path)
    else:
        os.makedirs(path)

def main():
    """
    创建一个运行的函数
    :return:
    """
    # 先创建文件夹
    create_dir()

    page_queue = queue.Queue(100)
    img_queue = queue.Queue(1000)

    # 下载图片为1-5页
    for x in range(1, 5):
        url = 'http://www.doutula.com/article/list/?page={0}'.format(x)
        page_queue.put(url)

    # 开启5个生产者与5个消费者线程
    for x in range(5):
        c = Consumer(page_queue, img_queue)
        p = Procuder(page_queue, img_queue)
        c.start()
        p.start()

if __name__ == '__main__':
    main()

results matching ""

    No results matching ""