自己编写下载中间件

本章节介绍几个在实际项目中常用的下载中间件

一、之前模拟登录中遇到post提交ajax请求的时候

  • 1、具体代码(参考作用)

    import json
    import requests
    from scrapy.http import HtmlResponse
    
    class PayLoadRequestMiddleware(object):
        """
        设置post发送ajax请求
        """
        def process_request(self, request, spider):
            # 如果有的请求是带有payload请求的,在这个里面处理掉
            if request.meta.get('payloadFlag', False):
                postUrl = request.url
                headers = request.meta.get('headers', {})
                payloadData = request.meta.get('payloadData', {})
    
                timeOut = request.meta.get('download_timeout', 25)
                allow_redirects = request.meta.get('dont_redirect', False)
                dumpJsonData = json.dumps(payloadData)
                print(f"dumpJsonData = {dumpJsonData}")
                # 发现这个居然是个同步 阻塞的过程,太过影响速度了
                res = requests.post(postUrl, data=dumpJsonData, headers=headers, timeout=timeOut,
                                    allow_redirects=allow_redirects)
                if 199 < res.status_code < 300:
                    return HtmlResponse(url=request.url, body=res.content, request=request, encoding='utf-8', status=200)
                else:
                    return HtmlResponse(url=request.url, status=500, request=request)
    

二、设置随机请求头

  • 1、获取各个浏览器请求头
  • 2、测试当前浏览器的请求有
  • 3、具体代码

    import random
    
    class RandomUserAgent(object):
        """
        设置随机请求头的中间件
        """
        USER_AGENTS = [
            'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)',
            'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)'
        ]
    
        def process_request(self, request, spider):
            """
            我们设置请求头,是要拦截请求的
            :param request:
            :param spider:
            :return:
            """
            user_agent = random.choice(self.USER_AGENTS)
            request.headers['User-Agent'] = user_agent
            # python中不写返回就默认返回None
            # return None
    
  • 4、创建一个爬虫测试

    class HttpbinSpider(scrapy.Spider):
        name = 'httpbin'
        allowed_domains = ['httpbin.org']
        start_urls = ['http://httpbin.org/user-agent']
    
        def parse(self, response):
            print('*' * 100)
            print(response.text)
            print('*' * 100)
            yield scrapy.Request(self.start_urls[0], callback=self.parse, dont_filter=True)
    

三、设置随机代理IP

  • 1、可以去西刺代理获取免费IP
  • 2、测试IP地址
  • 3、中间件的代码

    import random
    
    class RandomProxy(object):
        """
        设置随机代理IP
        """
        PROXY_LISTS = ['https://112.252.109.128:8118', 'https://106.75.169.71:3128']
    
        def process_request(self, request, spider):
            proxy = random.choice(self.PROXY_LISTS)
            request.meta['proxy'] = proxy
    

results matching ""

    No results matching ""