自己编写下载中间件
本章节介绍几个在实际项目中常用的下载中间件
一、之前模拟登录中遇到post
提交ajax
请求的时候
1、具体代码(参考作用)
import json import requests from scrapy.http import HtmlResponse class PayLoadRequestMiddleware(object): """ 设置post发送ajax请求 """ def process_request(self, request, spider): # 如果有的请求是带有payload请求的,在这个里面处理掉 if request.meta.get('payloadFlag', False): postUrl = request.url headers = request.meta.get('headers', {}) payloadData = request.meta.get('payloadData', {}) timeOut = request.meta.get('download_timeout', 25) allow_redirects = request.meta.get('dont_redirect', False) dumpJsonData = json.dumps(payloadData) print(f"dumpJsonData = {dumpJsonData}") # 发现这个居然是个同步 阻塞的过程,太过影响速度了 res = requests.post(postUrl, data=dumpJsonData, headers=headers, timeout=timeOut, allow_redirects=allow_redirects) if 199 < res.status_code < 300: return HtmlResponse(url=request.url, body=res.content, request=request, encoding='utf-8', status=200) else: return HtmlResponse(url=request.url, status=500, request=request)
二、设置随机请求头
- 1、获取各个浏览器请求头
- 2、测试当前浏览器的请求有
3、具体代码
import random class RandomUserAgent(object): """ 设置随机请求头的中间件 """ USER_AGENTS = [ 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)' ] def process_request(self, request, spider): """ 我们设置请求头,是要拦截请求的 :param request: :param spider: :return: """ user_agent = random.choice(self.USER_AGENTS) request.headers['User-Agent'] = user_agent # python中不写返回就默认返回None # return None
4、创建一个爬虫测试
class HttpbinSpider(scrapy.Spider): name = 'httpbin' allowed_domains = ['httpbin.org'] start_urls = ['http://httpbin.org/user-agent'] def parse(self, response): print('*' * 100) print(response.text) print('*' * 100) yield scrapy.Request(self.start_urls[0], callback=self.parse, dont_filter=True)
三、设置随机代理IP
- 1、可以去西刺代理获取免费IP
- 2、测试IP地址
3、中间件的代码
import random class RandomProxy(object): """ 设置随机代理IP """ PROXY_LISTS = ['https://112.252.109.128:8118', 'https://106.75.169.71:3128'] def process_request(self, request, spider): proxy = random.choice(self.PROXY_LISTS) request.meta['proxy'] = proxy