scrapy框架使用无界面浏览器抓取数据
一、在scrapy
中集成selenium
的无头浏览器步骤
1、定义一个下载中间件
from selenium import webdriver from scrapy.http import HtmlResponse class SeleniumMiddleware(object): def __init__(self): super(SeleniumMiddleware, self).__init__() option = webdriver.ChromeOptions() option.add_argument('headless') self.browser = webdriver.Chrome(chrome_options=option) def process_request(self, request, spider): self.browser.implicitly_wait(5) self.browser.get(request.url) print('你访问的网站:{0}'.format(request.url)) return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8", request=request)
2、在
settings.py
中注册激活