抓取拉钩网关于python的招聘
一、抓取一页数据步骤
- 1、先获取第一页中全部的招聘超链接
- 2、使用
selenium
进行访问超链接 - 3、获取招聘信息
- 4、写入到本地文件中
5、具体代码实现
import re import json import time import random from selenium import webdriver from lxml import etree class LagouSpider(object): """ 定义一个拉钩爬虫的类 """ def __init__(self): self.browser = webdriver.Chrome() self.url = 'https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3#filterBox' self.file_name = open('lagou.json', 'a', encoding='utf-8') def main(self): """ 主运行函数 :return: """ time.sleep(random.randrange(2)) self.browser.get(self.url) source = self.browser.page_source self.parse_list_page(source=source) def parse_list_page(self, source): """ 获取页面中全部的超连接 :param source: :return: """ html = etree.HTML(source) # 获取页面中全部的超链接 links = html.xpath('//a[@class="position_link"]/@href') for link in links: self.request_detail_page(link) def request_detail_page(self, url): """ 请求详情页面 :param url: :return: """ time.sleep(random.randrange(2)) self.browser.get(url=url) source = self.browser.page_source self.parse_detail_page(source) def parse_detail_page(self, source): """ 处理详情页面数据 :param source: :return: """ html = etree.HTML(source) position_name = html.xpath('//span[@class="name"]/text()')[0] job_request_spans = html.xpath('//dd[@class="job_request"]//span/text()') salary = re.sub(re.compile('\/'), '', job_request_spans[0]).strip() city = re.sub(re.compile('\/'), '', job_request_spans[1]).strip() work_years = re.sub(re.compile('\/'), '', job_request_spans[2]).strip() education = re.sub(re.compile('\/'), '', job_request_spans[3]).strip() desc = etree.tostring(html.xpath('//dd[@class="job_bt"]')[0], encoding='utf8').decode('utf8') self.write_file( {'name': position_name, 'salary': salary, 'city': city, 'work_years': work_years, 'education': education, 'desc': desc}) def write_file(self, data): print('开始写入数据==>', data['name']) self.file_name.write(json.dumps(data, indent=2, ensure_ascii=False) + ',\n') if __name__ == '__main__': lagou = LagouSpider() lagou.main()
二、使用分页功能
- 1、获取到分页,模拟浏览器点击下(需要判断最后一页)
- 2、每次请求一次详情页面就新开一个窗口(
tab
类型的) - 3、当获取到详情页面的数据的时候就关闭当前窗口,且切换到职位列表页面
4、现在代码
import os import re import json import time import random from selenium import webdriver from lxml import etree from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC class LagouSpider(object): """ 定义一个拉钩爬虫的类 """ def __init__(self): self.browser = webdriver.Chrome() self.url = 'https://www.lagou.com/jobs/list_python?px=default&city=%E6%B7%B1%E5%9C%B3#filterBox' path = os.path.join(os.path.dirname(__file__), 'lagou.csv') # 预先处理如果当前文件夹下存在文件就删除 if os.path.exists(path): os.remove(path) self.file_name = open('lagou.json', 'a', encoding='utf-8') def main(self): """ 主运行函数 :return: """ while True: self.browser.get(self.url) source = self.browser.page_source self.parse_list_page(source=source) time.sleep(2) # 获取下一页的按钮模拟点击 # 设置显示等待 wait = WebDriverWait(self.browser, 20) wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="pager_container"]'))) next_btn = self.browser.find_element_by_xpath('//div[@class="pager_container"]/span[last()]') if 'pager_next_disabled' in next_btn.get_attribute('class'): break else: next_btn.click() def parse_list_page(self, source): """ 获取页面中全部的超连接 :param source: :return: """ html = etree.HTML(source) # 获取页面中全部的超链接 links = html.xpath('//a[@class="position_link"]/@href') for link in links: self.request_detail_page(link) def request_detail_page(self, url): """ 请求详情页面 :param url: :return: """ # 切换新的窗口 self.browser.execute_script('window.open("{0}")'.format(url)) self.browser.switch_to.window(self.browser.window_handles[1]) # 新开页面可能会比较慢,设置显示等待 wait = WebDriverWait(self.browser, 10) wait.until(EC.presence_of_element_located((By.XPATH, '//dd[@class="job_bt"]'))) time.sleep(random.randrange(2)) self.browser.get(url) source = self.browser.page_source self.parse_detail_page(source) # 获取到了详情页面数据就关闭当前窗口并切换到列表窗口 self.browser.close() self.browser.switch_to.window(self.browser.window_handles[0]) def parse_detail_page(self, source): """ 处理详情页面数据 :param source: :return: """ html = etree.HTML(source) position_name = html.xpath('//span[@class="name"]/text()')[0] job_request_spans = html.xpath('//dd[@class="job_request"]//span/text()') salary = re.sub(re.compile('\/'), '', job_request_spans[0]).strip() city = re.sub(re.compile('\/'), '', job_request_spans[1]).strip() work_years = re.sub(re.compile('\/'), '', job_request_spans[2]).strip() education = re.sub(re.compile('\/'), '', job_request_spans[3]).strip() desc = etree.tostring(html.xpath('//dd[@class="job_bt"]')[0], encoding='utf8').decode('utf8') self.write_file( {'name': position_name, 'salary': salary, 'city': city, 'work_years': work_years, 'education': education, 'desc': desc}) def write_file(self, data): print('开始写入数据==>', data['name']) self.file_name.write(json.dumps(data, indent=2, ensure_ascii=False) + ',\n') if __name__ == '__main__': lagou = LagouSpider() lagou.main()