美文网首页
51job爬虫

51job爬虫

作者: 十里染林 | 来源:发表于2020-09-30 19:22 被阅读0次

spider.py

import scrapy
import json
import re
from copy import deepcopy
import lxml.etree as le
import urllib.parse as up
import re

class tool:
# 返回唯一的xpath结果
@staticmethod
def xpath_one(contentx, path, default=None):
rets = contentx.xpath(path)
return rets[0] if rets else default

# 返回多个xpath的结果
@staticmethod
def xpath_all(contentx, path):
    rets = contentx.xpath(path)
    return rets

@staticmethod
def xpath_union(contentx, path, split='', default=None):
    ret = split.join([ret.strip() for ret in contentx.xpath(path)])
    return ret if ret else default

class S1Spider(scrapy.Spider):
name = 's1'

def start_requests(self):
    keyword = self.settings.get('keyword')
    self.keyword = up.urlencode({'': keyword})[1:]

    for place_code in ['010000','020000','030200','040000']:
        url = 'https://search.51job.com/list/{place_code},000000,0000,00,9,99,{keyword},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(
            place_code=place_code,
            keyword=keyword,
            page=1
        )
        yield scrapy.Request(
            url=url,
            callback=self.parse1,
            meta={
                    'place_code':place_code,
                    'keyword':keyword,
            }
        )


def parse1(self, response):
    place_code = response.meta.get('place_code')
    keyword = response.meta.get('keyword')
    content = response.body.decode('gbk', 'ignore')
    data = json.loads(
        re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', content)[0]
    )
    total_page = int(data['total_page'])
    curr_page = int(data['curr_page'])
    for page in range(curr_page+1,total_page+1):
        url = 'https://search.51job.com/list/{place_code},000000,0000,00,9,99,{keyword},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(
            place_code=place_code,
            keyword=keyword,
            page=page
        )
        yield scrapy.Request(
            url = url,
            callback=self.parse2,
            meta={'place_code':place_code,'keyword':keyword}
        )
def parse2(self,response):
    content = response.body.decode('gbk', 'ignore')
    data = json.loads(
        re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', content)[0]
    )
    results = data['engine_search_result']
    for result in results:
        job_href = result['job_href']
        yield scrapy.Request(
            url=job_href,
            callback=self.parse3,
            meta={
                'parse1_data': deepcopy(result)
            }
        )
def parse3(self, response):
    data = response.meta['parse1_data']
    contentb = response.body
    contentx = le.HTML(contentb)
    parse2_job_detail = tool.xpath_union(
        contentx=contentx,
        path='//div[@class="tCompany_main"]/div[@class="tBorderTop_box"][1]//text()',
        split='', default=None)
    parse2_job_conn = tool.xpath_union(
        contentx=contentx,
        path='//div[@class="tCompany_main"]/div[@class="tBorderTop_box"][2]//text()',
        split='', default=None)
    parse2_job_company = tool.xpath_union(
        contentx=contentx,
        path='//div[@class="tCompany_main"]/div[@class="tBorderTop_box"][2]//text()',
        split='', default=None)
    data['parse2_job_detail'] = parse2_job_detail
    data['parse2_job_conn'] = parse2_job_conn
    data['parse2_job_company'] = parse2_job_company

    yield data
1.png 2.png 3.png 4.png 5.png 6.png 7.png

相关文章

网友评论

      本文标题:51job爬虫

      本文链接:https://www.haomeiwen.com/subject/hjkpuktx.html