spider.py
import scrapy
import json
import re
from copy import deepcopy
import lxml.etree as le
import urllib.parse as up
import re
class tool:
# 返回唯一的xpath结果
@staticmethod
def xpath_one(contentx, path, default=None):
rets = contentx.xpath(path)
return rets[0] if rets else default
# 返回多个xpath的结果
@staticmethod
def xpath_all(contentx, path):
rets = contentx.xpath(path)
return rets
@staticmethod
def xpath_union(contentx, path, split='', default=None):
ret = split.join([ret.strip() for ret in contentx.xpath(path)])
return ret if ret else default
class S1Spider(scrapy.Spider):
name = 's1'
def start_requests(self):
keyword = self.settings.get('keyword')
self.keyword = up.urlencode({'': keyword})[1:]
for place_code in ['010000','020000','030200','040000']:
url = 'https://search.51job.com/list/{place_code},000000,0000,00,9,99,{keyword},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(
place_code=place_code,
keyword=keyword,
page=1
)
yield scrapy.Request(
url=url,
callback=self.parse1,
meta={
'place_code':place_code,
'keyword':keyword,
}
)
def parse1(self, response):
place_code = response.meta.get('place_code')
keyword = response.meta.get('keyword')
content = response.body.decode('gbk', 'ignore')
data = json.loads(
re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', content)[0]
)
total_page = int(data['total_page'])
curr_page = int(data['curr_page'])
for page in range(curr_page+1,total_page+1):
url = 'https://search.51job.com/list/{place_code},000000,0000,00,9,99,{keyword},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(
place_code=place_code,
keyword=keyword,
page=page
)
yield scrapy.Request(
url = url,
callback=self.parse2,
meta={'place_code':place_code,'keyword':keyword}
)
def parse2(self,response):
content = response.body.decode('gbk', 'ignore')
data = json.loads(
re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', content)[0]
)
results = data['engine_search_result']
for result in results:
job_href = result['job_href']
yield scrapy.Request(
url=job_href,
callback=self.parse3,
meta={
'parse1_data': deepcopy(result)
}
)
def parse3(self, response):
data = response.meta['parse1_data']
contentb = response.body
contentx = le.HTML(contentb)
parse2_job_detail = tool.xpath_union(
contentx=contentx,
path='//div[@class="tCompany_main"]/div[@class="tBorderTop_box"][1]//text()',
split='', default=None)
parse2_job_conn = tool.xpath_union(
contentx=contentx,
path='//div[@class="tCompany_main"]/div[@class="tBorderTop_box"][2]//text()',
split='', default=None)
parse2_job_company = tool.xpath_union(
contentx=contentx,
path='//div[@class="tCompany_main"]/div[@class="tBorderTop_box"][2]//text()',
split='', default=None)
data['parse2_job_detail'] = parse2_job_detail
data['parse2_job_conn'] = parse2_job_conn
data['parse2_job_company'] = parse2_job_company
yield data







网友评论