from selenium import webdriver
from time import sleep
from urllib import request,parse
from lxml import etree
import csv,pymysql,json
# 需求分析:抓取的内容
# 一级页面:岗位、薪资、福利、公司、经验、学历
# 二级页面:职位信息、公司地址、公司概况
# 创建一个类,对待爬取的内容进行模型化
class JobItem(object):
def __init__(self,job='',salary='',fuli='',company='',jingyan='',xueli='',job_info='',address='',company_info=''):
self.job = job
self.salary = salary
self.fuli = fuli
self.company = company
self.jingyan = jingyan
self.xueli = xueli
self.job_info = job_info
self.address = address
self.company_info = company_info
class ZhilianSpider(object):
def __init__(self,city,start,end,job,url):
self.city = city
self.start = start
self.end = end
self.job = job
self.url = url
# 定义其他成员变量
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
opt = webdriver.ChromeOptions()
opt.add_argument("--headless")
self.driver = webdriver.Chrome(options=opt)
# 1、请求模块
# 1) 请求一级页面
def request_job_list(self,url):
self.driver.get(url)
sleep(1)
return self.driver.page_source
# 2)请求二级页面
def request_job_info(self,url,callback,item):
req = request.Request(url=url,headers=self.headers)
# print(url)
res = request.urlopen(req)
# 回调
yield callback(res.read().decode("utf-8"),item)
# return res.read().decode()
# 2、解析模块
# 1)解析一级页面
def anylasis_job_list(self,html):
html_tree = etree.HTML(html)
job_list = html_tree.xpath("//div[@id='listContent']/div")
for job in job_list:
# 创建一个职位模型
jobItem = JobItem()
jobItem.job = job.xpath(".//span[contains(@class,'jobname__title')]/@title")[0]
jobItem.salary = job.xpath(".//p/text()")[0]
jobItem.fuli = " ".join(job.xpath(".//div[contains(@class,'welfare')]//text()"))
jobItem.company = job.xpath(".//a[contains(@class,'company_title')]//text()")[0]
jobItem.jingyan = job.xpath(".//li[2]/text()")[0]
jobItem.xueli = job.xpath(".//li[3]/text()")[0] if job.xpath(".//li[3]/text()") else ""
# print(jobItem)
# 解析出二级页面的url
next_url = job.xpath(".//div[contains(@class,'jobname')]/a/@href")[0]
yield self.request_job_info(url=next_url,callback=self.anylasis_job_info,item=jobItem)
#2)解析二级页面
def anylasis_job_info(self,html,item):
html_tree = etree.HTML(html)
jobItem = item
# print(jobItem.job)
jobItem.company_info = r"\n".join(html_tree.xpath("//div[@class='jianjie']//text()"))
jobItem.address = html_tree.xpath("//p[@class='add-txt']//text()")[0] if html_tree.xpath("//p[@class='add-txt']//text()") else ""
jobItem.job_info = r"\n".join(html_tree.xpath("//div[contains(@class,'pos-common')]//text()"))
return jobItem
# 3、存储模块
# 提供一个对外接口方法
def crawl_spider(self):
for page in range(int(self.start),int(self.end)+1):
page_url = self.url % (page,self.city,self.job)
html = self.request_job_list(url=page_url)
res = self.anylasis_job_list(html)
for j in res:
for i in j:
print(i)
self.driver.quit()
pass
def main():
url = "https://sou.zhaopin.com/?p=%d&jl=%s&kw=%s&kt=3"
city = input("请输入城市:")
start = input("请输入起始页:")
end = input("请输入终止页:")
job = input("请输入职位信息:")
# 初始化爬虫
zhilian = ZhilianSpider(url=url,city=city,start=start,end=end,job=job)
# 调用接口方法
zhilian.crawl_spider()
pass
if __name__ == '__main__':
main()
网友评论