美文网首页
8.利用Chrome抓取动态页面

8.利用Chrome抓取动态页面

作者: 学飞的小鸡 | 来源:发表于2018-10-31 20:56 被阅读0次
    from selenium import webdriver
    from time import sleep
    from urllib import request,parse
    
    from lxml import etree
    
    import csv,pymysql,json
    # 需求分析:抓取的内容
    # 一级页面:岗位、薪资、福利、公司、经验、学历
    # 二级页面:职位信息、公司地址、公司概况
    # 创建一个类,对待爬取的内容进行模型化
    class JobItem(object):
    
        def __init__(self,job='',salary='',fuli='',company='',jingyan='',xueli='',job_info='',address='',company_info=''):
            self.job = job
            self.salary = salary
            self.fuli = fuli
            self.company = company
            self.jingyan = jingyan
            self.xueli = xueli
            self.job_info = job_info
            self.address = address
            self.company_info = company_info
    
    class ZhilianSpider(object):
    
        def __init__(self,city,start,end,job,url):
            self.city = city
            self.start = start
            self.end = end
            self.job = job
            self.url = url
            # 定义其他成员变量
            self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
            opt = webdriver.ChromeOptions()
            opt.add_argument("--headless")
            self.driver = webdriver.Chrome(options=opt)
    
        # 1、请求模块
        # 1) 请求一级页面
        def request_job_list(self,url):
            self.driver.get(url)
            sleep(1)
            return self.driver.page_source
    
        # 2)请求二级页面
        def request_job_info(self,url,callback,item):
            req = request.Request(url=url,headers=self.headers)
            # print(url)
    
            res = request.urlopen(req)
            # 回调
            yield callback(res.read().decode("utf-8"),item)
            # return res.read().decode()
    
    
        # 2、解析模块
        # 1)解析一级页面
        def anylasis_job_list(self,html):
            html_tree = etree.HTML(html)
    
            job_list = html_tree.xpath("//div[@id='listContent']/div")
            for job in job_list:
                # 创建一个职位模型
                jobItem = JobItem()
                jobItem.job = job.xpath(".//span[contains(@class,'jobname__title')]/@title")[0]
                jobItem.salary = job.xpath(".//p/text()")[0]
                jobItem.fuli = " ".join(job.xpath(".//div[contains(@class,'welfare')]//text()"))
                jobItem.company = job.xpath(".//a[contains(@class,'company_title')]//text()")[0]
                jobItem.jingyan = job.xpath(".//li[2]/text()")[0]
                jobItem.xueli = job.xpath(".//li[3]/text()")[0] if job.xpath(".//li[3]/text()") else ""
                # print(jobItem)
                # 解析出二级页面的url
                next_url = job.xpath(".//div[contains(@class,'jobname')]/a/@href")[0]
                yield self.request_job_info(url=next_url,callback=self.anylasis_job_info,item=jobItem)
    
        #2)解析二级页面
        def anylasis_job_info(self,html,item):
            html_tree = etree.HTML(html)
            jobItem = item
            # print(jobItem.job)
            jobItem.company_info = r"\n".join(html_tree.xpath("//div[@class='jianjie']//text()"))
            jobItem.address = html_tree.xpath("//p[@class='add-txt']//text()")[0] if html_tree.xpath("//p[@class='add-txt']//text()") else ""
            jobItem.job_info = r"\n".join(html_tree.xpath("//div[contains(@class,'pos-common')]//text()"))
    
            return jobItem
    
        # 3、存储模块
    
    
    
        # 提供一个对外接口方法
        def crawl_spider(self):
            for page in range(int(self.start),int(self.end)+1):
                page_url = self.url % (page,self.city,self.job)
                html = self.request_job_list(url=page_url)
                res = self.anylasis_job_list(html)
                for j in res:
                    for i in j:
                        print(i)
    
            self.driver.quit()
            pass
    
    
    
    def main():
        url = "https://sou.zhaopin.com/?p=%d&jl=%s&kw=%s&kt=3"
        city = input("请输入城市:")
        start = input("请输入起始页:")
        end = input("请输入终止页:")
        job = input("请输入职位信息:")
        # 初始化爬虫
        zhilian = ZhilianSpider(url=url,city=city,start=start,end=end,job=job)
        # 调用接口方法
        zhilian.crawl_spider()
        pass
    
    if __name__ == '__main__':
        main()
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    

    相关文章

      网友评论

          本文标题:8.利用Chrome抓取动态页面

          本文链接:https://www.haomeiwen.com/subject/tlsztqtx.html