美文网首页
51job大数据职位爬虫示例

51job大数据职位爬虫示例

作者: hongmj_0704 | 来源:发表于2017-11-01 15:11 被阅读0次

    环境 Python3.6

    本人新手,刚开始写爬虫,使用的是requests,lxml和xpath,代码写的很凌乱,轻喷。

    选择爬取了51job上搜索大数据得到的职位信息,并将爬取到的数据存入了Mysql数据库中。

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    # @Time    : 2017/10/31 14:32
    # @Author  : mj
    # @File    : 51job.py
    
    import requests
    from lxml import etree
    import pymysql
    
    def get_info(link):
        global id_1
        global id_2
        print(link)
        see = requests.session()
        see.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
        }
        html = see.get(link)
        html.encoding = "GBK"
        selector = etree.HTML(html.text)
        """enterprise_info"""
        # 1.enterprise_name 企业名称
        enterprise_name = selector.xpath("//p/a/@title")
        if len(enterprise_name)<=0:
            return
        print(enterprise_name[0])
        e_name = enterprise_name[0]
        f_test = selector.xpath("//p[@class='msg ltype']/text()")
        F = str(f_test[0]).split('|')
        if len(F)<3:
            return
        # 2.enterprise_form 企业形式
        enterprise_form = F[0].strip()
        print(enterprise_form)
        # 3.enterprise_scale 企业规模
        enterprise_scale = F[1].strip()
        print(enterprise_scale)
        # 4.enterprise_kind
        enterprise_kind = F[2].strip()
        print(enterprise_kind)
        # 5.enterprise_intro
        e_intro = selector.xpath("//div/div[@class='tmsg inbox']/text()")
        enterprise_intro = ""
        for e in e_intro:
            enterprise_intro = enterprise_intro + e.strip()
        print(enterprise_intro)
    
        values_entreprise_info = [int(id_1),e_name,enterprise_form,enterprise_scale,enterprise_kind,enterprise_intro]
        print(values_entreprise_info)
        """job_info"""
        #1.job_name 职位名称
        job_name = selector.xpath("//div/h1/@title")
        print(job_name[0])
        #2.enterprise_name 企业名称
    
        #3.salary 薪资
        salary = selector.xpath("//div[@class='cn']/strong/text()")
        if len(salary)<1:
            return
        print(salary[0])
        #4.job_site 工作地点
        job_site = selector.xpath("//div/span[@class='lname']/text()")
        if len(job_site)<0:
            return
        print(job_site[0])
        #5.job_intro职位信息
        job_intro = selector.xpath("//div[@class='t1']/span[@class='sp4']/text()")
        print(job_intro)
        #6.job_des 职位描述
        des = selector.xpath("//div[@class='bmsg job_msg inbox']/text()")
        job_des = ""
        for i in des:
            job_des =  job_des+ i.strip()
        print(job_des)
        #7.link 职位连接
        values_job_info = [int(id_2),job_name,e_name,str(salary[0]),job_site,str(job_intro),job_des,link]
        print(values_job_info)
    
        conn1 = pymysql.connect(
            host='127.0.0.1',
            port=3306,
            user='root',
            passwd='123456',
            db='db',)
        # 使用cursor()方法获取操作游标
        cur1 = conn1.cursor()
        try:
            cur1.execute("insert into enterprise_info VALUES(%s,%s,%s,%s,%s,%s);",values_entreprise_info)
            conn1.commit()
        except Exception as e:
            print(e)
            conn1.rollback()
        finally:
            id_1 = id_1+1
            conn1.close()
    
        conn2 = pymysql.connect(
            host='127.0.0.1',
            port=3306,
            user='root',
            passwd='123456',
            db='db',)
        # 使用cursor()方法获取操作游标
        cur2 = conn2.cursor()
        try:
            cur2.execute("insert into job_info VALUES(%s,%s,%s,%s,%s,%s,%s,%s);",values_job_info)
            conn2.commit()
        except Exception as e:
            print(e)
            conn2.rollback()
        finally:
            id_2 = id_2+1
            conn2.close()
    
    
    
    
    see = requests.session()
    see.headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
    }
    global  id_1
    id_1= 1
    global  id_2
    id_2= 1
    pages = set()
    
    i = 1
    j = 1
    a = 1
    url = "http://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,"+str(a)+".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
    
    for i in range(1,9999):
        print(url)
        html = see.get(url)
        html.encoding = "GBK"
        selector = etree.HTML(html.text)
        links = selector.xpath("//span/a[@onmousedown='']/@href")
        for link in links:
            print(i)
            a = i
            print("a="+str(a))
            url = "http://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2," + str(a) + ".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
            if link not in pages:
                pages.add(link)
                get_info(link)
                print(link)
                j = j+1
    # j=记录数
    print(j)
    

    部分结果

    图片1.png 图片2.png

    相关文章

      网友评论

          本文标题:51job大数据职位爬虫示例

          本文链接:https://www.haomeiwen.com/subject/docgpxtx.html