美文网首页
51job大数据职位爬虫示例

51job大数据职位爬虫示例

作者: hongmj_0704 | 来源:发表于2017-11-01 15:11 被阅读0次

环境 Python3.6

本人新手,刚开始写爬虫,使用的是requests,lxml和xpath,代码写的很凌乱,轻喷。

选择爬取了51job上搜索大数据得到的职位信息,并将爬取到的数据存入了Mysql数据库中。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time    : 2017/10/31 14:32
# @Author  : mj
# @File    : 51job.py

import requests
from lxml import etree
import pymysql

def get_info(link):
    global id_1
    global id_2
    print(link)
    see = requests.session()
    see.headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
    }
    html = see.get(link)
    html.encoding = "GBK"
    selector = etree.HTML(html.text)
    """enterprise_info"""
    # 1.enterprise_name 企业名称
    enterprise_name = selector.xpath("//p/a/@title")
    if len(enterprise_name)<=0:
        return
    print(enterprise_name[0])
    e_name = enterprise_name[0]
    f_test = selector.xpath("//p[@class='msg ltype']/text()")
    F = str(f_test[0]).split('|')
    if len(F)<3:
        return
    # 2.enterprise_form 企业形式
    enterprise_form = F[0].strip()
    print(enterprise_form)
    # 3.enterprise_scale 企业规模
    enterprise_scale = F[1].strip()
    print(enterprise_scale)
    # 4.enterprise_kind
    enterprise_kind = F[2].strip()
    print(enterprise_kind)
    # 5.enterprise_intro
    e_intro = selector.xpath("//div/div[@class='tmsg inbox']/text()")
    enterprise_intro = ""
    for e in e_intro:
        enterprise_intro = enterprise_intro + e.strip()
    print(enterprise_intro)

    values_entreprise_info = [int(id_1),e_name,enterprise_form,enterprise_scale,enterprise_kind,enterprise_intro]
    print(values_entreprise_info)
    """job_info"""
    #1.job_name 职位名称
    job_name = selector.xpath("//div/h1/@title")
    print(job_name[0])
    #2.enterprise_name 企业名称

    #3.salary 薪资
    salary = selector.xpath("//div[@class='cn']/strong/text()")
    if len(salary)<1:
        return
    print(salary[0])
    #4.job_site 工作地点
    job_site = selector.xpath("//div/span[@class='lname']/text()")
    if len(job_site)<0:
        return
    print(job_site[0])
    #5.job_intro职位信息
    job_intro = selector.xpath("//div[@class='t1']/span[@class='sp4']/text()")
    print(job_intro)
    #6.job_des 职位描述
    des = selector.xpath("//div[@class='bmsg job_msg inbox']/text()")
    job_des = ""
    for i in des:
        job_des =  job_des+ i.strip()
    print(job_des)
    #7.link 职位连接
    values_job_info = [int(id_2),job_name,e_name,str(salary[0]),job_site,str(job_intro),job_des,link]
    print(values_job_info)

    conn1 = pymysql.connect(
        host='127.0.0.1',
        port=3306,
        user='root',
        passwd='123456',
        db='db',)
    # 使用cursor()方法获取操作游标
    cur1 = conn1.cursor()
    try:
        cur1.execute("insert into enterprise_info VALUES(%s,%s,%s,%s,%s,%s);",values_entreprise_info)
        conn1.commit()
    except Exception as e:
        print(e)
        conn1.rollback()
    finally:
        id_1 = id_1+1
        conn1.close()

    conn2 = pymysql.connect(
        host='127.0.0.1',
        port=3306,
        user='root',
        passwd='123456',
        db='db',)
    # 使用cursor()方法获取操作游标
    cur2 = conn2.cursor()
    try:
        cur2.execute("insert into job_info VALUES(%s,%s,%s,%s,%s,%s,%s,%s);",values_job_info)
        conn2.commit()
    except Exception as e:
        print(e)
        conn2.rollback()
    finally:
        id_2 = id_2+1
        conn2.close()




see = requests.session()
see.headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
}
global  id_1
id_1= 1
global  id_2
id_2= 1
pages = set()

i = 1
j = 1
a = 1
url = "http://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,"+str(a)+".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="

for i in range(1,9999):
    print(url)
    html = see.get(url)
    html.encoding = "GBK"
    selector = etree.HTML(html.text)
    links = selector.xpath("//span/a[@onmousedown='']/@href")
    for link in links:
        print(i)
        a = i
        print("a="+str(a))
        url = "http://search.51job.com/list/000000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2," + str(a) + ".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
        if link not in pages:
            pages.add(link)
            get_info(link)
            print(link)
            j = j+1
# j=记录数
print(j)

部分结果

图片1.png 图片2.png

相关文章

网友评论

      本文标题:51job大数据职位爬虫示例

      本文链接:https://www.haomeiwen.com/subject/docgpxtx.html