python爬虫与初步分析51job及智联工作

作者: luomagaoshou | 来源:发表于2018-02-02 14:31 被阅读54次

python爬虫与初步分析51job及智联工作
Python爬虫：智联
使用Scrapy爬取大规模数据
《七天爬虫进阶系列》 - 导语
爬虫入门的一点笔记
乱七八糟第四天
【非专业HR的自我修养】016 锻炼一下思考问题的深度
职位分析APP Python爬虫 + Python后台 + An
51job'python'岗位分析
IT男在服务业的过渡期

找工作啊找工作

年关将至，相信很多小伙伴在寻思着发展方向。不知有没小伙伴跟笔者找工作时狂刷简历，为的是找出离家近点的公司，但是有很多公司的简介上是没有地址的，所以也只能再点进去看公司详细地址。
因此，写了个爬虫，方便找工作。

项目简介

主要代码是一个job_spider.py(用于爬虫), jobs_data_analyse.py(用于工作数据分析)
笔者先获取工作列表，取得简介后再取得详情。下载完成后，再进行分析。

详细代码如下

job_spider.py

from bs4 import BeautifulSoup
import requests
import os
from enum import Enum
from program import config
import pandas as pd



pd.set_option('expand_frame_repr', False)  # 列太多不换行

class WEBTYPE(Enum):
    _51job = '_51job'  # 51job
    zhilian = 'zhilian'  # 智联
    all = 3  # 所有

#全局变量  记录爬虫次数
SPIDER_REQUIRE_COUNT = 0

#获取51job地址编号对应地名
def get_51job_area_code():
    dic = {}
    for i in range(1, 37):
        url = 'http://search.51job.com/list/{}0000,000000,0000,00,9,99,ios,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format('%02d' % i)
        r = requests.get(url, headers=config.http_headers).content.decode('gbk')
        area_name = BeautifulSoup(r, 'lxml').find(id="work_position_input")['value']
        print(area_name, i)
        dic[area_name] = i
    file_path = os.path.join(config.job_data_dir, '51job_area_code.txt')
    print('51job地区编号文件获取成功')
    with open(file_path, "w+", encoding="utf-8") as f:
        f.write(str(dic))
        f.close()


#  检查本地是否有51job地区编号 没有的话就自动获取
def check_area_name():
    file_path = os.path.join(config.job_data_dir, '51job_area_code.txt')
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            if f:
                result = f.read()
                dic = eval(result)
                f.close()
    else:
        print('51job缺少地区编号文件,获取中')
        get_51job_area_code()
        check_area_name()





def fetch_data( web_type=WEBTYPE.all, keywords=['iOS'], page_count=5, area='深圳'):
    if os.path.exists(config.jobs_data_path):
        os.remove(config.jobs_data_path)
        print('删除之前爬的数据')

    if web_type == WEBTYPE._51job:
        _fetch_data(web_type, keywords, page_count, area)
    elif web_type == WEBTYPE.zhilian:
        _fetch_data(web_type, keywords, page_count, area)
    elif web_type == WEBTYPE.all:
        for type in list(WEBTYPE)[0: -1]:
            _fetch_data(type, keywords, page_count, area)

def _fetch_data(web_type, keywords, page_count, area):
    df = fetch_job_introduce(web_type, keywords, page_count, area)
    df = fetch_job_detail(df)
    df.fillna(value='', inplace=True)

    if os.path.exists(config.jobs_data_path):
        df_existed = pd.read_csv(config.jobs_data_path, encoding='utf-8', index_col=0)
        df = df.append(df_existed, ignore_index=True)

    df.sort_values(by=['地区'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.to_csv(config.jobs_data_path, mode='w', encoding='utf-8')


    #去除工作要求 方便查看
    df_no_require = df.drop(['要求'], axis=1)
    df_no_require['薪酬'] = df_no_require['薪酬'].apply(_make_introduce_beautiful, min_length=12)
    df_no_require['地区'] = df_no_require['地区'].apply(_make_introduce_beautiful, min_length=12)
    df_no_require['详细地址'] = df_no_require['详细地址'].apply(_make_introduce_beautiful, min_length=30)
    df_no_require['链接'] = df_no_require['链接'].apply(_make_introduce_beautiful, min_length=60)
    df_no_require.to_csv(config.jobs_data_introduce_path, mode='w', encoding='utf-8')
# 让简介好看点  左对齐并留空
def _make_introduce_beautiful(txt, min_length):


    try:
        return txt.ljust(min_length)
    except Exception as e:
        print(e)
        return ''.ljust(min_length)


# 获取工作简介
def fetch_job_introduce(web_type, keywords, page_count, area):
    url = ""
    decode_type = ""
    #根据不同网站设置不同的地址格式
    area_need = ""
    if web_type == WEBTYPE._51job:
        url = "http://search.51job.com/list/{}0000,000000" \
              ",0000,00,9,99,{},2,{}.html? lang=c&stype=1&postchannel=0000&workyear=99&" \
              "cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0" \
              "&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
        decode_type = 'gbk'
        file_path = os.path.join(config.job_data_dir, '51job_area_code.txt')
        with open(file_path, mode='r', encoding='utf-8') as f:
            result = f.read()
            dic = eval(result)
            area_need = '%02d' % dic[area]
    elif web_type == WEBTYPE.zhilian:
        url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&isadv=0&sg=7e9e61449fd14593a5604fff81aec46a&p={}"
        decode_type = "utf-8"
    # 实际页数从1开始，所以+1
    urls = [url.format(area_need,' '.join(keywords), p+1) for p in range(0, page_count)]
    df = fetch_companies(urls, decode_type, web_type)
    return df




def fetch_companies(urls, decode_type, web_type):
    df = pd.DataFrame(columns=['薪酬', '地区', '详细地址', '链接', '工作', '公司', '来源', '要求'])

    # 要页数从0开始
    for url in urls:
        r = requests.get(url, headers=config.http_headers).content.decode(decode_type)
        if web_type == WEBTYPE._51job:
            bs = BeautifulSoup(r, 'lxml').find("div", class_="dw_table").find_all("div", class_="el")
            for b in bs:
                try:
                    href, job_name = b.find('a')['href'], b.find('a')['title']
                    company_name = b.find('span', class_='t2').text
                    locate = b.find('span', class_='t3').text
                    salary = b.find('span', class_='t4').text

                    dic = {'工作': company_name,
                           '地区': locate,
                           '详细地址': '',
                           '薪酬': salary,
                           '公司': job_name,
                           '链接': href,
                           '来源': web_type.value,
                           '要求': ''}
                    index = df.shape[0]
                    df.loc[index] = dic
                    # print(df)
                except Exception as e:
                    print(e, "简介解析错误")
                    pass
        elif web_type == WEBTYPE.zhilian:
            bs = BeautifulSoup(r, 'lxml').find(id="newlist_list_content_table").find_all("table",class_="newlist")
            for b in bs:
                try:
                    # 第一个标签没有信息
                    href = b.find("td", class_="zwmc").find("div").find("a")["href"]
                    company_name = b.find("td", class_="zwmc").find("div").find("a").text
                    job_name = b.find("td", class_='gsmc').find("a").text
                    locate = b.find("td", class_="gzdd").text
                    salary = b.find("td", class_="zwyx").text
                    dic = {'工作': company_name,
                           '地区': locate,
                           '详细地址': '',
                           '薪酬': salary,
                           '公司': job_name,
                           '链接': href,
                           '来源': web_type.value,
                           '要求': ''}
                    index = df.shape[0]
                    df.loc[index] = dic
                    # print(df)
                except Exception as e:
                    print(e, "简介解析错误")
                    pass
    return df

# 获取工作详情
def fetch_job_detail(df):

    for i in  range(0, df.shape[0]):
        introduce = df.loc[i]
        location, require = _fetch_location_and_require_from_detail(introduce)
        df.loc[i, '详细地址'] = location
        df.loc[i, '要求'] = require

    return df

# 获取详细地址与工作要求详情
def _fetch_location_and_require_from_detail(introduce):
    global SPIDER_REQUIRE_COUNT
    web_type = introduce['来源']
    href = introduce['链接']
    company_name = introduce['公司']
    if web_type == WEBTYPE._51job.value:
        SPIDER_REQUIRE_COUNT += 1
        print("正在爬第{}个公司{}的要求\n{}".format(SPIDER_REQUIRE_COUNT, company_name, href))
        try:
            r = requests.get(href, headers=config.http_headers).content.decode("gbk")
            location_detail = _fetch_location_from_detail(r, introduce)
            bs = BeautifulSoup(r, 'lxml').find('div', class_="bmsg job_msg inbox")
            useless_bs1 = bs.find('p', class_='fp')
            useless_bs2 = bs.find('div', class_='share')
            require = bs.text.replace(useless_bs1.text, '').replace(useless_bs2.text, '')\
                .replace("\t", "").replace("\n", "").replace("\r", "")
            return location_detail, require
        except Exception as e:
            print(e, "工作要求解析错误")
            return "", ""
            pass

    elif web_type == WEBTYPE.zhilian.value:
        SPIDER_REQUIRE_COUNT += 1
        print("正在爬第{}个公司{}的要求\n{}".format(SPIDER_REQUIRE_COUNT, company_name, href))

        try:
            r = requests.get(href, headers=config.http_headers).content.decode("utf-8")
            location_detail = _fetch_location_from_detail(r, introduce)
            bs = BeautifulSoup(r, 'lxml').find('div', class_="tab-inner-cont")
            useless_bs1 = bs.find('b')
            useless_bs2 = bs.find('h2')
            useless_bs3 = bs.find(id='applyVacButton1')
            require = bs.text.replace(useless_bs1.text, '').replace(useless_bs2.text, '').replace(useless_bs3.text, '')\
                .replace("\t", "").replace("\n", "").replace("\r", "")
            return location_detail, require
        except Exception as e:
            print(e, "工作要求解析错误")
            return "", ""
            pass

#获取详细地址
def _fetch_location_from_detail(h5_content, introduce):

    """获取公司详细地址"""
    web_type = introduce['来源']
    if web_type == WEBTYPE._51job.value:
        bs = BeautifulSoup(h5_content, 'lxml').find_all('p', class_="fp")
        for b in bs:
            try:
                location = b.text
                if "上班地址" in location:
                    location = location.replace("上班地址：", "").replace("\t", "").replace("\n", "")
                    return location
            except Exception as e:
                print(e, '上班地址解析错误')
                return introduce['地区']
                pass
    elif web_type == WEBTYPE.zhilian.value:

        bs = BeautifulSoup(h5_content, 'lxml').find('div', class_="tab-inner-cont")

        try:
            location = bs.find("h2").text
            location = location.replace("\t", "").replace("\n", "").replace("\r", "").replace(" ", "").replace("查看职位地图", "")
            return location

        except Exception as e:
            print(e, '上班地址解析错误')
            return introduce['地区']
            pass

jobs_data_analyse.py

import os
from program import config
import pandas as pd
import math
import jieba
import jieba.posseg
import csv
import matplotlib.pyplot as plt

from program.job_spider import *
import numpy as np
from PIL import Image
from collections import Counter
from wordcloud import WordCloud
pd.set_option('expand_frame_repr', False)
def jobs_data_analyse():
    df = pd.read_csv(config.jobs_data_path, encoding='utf-8')
    df['薪酬'] = df['薪酬'].apply(unify_salary_form)
    salary_analyse(df)
    require_analyse(df)

#统一工资格式
def unify_salary_form(salary):

    if type(salary) == float and math.isnan(salary):
        return None
    month = 1
    if salary.endswith('/年'):
        month = 12
        salary = salary.replace('/年', '')
    elif salary.endswith('/月'):
        month = 1
        salary = salary.replace('/月', '')

    multiple = 1
    if salary.endswith('千'):
        multiple = 1000
        salary = salary.replace('千', '')
    elif salary.endswith('万'):
        multiple = 10000
        salary = salary.replace('万', '')

    # print(salary)
    try:
        min = int(float(salary.split('-')[0]) * multiple / month)
        max = int(float(salary.split('-')[1]) * multiple / month)
        return str(min), str(max), str(min) + '-' + str(max)
    except Exception as e:
        print(e)
        return None

#分析薪酬
def salary_analyse(df):
    df['low_薪酬'] = df['薪酬'].apply(lambda x: None if(x == None) else int(x[0]))
    df['high_薪酬'] = df['薪酬'].apply(lambda x: None if (x == None) else int(x[1]))

    print('该行业平均工资为: ', df.dropna(subset=['薪酬'])[['low_薪酬', 'high_薪酬']].mean().mean())
    index_max_salary = df['high_薪酬'].idxmax()
    index_min_salary = df['low_薪酬'].idxmin()
    print('最高薪酬的公司: %s 薪酬为: %d 链接如下\\n%s' % (df.loc[index_max_salary, '公司'], df['high_薪酬'].max(), df.loc[index_max_salary, '链接']))
    print('最低薪酬的公司: %s 薪酬为: %d 链接如下\\n%s' % (df.loc[index_min_salary, '公司'], df['low_薪酬'].min(), df.loc[index_min_salary, '链接']))

    for area, group in df.dropna(subset=['薪酬']).groupby('地区'):
        average_salary = group[['low_薪酬', 'high_薪酬']].mean().mean()
        print('该行业在地区:(%s)的平均薪酬为:%d' % (area, average_salary))


#分析要求
def require_analyse(df):
    all_require = ''
    for require in df['要求']:
        if type(require) == float and math.isnan(require):
            continue
        all_require += require
    _require_word_freq(all_require)
    _require_word_cloud()

def _require_word_freq(all_require):
    #设置用户词典
    jieba.load_userdict(os.path.join(config.jieba_dir, "user_dict.txt"))
    seg_lst = jieba.posseg.cut(all_require)
    counter = Counter()
    #设置停用词
    stopwords_path = os.path.join(config.jieba_dir,"stopwords.txt" )
    stopwords = [line.strip() for line in open(stopwords_path, "r", encoding="utf-8").readlines()]

    for seg in seg_lst:
        if seg.word in stopwords:
            continue
            #过滤符号
        elif seg.flag == 'x':
            continue
        counter[seg.word] += 1
    counter_sorted = sorted(counter.items(), key=lambda value: value[1], reverse=True)


    with open(config.jobs_require_word_freq_path, "w+", encoding="utf-8") as f:
        f_csv = csv.writer(f)
        f_csv.writerows(counter_sorted)
        print('词频文件保存成功,地址为：', config.jobs_require_word_freq_path)

def _require_word_cloud():
    word_freq_dic = dict()
    with open(config.jobs_require_word_freq_path, mode='r', encoding='utf-8') as f:
        f_csv = csv.reader(f)
        # print(f_csv)
        for row in f_csv:
            word_freq_dic[row[0]] = int(row[1])
        # print(word_freq_dic)

    #使用图片作为背景生成wordcloud
    #这里用alice的图 是从这里得来的http://blog.csdn.net/fontthrone/article/details/72775865
    # alice_coloring = np.array(Image.open(config.alice_png))
    # wc = WordCloud(font_path=config.wc_font_path, background_color='white', mask = alice_coloring,
    #                max_words=150, max_font_size=100, min_font_size=20)\
    #     .generate_from_frequencies(word_freq_dic)


    wc = WordCloud(font_path=config.wc_font_path,
                          max_words=150, height=800, width=1400).generate_from_frequencies(word_freq_dic)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis('off')
    plt.show()
    wc.to_file(config.wordcloud_png_path)


def start():
    check_area_name()
    fetch_data(web_type=WEBTYPE.all, keywords=['iOS'], area='深圳', page_count=5)
    jobs_data_analyse()

start()

使用方法

打开项目文件jobs_data_analyse.py运行，可根据个人需求更改

运行后，就会开始收集数据。

数据爬虫

收集完成后，会对收集来的薪酬数据简要分析。

薪酬分析

最后会根据工作要求生成wordcloud。

深圳iOS的词频

为了方便按地区查看工作，笔者把工作简介放在jobs_data_introduce.csv,客官搜索自己要的地区进行查看。

工作简介

这个demo只是符合笔者需要，仅供参考。

demo地址

python爬虫与初步分析51job及智联工作
找工作啊找工作年关将至，相信很多小伙伴在寻思着发展方向。不知有没小伙伴跟笔者找工作时狂刷简历，为的是找出离家近点...
Python爬虫：智联
使用Scrapy爬取大规模数据
系统学习了解Python爬虫有20天时间了，做了一些爬虫小实验，如：爬取51JOB岗位要求及信息《当我们学Py...
《七天爬虫进阶系列》 - 导语
边工作边学习Python爬虫，一晃将近两周的时间。本周完成了Python抓取的基础工作，算是已经对爬虫有了一个初步...
爬虫入门的一点笔记
8月13号开始，用了六天学习了来自传智播客的“六节课掌握爬虫入门”课程，对python爬虫技术有了初步的了解，能够...
乱七八糟第四天
萌萌走了开始继续充电，今天晚上把51job爬虫分析的代码分析摘录一下： Beautiful Soup支持Pytho...
【非专业HR的自我修养】016 锻炼一下思考问题的深度
有点思维枯萎了，不知道该写些什么了。不如梳理下近期的工作思路吧：一.渠道：智联，51job，猎聘，拉勾，Bos...
职位分析APP Python爬虫 + Python后台 + An
个人独自开发项目工作分析APP 用到的技术 Python requests 爬虫 IP代理 Python Dja...
51job'python'岗位分析
项目介绍使用python requests爬取51job的13925条有关北京地区python的岗位信息，分析岗...
IT男在服务业的过渡期
一、求职概况 2013年6月份毕业，我回到上海重新找份工作，在智联和51Job上面投了简历，但是收效稍微，一个面试...