python爬虫与初步分析51job及智联工作

作者: luomagaoshou | 来源:发表于2018-02-02 14:31 被阅读54次

    找工作啊找工作

    年关将至,相信很多小伙伴在寻思着发展方向。不知有没小伙伴跟笔者找工作时狂刷简历,为的是找出离家近点的公司,但是有很多公司的简介上是没有地址的,所以也只能再点进去看公司详细地址。
    因此,写了个爬虫,方便找工作。

    项目简介

    主要代码是一个job_spider.py(用于爬虫), jobs_data_analyse.py(用于工作数据分析)
    笔者先获取工作列表,取得简介后再取得详情。下载完成后,再进行分析。

    详细代码如下

    job_spider.py

    from bs4 import BeautifulSoup
    import requests
    import os
    from enum import Enum
    from program import config
    import pandas as pd
    
    
    
    pd.set_option('expand_frame_repr', False)  # 列太多不换行
    
    class WEBTYPE(Enum):
        _51job = '_51job'  # 51job
        zhilian = 'zhilian'  # 智联
        all = 3  # 所有
    
    #全局变量  记录爬虫次数
    SPIDER_REQUIRE_COUNT = 0
    
    #获取51job地址编号对应地名
    def get_51job_area_code():
        dic = {}
        for i in range(1, 37):
            url = 'http://search.51job.com/list/{}0000,000000,0000,00,9,99,ios,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format('%02d' % i)
            r = requests.get(url, headers=config.http_headers).content.decode('gbk')
            area_name = BeautifulSoup(r, 'lxml').find(id="work_position_input")['value']
            print(area_name, i)
            dic[area_name] = i
        file_path = os.path.join(config.job_data_dir, '51job_area_code.txt')
        print('51job地区编号文件获取成功')
        with open(file_path, "w+", encoding="utf-8") as f:
            f.write(str(dic))
            f.close()
    
    
    #  检查本地是否有51job地区编号 没有的话就自动获取
    def check_area_name():
        file_path = os.path.join(config.job_data_dir, '51job_area_code.txt')
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as f:
                if f:
                    result = f.read()
                    dic = eval(result)
                    f.close()
        else:
            print('51job缺少地区编号文件,获取中')
            get_51job_area_code()
            check_area_name()
    
    
    
    
    
    def fetch_data( web_type=WEBTYPE.all, keywords=['iOS'], page_count=5, area='深圳'):
        if os.path.exists(config.jobs_data_path):
            os.remove(config.jobs_data_path)
            print('删除之前爬的数据')
    
        if web_type == WEBTYPE._51job:
            _fetch_data(web_type, keywords, page_count, area)
        elif web_type == WEBTYPE.zhilian:
            _fetch_data(web_type, keywords, page_count, area)
        elif web_type == WEBTYPE.all:
            for type in list(WEBTYPE)[0: -1]:
                _fetch_data(type, keywords, page_count, area)
    
    def _fetch_data(web_type, keywords, page_count, area):
        df = fetch_job_introduce(web_type, keywords, page_count, area)
        df = fetch_job_detail(df)
        df.fillna(value='', inplace=True)
    
        if os.path.exists(config.jobs_data_path):
            df_existed = pd.read_csv(config.jobs_data_path, encoding='utf-8', index_col=0)
            df = df.append(df_existed, ignore_index=True)
    
        df.sort_values(by=['地区'], inplace=True)
        df.reset_index(drop=True, inplace=True)
        df.to_csv(config.jobs_data_path, mode='w', encoding='utf-8')
    
    
        #去除工作要求 方便查看
        df_no_require = df.drop(['要求'], axis=1)
        df_no_require['薪酬'] = df_no_require['薪酬'].apply(_make_introduce_beautiful, min_length=12)
        df_no_require['地区'] = df_no_require['地区'].apply(_make_introduce_beautiful, min_length=12)
        df_no_require['详细地址'] = df_no_require['详细地址'].apply(_make_introduce_beautiful, min_length=30)
        df_no_require['链接'] = df_no_require['链接'].apply(_make_introduce_beautiful, min_length=60)
        df_no_require.to_csv(config.jobs_data_introduce_path, mode='w', encoding='utf-8')
    # 让简介好看点  左对齐并留空
    def _make_introduce_beautiful(txt, min_length):
    
    
        try:
            return txt.ljust(min_length)
        except Exception as e:
            print(e)
            return ''.ljust(min_length)
    
    
    # 获取工作简介
    def fetch_job_introduce(web_type, keywords, page_count, area):
        url = ""
        decode_type = ""
        #根据不同网站设置不同的地址格式
        area_need = ""
        if web_type == WEBTYPE._51job:
            url = "http://search.51job.com/list/{}0000,000000" \
                  ",0000,00,9,99,{},2,{}.html? lang=c&stype=1&postchannel=0000&workyear=99&" \
                  "cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0" \
                  "&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
            decode_type = 'gbk'
            file_path = os.path.join(config.job_data_dir, '51job_area_code.txt')
            with open(file_path, mode='r', encoding='utf-8') as f:
                result = f.read()
                dic = eval(result)
                area_need = '%02d' % dic[area]
        elif web_type == WEBTYPE.zhilian:
            url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&isadv=0&sg=7e9e61449fd14593a5604fff81aec46a&p={}"
            decode_type = "utf-8"
        # 实际页数从1开始,所以+1
        urls = [url.format(area_need,' '.join(keywords), p+1) for p in range(0, page_count)]
        df = fetch_companies(urls, decode_type, web_type)
        return df
    
    
    
    
    def fetch_companies(urls, decode_type, web_type):
        df = pd.DataFrame(columns=['薪酬', '地区', '详细地址', '链接', '工作', '公司', '来源', '要求'])
    
        # 要页数从0开始
        for url in urls:
            r = requests.get(url, headers=config.http_headers).content.decode(decode_type)
            if web_type == WEBTYPE._51job:
                bs = BeautifulSoup(r, 'lxml').find("div", class_="dw_table").find_all("div", class_="el")
                for b in bs:
                    try:
                        href, job_name = b.find('a')['href'], b.find('a')['title']
                        company_name = b.find('span', class_='t2').text
                        locate = b.find('span', class_='t3').text
                        salary = b.find('span', class_='t4').text
    
                        dic = {'工作': company_name,
                               '地区': locate,
                               '详细地址': '',
                               '薪酬': salary,
                               '公司': job_name,
                               '链接': href,
                               '来源': web_type.value,
                               '要求': ''}
                        index = df.shape[0]
                        df.loc[index] = dic
                        # print(df)
                    except Exception as e:
                        print(e, "简介解析错误")
                        pass
            elif web_type == WEBTYPE.zhilian:
                bs = BeautifulSoup(r, 'lxml').find(id="newlist_list_content_table").find_all("table",class_="newlist")
                for b in bs:
                    try:
                        # 第一个标签没有信息
                        href = b.find("td", class_="zwmc").find("div").find("a")["href"]
                        company_name = b.find("td", class_="zwmc").find("div").find("a").text
                        job_name = b.find("td", class_='gsmc').find("a").text
                        locate = b.find("td", class_="gzdd").text
                        salary = b.find("td", class_="zwyx").text
                        dic = {'工作': company_name,
                               '地区': locate,
                               '详细地址': '',
                               '薪酬': salary,
                               '公司': job_name,
                               '链接': href,
                               '来源': web_type.value,
                               '要求': ''}
                        index = df.shape[0]
                        df.loc[index] = dic
                        # print(df)
                    except Exception as e:
                        print(e, "简介解析错误")
                        pass
        return df
    
    # 获取工作详情
    def fetch_job_detail(df):
    
        for i in  range(0, df.shape[0]):
            introduce = df.loc[i]
            location, require = _fetch_location_and_require_from_detail(introduce)
            df.loc[i, '详细地址'] = location
            df.loc[i, '要求'] = require
    
        return df
    
    # 获取详细地址与工作要求详情
    def _fetch_location_and_require_from_detail(introduce):
        global SPIDER_REQUIRE_COUNT
        web_type = introduce['来源']
        href = introduce['链接']
        company_name = introduce['公司']
        if web_type == WEBTYPE._51job.value:
            SPIDER_REQUIRE_COUNT += 1
            print("正在爬第{}个公司{}的要求\n{}".format(SPIDER_REQUIRE_COUNT, company_name, href))
            try:
                r = requests.get(href, headers=config.http_headers).content.decode("gbk")
                location_detail = _fetch_location_from_detail(r, introduce)
                bs = BeautifulSoup(r, 'lxml').find('div', class_="bmsg job_msg inbox")
                useless_bs1 = bs.find('p', class_='fp')
                useless_bs2 = bs.find('div', class_='share')
                require = bs.text.replace(useless_bs1.text, '').replace(useless_bs2.text, '')\
                    .replace("\t", "").replace("\n", "").replace("\r", "")
                return location_detail, require
            except Exception as e:
                print(e, "工作要求解析错误")
                return "", ""
                pass
    
        elif web_type == WEBTYPE.zhilian.value:
            SPIDER_REQUIRE_COUNT += 1
            print("正在爬第{}个公司{}的要求\n{}".format(SPIDER_REQUIRE_COUNT, company_name, href))
    
            try:
                r = requests.get(href, headers=config.http_headers).content.decode("utf-8")
                location_detail = _fetch_location_from_detail(r, introduce)
                bs = BeautifulSoup(r, 'lxml').find('div', class_="tab-inner-cont")
                useless_bs1 = bs.find('b')
                useless_bs2 = bs.find('h2')
                useless_bs3 = bs.find(id='applyVacButton1')
                require = bs.text.replace(useless_bs1.text, '').replace(useless_bs2.text, '').replace(useless_bs3.text, '')\
                    .replace("\t", "").replace("\n", "").replace("\r", "")
                return location_detail, require
            except Exception as e:
                print(e, "工作要求解析错误")
                return "", ""
                pass
    
    #获取详细地址
    def _fetch_location_from_detail(h5_content, introduce):
    
        """获取公司详细地址"""
        web_type = introduce['来源']
        if web_type == WEBTYPE._51job.value:
            bs = BeautifulSoup(h5_content, 'lxml').find_all('p', class_="fp")
            for b in bs:
                try:
                    location = b.text
                    if "上班地址" in location:
                        location = location.replace("上班地址:", "").replace("\t", "").replace("\n", "")
                        return location
                except Exception as e:
                    print(e, '上班地址解析错误')
                    return introduce['地区']
                    pass
        elif web_type == WEBTYPE.zhilian.value:
    
            bs = BeautifulSoup(h5_content, 'lxml').find('div', class_="tab-inner-cont")
    
            try:
                location = bs.find("h2").text
                location = location.replace("\t", "").replace("\n", "").replace("\r", "").replace(" ", "").replace("查看职位地图", "")
                return location
    
            except Exception as e:
                print(e, '上班地址解析错误')
                return introduce['地区']
                pass
    
    

    jobs_data_analyse.py

    import os
    from program import config
    import pandas as pd
    import math
    import jieba
    import jieba.posseg
    import csv
    import matplotlib.pyplot as plt
    
    from program.job_spider import *
    import numpy as np
    from PIL import Image
    from collections import Counter
    from wordcloud import WordCloud
    pd.set_option('expand_frame_repr', False)
    def jobs_data_analyse():
        df = pd.read_csv(config.jobs_data_path, encoding='utf-8')
        df['薪酬'] = df['薪酬'].apply(unify_salary_form)
        salary_analyse(df)
        require_analyse(df)
    
    #统一工资格式
    def unify_salary_form(salary):
    
        if type(salary) == float and math.isnan(salary):
            return None
        month = 1
        if salary.endswith('/年'):
            month = 12
            salary = salary.replace('/年', '')
        elif salary.endswith('/月'):
            month = 1
            salary = salary.replace('/月', '')
    
        multiple = 1
        if salary.endswith('千'):
            multiple = 1000
            salary = salary.replace('千', '')
        elif salary.endswith('万'):
            multiple = 10000
            salary = salary.replace('万', '')
    
        # print(salary)
        try:
            min = int(float(salary.split('-')[0]) * multiple / month)
            max = int(float(salary.split('-')[1]) * multiple / month)
            return str(min), str(max), str(min) + '-' + str(max)
        except Exception as e:
            print(e)
            return None
    
    #分析薪酬
    def salary_analyse(df):
        df['low_薪酬'] = df['薪酬'].apply(lambda x: None if(x == None) else int(x[0]))
        df['high_薪酬'] = df['薪酬'].apply(lambda x: None if (x == None) else int(x[1]))
    
        print('该行业平均工资为: ', df.dropna(subset=['薪酬'])[['low_薪酬', 'high_薪酬']].mean().mean())
        index_max_salary = df['high_薪酬'].idxmax()
        index_min_salary = df['low_薪酬'].idxmin()
        print('最高薪酬的公司: %s 薪酬为: %d 链接如下\\n%s' % (df.loc[index_max_salary, '公司'], df['high_薪酬'].max(), df.loc[index_max_salary, '链接']))
        print('最低薪酬的公司: %s 薪酬为: %d 链接如下\\n%s' % (df.loc[index_min_salary, '公司'], df['low_薪酬'].min(), df.loc[index_min_salary, '链接']))
    
        for area, group in df.dropna(subset=['薪酬']).groupby('地区'):
            average_salary = group[['low_薪酬', 'high_薪酬']].mean().mean()
            print('该行业在地区:(%s)的平均薪酬为:%d' % (area, average_salary))
    
    
    #分析要求
    def require_analyse(df):
        all_require = ''
        for require in df['要求']:
            if type(require) == float and math.isnan(require):
                continue
            all_require += require
        _require_word_freq(all_require)
        _require_word_cloud()
    
    def _require_word_freq(all_require):
        #设置用户词典
        jieba.load_userdict(os.path.join(config.jieba_dir, "user_dict.txt"))
        seg_lst = jieba.posseg.cut(all_require)
        counter = Counter()
        #设置停用词
        stopwords_path = os.path.join(config.jieba_dir,"stopwords.txt" )
        stopwords = [line.strip() for line in open(stopwords_path, "r", encoding="utf-8").readlines()]
    
        for seg in seg_lst:
            if seg.word in stopwords:
                continue
                #过滤符号
            elif seg.flag == 'x':
                continue
            counter[seg.word] += 1
        counter_sorted = sorted(counter.items(), key=lambda value: value[1], reverse=True)
    
    
        with open(config.jobs_require_word_freq_path, "w+", encoding="utf-8") as f:
            f_csv = csv.writer(f)
            f_csv.writerows(counter_sorted)
            print('词频文件保存成功,地址为:', config.jobs_require_word_freq_path)
    
    def _require_word_cloud():
        word_freq_dic = dict()
        with open(config.jobs_require_word_freq_path, mode='r', encoding='utf-8') as f:
            f_csv = csv.reader(f)
            # print(f_csv)
            for row in f_csv:
                word_freq_dic[row[0]] = int(row[1])
            # print(word_freq_dic)
    
        #使用图片作为背景生成wordcloud
        #这里用alice的图 是从这里得来的http://blog.csdn.net/fontthrone/article/details/72775865
        # alice_coloring = np.array(Image.open(config.alice_png))
        # wc = WordCloud(font_path=config.wc_font_path, background_color='white', mask = alice_coloring,
        #                max_words=150, max_font_size=100, min_font_size=20)\
        #     .generate_from_frequencies(word_freq_dic)
    
    
        wc = WordCloud(font_path=config.wc_font_path,
                              max_words=150, height=800, width=1400).generate_from_frequencies(word_freq_dic)
        plt.imshow(wc, interpolation="bilinear")
        plt.axis('off')
        plt.show()
        wc.to_file(config.wordcloud_png_path)
    
    
    def start():
        check_area_name()
        fetch_data(web_type=WEBTYPE.all, keywords=['iOS'], area='深圳', page_count=5)
        jobs_data_analyse()
    
    start()
    

    使用方法

    打开项目文件jobs_data_analyse.py运行,可根据个人需求更改

    运行后,就会开始收集数据。

    数据爬虫

    收集完成后,会对收集来的薪酬数据简要分析。

    薪酬分析

    最后会根据工作要求生成wordcloud。

    深圳iOS的词频

    为了方便按地区查看工作,笔者把工作简介放在jobs_data_introduce.csv,客官搜索自己要的地区进行查看。


    工作简介

    这个demo只是符合笔者需要,仅供参考。

    demo地址

    相关文章

      网友评论

      本文标题:python爬虫与初步分析51job及智联工作

      本文链接:https://www.haomeiwen.com/subject/bwtszxtx.html