boss 直聘爬虫数据分析职位分析

作者: ygquincy | 来源:发表于2019-03-16 19:55 被阅读0次

首先说明这篇文章的数据来源，是爬虫BOSS直聘"数据分析师"这一职位信息所得来的。并且主要分析了数据分析师总体薪酬情况、不同城市薪酬分布、不同学历薪酬分布、北京上海工作经验薪酬分布情况、北上广深对数据分析职位需求量以及有招聘需求的公司所处行业的词云图分析。

1.数据采集
2.数据清洗与处理
3.数据分析

数据采集

import requests
from fake_useragent import UserAgent
from lxml import etree
import pymysql
import pymongo
import json
import time
from requests import RequestException

mongo_url = 'localhost'
mongo_db = 'zhaopin'

ua = UserAgent()

class Boss(object):
    def __init__(self):
        self.url = 'https://www.zhipin.com/{}/?query=数据分析&page={}'
        self.headers = {'user-agent': ua.random,
           'referer':'https://www.zhipin.com/c101020100/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&page=1',
           'cookie': ''}
        self.client = pymongo.MongoClient(mongo_url)
        self.db = self.client[mongo_db]
        self.cityList = {'广州':'c101280100','北京':'c101010100','上海':'c101020100','深圳':'c101280600','杭州':'c101210100','天津':'c101030100','西安':'c101110100','苏州':'c101190400','武汉':'c101200100','厦门':'c101230200','长沙':'c101250100','成都':'c101270100','郑州':'c101180100','重庆':'c101040100'}


    # def get_proxy(self):
    #     PROXY_POOL_URL = 'http://localhost:5555/random'
    #     try:
    #         response = requests.get(PROXY_POOL_URL)
    #         if response.status_code == 200:
    #             return response.text
    #     except ConnectionError:
    #         return None


    def get_one_page(self, url):
        try:
            # proxy = self.get_proxy()
            # proxies = {'http': proxy}
            # print(proxies)
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                return response.text

            return None
        except RequestException:
            print("请求错误")

    def parse_one_page(self,html):
        html = etree.HTML(html)
        content = html.xpath("//li/div[@class='job-primary']")

        for con in content:

            pos_name = con.xpath(".//div[@class='job-title']/text()")[0]
            comp_name = con.xpath(".//div[@class='info-company']/div/h3/a/text()")[0]
            salary = con.xpath(".//h3/a/span/text()")[0]
            scale = con.xpath("./div[@class='info-company']//p/text()[last()]")[0]
            education = con.xpath("./div/p/text()[3]")[0]
            industry = con.xpath(".//div[@class='company-text']/p//text()")[0]
            workyear = con.xpath("./div[@class='info-primary']/p/text()")[1]
            location = con.xpath("./div[@class='info-primary']/p/text()")[0]


            item = {'pos_name':pos_name,
                    'comp_name':comp_name,
                    'salary':salary,
                    'scale':scale,
                    'education':education,
                    'industry':industry,
                    'workyear':workyear,
                    'location':location}
            yield item

    def write_to_file(self, item):
        with open('boss.txt', 'a', encoding='utf-8') as f:
            f.write(json.dumps(item, ensure_ascii=False)+'\n')

    def write_to_csv(self, item):
        with open('爬虫BOSS直聘.txt','a', encoding='utf-8') as file:
            line = str(item['pos_name']) + ',' + str(item['comp_name']) + ',' + str(item['salary']) + ',' + \
                   str(item['scale']) + ',' + str(item['education']) + ',' + str(item['industry']) + ',' + \
                   str(item['workyear']) + ',' + str(item['location']) + '\n'
            file.write(line)

    def save_to_mongo(self, item):
        if self.db['boss'].insert(item):
            print("save successfully")

    def save_mo_mysql(self, item):
        conn = pymysql.connect(host='localhost', user='root', password='', db='test7', port=3306,
                               charset='utf8')
        cur = conn.cursor()
        insert_data = "INSERT INTO boss(pos_name, comp_name, salary, scale, education, industry, workyear,location) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)"
        val = (item['pos_name'], item['comp_name'], item['salary'], item['scale'], item['education'], item['industry'], item['workyear'], item['location'])
        cur.execute(insert_data, val)
        conn.commit()

    def run(self):
        title = u'posName,companyName,salary,scale,education,industry,workyear,location'+'\n'
        file = open('%s.txt' % '爬虫BOSS直聘', 'w',encoding='utf-8')  # 创建爬虫拉勾网.txt文件
        file.write(title)
        file.close()


        for city in self.cityList.values():
            for i in range(1,11):
                url = self.url.format(city, i)
            # url = self.url.format(1)
                response = self.get_one_page(url)
                for i in self.parse_one_page(response):
                    self.write_to_csv(i)
                time.sleep(3)


if __name__ == '__main__':
    boss = Boss()
    boss.run()