首先说明这篇文章的数据来源,是爬虫BOSS直聘"数据分析师"这一职位信息所得来的。并且主要分析了数据分析师总体薪酬情况、不同城市薪酬分布、不同学历薪酬分布、北京上海工作经验薪酬分布情况、北上广深对数据分析职位需求量以及有招聘需求的公司所处行业的词云图分析。
1.数据采集
2.数据清洗与处理
3.数据分析
数据采集
import requests
from fake_useragent import UserAgent
from lxml import etree
import pymysql
import pymongo
import json
import time
from requests import RequestException
mongo_url = 'localhost'
mongo_db = 'zhaopin'
ua = UserAgent()
class Boss(object):
def __init__(self):
self.url = 'https://www.zhipin.com/{}/?query=数据分析&page={}'
self.headers = {'user-agent': ua.random,
'referer':'https://www.zhipin.com/c101020100/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&page=1',
'cookie': ''}
self.client = pymongo.MongoClient(mongo_url)
self.db = self.client[mongo_db]
self.cityList = {'广州':'c101280100','北京':'c101010100','上海':'c101020100','深圳':'c101280600','杭州':'c101210100','天津':'c101030100','西安':'c101110100','苏州':'c101190400','武汉':'c101200100','厦门':'c101230200','长沙':'c101250100','成都':'c101270100','郑州':'c101180100','重庆':'c101040100'}
# def get_proxy(self):
# PROXY_POOL_URL = 'http://localhost:5555/random'
# try:
# response = requests.get(PROXY_POOL_URL)
# if response.status_code == 200:
# return response.text
# except ConnectionError:
# return None
def get_one_page(self, url):
try:
# proxy = self.get_proxy()
# proxies = {'http': proxy}
# print(proxies)
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("请求错误")
def parse_one_page(self,html):
html = etree.HTML(html)
content = html.xpath("//li/div[@class='job-primary']")
for con in content:
pos_name = con.xpath(".//div[@class='job-title']/text()")[0]
comp_name = con.xpath(".//div[@class='info-company']/div/h3/a/text()")[0]
salary = con.xpath(".//h3/a/span/text()")[0]
scale = con.xpath("./div[@class='info-company']//p/text()[last()]")[0]
education = con.xpath("./div/p/text()[3]")[0]
industry = con.xpath(".//div[@class='company-text']/p//text()")[0]
workyear = con.xpath("./div[@class='info-primary']/p/text()")[1]
location = con.xpath("./div[@class='info-primary']/p/text()")[0]
item = {'pos_name':pos_name,
'comp_name':comp_name,
'salary':salary,
'scale':scale,
'education':education,
'industry':industry,
'workyear':workyear,
'location':location}
yield item
def write_to_file(self, item):
with open('boss.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(item, ensure_ascii=False)+'\n')
def write_to_csv(self, item):
with open('爬虫BOSS直聘.txt','a', encoding='utf-8') as file:
line = str(item['pos_name']) + ',' + str(item['comp_name']) + ',' + str(item['salary']) + ',' + \
str(item['scale']) + ',' + str(item['education']) + ',' + str(item['industry']) + ',' + \
str(item['workyear']) + ',' + str(item['location']) + '\n'
file.write(line)
def save_to_mongo(self, item):
if self.db['boss'].insert(item):
print("save successfully")
def save_mo_mysql(self, item):
conn = pymysql.connect(host='localhost', user='root', password='', db='test7', port=3306,
charset='utf8')
cur = conn.cursor()
insert_data = "INSERT INTO boss(pos_name, comp_name, salary, scale, education, industry, workyear,location) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)"
val = (item['pos_name'], item['comp_name'], item['salary'], item['scale'], item['education'], item['industry'], item['workyear'], item['location'])
cur.execute(insert_data, val)
conn.commit()
def run(self):
title = u'posName,companyName,salary,scale,education,industry,workyear,location'+'\n'
file = open('%s.txt' % '爬虫BOSS直聘', 'w',encoding='utf-8') # 创建爬虫拉勾网.txt文件
file.write(title)
file.close()
for city in self.cityList.values():
for i in range(1,11):
url = self.url.format(city, i)
# url = self.url.format(1)
response = self.get_one_page(url)
for i in self.parse_one_page(response):
self.write_to_csv(i)
time.sleep(3)
if __name__ == '__main__':
boss = Boss()
boss.run()
数据清洗与处理
image.png首先看到爬下来的数据地区location,太详细了,我们只保留市的前两个字。
image.png
可以观察到工资的格式也有些问题,是一个区间的形式,用函数把工资清理成最大值,最小值,以及平均值的形式便于分析。
image.png
数据分析
总体工资分布情况
image.png
不同城市工资分布的情况
image.png
不同学历的分布情况
image.png
再仔细看看详细的招聘人数情况
image.png
现在来看看北京上海工作经验分布情况
image.png
来看看北上广深对数据分析类职位的需求量
image.png
做个招聘的公司所处行业领域的词云图分析
image.png
可以观察到需求数据分析这一职位的主要在互联网,移动互联网,电子商务,金融等方面。所以向这些领域求职的话成功率会大很多。
网友评论