一.数据来源
网上扒拉的一份拉勾网的数据文件。目的:进行描述性统计。
二.数据处理
1.读取数据
import pandas as pd
import numpy as np
data_lg = pd.read_csv("lagou_recruitment.csv",
header=None,names=['岗位名称','公司名称','城市','地点','薪资','基本要求','公司状况','岗位技能','公司福利'],
index_col=None,engine='python',encoding='utf-8')
data_lg.head()
所读取的数据
image.png
查看数据,所有列均为字符串,有空值列,接下来处理时应该注意空值列。
2.将基本要求列进行拆分
data_lg['基本要求'] = data_lg['基本要求'].astype(str)
def str_cl(i):
i_a = i.split('/')[1]
i_a = i_a.replace(" ","")
return i_a
data_lg['学历要求'] = data_lg['基本要求'].apply(str_cl)
data_lg['经验要求'] = data_lg["基本要求"].str.split(" ",expand=True)[1]
del data_lg['基本要求']
看下处理情况
image.png
3.将公司状况拆分形成:行业+级别+人数
data_lg["行业"] = data_lg["公司状况"].str.split(" ",expand=True)[0]
data_lg["级别"] = data_lg["公司状况"].str.split(" ",expand=True)[2]
data_lg["人数"] = data_lg["公司状况"].str.split(" ",expand=True)[4]
del data_lg["公司状况"]
# 去除掉两边的括号
data_lg["地点"] = data_lg["地点"].str.replace('[','').str.replace(']','')
image.png
4.处理下薪资
data_lg["最低薪资"] = data_lg["薪资"].str.replace('k',"000").str.replace('K','000').str.split('-',expand=True)[0].astype('int')
data_lg["最高薪资"] = data_lg["薪资"].str.replace('k',"000").str.replace('K','000').str.split('-',expand=True)[1].astype('int')
del data_lg["薪资"]
data_lg["平均薪资"] = (data_lg["最低薪资"]+data_lg["最高薪资"])/2
image.png
三.数据分析
1.看下哪些城市招聘数据分析师
city_number = data_lg['城市'].value_counts()
from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams['font.family'] = ['simhei'] #正常显示中文
city_number.plot()
image.png
还是用pyecharts画图吧
#pyecharts画图
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker
b=Bar()
b.add_xaxis(list(city_number.index))
b.add_yaxis('城市数量',list(city_number))
b.set_global_opts(title_opts=opts.TitleOpts(title="数据分析工作地区分布"))
b.render_notebook()
image.png
2.地理位置上对分析师的需求
city_number.to_dict().items()
from pyecharts import options as opts
from pyecharts.charts import Geo
from pyecharts.faker import Faker
c = Geo()
c.add_schema(maptype="china")
c.add("geo", city_number.to_dict().items())
c.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
c.set_global_opts(
visualmap_opts=opts.VisualMapOpts(), title_opts=opts.TitleOpts(title="各城市对分析师需求数量")
)
c.render_notebook()
image.png
3.薪资特征分布
data_lg.平均薪资.plot(kind='density',grid=True,figsize=(12,5)
,xlim=(0,60000) #x轴的刻度范围
,xticks=np.arange(0,60000,3000) #x轴的刻度线
);
image.png
4.对平均薪资做统计
data_lg.groupby(by='城市')['平均薪资'].agg(['max','min','median','mean'])
image.png
一线城市的平均薪资情况
from pyecharts import options as opts
from pyecharts.charts import Boxplot
beijing = data_lg[data_lg.城市=='北京']['平均薪资']
shanghai = data_lg[data_lg.城市=='上海']['平均薪资']
guangzhou = data_lg[data_lg.城市=='广州']['平均薪资']
shenzhen = data_lg[data_lg.城市=='深圳']['平均薪资']
bp = Boxplot()
bp.add_xaxis(['北京','上海','广州','深圳'])
bp.add_yaxis('平均薪资',bp.prepare_data([beijing,shanghai,guangzhou,shenzhen]))
bp.set_global_opts(title_opts=opts.TitleOpts(title="一线城市平均薪资情况"))
bp.render_notebook()
image.png
5.根据工作年限的不同,统计平均薪资的箱线图
temp = data_lg.经验要求.str.replace('经验','').value_counts().index
v1 = data_lg[data_lg.经验要求 == temp[0]]['平均薪资']
v2 = data_lg[data_lg.经验要求 == temp[1]]['平均薪资']
v3 = data_lg[data_lg.经验要求 == temp[2]]['平均薪资']
v4 = data_lg[data_lg.经验要求 == temp[3]]['平均薪资']
v5 = data_lg[data_lg.经验要求 == temp[4]]['平均薪资']
v6 = data_lg[data_lg.经验要求 == temp[5]]['平均薪资']
v7 = data_lg[data_lg.经验要求 == temp[6]]['平均薪资']
box1 = Boxplot()
box1.add_xaxis(list(temp))
box1.add_yaxis("平均薪资",box1.prepare_data([v1,v2,v3,v4,v5,v6]))
box1.reversal_axis() #翻转XY轴
box1.set_global_opts(title_opts=opts.TitleOpts(title="工作年限平均薪资分布"))
box1.render_notebook()
image.png
6.不同学历的薪资情况
image.png
7.上海的情况
data_lg_sh = data_lg[data_lg.城市=='上海'].copy()
loc_sh = data_lg_sh.地点.map({'七宝':'闵行区','世纪公园':'浦东新区','东外滩':'黄浦区','东方路':'浦东新区','中山公园':'长宁区',
'中远两湾城':'普陀区','五角场':'杨浦区','五里桥':'黄浦区','人民广场':'黄浦区','八佰伴':'黄浦区','凉城':'虹口区',
'北京西路':'黄浦区','北外滩':'黄浦区','北新泾':'长宁区','华漕':'闵行区','南京东路':'黄浦区','南方商城':'闵行区',
'南码头':'浦东新区','吴淞':'宝山区','周家嘴路':'虹口区','陆家嘴':'浦东新区','漕河泾':'徐汇区','龙华':'徐汇区',
'田林':'徐汇区','长阳路':'杨浦区','天山路':'长宁区','长征':'黄浦区','四平路':'杨浦区','江湾':'杨浦区','斜土路':'徐汇区',
'城隍庙':'黄浦区','虹梅路':'徐汇区','虹桥':'闵行区','五角场':'杨浦区','中山公园':'长宁区','复旦大学':'杨浦区',
'外滩':'黄浦区','平凉路':'杨浦区','中远两湾城':'普陀区', '洋泾':'浦东新区', '潍坊':'浦东新区','遵义路':'长宁区',
'洞泾':'松江区','塘桥':'浦东新区','徐汇区':'徐汇区','长宁区':'长宁区','杨浦区':'杨浦区','黄浦区':'黄浦区',
'浦东新区':'浦东新区','闵行区':'闵行区','宝山区':'宝山区','普陀区':'普陀区','虹口区':'虹口区','松江区':'松江区',
'静安区':'静安区', '闸北区':'闸北区', '青浦区':'青浦区'}).value_counts()
image.png
8.公司福利词云
import jieba
fun = lambda x:jieba.lcut(x)
data_lg_cut = data_lg['公司福利'].apply(fun).tolist()
data_lg_cuts = [j for i in data_lg_cut for j in i if j!=' ']
data_lg_cuts[:10]
#统计
import collections
collections.Counter(data_lg_cuts)
# 去除停用词
stopwords = pd.read_csv('停用词.csv',engine='python',header=0,encoding='utf-8')
stop_list = stopwords['停用词'].tolist()
split_words = list(x for x in data_lg_cuts if (x not in stop_list)&(len(x)>1))
collections.Counter(split_words)
from pyecharts import options as opts
from pyecharts.charts import Page, WordCloud
from pyecharts.globals import SymbolType
words=collections.Counter(split_words).most_common(200)
w=WordCloud()
w.add("", new, word_size_range=[20, 100])
w.set_global_opts(title_opts=opts.TitleOpts(title="公司描述"))
w.render_notebook()#实时显示
image.png
9.工位技能要求
data_lg.岗位技能.str.split(' ')[221] ##坑,字符串有空值,因此在分离前将空值删除
jineng = data_lg.岗位技能.dropna().str.split(' ').tolist()
jineng_word = [j for i in jineng for j in i if j!='数据分析']
jineng_li = []
for i in set(jineng_word):
jineng_li.append((i,jineng_word.count(i)))
# matplotlib 画图
from wordcloud import STOPWORDS
import wordcloud
plt.rcParams['figure.figsize']=(6,10)
newtxt= ' '.join(jineng_word)#用空格分开的字符串
w=wordcloud.WordCloud(width=1000,height=700,mask=backgroud_Image,max_words=50,stopwords=STOPWORDS,font_path="C:/Windows/Fonts/SIMYOU.TTF",background_color = 'white')
wordcloud=w.generate(newtxt)
plt.imshow(wordcloud)
plt.axis('off')
image.png
网友评论