青年千人数据分析

作者: Thinkando | 来源:发表于2017-12-07 23:21 被阅读84次

    序章

    本文是对《对青年千人表格信息的可视化探索》的重复,作者(天地本无心大神)
    https://mp.weixin.qq.com/s/GrVyv3RocrzJijVx7Ymr6Q

    刚开始学习python的我,会跑知乎社区看看别人比较R与python那个才是最好的分析软件,现在当我看到天地本无心大神拿着两把利器游刃有余的分析时,才顿悟:花时间去比较谁强谁弱的时候,早已有大神点满了这两项技能。

    1. 数据下载

    https://github.com/wandering513/Data_workshop/tree/master/1000_youth_talent
    

    2. python 实战

    import pandas as pd
    import numpy as np
    from matplotlib import pyplot as plt
    import matplotlib
    
    # 用pandas 读取excel
    qian = pd.read_excel("./1000_youth_talent.xlsx",header=0, skiprows=1)
    # 自己加表头
    qian.columns = ["ID","Name","Gender","year-month-day","year_birth","university","major"]
    print(qian.head())
    
    image.png
    2.1 看一下男女分布
    • 本届千人共630人,男549,女81
    sex =qian.groupby("Gender").size()
    #调节图形大小,宽,高
    plt.figure(figsize=(6,9))
    #定义饼状图的标签,标签是列表
    labels = ['Men','Female']
    #每个标签占多大,会自动去算百分比
    sizes = [sex['男'],sex['女']]
    colors = ['yellowgreen','red']
    #将某部分爆炸出来, 使用括号,将第一块分割出来,数值的大小是分割出来的与其他两块的间隙
    explode = (0.05,0)
    
    patches,l_text,p_text = plt.pie(sizes,explode=explode,labels=labels,colors=colors,labeldistance = 1.1,autopct = '%3.1f%%',shadow = False,startangle = 90,pctdistance = 0.6)
    
    #改变文本的大小
    #方法是把每一个text遍历。调用set_size方法设置它的属性
    for t in l_text:
        t.set_size=(30)
    for t in p_text:
        t.set_size=(20)
    # 设置x,y轴刻度一致,这样饼图才能是圆的
    plt.axis('equal')
    plt.legend()
    plt.show()
    
    image.png
    2.2 看一下年龄分布
    • 大神dataframe 用的出神入化了。。。
    #Code to get the age distribution of male and female
    # 抽取男女的dataframe
    qian_male = qian[qian["Gender"] == u"男"]
    qian_female = qian[qian["Gender"] == u"女"]
    # 算出男女的年龄,设为index
    qian_male_age = pd.Series(2017 - np.array(qian_male["year_birth"])).value_counts().sort_index()
    qian_female_age = pd.Series(2017 - np.array(qian_female["year_birth"])).value_counts().sort_index()
    # 统计每个年龄出现的人数
    qian_male_age = pd.DataFrame(qian_male_age).reset_index(); qian_male_age.columns = ["age","male_num"]
    qian_female_age = pd.DataFrame(qian_female_age).reset_index(); qian_female_age.columns = ["age","female_num"]
    # 合并
    age_count = qian_male_age.merge(qian_female_age,how="outer")
    age_count = age_count.fillna(0); age_count["female_num"] = age_count["female_num"].astype("int")
    
    # Code to draw plot
    font = {'family': 'serif', 'weight': 'normal', 'size': 14}
    matplotlib.rc('font', **font)
    plt.rcParams['figure.figsize'] = (15, 9)
    bar_labels = age_count["age"].map(lambda x: str(x) + "-year-old").values
    male = age_count["male_num"].values
    female = age_count["female_num"].values
    minus_female = [-int(i) for i in female]
    
    X = np.arange(16)
    for x, y in zip(X, male):
        plt.text(x + 0.1, y + 0.08, '%s' % y, ha='center', va="bottom")
    for x, y in zip(X, minus_female):
        plt.text(x + 0.1, y - 0.08, "%s" % -y, ha='center', va='top')
    
    plt.bar(X, male, facecolor="#9999ff", edgecolor="white", label="Male 549")
    plt.bar(X, minus_female, facecolor="#ff9999", edgecolor="white", label="Female 81")
    x_pos = [x + 0.1 for x in X]
    plt.xticks(x_pos, bar_labels, fontsize=15)
    plt.setp(plt.gca().get_xticklabels(), rotation=45, horizontalalignment="right")
    
    plt.setp(plt.gca().spines['right'].set_color('none'))
    plt.setp(plt.gca().spines['top'].set_color('none'))
    plt.setp(plt.gca().xaxis.set_ticks_position('bottom'))
    plt.setp(plt.gca().yaxis.set_ticks_position('left'))
    plt.yticks(plt.yticks()[0], [int(i) for i in abs((plt.yticks()[0]))])
    plt.legend()
    plt.show()
    
    image.png
    2.3 看一下出生月数分布
    # Code to get the month distribution matrix in each year
    # 提取月份
    qian["month"] = pd.Series([array[1] for array in qian["year-month-day"].str.split("/")])
    # 提取年龄
    qian["age"] = pd.Series(2017 - np.array(qian["year_birth"]))
    # 制作一个dataframe, 年龄+不同月份人数的list
    temp1 = pd.DataFrame(qian.groupby("age")["month"].apply(list))
    #print(temp1)
    
    list_month = []
    for i in temp1["month"]:
        list = [0] * 12
        for j in i:
            list[int(j)-1] +=1
        list_month.append(list)
    columns = [ x for x in range(1,13)]
    index =[str(x)+"-years-old" for x in range(25,41)]
    month_matrix =pd.DataFrame(
        list_month, columns = columns, index =index)
    
    plt.rcParams['figure.figsize'] = (6,4)
    month_matrix.sum().plot(kind="bar")
    plt.show()
    
    image.png
    import seaborn as sns
    plt.figure(figsize=(8,4))
    sns.heatmap(month_matrix,  annot=True, cmap="YlGnBu")
    plt.show()
    
    image.png
    2.4 看一下哪个城市居多

    在mac 中运行中文乱码,这里稍作修改
    1 https://github.com/dolbydu/font/tree/master/unicode 下载“STKaiti”文件, 放到matplotlib的ttf文件夹下。找不到路径用matplotlib.matplotlib_fname()这个找。
    2 终端输入rm -rf ~/.matplotlib/*.cache.
    3 在python脚本中添加一行代码mpl.rcParams['font.sans-serif'] = u'SimHei'

    qian = pd.read_excel("1000_youth_talent.xlsx",header=0, skiprows=1)
    qian.columns = ["ID","Name","Gender","year-month-day","year_birth","university","major"]
    font = {'weight' : 'normal','size': 14}
    matplotlib.rc('font', **font)
    plt.rcParams['figure.figsize'] = (20,9)
    plt.rcParams['font.sans-serif'] = ['STKaiti']
    school_count = pd.DataFrame(qian["university"].value_counts()).reset_index()
    school_count.columns = ["university","num"]
    Pcity = pd.read_excel("university_province_city.xls",header=0)
    temp2 = school_count.merge(Pcity)
    temp2 = temp2.groupby("City").sum().reset_index()
    temp2.columns = ["city","num"]
    temp2 = temp2.sort_values("num", ascending=False)
    temp2 = temp2.set_index("city")
    
    # 这里保存一个csv, 后面R作图用
    temp2.to_csv('city_num.csv',sep=',',header=True,index=True,encoding='utf-8')
    
    
    temp2.plot(kind="bar", color="skyblue")
    plt.legend().set_visible(False)
    plt.setp(plt.gca().spines['right'].set_color('none'))
    plt.setp(plt.gca().spines['top'].set_color('none'))
    plt.setp(plt.gca().xaxis.set_ticks_position('bottom'))
    plt.setp(plt.gca().yaxis.set_ticks_position('left'))
    plt.show()
    
    image.png
    2.5 各行业分布
    temp3 = qian["major"].value_counts().reset_index(); temp3.columns = ["major","num"]
    temp3.loc[3], temp3.loc[6] = temp3.loc[6], temp3.loc[3]
    temp3 = temp3.set_index("major")
    import brewer2mpl
    set2 = brewer2mpl.get_map('Set2','qualitative','8').mpl_colors + brewer2mpl.get_map('Accent','qualitative','6').mpl_colors
    font = {'family' : 'STKaiti', 'weight' : 'normal','size': 14}
    matplotlib.rc('font', **font)
    plt.rcParams['figure.figsize'] = (10,10)
    explode = (0, 0,0.05,0.05, 0, 0, 0, 0, 0, 0)
    X = temp3.index
    Y = temp3["num"]
    figlabel = []
    for x,y in zip(X,Y):
        figlabel.append(x +"-"+ str(y))
    temp3.plot(kind="pie",y="num",colors=set2,explode=explode)
    plt.setp(plt.gca().legend(labels = figlabel, loc="best",bbox_to_anchor=(0.01,0.75), fontsize=20))
    plt.show()
    
    image.png

    3 R实战

    # install REmap
    install.packages("devtools")
    library(devtools)
    install_github('lchiffon/REmap')
    
    setwd('/Users/chengkai/Desktop/file/learn/project/1000talient')
    library(REmap)
    library(dplyr)
    rm(list=ls())
    Sys.setlocale("LC_ALL","Chinese")
    city_num <- read.csv("city_num.csv",sep=",",fileEncoding = "utf-8")
    
    cities = city_num$city
    cities = as.character(cities)
    cities_Geo <- get_geo_position(cities)
    write.csv(cities_Geo, file="R_Geo.csv")
    #city_num
    data_final <- cities_Geo %>% 
      left_join(city_num)
    
    data_final <- data_final[,-3]
    #temp_file <- data.frame(cites_Geo,city_num$num)[,-3]
    #names(temp_file)[names(temp_file)=="city_num.num"]="num"
    
    remapH(data_final,
           maptype = "china",
           theme = get_theme("Bright"),
           blurSize = 35,
           color="red",
           minAlpha = 10,
           opacity = 2)
    
    image.png

    参考文献

    1. http://www.jianshu.com/p/883c8d9268a8(数据分析神器之Pandas)

    相关文章

      网友评论

        本文标题:青年千人数据分析

        本文链接:https://www.haomeiwen.com/subject/spfqmxtx.html