美文网首页
《青春有你2》参赛选手数据分析【python,pandas,ma

《青春有你2》参赛选手数据分析【python,pandas,ma

作者: Mead170810 | 来源:发表于2020-06-08 09:10 被阅读0次

    《青春有你2》参赛选手数据分析
    本文章主要介绍《青春有你2》的数据分析流程。

    任务描述:使用python爬取《青春有你2》所有参赛选手的信息,然后进行数据可视化分析
    实践平台:windows
    实践环境:Python2.7 + pandas + matplotlib
    
    分析的结果
    1. 使用python爬取《青春有你2》所有参赛选手信息:
      选手数据来源百度百科:https://baike.baidu.com/item/青春有你第二季/23802025?fromtitle=青春有你2&fromid=24266334
      数据来源百度百科
    # pip install beautifulsoup4
    
    # fetch data
    import requests
    import json
    from bs4 import BeautifulSoup
    
    url = "https://baike.baidu.com/item/青春有你第二季/23802025?fromtitle=青春有你2&fromid=24266334"
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
        'Cookie':'BIDUPSID=A89BD119C7B73C53021773EB9D924AD9; PSTM=1572051557; BAIDUID=A89BD119C7B73C53BD48595D9C682DE7:FG=1; MCITY=-289%3A; BDUSS=R2cDVoQVRoZkEyeVhSLVdudVNIakRZdFU5M3VoODd0QVFpRWFCYkVxMXhkUFZlSVFBQUFBJCQAAAAAAAAAAAEAAAAnp285TWVhZDIwMTQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHHnzV5x581eRG; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ai-studio-ticket=D8284B48976948689935A404927822C1BAF6BBFC966E40B3B04C71226E350C68; H_PS_PSSID=31729_1435_31672_21107_31605_30824_31844_26350; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; PSINO=5; ZD_ENTRY=baidu'
    }
    html = requests.get(url, headers=headers)
    #print(html.text)
    soup = BeautifulSoup(html.text, "html.parser")
    # print(soup.prettify())
    # soup.title.string
    data = []
    xuanshou_table = soup.find_all("table")[-3]
    for tr in xuanshou_table.find_all("tr"):
        item = []
        for td in tr.find_all("td"):
            item.append(td.get_text().replace("\n", ""))
        data.append(item)
    
    # print(data) 
    # save to json
    with open("data.json", "w", encoding='utf-8') as file:
        file.write(json.dumps(data, indent=2,ensure_ascii=False))
    
    data.json中存储的数据

    2.将json数据放入 pandas.DataFrame

    # read data
    import pandas as pd
    import numpy as np 
    
    with open("data.json", 'r', encoding='UTF-8') as file:
        json_array = json.loads(file.read())
    
    df = pd.DataFrame(json_array[1:], columns=json_array[0])
    df
    
    df输出结果

    拼装数据

    # 中国山东,中国四川,中国台湾,中国北京
    zone_cnt_dict = df.groupby(['国家/地区']).count().sort_values(by=['姓名'], ascending=False)['姓名']
    zone_cnt_dict
    # 狮子座,摩羯座,白羊座
    #xinzuo_cnt_dict = df.groupby(['星座']).count().sort_values(by='姓名', ascending=False)['姓名']
    #xinzuo_cnt_dict
    # 168, 170, 163
    #shengan_cnt_dict = df.groupby(['身高']).count().sort_values(by='姓名', ascending=False)['姓名']
    #shengan_cnt_dict
    # 45-50
    #weight_cnt_dict = df.groupby(['体重']).count().sort_values(by='姓名', ascending=False)['姓名']
    #weight_cnt_dict
    
    image.png
    from matplotlib import pyplot as pyt 
    
    from matplotlib.font_manager import FontProperties
    # msyhbd.ttf 是微软雅黑字体,用于解决中文乱码的问题
    myfont = FontProperties(fname=r"msyhbd.ttf",size=12)
    
    x = zone_cnt_dict.keys().tolist()
    y = zone_cnt_dict.values.tolist()
    plt.figure(figsize=(20, 15))
    #
    plt.xticks(rotation=45, fontsize=20, fontproperties=myfont)
    plt.bar(x, y, align='center')
    plt.legend()
    plt.title("《青春有你2》参赛选手区域排名", fontproperties=myfont)
    plt.ylabel("人数", fontproperties=myfont)
    plt.xlabel("城市", fontproperties=myfont)
    plt.show()
    
    image.png
    ## 狮子座,摩羯座,白羊座
    xinzuo_cnt_dict = df.groupby(['星座']).count().sort_values(by='姓名', ascending=False)['姓名']
    xinzuo_cnt_dict
    from matplotlib import pyplot as pyt 
    
    from matplotlib.font_manager import FontProperties
    myfont = FontProperties(fname=r"msyhbd.ttf",size=12)
    
    x = xinzuo_cnt_dict.keys().tolist()
    y = xinzuo_cnt_dict.values.tolist()
    plt.figure(figsize=(20, 15))
    #
    plt.xticks(rotation=45, fontsize=20, fontproperties=myfont)
    plt.bar(x, y, align='center')
    plt.legend()
    plt.title("《青春有你2》参赛选手数星座排名", fontproperties=myfont)
    plt.ylabel("人数", fontproperties=myfont)
    plt.xlabel("星座", fontproperties=myfont)
    plt.show()
    
    image.png
    # 168, 170, 163
    shengan_cnt_dict = df.groupby(['身高']).count().sort_values(by='姓名', ascending=False)['姓名']
    #shengan_cnt_dict
    from matplotlib import pyplot as pyt 
    
    from matplotlib.font_manager import FontProperties
    myfont = FontProperties(fname=r"msyhbd.ttf",size=12)
    
    x = shengan_cnt_dict.keys().tolist()
    y = shengan_cnt_dict.values.tolist()
    plt.figure(figsize=(20, 15))
    #
    plt.xticks(rotation=45, fontsize=20, fontproperties=myfont)
    plt.bar(x, y, align='center')
    plt.legend()
    plt.title("《青春有你2》参赛选手数身高排名", fontproperties=myfont)
    plt.ylabel("人数", fontproperties=myfont)
    plt.xlabel("身高", fontproperties=myfont)
    plt.show()
    
    image.png
    # 45-50
    weight_cnt_dict = df.groupby(['体重']).count().sort_values(by='姓名', ascending=False)['姓名']
    #weight_cnt_dict
    from matplotlib import pyplot as pyt 
    
    from matplotlib.font_manager import FontProperties
    myfont = FontProperties(fname=r"msyhbd.ttf",size=12)
    
    x = weight_cnt_dict.keys().tolist()
    y = weight_cnt_dict.values.tolist()
    plt.figure(figsize=(20, 15))
    #
    plt.xticks(rotation=45, fontsize=20, fontproperties=myfont)
    plt.bar(x, y, align='center')
    plt.legend()
    plt.title("《青春有你2》参赛选手数体重排名", fontproperties=myfont)
    plt.ylabel("人数", fontproperties=myfont)
    plt.xlabel("体重", fontproperties=myfont)
    plt.show()
    
    image.png

    相关文章

      网友评论

          本文标题:《青春有你2》参赛选手数据分析【python,pandas,ma

          本文链接:https://www.haomeiwen.com/subject/jsgbzhtx.html