《青春有你2》参赛选手数据分析【python,pandas,ma

作者: Mead170810 | 来源:发表于2020-06-08 09:10 被阅读0次

《青春有你2》参赛选手数据分析【python,pandas,ma
Python（金融）数据分析（二）Pandas
python入门 -安装python与环境配置
Python--pandas--基础概念学习
pandas简介
数据分析学习计划
Pandas基础教程
Python实现数据分析1
Python之Pandas使用教程
萝卜头学python:pandas 对EXCEL处理

《青春有你2》参赛选手数据分析
本文章主要介绍《青春有你2》的数据分析流程。

任务描述：使用python爬取《青春有你2》所有参赛选手的信息，然后进行数据可视化分析
实践平台：windows
实践环境：Python2.7 + pandas + matplotlib

分析的结果

使用python爬取《青春有你2》所有参赛选手信息：
选手数据来源百度百科：https://baike.baidu.com/item/青春有你第二季/23802025?fromtitle=青春有你2&fromid=24266334
数据来源百度百科

# pip install beautifulsoup4

# fetch data
import requests
import json
from bs4 import BeautifulSoup

url = "https://baike.baidu.com/item/青春有你第二季/23802025?fromtitle=青春有你2&fromid=24266334"
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
    'Cookie':'BIDUPSID=A89BD119C7B73C53021773EB9D924AD9; PSTM=1572051557; BAIDUID=A89BD119C7B73C53BD48595D9C682DE7:FG=1; MCITY=-289%3A; BDUSS=R2cDVoQVRoZkEyeVhSLVdudVNIakRZdFU5M3VoODd0QVFpRWFCYkVxMXhkUFZlSVFBQUFBJCQAAAAAAAAAAAEAAAAnp285TWVhZDIwMTQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHHnzV5x581eRG; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ai-studio-ticket=D8284B48976948689935A404927822C1BAF6BBFC966E40B3B04C71226E350C68; H_PS_PSSID=31729_1435_31672_21107_31605_30824_31844_26350; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; PSINO=5; ZD_ENTRY=baidu'
}
html = requests.get(url, headers=headers)
#print(html.text)
soup = BeautifulSoup(html.text, "html.parser")
# print(soup.prettify())
# soup.title.string
data = []
xuanshou_table = soup.find_all("table")[-3]
for tr in xuanshou_table.find_all("tr"):
    item = []
    for td in tr.find_all("td"):
        item.append(td.get_text().replace("\n", ""))
    data.append(item)

# print(data) 
# save to json
with open("data.json", "w", encoding='utf-8') as file:
    file.write(json.dumps(data, indent=2,ensure_ascii=False))

data.json中存储的数据

2.将json数据放入 pandas.DataFrame

# read data
import pandas as pd
import numpy as np 

with open("data.json", 'r', encoding='UTF-8') as file:
    json_array = json.loads(file.read())

df = pd.DataFrame(json_array[1:], columns=json_array[0])
df

df输出结果

拼装数据

# 中国山东，中国四川，中国台湾，中国北京
zone_cnt_dict = df.groupby(['国家/地区']).count().sort_values(by=['姓名'], ascending=False)['姓名']
zone_cnt_dict
# 狮子座，摩羯座，白羊座
#xinzuo_cnt_dict = df.groupby(['星座']).count().sort_values(by='姓名', ascending=False)['姓名']
#xinzuo_cnt_dict
# 168, 170, 163
#shengan_cnt_dict = df.groupby(['身高']).count().sort_values(by='姓名', ascending=False)['姓名']
#shengan_cnt_dict
# 45-50
#weight_cnt_dict = df.groupby(['体重']).count().sort_values(by='姓名', ascending=False)['姓名']
#weight_cnt_dict

image.png

from matplotlib import pyplot as pyt 

from matplotlib.font_manager import FontProperties
# msyhbd.ttf 是微软雅黑字体，用于解决中文乱码的问题
myfont = FontProperties(fname=r"msyhbd.ttf",size=12)

x = zone_cnt_dict.keys().tolist()
y = zone_cnt_dict.values.tolist()
plt.figure(figsize=(20, 15))
#
plt.xticks(rotation=45, fontsize=20, fontproperties=myfont)
plt.bar(x, y, align='center')
plt.legend()
plt.title("《青春有你2》参赛选手区域排名", fontproperties=myfont)
plt.ylabel("人数", fontproperties=myfont)
plt.xlabel("城市", fontproperties=myfont)
plt.show()

image.png

## 狮子座，摩羯座，白羊座
xinzuo_cnt_dict = df.groupby(['星座']).count().sort_values(by='姓名', ascending=False)['姓名']
xinzuo_cnt_dict
from matplotlib import pyplot as pyt 

from matplotlib.font_manager import FontProperties
myfont = FontProperties(fname=r"msyhbd.ttf",size=12)

x = xinzuo_cnt_dict.keys().tolist()
y = xinzuo_cnt_dict.values.tolist()
plt.figure(figsize=(20, 15))
#
plt.xticks(rotation=45, fontsize=20, fontproperties=myfont)
plt.bar(x, y, align='center')
plt.legend()
plt.title("《青春有你2》参赛选手数星座排名", fontproperties=myfont)
plt.ylabel("人数", fontproperties=myfont)
plt.xlabel("星座", fontproperties=myfont)
plt.show()

image.png

# 168, 170, 163
shengan_cnt_dict = df.groupby(['身高']).count().sort_values(by='姓名', ascending=False)['姓名']
#shengan_cnt_dict
from matplotlib import pyplot as pyt 

from matplotlib.font_manager import FontProperties
myfont = FontProperties(fname=r"msyhbd.ttf",size=12)

x = shengan_cnt_dict.keys().tolist()
y = shengan_cnt_dict.values.tolist()
plt.figure(figsize=(20, 15))
#
plt.xticks(rotation=45, fontsize=20, fontproperties=myfont)
plt.bar(x, y, align='center')
plt.legend()
plt.title("《青春有你2》参赛选手数身高排名", fontproperties=myfont)
plt.ylabel("人数", fontproperties=myfont)
plt.xlabel("身高", fontproperties=myfont)
plt.show()

image.png

# 45-50
weight_cnt_dict = df.groupby(['体重']).count().sort_values(by='姓名', ascending=False)['姓名']
#weight_cnt_dict
from matplotlib import pyplot as pyt 

from matplotlib.font_manager import FontProperties
myfont = FontProperties(fname=r"msyhbd.ttf",size=12)

x = weight_cnt_dict.keys().tolist()
y = weight_cnt_dict.values.tolist()
plt.figure(figsize=(20, 15))
#
plt.xticks(rotation=45, fontsize=20, fontproperties=myfont)
plt.bar(x, y, align='center')
plt.legend()
plt.title("《青春有你2》参赛选手数体重排名", fontproperties=myfont)
plt.ylabel("人数", fontproperties=myfont)
plt.xlabel("体重", fontproperties=myfont)
plt.show()

image.png