爬取代表数据
import requests
import re
import time
from lxml import html
from selenium import webdriver
r = requests.get(url='http://www.sohu.com/a/223885358_118392') # 最基本的GET请求
time.sleep(60)
html = r.text
正则表达式提取需要的数据
content = re.findall(u'<p>([\u2E80-\u9FFF]{2,3})[u"市"u"省"u"特"u"壮"u"回"u"维"u"自"u"人"].{0,6}\uff08(\d*)\u540d\uff09 </p>.*?<p>(.*?)</p>', html, re.S)
from pyecharts import Map
data = [_[0:2] for _ in content]
map = Map("人大代表分布", title_pos="center",width=800, height=500)
attr, value =map.cast(data)
attr[-1] = u"南海诸岛"
map.add("", attr, value, maptype='china',
is_label_show=True,label_pos="inside",label_text_color="#000",
is_visualmap=True, visual_text_color='#000', visual_range=[12, 172],visual_range_text=['低','高'],
visual_pos = [500,500])
#map.show_config()
map.render()
map
![](https://img.haomeiwen.com/i6559992/fb8fe3ae39f69be3.gif)
使用pandas,将数据转化为pandas表
import pandas as pd
import re
# 取地区,姓名,性别,族
data = [(_[0],_[2]) for _ in content]
def get_locality_name_sex_race(param):
locality = param[0]
# '、'分割姓名
for el in param[1].split(u'、'):
temp = re.split(u'\uff08|\uff0c|\uff09', el)
length = len(temp)
if length == 1:
temp += [u'男', u'汉族']
elif length == 3:
if len(temp[1]) == 1:
temp[2] = u'汉族'
else:
temp[2] = temp[1]
temp[1] = u'男'
elif length == 4:
temp.pop()
if u'族' not in temp[2]:
temp[2] = u'汉族'
df.loc[df.shape[0]] = [locality]+temp
df = pd.DataFrame(columns=('locality', 'name', 'sex', 'race'))#生成空的pandas表
map(get_locality_name_sex_race, data)
df
![](https://img.haomeiwen.com/i6559992/0ccf8e250f3683f7.png)
def get_attr_sex_v(race_groupby_sex):
attr_sex = race_groupby_sex.count().index.values.tolist()
attr_v = race_groupby_sex.count().values[0:2,0].tolist()
return attr_sex,attr_v
race_groupby_sex = df[df.race==u'汉族'].groupby(df['sex'])
attr_sex,attr_v = get_attr_sex_v(race_groupby_sex)
bar = Bar("", "", width=600, height=400)
bar.add("汉族", attr_sex, attr_v, is_more_utils=True)
race_groupby_sex = df[df.race!=u'汉族'].groupby(df['sex'])
minorities_sex,minorities_v = get_attr_sex_v(race_groupby_sex)
bar.add("少数民族", minorities_sex, minorities_v, is_more_utils=True)
bar.render()
bar
![](https://img.haomeiwen.com/i6559992/983c3d1584021c40.gif)
还可以把上面数据画成饼图
pie = Pie("")
pie.add("", ['汉族女','汉族男','少数民族女','少数民族男'], attr_v+minorities_v, is_label_show=True, is_legend_show=False)
pie.render()
pie
![](https://img.haomeiwen.com/i6559992/5828162702adf123.gif)
民族词云图
from pyecharts import WordCloud
wordcloud = WordCloud(width=1300, height=620)
wordcloud.add("", race_name, race_values, word_size_range=[20, 100])
wordcloud.render()
wordcloud
![](https://img.haomeiwen.com/i6559992/26e66b5f5f097cf9.gif)
from pyecharts import Bar
race_nums = df['race'].value_counts()
race_name = race_nums.index.values.tolist()
race_values = race_nums.values.tolist()
bar = Bar("", "少数民族")
bar.add("少数民族", race_name[1:], race_values[1:],
is_label_show=True,is_more_utils=True)
bar.render()
bar
![](https://img.haomeiwen.com/i6559992/6541228d0c9e99e9.gif)
统计代表姓氏人数
df['surname'] = [_[0] for _ in df['name']]
surname_Data = df['surname'].value_counts()
bar = Bar("", "", width=800, height=400)
bar.add("", surname_Data.index.values.tolist()[0:30],
surname_Data.values.tolist()[0:30], is_label_show=True,
is_legend_show=True,is_more_utils=True)
bar.render()
bar
![](https://img.haomeiwen.com/i6559992/9cb328f220ffd8ca.gif)
姓氏词云
from pyecharts import WordCloud
wordcloud = WordCloud(width=1200, height=600)
wordcloud.add("", surname_Data.index.values.tolist(),
surname_Data.values.tolist(), word_size_range=[20, 100])
wordcloud.render()
wordcloud
![](https://img.haomeiwen.com/i6559992/0031e8b2bd8bfc46.png)
网友评论
map(get_locality_name_sex_race, data)
df
无法得到数据。
list(map(get_locality_name_sex_race, data))
df
就可以得到数据了。。。
不大明白为什么。。。
$ pip install echarts-countries-pypkg
$ pip install echarts-china-provinces-pypkg
$ pip install echarts-china-cities-pypkg
https://github.com/pyecharts/pyecharts