![](https://img.haomeiwen.com/i6581981/c09f1a9b23c815ce.jpg)
作业
- 爬取大数据专题所有文章列表,并输出到文件中保存
- 每篇文章需要爬取的数据: 作者,标题,文章地址,摘要,缩略图地址,阅读数,评论数,点赞数和打赏数
本机环境
Windows 7 64位 操作系统
Anaconda Navigator 1.6.2
Jupyter notobook 5.0.0
Python 3.6.1
导入模块
#coding=utf-8
import re
import time
import urllib
import pandas as pd
from bs4 import BeautifulSoup
自定义一个下载页面源码的函数
def download(url,retry=2):
print (url,'Is analyzing ...')
try:
html = urllib.request.urlopen(url).read()
except urllib.error as e:
print ('Download error:',e.reason)
html = None
if retry > 0:
if hasattr(e,'code') and 500 <= e.code < 600:
print (e.code)
return download(url,retry - 1)
time.sleep(0.3)
return (html)
自定义一个分析页面的函数
def nalyse(url,url_root):
name_list=[]
title_list=[]
link_list=[]
abstract_list=[]
have_img_list=[]
pic_list=[]
meta_list=[]
read_list=[]
comments_list=[]
like_list=[]
money_list=[]
data = pd.DataFrame()
html = download(url)
soup = BeautifulSoup(html,"html.parser")
names = soup.find_all('a',{'class':'blue-link'})
for i in names:
name_list.append(i.text)
titles = soup.find_all('a',{'class':'title'})
for i in titles:
title_list.append(i.text)
links = soup.find_all('a',{'class':'title'})
for i in links:
link_list.append(url_root+ i.get('href'))
abstracts = soup.find_all('p',{'class':'abstract'})
for i in abstracts:
abstract = str(i.text).replace('\n ', '')
abstract_list.append(abstract)
have_imgs = soup.find_all('li',{'id':re.compile("note-\d+")})
for i in have_imgs:
img = str(i.attrs['class']).replace("[\'", '').replace("\']", '')
if img:
have_img_list.append(True)
else:
have_img_list.append(False)
pics = soup.find_all('img',{'alt':'120'})
for i in pics:
pic_list.append(i.attrs['src'])
for i in range(len(have_img_list)):
if have_img_list[i] == False: # 如果文章是无缩略图的,则在pic_list列表中插入一个空项.
pic_list.insert(i,'')
metas = soup.find_all('div',{'class':'meta'})
for i in metas:
meta = re.findall('</i>\s?(\d*)\s?[</a>|</span>]',str(i))
meta_list.append(meta)
data['name'] = name_list
data['title'] = title_list
data['link'] = link_list
data['abstract_list'] = abstract_list
data['pic'] = pic_list
for i in range(len(meta_list)):
read_list.append(int(meta_list[i][0]))
comments_list.append(int(meta_list[i][1]))
like_list.append(int(meta_list[i][2]))
if len(meta_list[i]) == 4:
money_list.append(int(meta_list[i][3]))
else:
money_list.append(int(0))
data['read'] = read_list
data['comments'] = comments_list
data['like'] = like_list
data['money'] = money_list
return data
定义几个变量
url_root = 'http://www.jianshu.com'
url_seed='http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=%d'
执行代码主体
flag = True
i = 1
df = pd.DataFrame()
while flag:
url = url_seed % i
data = nalyse(url,url_root)
if len(data) == 0:
print ('page',i,'is nothing.')
flag = False
break
else:
df = pd.concat((df,data),ignore_index=True)
i += 1
print ('end. total:',i-1,'page.')
运行结果
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=1 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=2 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=3 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=4 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=5 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=6 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=7 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=8 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=9 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=10 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=11 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=12 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=13 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=14 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=15 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=16 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=17 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=18 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=19 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=20 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=21 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=22 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=23 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=24 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=25 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=26 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=27 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=28 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=29 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=30 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=31 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=32 Is analyzing ...
page 32 is nothing.
end. total: 31 page.
爬取到的数据汇总成表格 展示一下
pd.set_option('max_colwidth',20)
df
![](https://img.haomeiwen.com/i6581981/c986d59e3d5c7469.jpg)
数据进一步处理一下
aaa=df[['name','title']].groupby('name').count()
aaa.columns = ['article']
bbb=df[['name','read','comments','like','money']].groupby('name').sum()
ccc=aaa.join(bbb)
ccc.columns = ['article', 'read', 'comments', 'like', 'money']
ccc['mean_read'] = ccc['read'] / ccc['article']
ccc['mean_like'] = ccc['like'] / ccc['article']
作者文章按总篇数排序 TOP20
ccc.sort_values('article',ascending=False).head(20)
![](https://img.haomeiwen.com/i6581981/f5f959ed6326d1a9.png)
作者文章按总阅读数排序 TOP20
ccc.sort_values('read',ascending=False).head(20)
![](https://img.haomeiwen.com/i6581981/5ee44ace3b6edf15.png)
作者文章按总点赞数排序 TOP20
ccc.sort_values('like',ascending=False).head(20)
![](https://img.haomeiwen.com/i6581981/2e94a8f74fcf2214.png)
作者文章按收到打赏总次数排序 TOP20
ccc.sort_values('money',ascending=False).head(20)
![](https://img.haomeiwen.com/i6581981/c480ff9f17ff4ce6.png)
画图
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.figure(figsize=(7,5))
plt.hist(df.name.value_counts(),rwidth=0.9,bins=17)
plt.title('专栏作者投稿篇数对应的人数 直方分布图',fontsize=20)
plt.xlabel('作者投稿篇数',fontsize=13)
plt.ylabel('对应人数',fontsize=13)
plt.xticks(range(17))
plt.hlines(ccc.article.mean(), 1,17, colors = "r", linestyles = "dashed",label="平均每人投稿 %.2f 篇" % ccc.article.mean())
plt.legend(loc='upper right',fontsize=12) # 显示图例于右上角
plt.show()
![](https://img.haomeiwen.com/i6581981/b96d5d176bdfeeed.png)
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.figure(figsize=(15,15))
size = (ccc.money+0.5) * 200 # 数据点大小,正比于打赏数
plt.scatter(x = ccc.article , y = ccc.like,s=size, alpha=0.35) # 参数s设置点的大小
#plt.xscale('symlog')
plt.yscale('log')
#article
tick_val = [1,5,10,20,50,100,400]
tick_lab = ['1','5','10','20','50','100','400']
plt.yticks(tick_val, tick_lab)
plt.xlabel('作者总投稿文章篇数',fontsize=20)
plt.ylabel('作者文章总喜欢(点赞)数',fontsize=20)
plt.title('<解密大数据专栏> 投稿作者文章篇数点赞关系图',fontsize=25)
plt.text(2, 200, '泡泡越大,该作者收到的打赏次数越多',fontsize=20 ,color='#666666')
dd=ccc.sort_values('like',ascending=False).head(30)
x=list(dd.article)
y=list(dd.like)
z=list(dd.index)
for a,b,c in zip(x,y,z):
plt.text(a, b,c,ha='center',va='center',fontsize=12 ,alpha=0.8)
plt.grid(True) # 添加网格
plt.show()
![](https://img.haomeiwen.com/i6581981/ee04babbf93db8c4.png)
网友评论