美文网首页解密大数据
爬虫作业04-进一步爬取专栏文章相关数据

爬虫作业04-进一步爬取专栏文章相关数据

作者: pnjoe | 来源:发表于2017-08-08 16:57 被阅读34次

作业

  • 爬取大数据专题所有文章列表,并输出到文件中保存
  • 每篇文章需要爬取的数据: 作者,标题,文章地址,摘要,缩略图地址,阅读数,评论数,点赞数和打赏数

本机环境

Windows 7 64位 操作系统
Anaconda Navigator 1.6.2
Jupyter notobook 5.0.0
Python 3.6.1

导入模块

#coding=utf-8

import re
import time
import urllib
import pandas as pd
from bs4 import BeautifulSoup

自定义一个下载页面源码的函数

def download(url,retry=2):
    print (url,'Is analyzing ...')

    try:
        html = urllib.request.urlopen(url).read()
    except urllib.error as e:
        print ('Download error:',e.reason)
        html = None
        if retry > 0:
            if hasattr(e,'code') and 500 <= e.code < 600:
                print (e.code)
                return download(url,retry - 1)
   
    time.sleep(0.3)
    return (html)

自定义一个分析页面的函数

def nalyse(url,url_root):
    
    
    name_list=[]
    title_list=[]
    link_list=[]
    abstract_list=[]
    have_img_list=[]
    pic_list=[]
    meta_list=[]
    read_list=[]
    comments_list=[]
    like_list=[]
    money_list=[]
    data = pd.DataFrame()
    
    
    html = download(url)    
    soup = BeautifulSoup(html,"html.parser")
    
    names = soup.find_all('a',{'class':'blue-link'})
    for i in names:
        name_list.append(i.text)

    titles = soup.find_all('a',{'class':'title'})
    for i in titles:
        title_list.append(i.text)

    links = soup.find_all('a',{'class':'title'})
    for i in links:
        link_list.append(url_root+ i.get('href'))

    abstracts = soup.find_all('p',{'class':'abstract'})
    for i in abstracts:
        abstract = str(i.text).replace('\n ', '')
        abstract_list.append(abstract)

    have_imgs = soup.find_all('li',{'id':re.compile("note-\d+")})
    for i in have_imgs:
        img = str(i.attrs['class']).replace("[\'", '').replace("\']", '')
        if img:
            have_img_list.append(True)
        else:
            have_img_list.append(False)

    pics = soup.find_all('img',{'alt':'120'})
    for i in pics:
        pic_list.append(i.attrs['src'])


    for i in range(len(have_img_list)):
        if have_img_list[i] == False:   # 如果文章是无缩略图的,则在pic_list列表中插入一个空项.
            pic_list.insert(i,'')


    metas = soup.find_all('div',{'class':'meta'})
    for i in metas:
        meta = re.findall('</i>\s?(\d*)\s?[</a>|</span>]',str(i))
        meta_list.append(meta)

    data['name'] = name_list
    data['title'] = title_list
    data['link'] = link_list
    data['abstract_list'] = abstract_list
    data['pic'] = pic_list

    for i in range(len(meta_list)):
        read_list.append(int(meta_list[i][0]))
        comments_list.append(int(meta_list[i][1]))
        like_list.append(int(meta_list[i][2]))
        if len(meta_list[i]) == 4:
            money_list.append(int(meta_list[i][3]))
        else:
            money_list.append(int(0))

    data['read'] = read_list
    data['comments'] = comments_list
    data['like'] = like_list
    data['money'] = money_list
    
    return data

定义几个变量

url_root = 'http://www.jianshu.com'
url_seed='http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=%d'

执行代码主体

flag = True
i = 1
df = pd.DataFrame()
while flag:
    url = url_seed % i
    data = nalyse(url,url_root)
    if len(data) == 0:
        print ('page',i,'is nothing.')
        flag = False
        break
        
    else:
        df = pd.concat((df,data),ignore_index=True)
        i += 1
        
print ('end. total:',i-1,'page.')

运行结果

http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=1 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=2 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=3 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=4 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=5 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=6 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=7 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=8 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=9 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=10 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=11 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=12 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=13 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=14 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=15 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=16 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=17 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=18 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=19 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=20 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=21 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=22 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=23 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=24 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=25 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=26 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=27 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=28 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=29 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=30 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=31 Is analyzing ...
http://www.jianshu.com/c/9b4685b6357c?order_by=added_at&page=32 Is analyzing ...
page 32 is nothing.
end. total: 31 page.

爬取到的数据汇总成表格 展示一下

pd.set_option('max_colwidth',20)
df

数据进一步处理一下

aaa=df[['name','title']].groupby('name').count()
aaa.columns = ['article']
bbb=df[['name','read','comments','like','money']].groupby('name').sum()
ccc=aaa.join(bbb)
ccc.columns = ['article', 'read', 'comments', 'like', 'money']
ccc['mean_read'] = ccc['read'] / ccc['article']
ccc['mean_like'] = ccc['like'] / ccc['article']

作者文章按总篇数排序 TOP20

ccc.sort_values('article',ascending=False).head(20)
作者文章按总篇数排序 TOP20

作者文章按总阅读数排序 TOP20

ccc.sort_values('read',ascending=False).head(20)
作者文章按总阅读数排序 TOP20

作者文章按总点赞数排序 TOP20

ccc.sort_values('like',ascending=False).head(20)
作者文章按总点赞数排序 TOP20

作者文章按收到打赏总次数排序 TOP20

ccc.sort_values('money',ascending=False).head(20)
作者文章按收到打赏总次数排序 TOP20

画图

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签 
plt.figure(figsize=(7,5)) 
plt.hist(df.name.value_counts(),rwidth=0.9,bins=17)
plt.title('专栏作者投稿篇数对应的人数 直方分布图',fontsize=20)
plt.xlabel('作者投稿篇数',fontsize=13)  
plt.ylabel('对应人数',fontsize=13)  
plt.xticks(range(17))


plt.hlines(ccc.article.mean(), 1,17,  colors = "r", linestyles = "dashed",label="平均每人投稿 %.2f 篇" % ccc.article.mean())
plt.legend(loc='upper right',fontsize=12) # 显示图例于右上角
plt.show()
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签 
plt.figure(figsize=(15,15)) 
size = (ccc.money+0.5) * 200   # 数据点大小,正比于打赏数
plt.scatter(x = ccc.article , y = ccc.like,s=size, alpha=0.35) # 参数s设置点的大小

#plt.xscale('symlog') 
plt.yscale('log') 
#article
tick_val = [1,5,10,20,50,100,400]
tick_lab = ['1','5','10','20','50','100','400']
plt.yticks(tick_val, tick_lab)

plt.xlabel('作者总投稿文章篇数',fontsize=20)  
plt.ylabel('作者文章总喜欢(点赞)数',fontsize=20)  
plt.title('<解密大数据专栏> 投稿作者文章篇数点赞关系图',fontsize=25)

plt.text(2, 200, '泡泡越大,该作者收到的打赏次数越多',fontsize=20 ,color='#666666')

dd=ccc.sort_values('like',ascending=False).head(30)
x=list(dd.article)
y=list(dd.like)
z=list(dd.index)
for a,b,c in zip(x,y,z):
    plt.text(a, b,c,ha='center',va='center',fontsize=12 ,alpha=0.8)

plt.grid(True)  # 添加网格

plt.show()

相关文章

网友评论

    本文标题:爬虫作业04-进一步爬取专栏文章相关数据

    本文链接:https://www.haomeiwen.com/subject/igntrxtx.html