还是大鹏老师课程的复习,这节课是对复联4的短评进行爬取与分析
1、数据爬取 (函数式编程,requests包和BeautifulSoup包的使用)
# 导入工具包
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 获取urls(观察url的规律)
def get_urls(n):
urllst = []
for i in range(n):
urllst.append('https://movie.douban.com/subject/26100958/comments?start=%i&limit=20&sort=new_score&status=P' % (i*20))
return urllst
urllsts = get_urls(50)
# 获取网页数据
def get_data(urli):
try:
ri = requests.get(url = u)
ri.encoding = 'utf-8'
soupi = BeautifulSoup(ri.text)
infor_lst = soupi.find('div',id="comments").find_all('div',class_="comment-item")
datalsti = []
for infor in infor_lst[:]:
dic = {}
dic['评论者'] = infor.find('span',class_="comment-info").find('a').text
dic['评分'] = int(infor.find('span',class_="comment-info").find_all('span')[1]['class'][0][-2:])
dic['评论时间'] = infor.find('span',class_="comment-time").text.replace(' ','').replace('\n','')
dic['有用数量'] = int(infor.find('span',class_="votes").text)
dic['评论内容'] = infor.find('p').text.replace('\n','')
datalsti.append(dic)
return datalsti
except:
return []
# 批量获取数据
datalst = []
n = 1
for u in urllsts:
datalst.extend(get_data(u))
print('成功获取%i条数据' % (n*20))
n += 1
df = pd.DataFrame(datalst)
image.png
2、数据查看 (直方图和散点图的使用)
# 评论字数数据分布
df['评论字数'] = df['评论内容'].str.len()
plt.figure(figsize = (12,5))
plt.title('评论字数数据分布')
df['评论字数'].hist(bins = 20,edgecolor = 'white')
plt.grid(linestyle='--')
# 评论字数与有用数量关系
plt.figure(figsize = (12,5))
plt.title('评论字数与有用数量关系')
plt.scatter(df['评论字数'],df['有用数量'],alpha = 0.4)
plt.xlabel('评论字数')
plt.ylabel('有用数量')
plt.grid(linestyle='--')
df[df['有用数量']>5000].sort_values(by = '有用数量',ascending = False)
3、关键人物分析 (人物出现次数)
# 创建函数查找美队出现次数
def name_count(namei,s):
n = 0
for i in s:
if namei in i:
n += 1
else:
continue
return n
name_count('美队',df['评论内容'])
# 计算不同关键字出现频率
namelst = ['美队','钢铁侠','灭霸','黑寡妇','雷神','浩克','惊奇队长',
'鹰眼','蚁人','奇异博士','蜘蛛侠','星云','黑豹']
lst = []
for namei in namelst:
lst.append({'关键词': namei,'出现频率': name_count(namei,df['评论内容'])})
# 保存数据
result = pd.DataFrame(lst)
result.to_csv('/home/kesci/result.csv')
网友评论