import pandas as pd
import os
os.getcwd()
# 查看文件读取目录
unames = ['user_id','gender','age','occupation','zip' ]
users = pd.read_table(r'users.dat',sep='::',header=None,names=unames,engine = 'python')
rnames = ['user_id','movie_id','rating',timestamp']
ratings = pd.read_table(r'ratings.dat',sep='::',header=None,names=rnames,engine = 'python')
mnames = ['movie_id','title','genres']
movies = pd.read_table(r'movies.dat',sep='::+',header=None,names=mnames,engine = 'python')
# 读取三个文档
data = pd.merge(pd.merge(ratings,users),movies)
# 先通过user_id 将ratings表和users表合并,再通过movie_id将movies表与刚刚合并的表合并
mean_ratings = data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
# 按性别统计每部电影的平均评分
ratings_by_title = data.groupby('title').size()
# size()获取一个元素为各分组大小的Series
active_titles = ratings_by_title.index[ratings_by_title>=250]
# 获得评分数量大于等于250个的电影的标题
mean_ratings = mean_ratings.loc[active_titles]
# 筛选出评分数量大于等于250个的电影的平均评分
top_female_ratings = mean_ratings.sort_values(by='F',ascending=False)
# 为了获得女性评分的top电影,我们将数据透视表按照女性性别F进行降序排列
mean_ratings['diff'] = mean_ratings['M']-mean_ratings['F']
sorted_by_diff = mean_ratings.sort_values(by='diff')
# 计算男女对同一部电影评分均值的差异,并按照差异值升序排序
sorted_by_diff[:10]
# 查看女性更喜欢而男性评分不高的电影
sorted_by_diff[::-1][:10]
# 先将数据按行倒序,再选出前十行,以查看男性更喜欢而女性评分不高的电影
rating_std_by_title = data.groupby('title')['rating'].std()
# 获取按标题分组的电影评分的标准差
rating_std_by_title = rating_std_by_title.loc[active_titles]
# 筛选出评分数量大于等于250个的电影评分的标准差
rating_std_by_title.sort_values(ascending=False)[:10]
# 筛选出电影评分在观众中异议最大的电影
网友评论