读取.dat文件
import pandas as pd
import numpy as np
pd.options.display.max_rows=10#让显示内容少一点
unames = ['user_id','gender','age','occupation','zip']
users = pd.read_table('users.dat',sep='::',header=None,names = unames,engine='python')
users[:5]
rnames = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_table('ratings.dat',sep='::',header=None,names=rnames,engine='python')
ratings[:5]
mnames = ['movie_id','title','genres']
movies = pd.read_table('movies.dat',sep='::',header=None,names=mnames,engine='python')
movies[:5]
连接这三个表
data=pd.merge(pd.merge(ratings,users),movies)
data
#获得按性别分级的每部电影的平均电影评分
mean_ratings = data.pivot_table(values = 'rating',index='title',columns='gender',aggfunc='mean')
mean_ratings[:5]
#过滤掉少于250个评分的电影
ratings_by_title = data.groupby('title').size()
active_title = ratings_by_title.index[ratings_by_title>=250]
active_title
ratings_by_title[ratings_by_title>=250].index#与上式等价
#可以发现有1216部电影的评分超过250个
#选出这些电影
mean_ratings = mean_ratings.loc[active_title]
mean_ratings
#找出女性观众的top电影
top_female_ratings = mean_ratings.sort_values(by='F',ascending=False)#降序
top_female_ratings[:10]
#男性观众的top电影
top_male_ratings = mean_ratings.sort_values(by='M',ascending=False)
top_male_ratings[:10]
#测量意见分歧
mean_ratings['diff']=mean_ratings['M']-mean_ratings['F']
sorted_by_diff = mean_ratings.sort_values(by='diff',ascending = False)
sorted_by_diff[:10]#男性更喜欢但女性评分不高的前10部电影
sorted_by_diff[::-1][:10]#女性首选前十部
#争议最大的(电影评分值的标准差最大)
rating_std_by_title = data.groupby('title')['rating'].std()
rating_std_by_title = rating_std_by_title.loc[active_title]#很巧妙的用到了之前的索引
rating_std_by_title.sort_values(ascending=False)
思路很关键!
网友评论