美文网首页
电影评分数据

电影评分数据

作者: 彭健平6点30 | 来源:发表于2017-03-28 00:44 被阅读398次
    import pandas as pd
    import os #导入os模块
    encoding = 'latin1'# 格式设置成“latinl""
    #用pandas.read_table讲各个表分别读到一个pandas Dataframe对像中
    #pandas 有两种数据格式Dataframe表示一个表格,类似表格的数据结构;Series是一个一维的类似的数组对象,包含一个数组的数据
    #和一个与数组关联的数据标签,被叫做索引,最简单的Series是由一个数组的数据构成:
    
    #os.path模块重要用于文件的属性获取,
    #os.path.expanduser(path)  #把path中包含的"~"和"~user"转换成用户目录
    #os.path.expandvars(path)  #根据环境变量的值替换path中包含的”$name”和”${name}”
    upath = os.path.expanduser('ch02/movielens/users.dat')
    rpath = os.path.expanduser('ch02/movielens/ratings.dat')
    mpath = os.path.expanduser('ch02/movielens/movies.dat')
    
    
    unames = ['user_id', 'gender', 'age', 'occupation', 'zip']#设列名称
    rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
    mnames = ['movie_id', 'title', 'genres']
    
    users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)
    ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)
    movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)
    #这里是用pd.read_cav读取文件 #sep="::"表示记录用“:”隔开如果记录使用“,”就sep=","
    #header=None默认无,encoding=encoding默认编码如utf-8
    
    /Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
      if __name__ == '__main__':
    /Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
      from ipykernel import kernelapp as app
    /Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
      app.launch_new_instance()
    

    users[:20]#切语法;查看DataFrame

    ratings[:5]

    movies[:5]

    ratings

    #利用pandas将ratings和users合并到一起,然后再将moview也合并到一起

    #pandas会根据列名推断哪些列是合并列

    data = pd.merge(pd.merge(ratings, users), movies)

    data

    data = pd.merge(pd.merge(ratings,users),movies)
    data

    data['rating'].mean()
    
    3.5815644530293169
    
    data.ix[1]#索引
    
    user_id                                            2
    movie_id                                        1193
    rating                                             5
    timestamp                                  978298413
    gender                                             M
    age                                               56
    occupation                                        16
    zip                                            70072
    title         One Flew Over the Cuckoo's Nest (1975)
    genres                                         Drama
    Name: 1, dtype: object
    

    按性别计算每部电影的平均得分,可以使用pivot_table

    mean_ratings = data.pivot_table('rating', index='title',
    columns='gender', aggfunc='mean')
    mean_ratings[:7]

    #过滤掉数据不足250条的电影,对title进行分组,利用ize()得到一个含有个各电影分组大小的Series的对象
    ratings_by_title = data.groupby('title').size()
    
    ratings_by_title[:10]
    
    title
    $1,000,000 Duck (1971)                37
    'Night Mother (1986)                  70
    'Til There Was You (1997)             52
    'burbs, The (1989)                   303
    ...And Justice for All (1979)        199
    1-900 (1994)                           2
    10 Things I Hate About You (1999)    700
    101 Dalmatians (1961)                565
    101 Dalmatians (1996)                364
    12 Angry Men (1957)                  616
    dtype: int64
    
    active_titles=ratings_by_title.index[ratings_by_title>=250]
    active_titles
    
    Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
           '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
           '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
           '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
           '2010 (1984)',
           ...
           'X-Men (2000)', 'Year of Living Dangerously (1982)',
           'Yellow Submarine (1968)', 'You've Got Mail (1998)',
           'Young Frankenstein (1974)', 'Young Guns (1988)',
           'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
           'Zero Effect (1998)', 'eXistenZ (1999)'],
          dtype='object', name='title', length=1216)
    

    mean_ratings=mean_ratings.ix[active_titles]
    mean_ratings

    为了了解女性最了解的电影,对F进行降序排列

    top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)#sort_index被弃用使用sort_values
    top_female_ratings[:10]

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from scipy import stats
    
    
    data['age'].mean()
    
    29.738313692438279
    
    data['age'].max()
    
    56
    
    data['age'].min()
    
    1
    
    data['age'].var()#var方差std标准差
    
    138.10909427256377
    
    #画直方图
    fig=plt.figure()
    x=data['age']
    ax=fig.add_subplot(111)
    numBins=5
    ax.hist(x,numBins,color='red',alpha=0.8,rwidth=0.5)
    plt.title(u'age')
    plt.show()
    
    
    import numpy as np
    import pandas as pd 
    import matplotlib.pyplot as plt 
    %matplotlib inline
    %config InlineBackend.figure_format='retina'
    def normfun(x,mu,sigma):
        pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
        return pdf
    
    def normfun(x,mu,sigma):
        pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
        return pdf
    
    p=data['age']
    
    x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
    #0.3表示线的弯曲角度(步长)
    y=normfun(x,mean,std)
    plt.plot(x,y)
    plt.hist(p,bins=6,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
    #rwidth表示每组宽度,normed表示显示曲线
    plt.title("age")
    plt.xlabel("stakes")
    plt.ylabel("Probability")
    plt.show()
    
    x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
    #0.3表示线的弯曲角度(步长)
    y=normfun(x,mean,std)
    plt.plot(x,y)
    plt.hist(p,bins=2,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
    #rwidth表示每组宽度,normed表示显示曲线
    plt.title("time")
    plt.xlabel("stakes")
    plt.ylabel("Probability")
    plt.show()
    
    x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
    #0.3表示线的弯曲角度(步长)
    y=normfun(x,mean,std)
    plt.plot(x,y)
    ![
    ![
    ![
    ![
    ![output_34_0.png](https://img.haomeiwen.com/i2007820/408b82f75a63a3f3.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
    ](https://img.haomeiwen.com/i2007820/cb7ffce4a59d0504.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
    ](https://img.haomeiwen.com/i2007820/a8fce3d1a47c8184.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
    ](https://img.haomeiwen.com/i2007820/9271b213266a0748.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
    ](https://img.haomeiwen.com/i2007820/0554285cc748af49.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
    plt.hist(p,bins=6,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
    #rwidth表示每组宽度,normed表示显示曲线
    plt.title("time")
    plt.xlabel("stakes")
    plt.ylabel("Probability")
    plt.show()
    
    len(p)#显示记录数
    
    1000209
    
    std=p.std()
    std
    
    11.751982567744209
    
    mean=p.mean()
    std=p.std()
    
    结果分析:电影人的平均值为29.74,大部分人的年纪在20~30之间。
    标准差是11.75,波动较小有68%的人年纪在29.74-11.75到29.74+11.75之间
    数据显示10岁以下的人很少,广告应该控制在20~60之间
    
    a=p[:100000]#拿出前10%的数据,进行分析
    
    x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
    #0.3表示线的弯曲角度(步长)
    y=normfun(x,mean,std)
    plt.plot(x,y)
    plt.hist(a,bins=6,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
    #rwidth表示每组宽度,normed表示显示曲线
    plt.title("age")
    plt.xlabel("stakes")
    plt.ylabel("Probability")
    plt.show()
    
    c=p[:10000]#拿出前1%的数据进行分析
    
    x=np.arange(1,60,1)#140和160是起点和终点,根据最大、最小、均值设置;
    #0.3表示线的弯曲角度(步长)
    y=normfun(x,mean,std)
    plt.plot(x,y)
    plt.hist(c,bins=6,rwidth=0.9,normed=True)#time表示数据,bins表示显示的组数
    #rwidth表示每组宽度,normed表示显示曲线
    plt.title("age")
    plt.xlabel("stakes")
    plt.ylabel("Probability")
    plt.show()
    
    符合上面的结论:
    ##时间有限先到这里,相信随着深入会体会到更多的乐趣
    

    相关文章

      网友评论

          本文标题:电影评分数据

          本文链接:https://www.haomeiwen.com/subject/aiedottx.html