美文网首页数据挖掘
基于协同过滤算法的电影推荐实例

基于协同过滤算法的电影推荐实例

作者: DreamWeave_fdbe | 来源:发表于2020-02-24 00:31 被阅读0次
    import pandas as pd 
    import numpy as np  
    import tensorflow as tf 
    
    
    #Step1:数据特征处理
    ratings_df = pd.read_csv("./data/ratings.csv")
    movies_df = pd.read_csv("./data/movies.csv")
    
    
    movies_df['movieRow'] = movies_df.index 
    #筛选特征
    movies_df = movies_df[['movieRow','movieId','title']]
    movies_df.to_csv('moviesProcessed.csv',index=False,header=True,encoding='utf-8')
    #用电影的行号代替电影的ID,节省内存
    ratings_df = pd.merge(ratings_df,movies_df,on='movieId')
    ratings_df = ratings_df[['userId','movieRow','rating']]
    ratings_df.to_csv('ratingsProcessed.csv',index=False,header=True,encoding='utf-8')
    
    
    #建立一个电影评分表rating:用户对每个电影的评分高低
    #获取UserId的最大编号
    userNo = ratings_df['userId'].max() + 1 
    #获取电影ID的最大编号
    movieNo = ratings_df['movieRow'].max() + 1 
    #rating的维度:[movieNo,userNo]
    rating = np.zeros((movieNo,userNo))
    flag = 0 
    ratings_df_length = np.shape(ratings_df)[0]
    
    for index,row in ratings_df.iterrows():
        #iterrows()返回值为元组:(index,row)
        rating[int(row['movieRow']),int(row['userId'])]=row['rating']
        flag += 1 
        #打印进度
        print('processed %d,%d left' % (flag,ratings_df_length-flag))
    
    #建立一个评分记录表record:评过分电影的对应位置为1,没有评过分的对应位置为0 
    record = rating > 0 
    record = np.array(record,dtype=int)
    
    #Step2:构建模型
    
    #对评分数据进行标准
    def normalizeRatings(rating,record):
        m,n = rating.shape 
        #每个电影的评分均值
        rating_mean = np.zeros((m,1))
        #电影的真实评分
        rating_norm = np.zeros((m,n))
        for i in range(m):
            #统计一行中所有评分的,因为没评分的不参与平均值的计算
            idx = record[i,:] !=0 
            rating_mean[i] = np.mean(rating[i,idx])
            rating_norm[i,idx] -= rating_mean[i]
        return rating_norm,rating_mean
    
    rating_norm,rating_mean = normalizeRatings(rating,record)
    #处理Nan
    rating_norm = np.nan_to_num(rating_norm)
    rating_mean = np.nan_to_num(rating_mean)
    
    #电影类型数量
    num_features = 10
    #电影内容矩阵:X_parameters
    X_parameters = tf.Variable(tf.random_normal([movieNo,num_features],stddev=0.35))
    #用户喜好矩阵:Theta_parameters
    Theta_parameters = tf.Variable(tf.random_normal([userNo,num_features],stddev=0.35))
    
    
    #损失函数
    #transpose_b:对Theta_parameters进行转置 
    xietong_formula = (1/2) * tf.reduce_sum(((tf.matmul(X_parameters,Theta_parameters,transpose_b = True) - rating_norm)*record)**2)
    loss = xietong_formula + (1/2) * (tf.reduce_sum(X_parameters**2) + tf.reduce_sum(Theta_parameters**2))
    
    #优化器和优化目标
    optimizer = tf.train.AdamOptimizer(1e-4)
    optim     = optimizer.minimize(loss)
    
    
    
    #Step3:训练模型
    
    #用tf.summary可视化训练过程
    tf.summary.scalar('loss',loss)
    #汇总summary信息
    summaryMerged = tf.summary.merge_all()
    #把信息保存到文件
    filename = './movie_tensorboard'
    writer = tf.summary.FileWriter(filename)
    
    
    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)
    
    for i in range(100):
        _,movie_summary = sess.run([optim,summaryMerged])
        writer.add_summary(movie_summary,i)
    
    
    #Step4:模型评估
    Current_X_parameters,Current_Theta_parameters = sess.run([X_parameters,Theta_parameters])
    predicts = np.dot(Current_X_parameters,Current_Theta_parameters.T) + rating_mean
    errors   = np.sqrt(np.sum((predicts - rating)**2))
    print(errors)
    
    

    参考教程:https://www.imooc.com/video/17130

    相关文章

      网友评论

        本文标题:基于协同过滤算法的电影推荐实例

        本文链接:https://www.haomeiwen.com/subject/hkryqhtx.html