import pandas as pd
import numpy as np
import tensorflow as tf
#Step1:数据特征处理
ratings_df = pd.read_csv("./data/ratings.csv")
movies_df = pd.read_csv("./data/movies.csv")
movies_df['movieRow'] = movies_df.index
#筛选特征
movies_df = movies_df[['movieRow','movieId','title']]
movies_df.to_csv('moviesProcessed.csv',index=False,header=True,encoding='utf-8')
#用电影的行号代替电影的ID,节省内存
ratings_df = pd.merge(ratings_df,movies_df,on='movieId')
ratings_df = ratings_df[['userId','movieRow','rating']]
ratings_df.to_csv('ratingsProcessed.csv',index=False,header=True,encoding='utf-8')
#建立一个电影评分表rating:用户对每个电影的评分高低
#获取UserId的最大编号
userNo = ratings_df['userId'].max() + 1
#获取电影ID的最大编号
movieNo = ratings_df['movieRow'].max() + 1
#rating的维度:[movieNo,userNo]
rating = np.zeros((movieNo,userNo))
flag = 0
ratings_df_length = np.shape(ratings_df)[0]
for index,row in ratings_df.iterrows():
#iterrows()返回值为元组:(index,row)
rating[int(row['movieRow']),int(row['userId'])]=row['rating']
flag += 1
#打印进度
print('processed %d,%d left' % (flag,ratings_df_length-flag))
#建立一个评分记录表record:评过分电影的对应位置为1,没有评过分的对应位置为0
record = rating > 0
record = np.array(record,dtype=int)
#Step2:构建模型
#对评分数据进行标准
def normalizeRatings(rating,record):
m,n = rating.shape
#每个电影的评分均值
rating_mean = np.zeros((m,1))
#电影的真实评分
rating_norm = np.zeros((m,n))
for i in range(m):
#统计一行中所有评分的,因为没评分的不参与平均值的计算
idx = record[i,:] !=0
rating_mean[i] = np.mean(rating[i,idx])
rating_norm[i,idx] -= rating_mean[i]
return rating_norm,rating_mean
rating_norm,rating_mean = normalizeRatings(rating,record)
#处理Nan
rating_norm = np.nan_to_num(rating_norm)
rating_mean = np.nan_to_num(rating_mean)
#电影类型数量
num_features = 10
#电影内容矩阵:X_parameters
X_parameters = tf.Variable(tf.random_normal([movieNo,num_features],stddev=0.35))
#用户喜好矩阵:Theta_parameters
Theta_parameters = tf.Variable(tf.random_normal([userNo,num_features],stddev=0.35))
#损失函数
#transpose_b:对Theta_parameters进行转置
xietong_formula = (1/2) * tf.reduce_sum(((tf.matmul(X_parameters,Theta_parameters,transpose_b = True) - rating_norm)*record)**2)
loss = xietong_formula + (1/2) * (tf.reduce_sum(X_parameters**2) + tf.reduce_sum(Theta_parameters**2))
#优化器和优化目标
optimizer = tf.train.AdamOptimizer(1e-4)
optim = optimizer.minimize(loss)
#Step3:训练模型
#用tf.summary可视化训练过程
tf.summary.scalar('loss',loss)
#汇总summary信息
summaryMerged = tf.summary.merge_all()
#把信息保存到文件
filename = './movie_tensorboard'
writer = tf.summary.FileWriter(filename)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
for i in range(100):
_,movie_summary = sess.run([optim,summaryMerged])
writer.add_summary(movie_summary,i)
#Step4:模型评估
Current_X_parameters,Current_Theta_parameters = sess.run([X_parameters,Theta_parameters])
predicts = np.dot(Current_X_parameters,Current_Theta_parameters.T) + rating_mean
errors = np.sqrt(np.sum((predicts - rating)**2))
print(errors)
参考教程:https://www.imooc.com/video/17130
网友评论