RSVD代码分析
数据集
-
该代码的数据集描述如下:
-
u.data:
- 数据格式为:userID,movieID,rating,timestamp
- 有所有的数据
-
u1.base
- 数据格式为:userID,movieID,rating,timestamp
- 含有部分u.data数据,与u1.test不重合
-
u1.test
- 数据格式为:userID,movieID,rating,timestamp
- 含有部分u.data数据,与u1.base不重合
-
其他的u2,3,4,5都是这样
-
个人分析:
- u1,u2,u3等数据集,base是训练集,test是测试集
- 这些数据集都是从u.data取出来的
-
用户数量 userNum = 943
-
项目数量 itemNum = 1682
代码
import random
import math
import pandas as pd
import numpy as np
class RSVD():
def __init__(self, allfile, trainfile, testfile, latentFactorNum=20, alpha_u=0.01, alpha_v=0.01, beta_u=0.01,
beta_v=0.01, learning_rate=0.01):
"""
:param allfile:
u.data,总数据
:param trainfile:
u1.base,训练集
:param testfile:
u1.train,测试集
:param latentFactorNum:
因子数量
:param alpha_u:
alpha_u的初始化
:param alpha_v:
alpha_v的初始化
:param beta_u:
beta_u的初始化
:param beta_v:
beta_vu的初始化
:param learning_rate:
训练速度
"""
data_fields = ['user_id', 'item_id', 'rating', 'timestamp']
# all data file
allData = pd.read_table(allfile, names=data_fields)
# training set file
self.train_df = pd.read_table(trainfile, names=data_fields)
# testing set file
self.test_df = pd.read_table(testfile, names=data_fields)
# get factor number
self.latentFactorNum = latentFactorNum
# get user number
self.userNum = len(set(allData['user_id'].values))
# get item number
self.itemNum = len(set(allData['item_id'].values))
# learning rate
self.learningRate = learning_rate
# the regularization lambda
self.alpha_u = alpha_u
self.alpha_v = alpha_v
self.beta_u = beta_u
self.beta_v = beta_v
# initialize the model and parameters
self.initModel()
# initialize all parameters
def initModel(self):
"""
初始化矩阵,用作矩阵分解
:return:
"""
print("initModel is start now")
self.mu = self.train_df['rating'].mean()
self.bu = np.zeros(self.userNum)
self.bi = np.zeros(self.itemNum)
self.U = np.mat(np.random.rand(self.userNum, self.latentFactorNum))
self.V = np.mat(np.random.rand(self.itemNum, self.latentFactorNum))
# self.bu = [0.0 for i in range(self.userNum)]
# self.bi = [0.0 for i in range(self.itemNum)]
# temp = math.sqrt(self.latentFactorNum)
# self.U = [[(0.1 * random.random() / temp) for i in range(self.latentFactorNum)] for j in range(self.userNum)]
# self.V = [[0.1 * random.random() / temp for i in range(self.latentFactorNum)] for j in range(self.itemNum)]
print("Initialize end.The user number is:%d,item number is:%d" % (self.userNum, self.itemNum))
print("initModel is end now")
def train(self, iterTimes=100):
"""
:param iterTimes:
训练次数
"""
print("train is start now")
print("Beginning to train the model......")
preRmse = 10000.0
for iter in range(iterTimes):
"""
开始迭代训练
"""
for index in self.train_df.index:
"""
计算每个二元组数据,从0开始
每计算20000个数据就输出1次当前进度,一共有80000个数据
"""
if index % 20000 == 0:
print("第%s轮进度:%s%%" % (iter, index / len(self.train_df.index) * 100))
# 用户ID
user = int(self.train_df.loc[index]['user_id']) - 1
# 电影ID
item = int(self.train_df.loc[index]['item_id']) - 1
# 实际评分
rating = float(self.train_df.loc[index]['rating'])
# 计算、预测得分
pscore = self.predictScore(self.mu, self.bu[user], self.bi[item], self.U[user], self.V[item])
eui = rating - pscore
# update parameters bu and bi(user rating bais and item rating bais)
self.mu = -eui
self.bu[user] += self.learningRate * (eui - self.beta_u * self.bu[user])
self.bi[item] += self.learningRate * (eui - self.beta_v * self.bi[item])
temp = self.U[user]
self.U[user] += self.learningRate * (eui * self.V[user] - self.alpha_u * self.U[user])
self.V[item] += self.learningRate * (temp * eui - self.alpha_v * self.V[item])
# for k in range(self.latentFactorNum):
# temp = self.U[user][k]
# # update U,V
# self.U[user][k] += self.learningRate * (eui * self.V[user][k] - self.alpha_u * self.U[user][k])
# self.V[item][k] += self.learningRate * (temp * eui - self.alpha_v * self.V[item][k])
#
# calculate the current rmse
curRmse = self.test(self.mu, self.bu, self.bi, self.U, self.V)
print("Iteration %d times,RMSE is : %f" % (iter + 1, curRmse))
if curRmse > preRmse:
break
else:
preRmse = curRmse
print("Iteration finished!")
print("train is end now")
# test on the test set and calculate the RMSE
def test(self, mu, bu, bi, U, V):
"""
:param mu:
:param bu:
:param bi:
:param U:
:param V:
:return:
最终返回RMSE
"""
print("test is start now")
cnt = self.test_df.shape[0]
rmse = 0.0
buT = bu.reshape(bu.shape[0], 1)
predict_rate_matrix = mu + np.tile(buT, (1, self.itemNum)) + np.tile(bi, (self.userNum, 1)) + self.U * self.V.T
for i in self.test_df.index:
user = int(self.test_df.loc[i]['user_id']) - 1
item = int(self.test_df.loc[i]['item_id']) - 1
score = float(self.test_df.loc[i]['rating'])
# pscore = self.predictScore(mu, bu[user], bi[item], U[user], V[item])
pscore = predict_rate_matrix[user, item]
rmse += math.pow(score - pscore, 2)
RMSE = math.sqrt(rmse / cnt)
print("test is end now")
return RMSE
# calculate the inner product of two vectors
def innerProduct(self, v1, v2):
print("innerProduct is start now")
result = 0.0
for i in range(len(v1)):
result += v1[i] * v2[i]
print("innerProduct is end now")
return result
def predictScore(self, mu, bu, bi, U, V):
# print("predictScore is start now")
# pscore = mu + bu + bi + self.innerProduct(U, V)
pscore = mu + bu + bi + np.multiply(U, V).sum()
if pscore < 1:
pscore = 1
if pscore > 5:
pscore = 5
# print("predictScore is end now")
return pscore
if __name__ == '__main__':
s = RSVD("u.data", "u1.base", "u1.test")
s.train()
结果
H:\Anaconda\python.exe I:/Pywork/协同过滤/对接16-RSVD代码/RSVD/RSVD.py
I:/Pywork/协同过滤/对接16-RSVD代码/RSVD/RSVD.py:12: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\t'.
allData = pd.read_table(allfile, names=data_fields)
I:/Pywork/协同过滤/对接16-RSVD代码/RSVD/RSVD.py:14: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\t'.
self.train_df = pd.read_table(trainfile, names=data_fields)
I:/Pywork/协同过滤/对接16-RSVD代码/RSVD/RSVD.py:16: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\t'.
self.test_df = pd.read_table(testfile, names=data_fields)
initModel is start now
Initialize end.The user number is:943,item number is:1682
initModel is end now
train is start now
Beginning to train the model......
第0轮进度:0.0%
第0轮进度:25.0%
第0轮进度:50.0%
第0轮进度:75.0%
test is start now
test is end now
Iteration 1 times,RMSE is : 1.562428
第1轮进度:0.0%
第1轮进度:25.0%
第1轮进度:50.0%
第1轮进度:75.0%
test is start now
test is end now
Iteration 2 times,RMSE is : 1.614008
Iteration finished!
train is end now
Process finished with exit code 0
网友评论