美文网首页
RSVD源码分析

RSVD源码分析

作者: hwang_zhic | 来源:发表于2019-09-27 18:48 被阅读0次

    RSVD代码分析

    数据集

    • 该代码的数据集描述如下:

    • u.data:

      • 数据格式为:userID,movieID,rating,timestamp
      • 有所有的数据
    • u1.base

      • 数据格式为:userID,movieID,rating,timestamp
      • 含有部分u.data数据,与u1.test不重合
    • u1.test

      • 数据格式为:userID,movieID,rating,timestamp
      • 含有部分u.data数据,与u1.base不重合
    • 其他的u2,3,4,5都是这样

    • 个人分析:

      • u1,u2,u3等数据集,base是训练集,test是测试集
      • 这些数据集都是从u.data取出来的
    • 用户数量 userNum = 943

    • 项目数量 itemNum = 1682

    代码

    import random
    import math
    import pandas as pd
    import numpy as np
    
    
    class RSVD():
        def __init__(self, allfile, trainfile, testfile, latentFactorNum=20, alpha_u=0.01, alpha_v=0.01, beta_u=0.01,
                     beta_v=0.01, learning_rate=0.01):
            """
            :param allfile:
                u.data,总数据
            :param trainfile:
                u1.base,训练集
            :param testfile:
                u1.train,测试集
            :param latentFactorNum:
                因子数量
            :param alpha_u:
                 alpha_u的初始化
            :param alpha_v:
                alpha_v的初始化
            :param beta_u:
                beta_u的初始化
            :param beta_v:
                beta_vu的初始化
            :param learning_rate:
                训练速度
            """
            data_fields = ['user_id', 'item_id', 'rating', 'timestamp']
            # all data file
            allData = pd.read_table(allfile, names=data_fields)
            # training set file
            self.train_df = pd.read_table(trainfile, names=data_fields)
            # testing set file
            self.test_df = pd.read_table(testfile, names=data_fields)
            # get factor number
            self.latentFactorNum = latentFactorNum
            # get user number
            self.userNum = len(set(allData['user_id'].values))
            # get item number
            self.itemNum = len(set(allData['item_id'].values))
            # learning rate
            self.learningRate = learning_rate
            # the regularization lambda
            self.alpha_u = alpha_u
            self.alpha_v = alpha_v
            self.beta_u = beta_u
            self.beta_v = beta_v
            # initialize the model and parameters
            self.initModel()
    
        # initialize all parameters
        def initModel(self):
            """
            初始化矩阵,用作矩阵分解
            :return:
            """
            print("initModel is start now")
            self.mu = self.train_df['rating'].mean()
            self.bu = np.zeros(self.userNum)
            self.bi = np.zeros(self.itemNum)
            self.U = np.mat(np.random.rand(self.userNum, self.latentFactorNum))
            self.V = np.mat(np.random.rand(self.itemNum, self.latentFactorNum))
    
            # self.bu = [0.0 for i in range(self.userNum)]
            # self.bi = [0.0 for i in range(self.itemNum)]
            # temp = math.sqrt(self.latentFactorNum)
            # self.U = [[(0.1 * random.random() / temp) for i in range(self.latentFactorNum)] for j in range(self.userNum)]
            # self.V = [[0.1 * random.random() / temp for i in range(self.latentFactorNum)] for j in range(self.itemNum)]
    
            print("Initialize end.The user number is:%d,item number is:%d" % (self.userNum, self.itemNum))
            print("initModel is end now")
    
        def train(self, iterTimes=100):
            """
            :param iterTimes:
                训练次数
            """
            print("train is start now")
            print("Beginning to train the model......")
            preRmse = 10000.0
            for iter in range(iterTimes):
                """
                开始迭代训练
                """
                for index in self.train_df.index:
                    """
                    计算每个二元组数据,从0开始
                    每计算20000个数据就输出1次当前进度,一共有80000个数据
                    """
                    if index % 20000 == 0:
                        print("第%s轮进度:%s%%" % (iter, index / len(self.train_df.index) * 100))
                    # 用户ID
                    user = int(self.train_df.loc[index]['user_id']) - 1
                    # 电影ID
                    item = int(self.train_df.loc[index]['item_id']) - 1
                    # 实际评分
                    rating = float(self.train_df.loc[index]['rating'])
                    # 计算、预测得分
                    pscore = self.predictScore(self.mu, self.bu[user], self.bi[item], self.U[user], self.V[item])
                    eui = rating - pscore
                    # update parameters bu and bi(user rating bais and item rating bais)
                    self.mu = -eui
                    self.bu[user] += self.learningRate * (eui - self.beta_u * self.bu[user])
                    self.bi[item] += self.learningRate * (eui - self.beta_v * self.bi[item])
    
                    temp = self.U[user]
                    self.U[user] += self.learningRate * (eui * self.V[user] - self.alpha_u * self.U[user])
                    self.V[item] += self.learningRate * (temp * eui - self.alpha_v * self.V[item])
    
                    # for k in range(self.latentFactorNum):
                    #     temp = self.U[user][k]
                    #     # update U,V
                    #     self.U[user][k] += self.learningRate * (eui * self.V[user][k] - self.alpha_u * self.U[user][k])
                    #     self.V[item][k] += self.learningRate * (temp * eui - self.alpha_v * self.V[item][k])
                    #
                # calculate the current rmse
                curRmse = self.test(self.mu, self.bu, self.bi, self.U, self.V)
                print("Iteration %d times,RMSE is : %f" % (iter + 1, curRmse))
                if curRmse > preRmse:
                    break
                else:
                    preRmse = curRmse
            print("Iteration finished!")
            print("train is end now")
    
        # test on the test set and calculate the RMSE
        def test(self, mu, bu, bi, U, V):
            """
            :param mu:
            :param bu:
            :param bi:
            :param U:
            :param V:
            :return:
                最终返回RMSE
            """
            print("test is start now")
            cnt = self.test_df.shape[0]
            rmse = 0.0
    
            buT = bu.reshape(bu.shape[0], 1)
            predict_rate_matrix = mu + np.tile(buT, (1, self.itemNum)) + np.tile(bi, (self.userNum, 1)) + self.U * self.V.T
    
            for i in self.test_df.index:
                user = int(self.test_df.loc[i]['user_id']) - 1
                item = int(self.test_df.loc[i]['item_id']) - 1
                score = float(self.test_df.loc[i]['rating'])
                # pscore = self.predictScore(mu, bu[user], bi[item], U[user], V[item])
                pscore = predict_rate_matrix[user, item]
                rmse += math.pow(score - pscore, 2)
            RMSE = math.sqrt(rmse / cnt)
            print("test is end now")
            return RMSE
    
        # calculate the inner product of two vectors
        def innerProduct(self, v1, v2):
            print("innerProduct is start now")
            result = 0.0
            for i in range(len(v1)):
                result += v1[i] * v2[i]
            print("innerProduct is end now")
            return result
    
        def predictScore(self, mu, bu, bi, U, V):
            # print("predictScore is start now")
            # pscore = mu + bu + bi + self.innerProduct(U, V)
            pscore = mu + bu + bi + np.multiply(U, V).sum()
            if pscore < 1:
                pscore = 1
            if pscore > 5:
                pscore = 5
    
            # print("predictScore is end now")
            return pscore
    
    
    if __name__ == '__main__':
    
        s = RSVD("u.data", "u1.base", "u1.test")
        s.train()
    
    

    结果

    H:\Anaconda\python.exe I:/Pywork/协同过滤/对接16-RSVD代码/RSVD/RSVD.py
    I:/Pywork/协同过滤/对接16-RSVD代码/RSVD/RSVD.py:12: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\t'.
      allData = pd.read_table(allfile, names=data_fields)
    I:/Pywork/协同过滤/对接16-RSVD代码/RSVD/RSVD.py:14: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\t'.
      self.train_df = pd.read_table(trainfile, names=data_fields)
    I:/Pywork/协同过滤/对接16-RSVD代码/RSVD/RSVD.py:16: FutureWarning: read_table is deprecated, use read_csv instead, passing sep='\t'.
      self.test_df = pd.read_table(testfile, names=data_fields)
    initModel is start now
    Initialize end.The user number is:943,item number is:1682
    initModel is end now
    train is start now
    Beginning to train the model......
    第0轮进度:0.0%
    第0轮进度:25.0%
    第0轮进度:50.0%
    第0轮进度:75.0%
    test is start now
    test is end now
    Iteration 1 times,RMSE is : 1.562428
    第1轮进度:0.0%
    第1轮进度:25.0%
    第1轮进度:50.0%
    第1轮进度:75.0%
    test is start now
    test is end now
    Iteration 2 times,RMSE is : 1.614008
    Iteration finished!
    train is end now
    
    Process finished with exit code 0
    

    相关文章

      网友评论

          本文标题:RSVD源码分析

          本文链接:https://www.haomeiwen.com/subject/couauctx.html