美文网首页CS229
CS229 Lesson 1: Linear Model - P

CS229 Lesson 1: Linear Model - P

作者: phoenixmy | 来源:发表于2018-07-03 20:41 被阅读0次

    线性模型是机器学习领域里最基础的模型之一,因此被Andrew NG老师用来开启CS229课程。 线性模型又分为线性回归(Linear Regression)与逻辑回归(Logistic Regression,但是这种翻译被很多人诟病,正确的说法应该是 对数几率回归)两大类。
    本文主要讨论一下前者,即线性回归。

    网上关于线性回归的学习笔记很多了,这里不多赘述,我就列一下相关公式和

    # -*- coding: utf-8 -*-
    """
    Created on Tue Jul  3 10:53:02 2018
    
    @author: PhoenixMY
    
    Linear Regression Notes:
        Ordinary Least Squares
        Gradient Descent : Batch GD / Stochastic GD / Mini-Batch GD
        Ridge Regression
        Lasso Regression
        
    """
    
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    
    from sklearn.datasets import load_boston
    from sklearn import linear_model 
    
    
    '''
    ' Ordinary Least Square
    ' J(theta) = ∑(X(i)*theta - y(i)) ^ 2 / 2 ;  
    '       1 <= i <= m, m is the size of data set;   
    '''
    #import boston house price dataset, for details, see load_boston.DESCR
    X, y = load_boston(return_X_y=True)
    
    
    
    def featureNormalize(X):
        X_norm = X;
        mu = np.zeros((1,X.shape[1]))
        sigma = np.zeros((1,X.shape[1]))
        for i in range(X.shape[1]):
            mu[0,i] = np.mean(X[:,i]) # 均值
            sigma[0,i] = np.std(X[:,i])     # 标准差
            if (sigma[0,i] == 0):
              sigma[0,i] = 1
    #     print(mu)
    #     print(sigma)
        X_norm  = (X - mu) / sigma
        return X_norm
    
    def J_theta(X_input, y_input, theta):
        t = X_input@theta - y_input
        return (t.T @ t)[0, 0] / (y_input.size * 2)
      
    
    #########################################################
    # Name   : batch_gradient
    # Descr  : 
    # Params :   
    #########################################################
    def batch_gradient(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.000001, cost=12):
      print("***********************")
      print("==>batch gradient descent started!")
      theta = theta_init
      J_theta_init = 0
      J_theta_old = 0
      for i in range(iter_max):
        J_theta_init = J_theta(X_input, y_input, theta)
        #print("J of theta is ", J_theta_init)
        if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
            break
        if (np.abs(J_theta_init) < cost):
          break
        J_theta_old = J_theta_init
        ############################################################
        #batch gradient : use all sample to update theta in one loop
        ############################################################
        delta = X_input@theta - y_input
        # warning point 2 : divided by m(the number of samples)    
        theta = theta - lr * (X_input.T@delta) / (y_input.size)
      
      print("iteration times:", i)
      if (i == iter_max):
        print("!!!Reach the max number of iteration(iter_max)")
      print("J of theta is ", J_theta_init)
      print("**end")
      print(theta)
      return theta
    
    def stochastic_gradient(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.00001, cost=12):
      print("***********************")
      print("==>stochastic gradient descent started!")
    
      theta = theta_init
      J_theta_init = 0
      J_theta_old = 0
      m = X.shape[0]
      #n = X.shape[1]
      for i in range(iter_max):
        k = np.random.randint(low=0, high=(m-1))
        J_theta_init = J_theta(X_input, y_input, theta)
    
        '''
        # no matter what kind of gradient method you use,
        # the cost function should be same ~~~
        t = X_input[k, :]@theta - y_input[k]
        J_theta_init = t[0] * t[0] / 2
        ''' 
        
        if ((i % 50000) == 0):
          print("J of theta is ", J_theta_init)
    
        #'''
        if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
            break
        if (np.abs(J_theta_init) < cost):
          break
        #'''    
    
        J_theta_old = J_theta_init
        ############################################################
        #stochastic gradient : use a random sample to update theta in one loop
        ############################################################
        delta = X_input[k, :]@theta - y_input[k]
        # warning 2 : by theory , it should not be divided by m(the number of samples)
        # but if learning rate is small(eg : 0.5 for this dataset), cost function will 
        # not converge any more!!!   
        # https://blog.csdn.net/lw_ghy/article/details/71054965
        # 对于该算法,需要注意的是我们必须小心调节学习率。因为如果学习率设置太大的话,
        # 算法会导致目标函数发散;反之如果学习率设置太小的话,算法会导致目标函数收敛过慢。
        #theta = theta - lr * delta * (X_input.T[:, k].reshape((X_input.shape[1], 1))) / (y_input.size)
        theta = theta - lr * delta * (X_input.T[:, k].reshape((X_input.shape[1], 1)))
        
      print("iteration times:", i)
      if (i == (iter_max-1)):
        print("!!!Reach the max number of iteration(iter_max)")
      print("J of theta is ", J_theta_init)
      print("**end")
      return theta
    
    
    def mini_batch_gradient(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.000001, cost=12):
      print("***********************")
      print("==>mini batch gradient descent started!")
      theta = theta_init
      J_theta_init = 0
      J_theta_old = 0
      for i in range(iter_max):
        J_theta_init = J_theta(X_input, y_input, theta)
        #print("J of theta is ", J_theta_init)
        if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
            break
        if (np.abs(J_theta_init) < cost):
          break
        J_theta_old = J_theta_init
        ############################################################
        #batch gradient : use a group of samples to update theta in one loop
        ############################################################
        # select 10 samples(vector) from dataset
        # 10 is selected by experience(by Andrew NG)
        batch_num = 10
        mini_batch = np.random.choice((y_input.size-1), batch_num)
        # stochastic gradient is a special case of mini-batch gradient : 
        # under the condition : mini_batch = 1
        for k in mini_batch:
          delta = X_input[k, :]@theta - y_input[k]
          # warning point 2 : divided by m(the number of samples)    
          theta = theta - lr * delta * (X_input.T[:, k].reshape((X_input.shape[1], 1))) / batch_num
      
      print("iteration times:", i)
      if (i == iter_max):
        print("!!!Reach the max number of iteration(iter_max)")
      print("J of theta is ", J_theta_init)
      print("**end")
      return theta
    
    
    
    def batch_gradient_Ridge(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.0000001, cost=12, alpha=0.00001):
      print("***********************")
      print("==>batch gradient descent with Ridge started!")
      theta = theta_init
      J_theta_init = 0
      J_theta_old = 0
      m = X_input.shape[0]
      n = X_input.shape[1]
      for i in range(iter_max):
        ###
        # Ridge Regression : add penalty factor(alpha * theta ** 2) in cost function
        ###
        t = X_input@theta - y_input
        J_theta_init = (t.T @ t)[0, 0] / (y_input.size * 2) + alpha * theta.reshape((1,n)).dot(theta)[0,0]
        #print("J of theta is ", J_theta_init)
        if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
            break
        #if (np.abs(J_theta_init) < cost):
        #  break
        J_theta_old = J_theta_init
        ############################################################
        #batch gradient : use all sample to update theta in one loop
        ############################################################
        delta = X_input@theta - y_input
        # warning point 2 : divided by m(the number of samples)    
        theta = theta - lr * (X_input.T@delta) / (y_input.size) - alpha * theta
      
      print("iteration times:", i)
      if (i == iter_max):
        print("!!!Reach the max number of iteration(iter_max)")
      print("J of theta is ", J_theta_init)
      print("**end")
      print(theta)
      return theta
    
    
    def batch_gradient_Lasso(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.0000001, cost=12, alpha=0.00001):
      print("***********************")
      print("==>batch gradient descent with Ridge started!")
      theta = theta_init
      J_theta_init = 0
      J_theta_old = 0
      m = X_input.shape[0]
      n = X_input.shape[1]
      for i in range(iter_max):
        ###
        # Ridge Regression : add penalty factor(theta * alpha) in cost function
        ###
        t = X_input@theta - y_input
        J_theta_init = (t.T @ t)[0, 0] / (y_input.size * 2) + alpha *np.sum(theta)
        #print("J of theta is ", J_theta_init)
        if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
            break
        #if (np.abs(J_theta_init) < cost):
        #  break
        J_theta_old = J_theta_init
        ############################################################
        #batch gradient : use all sample to update theta in one loop
        ############################################################
        delta = X_input@theta - y_input
        # warning point 2 : divided by m(the number of samples)    
        theta = theta - lr * (X_input.T@delta) / (y_input.size) - alpha
      
      print("iteration times:", i)
      if (i == iter_max):
        print("!!!Reach the max number of iteration(iter_max)")
      print("J of theta is ", J_theta_init)
      print("**end")
      
      print(theta)
      return theta
    
    
    def NormalEquation(X_input, y_input):
      # theta = (X.T * X) ** (-1) * X.T * y  
      t = np.linalg.pinv(X_input.T@X)
      theta = t @ X_input.T @ y_input 
    
      print(theta.reshape((X_input.shape[1]), 1))
      return theta
    
    
    
    
    
    # warning point 1 : normalization
    # if there is no feature normalization, cost function will not converge
    X = featureNormalize(X)
    
    NormalEquation(X, y)
    
    X = np.insert(X, 0, values=np.ones(y.size), axis=1)
    X = X.reshape(X.shape[0], X.shape[1])
    y = y.reshape(y.size, 1)
    
    ITER_MAX = 50000
    theta_init = np.zeros((X.shape[1], 1))
    
        
    batch_gradient(X, y, theta_init, ITER_MAX, lr=0.01, J_theta_limit=0.00000001, cost=10)
    #stochastic_gradient(X, y, theta_init, ITER_MAX, lr=0.001, J_theta_limit=0.00000001, cost=11)
    #mini_batch_gradient(X, y, theta_init, ITER_MAX, lr=0.001, J_theta_limit=0.00000001, cost=11)
    batch_gradient_Ridge(X, y, theta_init, ITER_MAX, lr=0.01, J_theta_limit=0.00000001, cost=10)
    batch_gradient_Lasso(X, y, theta_init, ITER_MAX, lr=0.01, J_theta_limit=0.00000001, cost=10)
    
    
    #
    #                       _oo0oo_
    #                      o8888888o
    #                      88" . "88
    #                      (| -_- |)
    #                      0\  =  /0
    #                    ___/`---'\___
    #                  .' \\|     |# '.
    #                 / \\|||  :  |||# \
    #                / _||||| -:- |||||- \
    #               |   | \\\  -  #/ |   |
    #               | \_|  ''\---/''  |_/ |
    #               \  .-\__  '-'  ___/-. /
    #             ___'. .'  /--.--\  `. .'___
    #          ."" '<  `.___\_<|>_/___.' >' "".
    #         | | :  `- \`.;`\ _ /`;.`/ - ` : | |
    #         \  \ `_.   \_ __\ /__ _/   .-` /  /
    #     =====`-.____`.___ \_____/___.-`___.-'=====
    #                       `=---='
    #
    #
    #     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #
    #               佛祖保佑         永无BUG
    #
    #
    #
    

    相关文章

      网友评论

        本文标题:CS229 Lesson 1: Linear Model - P

        本文链接:https://www.haomeiwen.com/subject/kmnjuftx.html