CS229 Lesson 1: Linear Model - P

作者: phoenixmy | 来源:发表于2018-07-03 20:41 被阅读0次

CS229 Lesson 1: Linear Model - P
CS229 Week1 Linear Regression wi
CS229 Week2 Linear Regression wi
R-建模及预测
逻辑斯蒂回归（Logistic Regression）
2020-07-08-2000-H. Kastelewicz
Linear Model
CS229 Lesson1
CRF的前世今生（二）
HSE - NLP- 1.4 Hashing trick in

线性模型是机器学习领域里最基础的模型之一，因此被Andrew NG老师用来开启CS229课程。线性模型又分为线性回归（Linear Regression）与逻辑回归（Logistic Regression，但是这种翻译被很多人诟病，正确的说法应该是 对数几率回归）两大类。
本文主要讨论一下前者，即线性回归。

网上关于线性回归的学习笔记很多了，这里不多赘述，我就列一下相关公式和

# -*- coding: utf-8 -*-
"""
Created on Tue Jul  3 10:53:02 2018

@author: PhoenixMY

Linear Regression Notes:
    Ordinary Least Squares
    Gradient Descent : Batch GD / Stochastic GD / Mini-Batch GD
    Ridge Regression
    Lasso Regression
    
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_boston
from sklearn import linear_model 


'''
' Ordinary Least Square
' J(theta) = ∑(X(i)*theta - y(i)) ^ 2 / 2 ;  
'       1 <= i <= m, m is the size of data set;   
'''
#import boston house price dataset, for details, see load_boston.DESCR
X, y = load_boston(return_X_y=True)



def featureNormalize(X):
    X_norm = X;
    mu = np.zeros((1,X.shape[1]))
    sigma = np.zeros((1,X.shape[1]))
    for i in range(X.shape[1]):
        mu[0,i] = np.mean(X[:,i]) # 均值
        sigma[0,i] = np.std(X[:,i])     # 标准差
        if (sigma[0,i] == 0):
          sigma[0,i] = 1
#     print(mu)
#     print(sigma)
    X_norm  = (X - mu) / sigma
    return X_norm

def J_theta(X_input, y_input, theta):
    t = X_input@theta - y_input
    return (t.T @ t)[0, 0] / (y_input.size * 2)
  

#########################################################
# Name   : batch_gradient
# Descr  : 
# Params :   
#########################################################
def batch_gradient(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.000001, cost=12):
  print("***********************")
  print("==>batch gradient descent started!")
  theta = theta_init
  J_theta_init = 0
  J_theta_old = 0
  for i in range(iter_max):
    J_theta_init = J_theta(X_input, y_input, theta)
    #print("J of theta is ", J_theta_init)
    if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
        break
    if (np.abs(J_theta_init) < cost):
      break
    J_theta_old = J_theta_init
    ############################################################
    #batch gradient : use all sample to update theta in one loop
    ############################################################
    delta = X_input@theta - y_input
    # warning point 2 : divided by m(the number of samples)    
    theta = theta - lr * (X_input.T@delta) / (y_input.size)
  
  print("iteration times:", i)
  if (i == iter_max):
    print("!!!Reach the max number of iteration(iter_max)")
  print("J of theta is ", J_theta_init)
  print("**end")
  print(theta)
  return theta

def stochastic_gradient(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.00001, cost=12):
  print("***********************")
  print("==>stochastic gradient descent started!")

  theta = theta_init
  J_theta_init = 0
  J_theta_old = 0
  m = X.shape[0]
  #n = X.shape[1]
  for i in range(iter_max):
    k = np.random.randint(low=0, high=(m-1))
    J_theta_init = J_theta(X_input, y_input, theta)

    '''
    # no matter what kind of gradient method you use,
    # the cost function should be same ~~~
    t = X_input[k, :]@theta - y_input[k]
    J_theta_init = t[0] * t[0] / 2
    ''' 
    
    if ((i % 50000) == 0):
      print("J of theta is ", J_theta_init)

    #'''
    if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
        break
    if (np.abs(J_theta_init) < cost):
      break
    #'''    

    J_theta_old = J_theta_init
    ############################################################
    #stochastic gradient : use a random sample to update theta in one loop
    ############################################################
    delta = X_input[k, :]@theta - y_input[k]
    # warning 2 : by theory , it should not be divided by m(the number of samples)
    # but if learning rate is small(eg : 0.5 for this dataset), cost function will 
    # not converge any more!!!   
    # https://blog.csdn.net/lw_ghy/article/details/71054965
    # 对于该算法，需要注意的是我们必须小心调节学习率。因为如果学习率设置太大的话，
    # 算法会导致目标函数发散；反之如果学习率设置太小的话，算法会导致目标函数收敛过慢。
    #theta = theta - lr * delta * (X_input.T[:, k].reshape((X_input.shape[1], 1))) / (y_input.size)
    theta = theta - lr * delta * (X_input.T[:, k].reshape((X_input.shape[1], 1)))
    
  print("iteration times:", i)
  if (i == (iter_max-1)):
    print("!!!Reach the max number of iteration(iter_max)")
  print("J of theta is ", J_theta_init)
  print("**end")
  return theta


def mini_batch_gradient(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.000001, cost=12):
  print("***********************")
  print("==>mini batch gradient descent started!")
  theta = theta_init
  J_theta_init = 0
  J_theta_old = 0
  for i in range(iter_max):
    J_theta_init = J_theta(X_input, y_input, theta)
    #print("J of theta is ", J_theta_init)
    if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
        break
    if (np.abs(J_theta_init) < cost):
      break
    J_theta_old = J_theta_init
    ############################################################
    #batch gradient : use a group of samples to update theta in one loop
    ############################################################
    # select 10 samples(vector) from dataset
    # 10 is selected by experience(by Andrew NG)
    batch_num = 10
    mini_batch = np.random.choice((y_input.size-1), batch_num)
    # stochastic gradient is a special case of mini-batch gradient : 
    # under the condition : mini_batch = 1
    for k in mini_batch:
      delta = X_input[k, :]@theta - y_input[k]
      # warning point 2 : divided by m(the number of samples)    
      theta = theta - lr * delta * (X_input.T[:, k].reshape((X_input.shape[1], 1))) / batch_num
  
  print("iteration times:", i)
  if (i == iter_max):
    print("!!!Reach the max number of iteration(iter_max)")
  print("J of theta is ", J_theta_init)
  print("**end")
  return theta



def batch_gradient_Ridge(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.0000001, cost=12, alpha=0.00001):
  print("***********************")
  print("==>batch gradient descent with Ridge started!")
  theta = theta_init
  J_theta_init = 0
  J_theta_old = 0
  m = X_input.shape[0]
  n = X_input.shape[1]
  for i in range(iter_max):
    ###
    # Ridge Regression : add penalty factor(alpha * theta ** 2) in cost function
    ###
    t = X_input@theta - y_input
    J_theta_init = (t.T @ t)[0, 0] / (y_input.size * 2) + alpha * theta.reshape((1,n)).dot(theta)[0,0]
    #print("J of theta is ", J_theta_init)
    if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
        break
    #if (np.abs(J_theta_init) < cost):
    #  break
    J_theta_old = J_theta_init
    ############################################################
    #batch gradient : use all sample to update theta in one loop
    ############################################################
    delta = X_input@theta - y_input
    # warning point 2 : divided by m(the number of samples)    
    theta = theta - lr * (X_input.T@delta) / (y_input.size) - alpha * theta
  
  print("iteration times:", i)
  if (i == iter_max):
    print("!!!Reach the max number of iteration(iter_max)")
  print("J of theta is ", J_theta_init)
  print("**end")
  print(theta)
  return theta


def batch_gradient_Lasso(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.0000001, cost=12, alpha=0.00001):
  print("***********************")
  print("==>batch gradient descent with Ridge started!")
  theta = theta_init
  J_theta_init = 0
  J_theta_old = 0
  m = X_input.shape[0]
  n = X_input.shape[1]
  for i in range(iter_max):
    ###
    # Ridge Regression : add penalty factor(theta * alpha) in cost function
    ###
    t = X_input@theta - y_input
    J_theta_init = (t.T @ t)[0, 0] / (y_input.size * 2) + alpha *np.sum(theta)
    #print("J of theta is ", J_theta_init)
    if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
        break
    #if (np.abs(J_theta_init) < cost):
    #  break
    J_theta_old = J_theta_init
    ############################################################
    #batch gradient : use all sample to update theta in one loop
    ############################################################
    delta = X_input@theta - y_input
    # warning point 2 : divided by m(the number of samples)    
    theta = theta - lr * (X_input.T@delta) / (y_input.size) - alpha
  
  print("iteration times:", i)
  if (i == iter_max):
    print("!!!Reach the max number of iteration(iter_max)")
  print("J of theta is ", J_theta_init)
  print("**end")
  
  print(theta)
  return theta


def NormalEquation(X_input, y_input):
  # theta = (X.T * X) ** (-1) * X.T * y  
  t = np.linalg.pinv(X_input.T@X)
  theta = t @ X_input.T @ y_input 

  print(theta.reshape((X_input.shape[1]), 1))
  return theta





# warning point 1 : normalization
# if there is no feature normalization, cost function will not converge
X = featureNormalize(X)

NormalEquation(X, y)

X = np.insert(X, 0, values=np.ones(y.size), axis=1)
X = X.reshape(X.shape[0], X.shape[1])
y = y.reshape(y.size, 1)

ITER_MAX = 50000
theta_init = np.zeros((X.shape[1], 1))

    
batch_gradient(X, y, theta_init, ITER_MAX, lr=0.01, J_theta_limit=0.00000001, cost=10)
#stochastic_gradient(X, y, theta_init, ITER_MAX, lr=0.001, J_theta_limit=0.00000001, cost=11)
#mini_batch_gradient(X, y, theta_init, ITER_MAX, lr=0.001, J_theta_limit=0.00000001, cost=11)
batch_gradient_Ridge(X, y, theta_init, ITER_MAX, lr=0.01, J_theta_limit=0.00000001, cost=10)
batch_gradient_Lasso(X, y, theta_init, ITER_MAX, lr=0.01, J_theta_limit=0.00000001, cost=10)

#
#                       _oo0oo_
#                      o8888888o
#                      88" . "88
#                      (| -_- |)
#                      0\  =  /0
#                    ___/`---'\___
#                  .' \\|     |# '.
#                 / \\|||  :  |||# \
#                / _||||| -:- |||||- \
#               |   | \\\  -  #/ |   |
#               | \_|  ''\---/''  |_/ |
#               \  .-\__  '-'  ___/-. /
#             ___'. .'  /--.--\  `. .'___
#          ."" '<  `.___\_<|>_/___.' >' "".
#         | | :  `- \`.;`\ _ /`;.`/ - ` : | |
#         \  \ `_.   \_ __\ /__ _/   .-` /  /
#     =====`-.____`.___ \_____/___.-`___.-'=====
#                       `=---='
#
#
#     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
#               佛祖保佑         永无BUG
#
#
#

CS229 Lesson 1: Linear Model - P
线性模型是机器学习领域里最基础的模型之一，因此被Andrew NG老师用来开启CS229课程。线性模型又分为线性...
CS229 Week1 Linear Regression wi
title: CS229 Week1 Linear Regression with-One Variabledat...
CS229 Week2 Linear Regression wi
title: CS229 Week2 Linear Regression with Multiple Variab...
R-建模及预测
Linear Regression[#linear-regression]Model[#model]Predict...
逻辑斯蒂回归（Logistic Regression）
一. 广义线性模型（generalized linear model） 1.线性回归（Linear Regress...
2020-07-08-2000-H. Kastelewicz
A Numerical Model for the PSI-1 Linear Plasma Device Abst...
Linear Model
ML系列文章先引入数学理论和代码实现结合。一、数学理论篇引言：数值型自变量和数值型新变量之间关系分析方法就是...
CS229 Lesson1
课程设置课程主要内容 1.Supervised Learning2.Learning Theory3.Unsup...
CRF的前世今生（二）
CRF在本质上是属于log linear model，在log linear model中有两类比较重要的模型： ...
HSE - NLP- 1.4 Hashing trick in
linear model通常被用作baseline