[雪峰磁针石博客]数据科学入门6-梯度下降

作者: oychw | 来源:发表于2018-08-04 09:03 被阅读28次

    常常会面临这样的需要:为某种特定的情形寻找最佳模型。“最佳”常常会被解读为某种类似于“最小化模型残差”或者“最大化数据的可能性”。换句话说,它代表了优化某种问题的解决方案。

    这意味着我们需要解决一连串的最优化问题。尤其是,我们需要从零开始解决问题。我们
    采用的方法是一种叫作梯度下降(gradient descent)的技术,适合从零开始逐步解决问题。

    用梯度下降法计算最小点

    图片.png

    通过差商来求近似导数

    图片.png

    差商近似值的拟合度


    图片.png
    from collections import Counter
    from linear_algebra import distance, vector_subtract, scalar_multiply
    from functools import reduce
    import math, random
    import matplotlib.pyplot as plt
    
    from pylab import mpl
    mpl.rcParams['font.sans-serif'] = ['SimHei']
    
    def sum_of_squares(v):
        """computes the sum of squared elements in v"""
        return sum(v_i ** 2 for v_i in v)
    
    def difference_quotient(f, x, h):
        return (f(x + h) - f(x)) / h
    
    def plot_estimated_derivative():
    
        def square(x):
            return x * x
    
        def derivative(x):
            return 2 * x
    
        derivative_estimate = lambda x: difference_quotient(square, x, h=0.00001)
    
        # plot to show they're basically the same
        import matplotlib.pyplot as plt
        x = range(-10,10)
        plt.plot(x, list(map(derivative, x)), 'rx')           # red  x
        plt.plot(x, list(map(derivative_estimate, x)), 'b+')  # blue +
        plt.show()                                      # purple *, hopefully
    
    def partial_difference_quotient(f, v, i, h):
    
        # add h to just the i-th element of v
        w = [v_j + (h if j == i else 0)
             for j, v_j in enumerate(v)]
    
        return (f(w) - f(v)) / h
    
    def estimate_gradient(f, v, h=0.00001):
        return [partial_difference_quotient(f, v, i, h)
                for i, _ in enumerate(v)]
    
    def step(v, direction, step_size):
        """move step_size in the direction from v"""
        return [v_i + step_size * direction_i
                for v_i, direction_i in zip(v, direction)]
    
    def sum_of_squares_gradient(v):
        return [2 * v_i for v_i in v]
    
    def safe(f):
        """define a new function that wraps f and return it"""
        def safe_f(*args, **kwargs):
            try:
                return f(*args, **kwargs)
            except:
                return float('inf')         # this means "infinity" in Python
        return safe_f
    
    
    #
    #
    # minimize / maximize batch
    #
    #
    
    def minimize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001):
        """use gradient descent to find theta that minimizes target function"""
    
        step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
    
        theta = theta_0                           # set theta to initial value
        target_fn = safe(target_fn)               # safe version of target_fn
        value = target_fn(theta)                  # value we're minimizing
    
        while True:
            gradient = gradient_fn(theta)
            next_thetas = [step(theta, gradient, -step_size)
                           for step_size in step_sizes]
    
            # choose the one that minimizes the error function
            next_theta = min(next_thetas, key=target_fn)
            next_value = target_fn(next_theta)
    
            # stop if we're "converging"
            if abs(value - next_value) < tolerance:
                return theta
            else:
                theta, value = next_theta, next_value
    
    def negate(f):
        """return a function that for any input x returns -f(x)"""
        return lambda *args, **kwargs: -f(*args, **kwargs)
    
    def negate_all(f):
        """the same when f returns a list of numbers"""
        return lambda *args, **kwargs: [-y for y in f(*args, **kwargs)]
    
    def maximize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001):
        return minimize_batch(negate(target_fn),
                              negate_all(gradient_fn),
                              theta_0,
                              tolerance)
    
    #
    # minimize / maximize stochastic
    #
    
    def in_random_order(data):
        """generator that returns the elements of data in random order"""
        indexes = [i for i, _ in enumerate(data)]  # create a list of indexes
        random.shuffle(indexes)                    # shuffle them
        for i in indexes:                          # return the data in that order
            yield data[i]
    
    def minimize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
    
        data = list(zip(x, y))
        theta = theta_0                             # initial guess
        alpha = alpha_0                             # initial step size
        min_theta, min_value = None, float("inf")   # the minimum so far
        iterations_with_no_improvement = 0
    
        # if we ever go 100 iterations with no improvement, stop
        while iterations_with_no_improvement < 100:
            value = sum( target_fn(x_i, y_i, theta) for x_i, y_i in data )
    
            if value < min_value:
                # if we've found a new minimum, remember it
                # and go back to the original step size
                min_theta, min_value = theta, value
                iterations_with_no_improvement = 0
                alpha = alpha_0
            else:
                # otherwise we're not improving, so try shrinking the step size
                iterations_with_no_improvement += 1
                alpha *= 0.9
    
            # and take a gradient step for each of the data points
            for x_i, y_i in in_random_order(data):
                gradient_i = gradient_fn(x_i, y_i, theta)
                theta = vector_subtract(theta, scalar_multiply(alpha, gradient_i))
    
        return min_theta
    
    def maximize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
        return minimize_stochastic(negate(target_fn),
                                   negate_all(gradient_fn),
                                   x, y, theta_0, alpha_0)
    
    if __name__ == "__main__":
    
        print("using the gradient")
    
        v = [random.randint(-10,10) for i in range(3)]
    
        tolerance = 0.0000001
    
        while True:
            #print v, sum_of_squares(v)
            gradient = sum_of_squares_gradient(v)   # compute the gradient at v
            next_v = step(v, gradient, -0.01)       # take a negative gradient step
            if distance(next_v, v) < tolerance:     # stop if we're converging
                break
            v = next_v                              # continue if we're not
    
        print("minimum v", v)
        print("minimum value", sum_of_squares(v))
        print()
    
    
        print("using minimize_batch")
    
        v = [random.randint(-10,10) for i in range(3)]
    
        v = minimize_batch(sum_of_squares, sum_of_squares_gradient, v)
    
        print("minimum v", v)
        print("minimum value", sum_of_squares(v))
        plot_estimated_derivative()
        
    

    执行结果

    using the gradient
    minimum v [1.6985844650062676e-06, 1.1323896433375117e-06, -4.529558573350047e-06]
    minimum value 2.4684396358507597e-11
    
    using minimize_batch
    minimum v [-0.000519229685853483, -0.001038459371706966, -0.001038459371706966]
    minimum value 2.42639520004356e-06
    

    可爱的python测试开发库 请在github上点赞,谢谢!
    python中文库文档汇总
    [雪峰磁针石博客]python3标准库-中文版
    [雪峰磁针石博客]python3快速入门教程
    接口自动化性能测试线上培训大纲
    python测试开发自动化测试数据分析人工智能自学每周一练
    更多内容请关注 雪峰磁针石:简书

    • 技术支持qq群: 144081101(后期会录制视频存在该群群文件) 591302926 567351477 钉钉免费群:21745728

    • 道家技术-手相手诊看相中医等钉钉群21734177 qq群:391441566 184175668 338228106 看手相、面相、舌相、抽签、体质识别。服务费50元每人次起。请联系钉钉或者微信pythontesting

    图片.png

    相关文章

      网友评论

        本文标题:[雪峰磁针石博客]数据科学入门6-梯度下降

        本文链接:https://www.haomeiwen.com/subject/yitlvftx.html