正文之前
也不知道是循着什么路径,我又看了一次机器学习的入门必备--线性回归,索性看看这号称最好的入门教程《机器学习》---吴恩达。
不得不说大佬还是大佬。。讲的比我以前看的那些强多了。。然后找到了一份笔记,对照着,然后自己写了份实时显示拟合曲线的版本,感觉比那份广为流传的python版本更加适合人看(当然,人家的本来就很优秀了,我也是学着写的)
正文
废话不多说,我把吴恩达课程的笔记,python实现代码的Github链接放这儿(不是我的:
然后摆上我的代码和运行结果:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
A = np.eye(5)
data = pd.read_csv('ex1data1.txt', header=None, names=['Population', 'Profit'])
data.describe()
data.plot(kind='scatter', x=0, y=1, figsize=(12,8))
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:,0:cols-1]
y = data.iloc[:,cols-1:cols]
m = len(y)
def computeCost(X, y, theta):
inner = np.power(((X * theta) - y), 2)
return np.sum(inner) / (2 * len(X))
X.head()
Ones Population
0 1 6.1101
1 1 5.5277
2 1 8.5186
3 1 7.0032
4 1 5.8598
y.head()
Profit
0 17.5920
1 9.1302
2 13.6620
3 11.8540
4 6.8233
X = np.matrix(X.values)
y = np.matrix(y.values)
theta = np.zeros((2, 1))
X.shape, theta.shape, y.shape
>>((97, 2), (2, 1), (97, 1))
computeCost(X,y,theta)
theta.size
np.zeros(theta.shape)
下面是改动最大的一部分,数据读取和处理部分我基本是照抄的:
from IPython import display
def gradientDescent(X, y, theta, alpha, iters):
x = np.linspace(data.Population.min(), data.Population.max(), 100)
temp = np.zeros(theta.shape)
pars = theta.size
cost = np.zeros(iters)
for i in range(iters):
if len(theta) == 3:
f = theta[0, 0] + (theta[1, 0] * x) + theta[2,0] * x
else:
f = theta[0, 0] + (theta[1, 0] * x)
plt.plot(x, f, 'r', label='Prediction')
plt.scatter(data.Population, data.Profit, label='Traning Data')
plt.xlabel('Population')
plt.ylabel('Profit')
plt.title('Predicted Profit vs. Population Size --> %s'%i)
plt.show()
display.clear_output(wait=True)
for j in range(pars):
temp[j,0] = theta[j,0] - (alpha/len(X) * np.sum(np.multiply((X * theta - y), X[:,j])))
theta = temp
cost[i] = computeCost(X,y,theta)
return theta, cost
iterations = 1500;
alpha = 0.01;
g, cost = gradientDescent(X, y, theta, alpha, iterations)
g
红方框里面是显示迭代次数
computeCost(X, y, g)
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(iterations), cost, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()
正文之后
还有一个多变量的版本我没写了。洗澡去~ 🛀
算了,复制粘贴改几个参数多大事~
path = 'ex1data2.txt'
data2 = pd.read_csv(path, header=None, names=['Size', 'Bedrooms', 'Price'])
data2.head()
data2 = (data2 - data2.mean()) / data2.std()
data2.head()
# add ones column
data2.insert(0, 'Ones', 1)
# set X (training data) and y (target variable)
cols = data2.shape[1]
iters = 1000
X2 = data2.iloc[:,0:cols-1]
y2 = data2.iloc[:,cols-1:cols]
# convert to matrices and initialize theta
X2 = np.matrix(X2.values)
y2 = np.matrix(y2.values)
theta2 = np.matrix(np.array([0,0,0]).reshape(3,1))
# perform linear regression on the data set
g2, cost2 = gradientDescent(X2, y2, theta2, alpha, iters)
# get the cost (error) of the model
computeCost(X2, y2, g2)
网友评论