参考:https://www.bilibili.com/video/BV1xW411Y7Qd?p=8
- 如何选择机器学习方法:sklearn官方图
https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
- 通用学习模式
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
## 1. 读取数据
iris = datasets.load_iris() #iris is a type of flower
iris_X = iris.data
iris_y = iris.target
# 看一下数据
#print(iris_X[:2,:]) # iris的属性 两个sample
#print(iris_y) # y的分类 输出:3个类别
## 2. 分类
X_train,X_test,y_train,y_test = train_test_split(iris_X,iris_y,test_size=0.3) # train:test = 7:3
#print(y_train) #输出可以看出分开了且打乱了数据
## 3. fit
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
# 用预测和真实的数据对比
print(knn.predict(X_test))
print(y_test)
- sklearn的数据库
from sklearn import datasets
from sklearn.linear_model import LinearRegression
loaded_data = datasets.load_boston()
data_X = loaded_data.data
data_y = loaded_data.target
model = LinearRegression() # 选择模型,参数此处为默认
model.fit(data_X,data_y)
print(model.predict(data_X[:4,:])) #预测前4个
print(data_y[:4]) #与前四个真实值对比
# make some datapoints
import matplotlib.pyplot as plt
X,y= datasets.make_regression(n_samples=100,n_features=1,n_targets=1,noise=1)
plt.scatter(X,y)
# noise bigger
X,y= datasets.make_regression(n_samples=100,n_features=1,n_targets=1,noise=10)
plt.scatter(X,y)
- sklearn的属性和功能
from sklearn import datasets
from sklearn.linear_model import LinearRegression
loaded_data = datasets.load_boston()
data_X = loaded_data.data
data_y = loaded_data.target
model = LinearRegression() # 选择模型,参数此处为默认
model.fit(data_X,data_y)
# LR的参数
print(model.coef_) # 每个属性配套的参数
print(model.intercept_)
# 功能
print(model.get_params) # model本身的参数
print(model.score(data_X,data_y)) #R^2 coefficient of determination 给model打分,看预测和实际数据多吻合
- normalization(scale):
- make sure features are on a similar scale
- 如果x1,x2取值范围跨度很大,不容易走到gradient descent的中间点
from sklearn import preprocessing
import numpy as numpy
a = np.array([[10,2.7,3.6],[-100,5,-2],[120,20,40]],dtype=np.float64)
print(a)
print(preprocessing.scale(a))
from sklearn.model_selection import train_test_split
from sklearn.datasets.samples_generator import make_classification
from sklearn.svm import SVC
import matplotlib.pyplot as plt
# generate some data
X,y = make_classification(n_samples=300, n_features=2, n_redundant=0,n_informative=2, random_state=22, n_clusters_per_class=1,scale=100 ) # random_state随机产生的每次产生的数据是一样的
# look at data
#plt.scatter(X[:,0],X[:,1],c=y)
X = preprocessing.scale(X) #可以注释掉对比preprocessing前后的performance
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3)
clf = SVC()
clf.fit(X_train,y_train)
print(clf.score(X_test,y_test))
-
如何检验神经网络
- Accuracy
- R2 score
- f1 score
overfitting:
- error_training < error_test
解决方法:
- Theano L1/L2 regularization
- Tensorflow dropout
- sklearn cross validation
怎么确定哪个参数更好解决问题?(e.g. N_layer)
-
Cross Validation
- 对比不同参数/model/x属性进行验证
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
iris = load_iris()
X = iris.data
y = iris.target
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=4)
knn = KNeighborsClassifier(n_neighbors=5) # 考虑数据点附近的5个数据
knn.fit(X_train,y_train)
print(knn.score(X_test,y_test))
# 添加了CV
from sklearn.model_selection import cross_val_score
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn,X,y,cv=5,scoring = 'accuracy')
print(scores.mean())
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
k_range = range(1,31)
k_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
#scores = cross_val_score(knn,X,y,cv=10,scoring = 'accuracy') # for classificaiton
loss= -cross_val_score(knn,X,y,cv=10,scoring = 'neg_mean_squared_error') # for regression
k_scores.append(loss.mean()) #换成scores
plt.plot(k_range,k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross validation accuracy')
# overfitting 问题
from sklearn.model_selection import learning_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
digits = load_digits()
X = digits.data
y = digits.target
train_sizes, train_loss, test_loss = learning_curve(
SVC(gamma=0.001),X,y,cv=10,scoring='neg_mean_squared_error',
train_sizes=[0.1,0.25,0.5,0.75,1] #10%,25%点记录
)
train_loss_mean = - np.mean(train_loss,axis=1)
test_loss_mean = -np.mean(test_loss,axis=1)
# 图像化结果
plt.plot(train_sizes,train_loss_mean,'o-',color ="r", label='training')
plt.plot(train_sizes,test_loss_mean,'o-',color='g',label='CV')
plt.xlabel("training example")
plt.ylabel("loss")
plt.legend(loc="best")
# overfitting 问题
from sklearn.model_selection import validation_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
digits = load_digits()
X = digits.data
y = digits.target
param_range = np.logspace(-6,-2.3,5)
train_loss, test_loss = validation_curve(
SVC(),X,y,param_name='gamma',param_range=param_range,cv=10,scoring='neg_mean_squared_error',
)
train_loss_mean = - np.mean(train_loss,axis=1)
test_loss_mean = -np.mean(test_loss,axis=1)
# 图像化结果
plt.plot(param_range,train_loss_mean,'o-',color ="r", label='training')
plt.plot(param_range,test_loss_mean,'o-',color='g',label='CV')
plt.xlabel("gamma")
plt.ylabel("loss")
plt.legend(loc="best")
- 读取/储存处理好的model
from sklearn import svm
from sklearn import datasets
clf = svm.SVC()
iris = datasets.load_iris()
X,y = iris.data, iris.target
clf.fit(X,y)
# method1 : pickle
import pickle
## save
#with open('save/clf.pickle','wb') as f:
# pickle.dump(clf,f) # 把clf导入f
## restore
with open('save/clf.pickle','rb') as f:
clf2 = pickle.load(f)
print(clf2.predict(X[0:1]))
# method2 : joblib 比起pickle更快
import joblib
## save
joblib.dump(clf,'save/clf.pkl')
## restore
clf3 = joblib.load('save/clf.pkl')
print(clf3.predict(X[0:1]))
网友评论