sklearn学习笔记

作者: Pingouin | 来源:发表于2020-10-12 00:43 被阅读0次

    参考:https://www.bilibili.com/video/BV1xW411Y7Qd?p=8

    1. 如何选择机器学习方法:sklearn官方图
      https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html
    1. 通用学习模式
    import numpy as np 
    from sklearn import datasets
    from sklearn.model_selection import train_test_split 
    from sklearn.neighbors import KNeighborsClassifier
    
    ## 1. 读取数据
    iris = datasets.load_iris() #iris is a type of flower
    iris_X = iris.data
    iris_y = iris.target
    
    # 看一下数据
    #print(iris_X[:2,:]) # iris的属性 两个sample
    #print(iris_y) # y的分类 输出:3个类别
    
    ## 2. 分类
    X_train,X_test,y_train,y_test = train_test_split(iris_X,iris_y,test_size=0.3) # train:test = 7:3
    #print(y_train) #输出可以看出分开了且打乱了数据
    
    ## 3. fit
    knn = KNeighborsClassifier()
    knn.fit(X_train,y_train) 
    
    # 用预测和真实的数据对比
    print(knn.predict(X_test))
    print(y_test)
    
    1. sklearn的数据库
    from sklearn import datasets
    from sklearn.linear_model import LinearRegression
    
    loaded_data = datasets.load_boston()
    data_X = loaded_data.data
    data_y = loaded_data.target
    
    model = LinearRegression() # 选择模型,参数此处为默认
    model.fit(data_X,data_y)
    
    print(model.predict(data_X[:4,:])) #预测前4个
    print(data_y[:4]) #与前四个真实值对比
    
    # make some datapoints
    import matplotlib.pyplot as plt
    X,y= datasets.make_regression(n_samples=100,n_features=1,n_targets=1,noise=1)
    plt.scatter(X,y)
    
    # noise bigger
    X,y= datasets.make_regression(n_samples=100,n_features=1,n_targets=1,noise=10)
    plt.scatter(X,y)
    
    1. sklearn的属性和功能
    from sklearn import datasets
    from sklearn.linear_model import LinearRegression
    
    loaded_data = datasets.load_boston()
    data_X = loaded_data.data
    data_y = loaded_data.target
    
    model = LinearRegression() # 选择模型,参数此处为默认
    model.fit(data_X,data_y)
    
    # LR的参数
    print(model.coef_) # 每个属性配套的参数
    print(model.intercept_)
    
    # 功能
    print(model.get_params) # model本身的参数
    print(model.score(data_X,data_y)) #R^2 coefficient of determination 给model打分,看预测和实际数据多吻合
    
    
    
    1. normalization(scale):
      • make sure features are on a similar scale
      • 如果x1,x2取值范围跨度很大,不容易走到gradient descent的中间点
    from sklearn import preprocessing
    import numpy as numpy
    
    a = np.array([[10,2.7,3.6],[-100,5,-2],[120,20,40]],dtype=np.float64)
    print(a)
    print(preprocessing.scale(a))
    
    from sklearn.model_selection import train_test_split 
    from sklearn.datasets.samples_generator import make_classification
    from sklearn.svm import SVC
    import matplotlib.pyplot as plt 
    
    # generate some data
    X,y = make_classification(n_samples=300, n_features=2, n_redundant=0,n_informative=2, random_state=22, n_clusters_per_class=1,scale=100 ) # random_state随机产生的每次产生的数据是一样的
    
    # look at data
    #plt.scatter(X[:,0],X[:,1],c=y)
    
    X = preprocessing.scale(X) #可以注释掉对比preprocessing前后的performance
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3)
    clf = SVC()
    clf.fit(X_train,y_train)
    print(clf.score(X_test,y_test)) 
    
    
    1. 如何检验神经网络

      • Accuracy
      • R2 score
      • f1 score

      overfitting:

      • error_training < error_test

      解决方法:

      • Theano L1/L2 regularization
      • Tensorflow dropout
      • sklearn cross validation
        怎么确定哪个参数更好解决问题?(e.g. N_layer)
    2. Cross Validation

      • 对比不同参数/model/x属性进行验证
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split 
    from sklearn.neighbors import KNeighborsClassifier
    
    iris = load_iris() 
    X = iris.data
    y = iris.target
    
    X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=4) 
    
    knn = KNeighborsClassifier(n_neighbors=5) # 考虑数据点附近的5个数据
    knn.fit(X_train,y_train) 
    print(knn.score(X_test,y_test))
    
    # 添加了CV
    from sklearn.model_selection import cross_val_score
    knn = KNeighborsClassifier(n_neighbors=5)
    scores = cross_val_score(knn,X,y,cv=5,scoring = 'accuracy')
    print(scores.mean())
    
    from sklearn.model_selection import cross_val_score
    import matplotlib.pyplot as plt 
    k_range = range(1,31)
    k_scores = []
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors=k)
        #scores = cross_val_score(knn,X,y,cv=10,scoring = 'accuracy') # for classificaiton
        loss= -cross_val_score(knn,X,y,cv=10,scoring = 'neg_mean_squared_error') # for regression
        k_scores.append(loss.mean()) #换成scores
    
    plt.plot(k_range,k_scores)
    plt.xlabel('Value of K for KNN')
    plt.ylabel('Cross validation accuracy')
    
    # overfitting 问题
    from sklearn.model_selection import learning_curve
    from sklearn.datasets import load_digits
    from sklearn.svm import SVC
    import matplotlib.pyplot as plt 
    import numpy as np 
    
    digits = load_digits()
    X = digits.data
    y = digits.target
    
    train_sizes, train_loss, test_loss = learning_curve(
        SVC(gamma=0.001),X,y,cv=10,scoring='neg_mean_squared_error',
        train_sizes=[0.1,0.25,0.5,0.75,1] #10%,25%点记录
    )
    train_loss_mean = - np.mean(train_loss,axis=1) 
    test_loss_mean = -np.mean(test_loss,axis=1)
    # 图像化结果
    plt.plot(train_sizes,train_loss_mean,'o-',color ="r", label='training')
    plt.plot(train_sizes,test_loss_mean,'o-',color='g',label='CV')
    plt.xlabel("training example")
    plt.ylabel("loss")
    plt.legend(loc="best")
    
    # overfitting 问题
    from sklearn.model_selection import validation_curve
    from sklearn.datasets import load_digits
    from sklearn.svm import SVC
    import matplotlib.pyplot as plt 
    import numpy as np 
    
    digits = load_digits()
    X = digits.data
    y = digits.target
    param_range = np.logspace(-6,-2.3,5)
    train_loss, test_loss = validation_curve(
        SVC(),X,y,param_name='gamma',param_range=param_range,cv=10,scoring='neg_mean_squared_error',
    )
    train_loss_mean = - np.mean(train_loss,axis=1) 
    test_loss_mean = -np.mean(test_loss,axis=1)
    # 图像化结果
    plt.plot(param_range,train_loss_mean,'o-',color ="r", label='training')
    plt.plot(param_range,test_loss_mean,'o-',color='g',label='CV')
    plt.xlabel("gamma")
    plt.ylabel("loss")
    plt.legend(loc="best")
    
    1. 读取/储存处理好的model
    from sklearn import svm
    from sklearn import datasets
    
    clf = svm.SVC()
    iris = datasets.load_iris()
    X,y = iris.data, iris.target
    clf.fit(X,y)
    
    # method1 : pickle
    import pickle
    ## save 
    #with open('save/clf.pickle','wb') as f:
     #   pickle.dump(clf,f) # 把clf导入f
    ## restore
    with open('save/clf.pickle','rb') as f:
        clf2 = pickle.load(f)
        print(clf2.predict(X[0:1]))
    
    # method2 : joblib 比起pickle更快
    import joblib
    ## save 
    joblib.dump(clf,'save/clf.pkl')
    ## restore
    clf3 = joblib.load('save/clf.pkl')
    print(clf3.predict(X[0:1]))
    

    相关文章

      网友评论

        本文标题:sklearn学习笔记

        本文链接:https://www.haomeiwen.com/subject/dbvspktx.html