美文网首页人工智能工程师
回归算法--LogisticRegressionCV

回归算法--LogisticRegressionCV

作者: longsan0918 | 来源:发表于2018-12-10 12:05 被阅读263次

    1 鸢尾花数据分类

    # -*- coding: utf-8 -*-
    # @Time    : 2018/12/6 下午5:44
    # @Author  : scl
    # @Email   : 1163820757@qq.com
    # @File    : 鸢尾花数据分类(分类问题).py
    # @Software: PyCharm
    
    import numpy as np
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import pandas as pd
    import warnings
    
    import sklearn
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.linear_model.coordinate_descent import ConvergenceWarning
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.neighbors import KNeighborsClassifier #KNN
    from sklearn.preprocessing import label_binarize
    from sklearn import metrics
    
    ## 设置字符集,防止中文乱码
    mpl.rcParams['font.sans-serif']=[u'simHei']
    mpl.rcParams['axes.unicode_minus']=False
    ## 拦截异常
    warnings.filterwarnings(action = 'ignore', category=ConvergenceWarning)
    
    
    path = "datas/iris.data"
    names = ['sepal length', 'sepal width', 'petal length','petal width', 'cla']
    df = pd.read_csv(path, header=None, names=names)
    df['cla'].value_counts()
    df.head()
    
    # 数据哑编码处理
    def parseRecord(record):
        result=[]
        r = zip(names,record)
        for name,v in r:
            if name == 'cla':
                if v == 'Iris-setosa':
                    result.append(1)
                elif v == 'Iris-versicolor':
                    result.append(2)
                elif v == 'Iris-virginica':
                    result.append(3)
                else:
                    result.append(np.nan)
            else:
                result.append(float(v))
        return result
    
    
    ### 数据转换
    datas = df.apply(lambda r: pd.Series(parseRecord(r),index=names), axis=1)
    
    ### 异常数据删除
    datas = datas.dropna(how='any')
    
    ### 数据分割
    X = datas[names[0:-1]]
    Y = datas[names[-1]]
    
    ### 数据抽样(训练数据和测试数据分割)
    X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
    
    print ("原始数据条数:%d;训练数据条数:%d;特征个数:%d;测试样本条数:%d"
      % (len(X), len(X_train), X_train.shape[1], X_test.shape[0]))
    
    
    # 数据标准化
    ss = StandardScaler()
    X_train = ss.fit_transform(X_train)
    X_test = ss.transform(X_test)
    
    
    
    lr = LogisticRegressionCV(Cs=np.logspace(-4,1,50),
      cv=3,fit_intercept=True, penalty='l2', solver='lbfgs',
        tol=0.01, multi_class='multinomial')
    #solver:‘newton-cg’,'lbfgs','liblinear','sag'  default:liblinear
    #'sag'=mini-batch
    #'multi_clss':
    lr.fit(X_train, Y_train)
    
    
    ## 将正确的数据转换为矩阵形式
    y_test_hot = label_binarize(Y_test,classes=(1,2,3))
    print(y_test_hot)
    ## 得到预测的损失值
    lr_y_score = lr.decision_function(X_test)
    ## 计算roc的值
    lr_fpr, lr_tpr, lr_threasholds = metrics.roc_curve(y_test_hot.ravel(),lr_y_score.ravel())
    #threasholds阈值
    ## 计算auc的值
    lr_auc = metrics.auc(lr_fpr, lr_tpr)
    print ("Logistic算法R值:", lr.score(X_train, Y_train))
    print ("Logistic算法AUC值:", lr_auc)
    ### 7. 模型预测
    print(lr_y_score)
    lr_y_predict = lr.predict(X_test)
    print(lr.predict_proba(X_test))
    
    
    
    
    x_test_len = range(len(X_test))
    ## 画图1:ROC曲线画图
    plt.figure(figsize=(8, 6), facecolor='w')
    plt.plot(lr_fpr,lr_tpr,c='r',lw=2,label=u'Logistic算法,AUC=%.3f' % lr_auc)
    
    plt.plot((0,1),(0,1),c='#a0a0a0',lw=2,ls='--')
    plt.xlim(-0.01, 1.02) #设置X轴的最大和最小值
    plt.ylim(-0.01, 1.02) #设置y轴的最大和最小值
    plt.xticks(np.arange(0, 1.1, 0.1))
    plt.yticks(np.arange(0, 1.1, 0.1))
    
    
    plt.xlabel('False Positive Rate(FPR)', fontsize=16)
    plt.ylabel('True Positive Rate(TPR)', fontsize=16)
    plt.grid(b=True, ls=':')
    plt.legend(loc='lower right', fancybox=True, framealpha=0.8, fontsize=12)
    plt.title(u'鸢尾花数据Logistic和KNN算法的ROC/AUC', fontsize=18)
    plt.show()
    
    # 绘制预测
    # plt.figure(figsize=(12, 9), facecolor='w')
    # plt.ylim(0.5,3.5)
    # plt.plot(x_test_len, Y_test, 'ro',markersize = 6,
    #   zorder=3, label=u'真实值')
    # plt.plot(x_test_len, lr_y_predict, 'go', markersize = 10, zorder=2,
    #   label=u'Logis算法预测值,$R^2$=%.3f' % lr.score(X_test, Y_test))
    # plt.legend(loc = 'lower right')
    # plt.xlabel(u'数据编号', fontsize=18)
    # plt.ylabel(u'种类', fontsize=18)
    # plt.title(u'鸢尾花数据分类', fontsize=20)
    # plt.show()
                                                                                                                                   
    

    console输出

    /anaconda3/envs/mlenvment/bin/python3.7 /Users/long/Desktop/ml_worksapce/MlGitHubCode/MlWorkSpacePrj/回归算法/回归算法/鸢尾花数据分类(分类问题).py
    /anaconda3/envs/mlenvment/lib/python3.7/site-packages/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py:47: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
      import imp
    原始数据条数:150;训练数据条数:90;特征个数:4;测试样本条数:60
    [[0 0 1]
     [0 1 0]
     [1 0 0]
     [0 0 1]
     [1 0 0]
     [0 0 1]
     [1 0 0]
     [0 1 0]
     [0 1 0]
     [0 1 0]
     [0 0 1]
     [0 1 0]
     [0 1 0]
     [0 1 0]
     [0 1 0]
     [1 0 0]
     [0 1 0]
     [0 1 0]
     [1 0 0]
     [1 0 0]
     [0 0 1]
     [0 1 0]
     [1 0 0]
     [1 0 0]
     [0 0 1]
     [1 0 0]
     [1 0 0]
     [0 1 0]
     [0 1 0]
     [1 0 0]
     [0 0 1]
     [0 1 0]
     [1 0 0]
     [0 0 1]
     [0 0 1]
     [0 1 0]
     [1 0 0]
     [0 1 0]
     [0 1 0]
     [0 1 0]
     [0 0 1]
     [1 0 0]
     [0 0 1]
     [1 0 0]
     [1 0 0]
     [0 1 0]
     [0 0 1]
     [0 0 1]
     [0 0 1]
     [0 0 1]
     [0 1 0]
     [0 0 1]
     [0 1 0]
     [0 1 0]
     [0 0 1]
     [0 0 1]
     [0 0 1]
     [0 0 1]
     [0 1 0]
     [0 0 1]]
    Logistic算法R值: 0.9777777777777777
    Logistic算法AUC值: 0.9269444444444445
    [[ -6.27937676   1.05548892   5.22388784]
     [ -2.5371109    4.02213826  -1.48502736]
     [  9.58561223   2.6059998  -12.19161203]
     [ -8.18346495   2.6530125    5.53045244]
     [  8.06522513   3.206342   -11.27156713]
     [ -7.22184333   0.48250241   6.73934092]
     [  8.3061655    2.99869891 -11.30486441]
     [ -3.63926189   2.64337134   0.99589054]
     [ -4.44558943   3.22131538   1.22427405]
     [ -1.96604656   2.94686917  -0.98082261]
     [ -4.96636124   2.69911013   2.26725111]
     [ -2.62637732   2.30702815   0.31934917]
     [ -2.60962466   2.90343809  -0.29381343]
     [ -4.0043684    2.89099073   1.11337767]
     [ -2.93851571   2.52286105   0.41565466]
     [  7.70822708   3.63929131 -11.34751839]
     [ -2.8118009    2.36157169   0.4502292 ]
     [ -1.8045935    2.73019664  -0.92560314]
     [  7.12845508   3.45997993 -10.58843501]
     [  8.46288289   2.68843117 -11.15131406]
     [ -4.49448454   1.43394608   3.06053846]
     [ -1.91130978   1.89262974   0.01868004]
     [  7.84043823   2.94114447 -10.7815827 ]
     [  7.78617371   3.37763355 -11.16380726]
     [ -4.70281021   2.21479818   2.48801203]
     [  9.91131131   2.72673147 -12.63804279]
     [  7.85422387   2.42150625 -10.27573012]
     [ -2.25629084   2.82839377  -0.57210293]
     [  0.25066183   3.21437436  -3.46503619]
     [  7.33003213   2.93259577 -10.2626279 ]
     [ -5.14664337   1.82293543   3.32370794]
     [ -1.59680356   1.72359445  -0.12679089]
     [  8.02219758   3.24450592 -11.2667035 ]
     [ -4.13742303   1.84449723   2.29292581]
     [ -7.2966421    1.69209766   5.60454444]
     [ -0.98760882   2.21626947  -1.22866065]
     [  7.48053662   3.10154607 -10.5820827 ]
     [ -4.46388278   2.35421825   2.10966453]
     [ -1.97859927   2.21830435  -0.23970508]
     [ -1.73705971   3.07991182  -1.3428521 ]
     [ -6.94768906   1.91731888   5.03037018]
     [  8.26367368   3.26261293 -11.52628661]
     [ -6.8945401    1.71742173   5.17711837]
     [  7.00114109   2.55201873  -9.55315982]
     [  8.2506487    3.15179846 -11.40244716]
     [ -0.83342074   3.55890736  -2.72548661]
     [ -5.57537515   2.03832445   3.53705071]
     [ -7.44638593   0.89269467   6.55369126]
     [ -5.386268     3.1574685    2.2287995 ]
     [ -7.65174457   2.78959492   4.86214965]
     [ -1.25902403   3.19064129  -1.93161727]
     [ -9.77227645   2.28402866   7.48824779]
     [ -2.76747696   1.91870518   0.84877178]
     [ -1.23842631   3.26103558  -2.02260927]
     [ -5.26636527   2.40614663   2.86021863]
     [ -4.36397858   2.60175264   1.76222594]
     [ -2.76830109   1.70565742   1.06264367]
     [ -6.24433075   2.51899599   3.72533476]
     [ -3.53935769   2.89090574   0.64845195]
     [ -5.77127503   1.86074046   3.91053457]]
    [[9.94308711e-06 1.52409814e-02 9.84749076e-01]
     [1.40923429e-03 9.94555266e-01 4.03549934e-03]
     [9.99070201e-01 9.29798504e-04 3.48231198e-10]
     [1.04795442e-06 5.32800658e-02 9.46718886e-01]
     [9.92300592e-01 7.69940419e-03 3.96991379e-09]
     [8.62784771e-07 1.91362710e-03 9.98085510e-01]
     [9.95069960e-01 4.93003683e-03 3.02615758e-09]
     [1.56435834e-03 8.37238489e-01 1.61197152e-01]
     [4.11954355e-04 8.80123366e-01 1.19464680e-01]
     [7.15748439e-03 9.73671826e-01 1.91706895e-02]
     [2.84121940e-04 6.06145229e-01 3.93570649e-01]
     [6.29421519e-03 8.73961623e-01 1.19744162e-01]
     [3.86037204e-03 9.57021938e-01 3.91176902e-02]
     [8.65322130e-04 8.54661679e-01 1.44472999e-01]
     [3.77297029e-03 8.88237645e-01 1.07989385e-01]
     [9.83191769e-01 1.68082260e-02 5.20994486e-09]
     [4.91131150e-03 8.66891307e-01 1.28197381e-01]
     [1.03506360e-02 9.64720201e-01 2.49291631e-02]
     [9.75119469e-01 2.48805118e-02 1.97110710e-08]
     [9.96903714e-01 3.09628334e-03 3.02214789e-09]
     [4.37277212e-04 1.64225862e-01 8.35336861e-01]
     [1.89512121e-02 8.50485546e-01 1.30563242e-01]
     [9.92603267e-01 7.39672472e-03 8.11586415e-09]
     [9.87973457e-01 1.20265376e-02 5.81933161e-09]
     [4.27698467e-04 4.31933453e-01 5.67638848e-01]
     [9.99242387e-01 7.57613088e-04 1.60920285e-10]
     [9.95647811e-01 4.35217610e-03 1.33157849e-08]
     [5.95532105e-03 9.61956971e-01 3.20877079e-02]
     [4.90338189e-02 9.49772774e-01 1.19340698e-03]
     [9.87840788e-01 1.21591890e-02 2.26095427e-08]
     [1.71351240e-04 1.82279096e-01 8.17549553e-01]
     [3.02840778e-02 8.38001677e-01 1.31714245e-01]
     [9.91654822e-01 8.34517412e-03 4.16195355e-09]
     [9.82713315e-04 3.89351453e-01 6.09665834e-01]
     [2.44617907e-06 1.95996503e-02 9.80397904e-01]
     [3.78591986e-02 9.32391009e-01 2.97497922e-02]
     [9.87617232e-01 1.23827534e-02 1.41283928e-08]
     [6.13063609e-04 5.60491705e-01 4.38895231e-01]
     [1.36665771e-02 9.08556294e-01 7.77771291e-02]
     [7.93188460e-03 9.80303477e-01 1.17646383e-02]
     [6.01310014e-06 4.25718447e-02 9.57422142e-01]
     [9.93314195e-01 6.68580269e-03 2.52589966e-09]
     [5.54497074e-06 3.04808279e-02 9.69513627e-01]
     [9.88446166e-01 1.15537705e-02 6.39015637e-08]
     [9.93933267e-01 6.06673050e-03 2.89818523e-09]
     [1.21982255e-02 9.85962770e-01 1.83900493e-03]
     [9.01386278e-05 1.82599114e-01 8.17310747e-01]
     [8.28581172e-07 3.46698330e-03 9.96532188e-01]
     [1.39586331e-04 7.16705121e-01 2.83155292e-01]
     [3.26435596e-06 1.11792749e-01 8.88203987e-01]
     [1.14799130e-02 9.82660931e-01 5.85915602e-03]
     [3.17300208e-08 5.46332632e-03 9.94536642e-01]
     [6.81960060e-03 7.39506484e-01 2.53673915e-01]
     [1.09379036e-02 9.84069025e-01 4.99307120e-03]
     [1.80743782e-04 3.88322849e-01 6.11496407e-01]
     [6.58594413e-04 6.97905585e-01 3.01435820e-01]
     [7.41788656e-03 6.50572468e-01 3.42009645e-01]
     [3.60169716e-05 2.30341207e-01 7.69622776e-01]
     [1.45514725e-03 9.02682174e-01 9.58626786e-02]
     [5.52862776e-05 1.14066880e-01 8.85877833e-01]]
    objc[33525]: Class FIFinderSyncExtensionHost is implemented in both /System/Library/PrivateFrameworks/FinderKit.framework/Versions/A/FinderKit (0x7fff8fd57c90) and /System/Library/PrivateFrameworks/FileProvider.framework/OverrideBundles/FinderSyncCollaborationFileProviderOverride.bundle/Contents/MacOS/FinderSyncCollaborationFileProviderOverride (0x1a242c3cd8). One of the two will be used. Which one is undefined.
    
    
    
    

    效果

    Roc_auc.png 鸢尾花数据分类.jpg

    知识点:
    1 Roc曲线:接收者操作特征(receiveroperating characteristic),roc曲线上每个点反映着对同一信号刺激的感受性。
    横轴:负正类率(false postive rate FPR)特异度,划分实例中所有负例占所有负例的比例;(1-Specificity)
    纵轴:真正类率(true postive rate TPR)灵敏度,Sensitivity(正类覆盖率)

    针对一个二分类问题 将实例分为正类和负类 实际会出现四种情况(真正类(TP(True Postive)), 假负类(TN(False Negative),假正类(FP(False Postive FP)),真负类(TN(True Negative)))

    TP:正确肯定数目
    FN:漏报
    FP:误报
    TN:正确拒绝的非匹配项

    真正类率 (TPR) TP/(TP+FN) :预测的正类中实际正实例占所有正实例的比例
    负正类率 (FPR) FP/(FP+TN)

    BBBB463D-773E-4264-8883-BCEDD1C24BB1.png

    横轴FPR:1-TNR,1-Specificity,FPR越大,预测正类中实际负类越多。
    纵轴TPR:Sensitivity(正类覆盖率),TPR越大,预测正类中实际正类越多。
    理想目标:TPR=1,FPR=0,即图中(0,1)点,故ROC曲线越靠拢(0,1)点,越偏离45度对角线越好,Sensitivity、Specificity越大效果越好。

    AUC(Area under Curce) Roc曲线下的面积 值越大越好
    首先AUC值是一个概率值,当你随机挑选一个正样本以及负样本,当前的分类算法根据计算得到的Score值将这个正样本排在负样本前面的概率就是AUC值,AUC值越大,当前分类算法越有可能将正样本排在负样本前面,从而能够更好地分类。

    2 信贷审批问题

    # -*- coding: utf-8 -*-
    # @Time    : 2018/12/10 下午3:28
    # @Author  : scl
    # @Email   : 1163820757@qq.com
    # @File    : 信贷审批(分类问题).py
    # @Software: PyCharm
    
    import numpy as np
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import pandas as pd
    import warnings
    
    import sklearn
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.linear_model.coordinate_descent import ConvergenceWarning
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.neighbors import KNeighborsClassifier #KNN
    from sklearn.preprocessing import label_binarize
    from sklearn import metrics
    
    ## 设置字符集,防止中文乱码
    mpl.rcParams['font.sans-serif']=[u'simHei']
    mpl.rcParams['axes.unicode_minus']=False
    ## 拦截异常
    warnings.filterwarnings(action = 'ignore', category=ConvergenceWarning)
    
    
    # 1 加载数据
    path = "datas/crx.data"
    
    names = ['A1','A2','A3','A4','A5','A6','A7','A8',
             'A9','A10','A11','A12','A13','A14','A15','A16']
    
    df = pd.read_csv(path,header = None,names = names)
    print("数据条数",len(df))
    
     # 2 过滤异常数据
    df = df.replace("?", np.nan).dropna(how='any')
    print ("过滤后数据条数:", len(df))
    
    print("正常状态数据条数",len(df))
    print(df.head(10))
    
    
    # 自定义一个亚编码 将v变量转换成一个list形式
    def parse(v, l):
        # v是一个字符串,需要进行转换的数据
        # l是一个类别信息,其中v是其中的一个值
        return [1 if i == v else 0 for i in l]
    
    
    print(parse('b',('a', 'b')))
    print(df["A4"])
    
    
    def parseRecord(record):
        result = []
        a1 = record['A1']
        for i in parse(a1, ('a', 'b')):
            result.append(i)
    
        result.append(float(record['A2']))
        result.append(float(record['A3']))
    
    
        # 将A4的信息转换为哑编码的形式; 对于DataFrame中,原来一列的数据现在需要四列来进行表示
        a4 = record['A4']
        for i in parse(a4, ('u', 'y', 'l', 't')):
            result.append(i)
    
        a5 = record['A5']
        for i in parse(a5, ('g', 'p', 'gg')):
            result.append(i)
    
        a6 = record['A6']
        for i in parse(a6, ('c', 'd', 'cc', 'i', 'j', 'k', 'm', 'r', 'q', 'w', 'x', 'e', 'aa', 'ff')):
            result.append(i)
    
        a7 = record['A7']
        for i in parse(a7, ('v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o')):
            result.append(i)
    
        result.append(float(record['A8']))
    
        a9 = record['A9']
        for i in parse(a9, ('t', 'f')):
            result.append(i)
    
        a10 = record['A10']
        for i in parse(a10, ('t', 'f')):
            result.append(i)
    
        result.append(float(record['A11']))
    
        a12 = record['A12']
        for i in parse(a12, ('t', 'f')):
            result.append(i)
    
        a13 = record['A13']
        for i in parse(a13,('g', 'p', 's')):
            result.append(i)
    
        result.append(float(record['A14']))
        result.append(float(record['A15']))
    
        a16 = record['A16']
        if a16 == '+':
            result.append(1)
        else:
            result.append(0)
    
        return result
    
    ### 数据特征处理(将数据转换为数值类型的)
    new_names =  ['A1_0', 'A1_1',
                  'A2','A3',
                  'A4_0','A4_1','A4_2','A4_3', # 因为需要对A4进行哑编码操作,需要使用四列来表示一列的值
                  'A5_0', 'A5_1', 'A5_2',
                  'A6_0', 'A6_1', 'A6_2', 'A6_3', 'A6_4', 'A6_5', 'A6_6', 'A6_7', 'A6_8', 'A6_9', 'A6_10', 'A6_11', 'A6_12', 'A6_13',
                  'A7_0', 'A7_1', 'A7_2', 'A7_3', 'A7_4', 'A7_5', 'A7_6', 'A7_7', 'A7_8',
                  'A8',
                  'A9_0', 'A9_1' ,
                  'A10_0', 'A10_1',
                  'A11',
                  'A12_0', 'A12_1',
                  'A13_0', 'A13_1', 'A13_2',
                  'A14','A15','A16']
    
    datas = df.apply(lambda x: pd.Series(parseRecord(x), index = new_names), axis=1)
    
    print(datas.head(5))
    
    
    
    ## 数据分割
    X = datas[new_names[0:-1]]
    Y = datas[new_names[-1]]
    
    X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,train_size = 0.8, random_state=0)
    
    print(X_train.shape)
    print(X_test.shape)
    
    
    ## 数据标准化
    ss = StandardScaler()
    ## 模型训练一定是在训练集合上训练的
    X_train = ss.fit_transform(X_train) ## 训练正则化模型,并将训练数据归一化操作
    X_test = ss.transform(X_test) ## 使用训练好的模型对测试数据进行归一化操作
    
    
    ## 模型训练
    lr = LogisticRegressionCV(Cs=np.logspace(-4,1,50), fit_intercept=True, penalty='l2', solver='lbfgs', tol=0.01, multi_class='ovr')
    lr.fit(X_train, Y_train)
    
    ## Logistic算法效果输出
    lr_r = lr.score(X_train, Y_train)
    print ("Logistic算法(训练集上的准确率):", lr_r)
    print ("Logistic算法稀疏化特征比率:%.2f%%" % (np.mean(lr.coef_.ravel() == 0) * 100))
    print ("Logistic算法参数:",lr.coef_)
    print ("Logistic算法截距:",lr.intercept_)
    
    # 预测
    lr_y_predict = lr.predict(X_test)
    
    print(lr_y_predict)
    
    x_len = range(len(X_test))
    plt.figure(figsize=(14,7), facecolor='w')
    plt.ylim(-0.1,1.1)
    plt.plot(x_len, Y_test, 'ro',markersize = 6, zorder=3, label=u'真实值')
    plt.plot(x_len, lr_y_predict, 'go', markersize = 10, zorder=2,
             label=u'Logistic算法预测值,准确率=%.3f' % lr.score(X_test, Y_test))
    plt.legend(loc = 'center right')
    plt.xlabel(u'数据编号', fontsize=18)
    plt.ylabel(u'是否审批(0表示未通过,1表示通过)', fontsize=18)
    plt.title(u'Logistic回归算法', fontsize=20)
    plt.show()
    
    效果图
    信贷审批.png

    欢迎访问我的机器学习GitHub地址https://github.com/longsan1234567/mlFolder

    相关文章

      网友评论

        本文标题:回归算法--LogisticRegressionCV

        本文链接:https://www.haomeiwen.com/subject/uievqqtx.html