美文网首页
03:6大监督学习模型:毒蘑菇分类

03:6大监督学习模型:毒蘑菇分类

作者: Jachin111 | 来源:发表于2022-12-21 16:36 被阅读0次

    数据EDA

    # 导入数据
    import pandas as pd
    import numpy as np
    
    import plotly_express as px
    from matplotlib import pyplot as plt
    import seaborn as sns
    
    import warnings
    warnings.filterwarnings('ignore')
    
    data = pd.read_csv("mushrooms.csv")
    data.shape
    
    image.png
    data.columns
    
    image.png
    # 失值
    data.isnull().sum()
    
    image.png
    # 有无毒对比
    data["class"].value_counts()
    
    image.png

    可视化分析

    cap = data["cap-color"].value_counts().reset_index()
    cap.columns = ["color","number"]
    cap
    
    image.png
    fig = px.bar(cap,x="color",
                y="number",
                color="number",
                text="number",
                color_continuous_scale="rainbow")
    
    fig.show()
    
    image.png
    cap_class = data.groupby(["class","cap-color"]).size().reset_index()
    cap_class.columns = ["class","color","number"]
    cap_class.head()
    
    image.png
    fig = px.bar(cap_class,x="color",
                y="number",
                color="class",
                text="number",
                barmode="group")
    
    fig.show()
    
    image.png
    # 菌的气味
    odor = data["odor"].value_counts().reset_index()
    odor.columns = ["odor","number"]
    odor
    
    image.png
    fig = px.bar(odor,
                x="odor",
                y="number",
                color="number",
                text="number",
                color_continuous_scale="rainbow")
    
    fig.show()
    
    image.png
    odor_class = data.groupby(["class","odor"]).size().reset_index()
    odor_class.columns = ["class","odor","number"]
    odor_class.head()
    
    image.png
    fig = px.bar(odor_class,
                x="odor",
                y="number",
                color="class",
                text="number",
                barmode="group")
    
    fig.show()
    
    image.png

    特征工程

    # 特征转换
    data.head()
    
    image.png
    from sklearn.preprocessing import LabelEncoder
    
    labelencoder = LabelEncoder()
    
    for col in data.columns:
        data[col] = labelencoder.fit_transform(data[col])
        
    data.head()
    
    image.png
    data["stalk-color-above-ring"].unique()
    
    image.png
    data.groupby("class").size()
    
    image.png
    # 数据分布
    data["stalk-color-above-ring"].value_counts()
    
    image.png
    ax = sns.boxplot(x='class',
                    y='stalk-color-above-ring',
                    data=data)
    ax = sns.stripplot(x='class',
                      y='stalk-color-above-ring',
                      data=data,
                      jitter=True,
                      edgecolor="gray")
    plt.title("Class w.r.t stalkcolor above ring",fontsize=12)
    
    plt.show()
    
    image.png
    # 分离特征和标签
    x = data.iloc[:,1:23]
    y = data.iloc[:,0]
    
    # 数据标准化
    from sklearn.preprocessing import StandardScaler
    
    scaler = StandardScaler()
    X = scaler.fit_transform(x)
    X
    
    image.png

    特征相关性

    corr = data.corr()
    sns.heatmap(corr)
    
    plt.show()
    
    image.png

    主成分分析PCA

    # PCA过程
    from sklearn.decomposition import PCA
    
    pca = PCA()
    pca.fit_transform(X)
    
    covariance = pca.get_covariance()
    
    explained_variance = pca.explained_variance_
    explained_variance
    
    image.png
    with plt.style.context("dark_background"):
        plt.figure(figsize=(6,4))
        
        plt.bar(range(22),
               explained_variance,
               alpha=0.5,
               align="center",
               label="individual explained variance")
        
        plt.ylabel('Explained variance ratio')
        plt.xlabel('Principal components')
        plt.legend(loc="best")
        plt.tight_layout()
    
    image.png
    # 2个主成分下的原始数据分布
    N = data.values
    pca = PCA(n_components=2)
    x = pca.fit_transform(N)
    
    plt.figure(figsize=(5,5))
    plt.scatter(x[:,0],x[:,1])
    
    plt.show()
    
    image.png
    from sklearn.cluster import KMeans
    
    km = KMeans(n_clusters=2,random_state=5)
    
    N = data.values
    X_clustered = km.fit_predict(N)
    
    label_color_map = {0:"g",1:"y"}
    label_color = [label_color_map[l] for l in X_clustered]
    
    plt.figure(figsize=(5,5))
    plt.scatter(x[:,0],x[:,1],c=label_color)
    
    plt.show()
    
    image.png
    # 基于17主成分下的建模
    pca_modified = PCA(n_components=17)
    pca_modified.fit_transform(X)
    
    image.png
    from sklearn.model_selection import train_test_split
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)
    

    模型1:逻辑回归

    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import cross_val_score
    from sklearn import metrics
    
    model_LR = LogisticRegression()
    model_LR.fit(X_train,y_train)
    
    image.png
    y_prob = model_LR.predict_proba(X_test)[:,1]
    y_prob
    
    image.png
    y_pred = np.where(y_prob>0.5,1,0)
    y_pred
    
    image.png
    model_LR.score(X_test,y_pred)
    
    image.png
    confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
    confusion_matrix
    
    image.png
    auc_roc = metrics.roc_auc_score(y_test,y_pred)
    auc_roc
    
    image.png
    # 真假阳性
    from sklearn.metrics import roc_curve,auc
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_prob)
    roc_auc = auc(false_positive_rate,true_positive_rate)
    roc_auc
    
    image.png
    # ROC曲线
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10,10))
    plt.title("ROC")
    plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC=%0.2f"%roc_auc)
    
    plt.legend(loc="lower right")
    plt.plot([0,1],[0,1],linestyle="--")
    plt.axis("tight")
    
    plt.ylabel("True Positive Rate")
    plt.xlabel("False Positive Rate")
    
    plt.show()
    
    image.png
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import cross_val_score
    from sklearn import metrics
    
    LR_model = LogisticRegression()
    tuned_parameters = {"C":[0.001,0.01,0.1,1,10,100,1000],
                       "penalty":['l1','l2']}
    
    from sklearn.model_selection import GridSearchCV
    
    LR = GridSearchCV(LR_model,tuned_parameters,cv=10)
    LR.fit(X_train,y_train)
    
    print(LR.best_params_)
    
    image.png
    y_prob = LR.predict_proba(X_test)[:,1]
    y_pred = np.where(y_prob>0.5,1,0)
    LR.score(X_test,y_pred)
    
    image.png
    confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
    confusion_matrix
    
    image.png
    auc_roc = metrics.classification_report(y_test,y_pred)
    print(auc_roc)
    
    image.png
    auc_roc = metrics.roc_auc_score(y_test,y_pred)
    auc_roc
    
    image.png
    # ROC曲线情况
    from sklearn.metrics import roc_curve,auc
    
    false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
    
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10,10))
    plt.title("ROC")
    plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
    
    plt.legend(loc="lower right")
    plt.plot([0,1],[0,1],linestyle="--")
    plt.axis("tight")
    plt.ylabel("True Positive Rate")
    plt.xlabel("False Positive Rate")
    
    plt.show()
    
    image.png

    模型2:高斯朴素贝叶斯

    from sklearn.naive_bayes import GaussianNB
    
    model_naive = GaussianNB()
    model_naive.fit(X_train,y_train)
    
    y_prob = model_naive.predict_proba(X_test)[:,1]
    y_pred = np.where(y_prob>0.5,1,0)
    model_naive.score(X_test,y_pred)
    
    image.png
    print(f"Number of mislabeled points from {X_test.shape[0]} points:{(y_test!=y_pred).sum()}")
    
    image.png
    # 交叉验证
    scores = cross_val_score(model_naive,X,y,cv=10,scoring="accuracy")
    scores
    
    image.png
    scores.mean()
    
    image.png
    # 混淆矩阵和AUC
    confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
    confusion_matrix
    
    image.png
    auc_roc = metrics.classification_report(y_test,y_pred)
    print(auc_roc)
    
    image.png
    # 真假阳性
    from sklearn.metrics import roc_curve,auc
    
    false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
    roc_auc = auc(false_positive_rate,true_positive_rate)
    roc_auc
    
    image.png
    # ROC曲线
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10,10))
    plt.title("ROC")
    plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
    
    plt.legend(loc="lower right")
    plt.plot([0,1],[0,1],linestyle='--')
    plt.axis("tight")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    
    plt.show()
    
    image.png

    模型3:支持向量机SVM

    # 建模过程
    from sklearn.svm import SVC
    
    svm_model = SVC()
    tuned_parameters = {
        'C':[1,10,100,500,1000],
        'kernel':['linear','rbf'],
        'C':[1,10,100,500,1000],
        'gamma':[1,0.1,0.01,0.001,0.0001],
        'kernel':['rbf']
    }
    
    # 随机网络搜索-RandomizedSearchCV
    from sklearn.model_selection import RandomizedSearchCV
    
    model_svm = RandomizedSearchCV(svm_model,
                                  tuned_parameters,
                                  cv=10,
                                  scoring="accuracy",
                                  n_iter=20)
    model_svm.fit(X_train,y_train)
    
    image.png
    print(model_svm.best_score_)
    
    image.png
    model_svm.best_params_
    
    image.png
    y_pred = model_svm.predict(X_test)
    metrics.accuracy_score(y_pred,y_test)
    
    image.png
    # 混淆矩阵
    metrics.confusion_matrix(y_test,y_pred)
    
    image.png
    print(metrics.classification_report(y_test,y_pred))
    
    image.png
    metrics.roc_auc_score(y_test,y_pred)
    
    image.png
    # ROC曲线
    from sklearn.metrics import roc_curve,auc
    
    false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_pred)
    roc_auc = auc(false_positive_rate,true_positive_rate)
    
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10,10))
    plt.title("ROC")
    plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
    
    plt.legend(loc="lower right")
    plt.plot([0,1],[0,1],linestyle="--")
    plt.axis("tight")
    plt.ylabel("True Positive Rate")
    plt.xlabel("False Positive Rate")
    
    plt.show()
    
    image.png

    模型4:随机森林

    # 建模拟合
    from sklearn.ensemble import RandomForestClassifier
    
    model_RR = RandomForestClassifier()
    model_RR.fit(X_train,y_train)
    
    image.png
    # 预测得分
    y_prob = model_RR.predict_proba(X_test)[:,1]
    y_prob
    
    image.png
    y_pred = np.where(y_prob>0.5,1,0)
    model_RR.score(X_test,y_pred)
    
    image.png
    # 混淆矩阵
    metrics.confusion_matrix(y_test,y_pred)
    
    image.png
    print(metrics.classification_report(y_test,y_pred))
    
    image.png
    metrics.roc_auc_score(y_test,y_pred)
    
    image.png
    # ROC曲线
    from sklearn.metrics import roc_curve,auc
    
    false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
    roc_auc = auc(false_positive_rate,true_positive_rate)
    roc_auc
    
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10,10))
    plt.title("ROC")
    plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
    
    plt.legend(loc="lower right")
    plt.plot([0,1],[0,1],linestyle="--")
    plt.axis("tight")
    plt.ylabel("True Positive Rate")
    plt.xlabel("False Positive Rate")
    
    plt.show()
    
    image.png

    模型5:决策树(CART)

    # 建模
    from sklearn.tree import DecisionTreeClassifier
    
    model_tree = DecisionTreeClassifier()
    model_tree.fit(X_train,y_train)
    y_prob = model_tree.predict_proba(X_test)[:,1]
    y_pred = np.where(y_prob>0.5,1,0)
    model_tree.score(X_test,y_pred)
    
    image.png
    # 混淆矩阵
    metrics.confusion_matrix(y_test,y_pred)
    
    image.png
    print(metrics.classification_report(y_test,y_pred))
    
    image.png
    metrics.roc_auc_score(y_test,y_pred)
    
    image.png
    # ROC曲线
    from sklearn.metrics import roc_curve,auc
    
    false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
    roc_auc = auc(false_positive_rate,true_positive_rate)
    roc_auc
    
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10,10))
    plt.title("ROC")
    plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
    
    plt.legend(loc="lower right")
    plt.plot([0,1],[0,1],linestyle="--")
    plt.axis("tight")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    
    plt.show()
    
    image.png

    模型6:神经网络ANN

    # 建模
    from sklearn.neural_network import MLPClassifier
    
    mlp = MLPClassifier()
    mlp.fit(X_train,y_train)
    
    image.png
    y_prob = mlp.predict_proba(X_test)[:,1]
    y_pred = np.where(y_prob>0.5,1,0)
    mlp.score(X_test,y_pred)
    
    image.png
    # 混淆矩阵
    metrics.confusion_matrix(y_test,y_pred)
    
    image.png
    print(metrics.classification_report(y_test,y_pred))
    
    image.png
    metrics.roc_auc_score(y_test,y_pred)
    
    image.png
    # ROC曲线
    from sklearn.metrics import roc_curve,auc
    
    false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
    roc_auc = auc(false_positive_rate,true_positive_rate)
    roc_auc
    
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10,10))
    plt.title("ROC")
    plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
    
    plt.legend(loc="lower right")
    plt.plot([0,1],[0,1],linestyle="--")
    plt.axis("tight")
    plt.ylabel("True Positive Rate")
    plt.xlabel("False Positive Rate")
    
    plt.show()
    
    image.png
    # 网格搜索
    from sklearn.neural_network import MLPClassifier
    
    mlp_model = MLPClassifier()
    
    tuned_parameters = {'hidden_layer_sizes':range(1,200,10),
                       'activation':['tanh','logistic','relu'],
                       'alpha':[0.0001,0.001,0.01,0.1,1,10],
                       'max_iter':range(50,200,50)}
    model_mlp = RandomizedSearchCV(mlp_model,tuned_parameters,cv=10,scoring='accuracy',n_iter=5,n_jobs=-1,random_state=5)
    model_mlp.fit(X_train,y_train)
    
    image.png
    # 模型属性
    model_mlp.best_score_
    
    image.png
    model_mlp.best_params_
    
    image.png
    model_svm.cv_results_
    
    image.png
    # ROC曲线
    from sklearn.metrics import roc_curve,auc
    
    false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
    roc_auc = auc(false_positive_rate,true_positive_rate)
    roc_auc
    
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10,10))
    plt.title("ROC")
    plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
    
    plt.legend(loc="lower right")
    plt.plot([0,1],[0,1],linestyle="--")
    plt.axis("tight")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    
    plt.show()
    
    image.png

    相关文章

      网友评论

          本文标题:03:6大监督学习模型:毒蘑菇分类

          本文链接:https://www.haomeiwen.com/subject/zmzyqdtx.html