数据EDA
# 导入数据
import pandas as pd
import numpy as np
import plotly_express as px
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("mushrooms.csv")
data.shape
image.png
data.columns
image.png
# 失值
data.isnull().sum()
image.png
# 有无毒对比
data["class"].value_counts()
image.png
可视化分析
cap = data["cap-color"].value_counts().reset_index()
cap.columns = ["color","number"]
cap
image.png
fig = px.bar(cap,x="color",
y="number",
color="number",
text="number",
color_continuous_scale="rainbow")
fig.show()
image.png
cap_class = data.groupby(["class","cap-color"]).size().reset_index()
cap_class.columns = ["class","color","number"]
cap_class.head()
image.png
fig = px.bar(cap_class,x="color",
y="number",
color="class",
text="number",
barmode="group")
fig.show()
image.png
# 菌的气味
odor = data["odor"].value_counts().reset_index()
odor.columns = ["odor","number"]
odor
image.png
fig = px.bar(odor,
x="odor",
y="number",
color="number",
text="number",
color_continuous_scale="rainbow")
fig.show()
image.png
odor_class = data.groupby(["class","odor"]).size().reset_index()
odor_class.columns = ["class","odor","number"]
odor_class.head()
image.png
fig = px.bar(odor_class,
x="odor",
y="number",
color="class",
text="number",
barmode="group")
fig.show()
image.png
特征工程
# 特征转换
data.head()
image.png
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
for col in data.columns:
data[col] = labelencoder.fit_transform(data[col])
data.head()
image.png
data["stalk-color-above-ring"].unique()
image.png
data.groupby("class").size()
image.png
# 数据分布
data["stalk-color-above-ring"].value_counts()
image.png
ax = sns.boxplot(x='class',
y='stalk-color-above-ring',
data=data)
ax = sns.stripplot(x='class',
y='stalk-color-above-ring',
data=data,
jitter=True,
edgecolor="gray")
plt.title("Class w.r.t stalkcolor above ring",fontsize=12)
plt.show()
image.png
# 分离特征和标签
x = data.iloc[:,1:23]
y = data.iloc[:,0]
# 数据标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(x)
X
image.png
特征相关性
corr = data.corr()
sns.heatmap(corr)
plt.show()
image.png
主成分分析PCA
# PCA过程
from sklearn.decomposition import PCA
pca = PCA()
pca.fit_transform(X)
covariance = pca.get_covariance()
explained_variance = pca.explained_variance_
explained_variance
image.png
with plt.style.context("dark_background"):
plt.figure(figsize=(6,4))
plt.bar(range(22),
explained_variance,
alpha=0.5,
align="center",
label="individual explained variance")
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc="best")
plt.tight_layout()
image.png
# 2个主成分下的原始数据分布
N = data.values
pca = PCA(n_components=2)
x = pca.fit_transform(N)
plt.figure(figsize=(5,5))
plt.scatter(x[:,0],x[:,1])
plt.show()
image.png
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2,random_state=5)
N = data.values
X_clustered = km.fit_predict(N)
label_color_map = {0:"g",1:"y"}
label_color = [label_color_map[l] for l in X_clustered]
plt.figure(figsize=(5,5))
plt.scatter(x[:,0],x[:,1],c=label_color)
plt.show()
image.png
# 基于17主成分下的建模
pca_modified = PCA(n_components=17)
pca_modified.fit_transform(X)
image.png
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)
模型1:逻辑回归
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
model_LR = LogisticRegression()
model_LR.fit(X_train,y_train)
image.png
y_prob = model_LR.predict_proba(X_test)[:,1]
y_prob
image.png
y_pred = np.where(y_prob>0.5,1,0)
y_pred
image.png
model_LR.score(X_test,y_pred)
image.png
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
confusion_matrix
image.png
auc_roc = metrics.roc_auc_score(y_test,y_pred)
auc_roc
image.png
# 真假阳性
from sklearn.metrics import roc_curve,auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_prob)
roc_auc = auc(false_positive_rate,true_positive_rate)
roc_auc
image.png
# ROC曲线
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC=%0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()
image.png
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
LR_model = LogisticRegression()
tuned_parameters = {"C":[0.001,0.01,0.1,1,10,100,1000],
"penalty":['l1','l2']}
from sklearn.model_selection import GridSearchCV
LR = GridSearchCV(LR_model,tuned_parameters,cv=10)
LR.fit(X_train,y_train)
print(LR.best_params_)
image.png
y_prob = LR.predict_proba(X_test)[:,1]
y_pred = np.where(y_prob>0.5,1,0)
LR.score(X_test,y_pred)
image.png
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
confusion_matrix
image.png
auc_roc = metrics.classification_report(y_test,y_pred)
print(auc_roc)
image.png
auc_roc = metrics.roc_auc_score(y_test,y_pred)
auc_roc
image.png
# ROC曲线情况
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()
image.png
模型2:高斯朴素贝叶斯
from sklearn.naive_bayes import GaussianNB
model_naive = GaussianNB()
model_naive.fit(X_train,y_train)
y_prob = model_naive.predict_proba(X_test)[:,1]
y_pred = np.where(y_prob>0.5,1,0)
model_naive.score(X_test,y_pred)
image.png
print(f"Number of mislabeled points from {X_test.shape[0]} points:{(y_test!=y_pred).sum()}")
image.png
# 交叉验证
scores = cross_val_score(model_naive,X,y,cv=10,scoring="accuracy")
scores
image.png
scores.mean()
image.png
# 混淆矩阵和AUC
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
confusion_matrix
image.png
auc_roc = metrics.classification_report(y_test,y_pred)
print(auc_roc)
image.png
# 真假阳性
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
roc_auc = auc(false_positive_rate,true_positive_rate)
roc_auc
image.png
# ROC曲线
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle='--')
plt.axis("tight")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()
image.png
模型3:支持向量机SVM
# 建模过程
from sklearn.svm import SVC
svm_model = SVC()
tuned_parameters = {
'C':[1,10,100,500,1000],
'kernel':['linear','rbf'],
'C':[1,10,100,500,1000],
'gamma':[1,0.1,0.01,0.001,0.0001],
'kernel':['rbf']
}
# 随机网络搜索-RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
model_svm = RandomizedSearchCV(svm_model,
tuned_parameters,
cv=10,
scoring="accuracy",
n_iter=20)
model_svm.fit(X_train,y_train)
image.png
print(model_svm.best_score_)
image.png
model_svm.best_params_
image.png
y_pred = model_svm.predict(X_test)
metrics.accuracy_score(y_pred,y_test)
image.png
# 混淆矩阵
metrics.confusion_matrix(y_test,y_pred)
image.png
print(metrics.classification_report(y_test,y_pred))
image.png
metrics.roc_auc_score(y_test,y_pred)
image.png
# ROC曲线
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_pred)
roc_auc = auc(false_positive_rate,true_positive_rate)
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()
image.png
模型4:随机森林
# 建模拟合
from sklearn.ensemble import RandomForestClassifier
model_RR = RandomForestClassifier()
model_RR.fit(X_train,y_train)
image.png
# 预测得分
y_prob = model_RR.predict_proba(X_test)[:,1]
y_prob
image.png
y_pred = np.where(y_prob>0.5,1,0)
model_RR.score(X_test,y_pred)
image.png
# 混淆矩阵
metrics.confusion_matrix(y_test,y_pred)
image.png
print(metrics.classification_report(y_test,y_pred))
image.png
metrics.roc_auc_score(y_test,y_pred)
image.png
# ROC曲线
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
roc_auc = auc(false_positive_rate,true_positive_rate)
roc_auc
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()
image.png
模型5:决策树(CART)
# 建模
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier()
model_tree.fit(X_train,y_train)
y_prob = model_tree.predict_proba(X_test)[:,1]
y_pred = np.where(y_prob>0.5,1,0)
model_tree.score(X_test,y_pred)
image.png
# 混淆矩阵
metrics.confusion_matrix(y_test,y_pred)
image.png
print(metrics.classification_report(y_test,y_pred))
image.png
metrics.roc_auc_score(y_test,y_pred)
image.png
# ROC曲线
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
roc_auc = auc(false_positive_rate,true_positive_rate)
roc_auc
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()
image.png
模型6:神经网络ANN
# 建模
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(X_train,y_train)
image.png
y_prob = mlp.predict_proba(X_test)[:,1]
y_pred = np.where(y_prob>0.5,1,0)
mlp.score(X_test,y_pred)
image.png
# 混淆矩阵
metrics.confusion_matrix(y_test,y_pred)
image.png
print(metrics.classification_report(y_test,y_pred))
image.png
metrics.roc_auc_score(y_test,y_pred)
image.png
# ROC曲线
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
roc_auc = auc(false_positive_rate,true_positive_rate)
roc_auc
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()
image.png
# 网格搜索
from sklearn.neural_network import MLPClassifier
mlp_model = MLPClassifier()
tuned_parameters = {'hidden_layer_sizes':range(1,200,10),
'activation':['tanh','logistic','relu'],
'alpha':[0.0001,0.001,0.01,0.1,1,10],
'max_iter':range(50,200,50)}
model_mlp = RandomizedSearchCV(mlp_model,tuned_parameters,cv=10,scoring='accuracy',n_iter=5,n_jobs=-1,random_state=5)
model_mlp.fit(X_train,y_train)
image.png
# 模型属性
model_mlp.best_score_
image.png
model_mlp.best_params_
image.png
model_svm.cv_results_
image.png
# ROC曲线
from sklearn.metrics import roc_curve,auc
false_positive_rate,true_positive_rate,thresholds = roc_curve(y_test,y_prob)
roc_auc = auc(false_positive_rate,true_positive_rate)
roc_auc
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title("ROC")
plt.plot(false_positive_rate,true_positive_rate,color="red",label="AUC = %0.2f"%roc_auc)
plt.legend(loc="lower right")
plt.plot([0,1],[0,1],linestyle="--")
plt.axis("tight")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()
image.png
网友评论