异常点检测
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
raw_data = pd.read_csv("datas/data_class_raw.csv")
raw_data.head()
image.png
# 定义x y
x = raw_data.drop("y", axis=1)
x1 = raw_data["x1"]
x2 = raw_data["x2"]
y = raw_data["y"]
image.png
#异常检测
from sklearn.covariance import EllipticEnvelope
ad_model = EllipticEnvelope(contamination=0.02)
ad_model.fit(x[y==0])
bad_predict = ad_model.predict(x[y==0])
# 异常点展示
plt.figure()
bad = plt.scatter(x1[y==0], x2[y==0])
good = plt.scatter(x1[y==1], x2[y==1])
anoma = plt.scatter(x1[y==0][bad_predict==-1], x2[y==0][bad_predict==-1], marker="x", s=150)
plt.legend((bad, good, anoma), ("bad", "good", "anoma"))
plt.xlabel("x1")
plt.ylabel("x2")
plt.show()
image.png
去除异常点的数据pca处理
# 去除异常点的数据pca处理
process_data = pd.read_csv("datas/data_class_processed.csv")
# 定义x y
x = raw_data.drop("y", axis=1)
y = raw_data["y"]
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# 数据标准化处理
x_stand = StandardScaler().fit_transform(x)
# pca 同等维度
pca = PCA(n_components=2)
x_pca = pca.fit_transform(x_stand)
# 主成分方差
var_radio = pca.explained_variance_ratio_ #array([0.51664723, 0.48335277])不需要降维
数据分离,knn计算准确率
# 数据分离
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=4, test_size=0.4)
#knn算法计算准确率 n_neighbor=10
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train, y_train)
y_train_predict = knn.predict(x_train)
y_test_predict = knn.predict(x_test)
acc_train = accuracy_score(y_train, y_train_predict)
acc_test = accuracy_score(y_test, y_test_predict)
print(acc_train, acc_test) #0.9047619047619048 0.5333333333333333
#可视化分类边界
# 生成一组二维数据,预测出边界
xx, yy = np.meshgrid(np.arange(0, 10, 0.05), np.arange(0, 10, 0.05))
# 转换成若干行,两列的数据
x_range = np.c_[xx.ravel(), yy.ravel()]
y_range = knn.predict(x_range)
# 展示
plt.figure(figsize=(17, 8))
knn_bad = plt.scatter(x_range[:, 0][y_range==0], x_range[:, 1][y_range==0])
knn_good = plt.scatter(x_range[:, 0][y_range==1], x_range[:, 1][y_range==1])
bad = plt.scatter(x1[y==0], x2[y==0])
good = plt.scatter(x1[y==1], x2[y==1])
plt.legend((bad, good, knn_bad, knn_good), ("bad", "good", "knn_bad", "knn_good"))
plt.xlabel("x1")
plt.ylabel("x2")
plt.show()
image.png
混淆矩阵
# 混淆矩阵
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_test_predict)
tn = cm[0, 0]
fn = cm[1, 0]
tp = cm[1, 1]
fp = cm[0, 1]
#准确率 (tp+tn)/(tp+tn+fp+fn)
accuracy = (tp + tn )/(tp + tn + fp + fn)
#灵敏度(召回率) tp/(tp+fn)
recall = tp / (tp + fn)
#特异度 tn/(tn+fp)
spec = tn / (tn + fp)
# 精确率 tp/(tp+fp)
precision = tp / (tp + fp)
#f1 分数 2*precison * recall/(precision + recall)
f1_score = 2 * precision * recall/(precision + recall)
不同n_neighbors准确率
# knn取不同数量邻近点
acc_trains = []
acc_tests = []
for i in range(1, 21):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(x_train, y_train)
y_train_predict = knn.predict(x_train)
y_test_predict = knn.predict(x_test)
acc_train = accuracy_score(y_train, y_train_predict)
acc_test = accuracy_score(y_test, y_test_predict)
acc_trains.append(acc_train)
acc_tests.append(acc_test)
# 展示
plt.figure()
train = plt.plot(range(1, 21), acc_trains, label="train")
test = plt.plot(range(1, 21), acc_tests, label="test")
plt.xlabel("n_neighbors")
plt.ylabel("accuracy")
plt.legend()
plt.show()
image.png
网友评论