import pickle
import urllib
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import classification_report
import numpy as np
%matplotlib inline
# 数据集
from sklearn import datasets
无监督学习
N_smaples = 2000
dataset_1 = np.array(datasets.make_circles(n_samples=N_smaples,noise=0.05,factor=0.3)[0])
dataset_2 = np.array(datasets.make_blobs(n_samples=N_smaples,centers=4,cluster_std=0.4,random_state=0)[0])
plt.scatter(dataset_1[:,0],dataset_1[:,1],alpha=0.8,s=64,edgecolors='white')
plt.show()
output_3_0.png
plt.scatter(dataset_2[:,0],dataset_2[:,1],c='blue',alpha=0.8,s=64,edgecolors='white')
<matplotlib.collections.PathCollection at 0x1a19aafc10>
output_4_1.png
K_dataset_1 = 2
km_1 = KMeans(n_clusters=K_dataset_1)
labels_1 = km_1.fit(dataset_1).labels_
print(labels_1)
plt.scatter(dataset_1[:,0],dataset_1[:,1],c=labels_1,alpha=0.8,s=64,edgecolors='white')
plt.scatter(km_1.cluster_centers_[:,0],
km_1.cluster_centers_[:,1],
c=np.unique(labels_1),
alpha=0.8,s=200,edgecolors='black')
[1 0 1 ... 0 1 1]
<matplotlib.collections.PathCollection at 0x1a1ab66f10>
output_5_2.png
K_dataset_2 = 4
km_2 = KMeans(n_clusters=K_dataset_2)
labels_2 = km_2.fit(dataset_2).labels_
plt.scatter(dataset_2[:,0],dataset_2[:,1],c=labels_2,alpha=0.8,s=64,edgecolors='white')
plt.scatter(km_2.cluster_centers_[:,0],
km_2.cluster_centers_[:,1],
c=np.unique(labels_2),
alpha=0.8,s=200,edgecolors='black')
<matplotlib.collections.PathCollection at 0x1a1bdd74d0>
output_6_1.png
DBSCAN 算法
基于密度带有噪声的聚类,要优于Kmeans 算法。DBSCAN 处理可以做分类还可以做异常检测。
- 核心对象,若某个点的密度达到算法设定阈值这个该点就是核心点,也就是找到一个点,以他为半径画圆,如果在圆内点数量超过阈值,这个点就被看做为核心点。
- 在Kmeans 中我们需要创建模型前,预先设置好分类数目,而在 DBSCAN 中我们无需设置 K ,DBSCAN 会自己学出来。
- eps 设置邻域的距离
- 密度直接可达,如果某一个点 p 在核心点 c 邻域内,我们就是 p-c 表示直接密度可达
- 密度可达,这是 DBSCAN 算法重点需要我们理解问题,假设这里有个 k 个点
- 当以 为核心点时, 是位于其邻域所以 到 是直接密度可达
- 接下来当以 为核心点时, 是位于其邻域所以到 是直接密度可达
- 泛化后接下来当以 为核心点时, 是位于其邻域所以到 是直接密度可达
- 边界点
- 噪音点
adbs_1 = DBSCAN(eps=0.25)
labels_1 = adbs_1.fit(dataset_1).labels_
plt.scatter(dataset_1[:,0],dataset_1[:,1],c=labels_1,alpha=0.8,s=64,edgecolors='white')
<matplotlib.collections.PathCollection at 0x1a1a568810>
output_8_1.png
- D 表示数据集
- 指定邻域半径
- min
adbs_2 = DBSCAN(eps=0.5)
labels_2 = adbs_2.fit(dataset_2).labels_
plt.scatter(dataset_2[:,0],dataset_2[:,1],c=labels_2,alpha=0.8,s=64,edgecolors='white')
np.unique(labels_2)
array([-1, 0, 1, 2, 3])
output_10_1.png
线性模型
boston = datasets.load_boston()
X_train,X_test,y_train,y_test = train_test_split(boston.data,boston.target,test_size=0.2,random_state=0)
regr = LinearRegression()
regr.fit(X_train,y_train)
y_pred = regr.predict(X_test)
print("MAE",mean_absolute_error(y_test,y_pred))
MAE 3.842909220444498
classification_report(y_test,y_pred)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-35-c8b295710670> in <module>
----> 1 classification_report(y_test,y_pred)
/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py in classification_report(y_true, y_pred, labels, target_names, sample_weight, digits, output_dict)
1522 """
1523
-> 1524 y_type, y_true, y_pred = _check_targets(y_true, y_pred)
1525
1526 labels_given = True
/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py in _check_targets(y_true, y_pred)
86 # No metrics support "multiclass-multioutput" format
87 if (y_type not in ["binary", "multiclass", "multilabel-indicator"]):
---> 88 raise ValueError("{0} is not supported".format(y_type))
89
90 if y_type in ["binary", "multiclass"]:
ValueError: continuous is not supported
最后希望大家关注我们微信公众号
wechat.jpeg
网友评论