1 鸢尾花数据分类
# -*- coding: utf-8 -*-
# @Time : 2018/12/6 下午5:44
# @Author : scl
# @Email : 1163820757@qq.com
# @File : 鸢尾花数据分类(分类问题).py
# @Software: PyCharm
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.preprocessing import label_binarize
from sklearn import metrics
## 设置字符集,防止中文乱码
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
## 拦截异常
warnings.filterwarnings(action = 'ignore', category=ConvergenceWarning)
path = "datas/iris.data"
names = ['sepal length', 'sepal width', 'petal length','petal width', 'cla']
df = pd.read_csv(path, header=None, names=names)
df['cla'].value_counts()
df.head()
# 数据哑编码处理
def parseRecord(record):
result=[]
r = zip(names,record)
for name,v in r:
if name == 'cla':
if v == 'Iris-setosa':
result.append(1)
elif v == 'Iris-versicolor':
result.append(2)
elif v == 'Iris-virginica':
result.append(3)
else:
result.append(np.nan)
else:
result.append(float(v))
return result
### 数据转换
datas = df.apply(lambda r: pd.Series(parseRecord(r),index=names), axis=1)
### 异常数据删除
datas = datas.dropna(how='any')
### 数据分割
X = datas[names[0:-1]]
Y = datas[names[-1]]
### 数据抽样(训练数据和测试数据分割)
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
print ("原始数据条数:%d;训练数据条数:%d;特征个数:%d;测试样本条数:%d"
% (len(X), len(X_train), X_train.shape[1], X_test.shape[0]))
# 数据标准化
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
lr = LogisticRegressionCV(Cs=np.logspace(-4,1,50),
cv=3,fit_intercept=True, penalty='l2', solver='lbfgs',
tol=0.01, multi_class='multinomial')
#solver:‘newton-cg’,'lbfgs','liblinear','sag' default:liblinear
#'sag'=mini-batch
#'multi_clss':
lr.fit(X_train, Y_train)
## 将正确的数据转换为矩阵形式
y_test_hot = label_binarize(Y_test,classes=(1,2,3))
print(y_test_hot)
## 得到预测的损失值
lr_y_score = lr.decision_function(X_test)
## 计算roc的值
lr_fpr, lr_tpr, lr_threasholds = metrics.roc_curve(y_test_hot.ravel(),lr_y_score.ravel())
#threasholds阈值
## 计算auc的值
lr_auc = metrics.auc(lr_fpr, lr_tpr)
print ("Logistic算法R值:", lr.score(X_train, Y_train))
print ("Logistic算法AUC值:", lr_auc)
### 7. 模型预测
print(lr_y_score)
lr_y_predict = lr.predict(X_test)
print(lr.predict_proba(X_test))
x_test_len = range(len(X_test))
## 画图1:ROC曲线画图
plt.figure(figsize=(8, 6), facecolor='w')
plt.plot(lr_fpr,lr_tpr,c='r',lw=2,label=u'Logistic算法,AUC=%.3f' % lr_auc)
plt.plot((0,1),(0,1),c='#a0a0a0',lw=2,ls='--')
plt.xlim(-0.01, 1.02) #设置X轴的最大和最小值
plt.ylim(-0.01, 1.02) #设置y轴的最大和最小值
plt.xticks(np.arange(0, 1.1, 0.1))
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xlabel('False Positive Rate(FPR)', fontsize=16)
plt.ylabel('True Positive Rate(TPR)', fontsize=16)
plt.grid(b=True, ls=':')
plt.legend(loc='lower right', fancybox=True, framealpha=0.8, fontsize=12)
plt.title(u'鸢尾花数据Logistic和KNN算法的ROC/AUC', fontsize=18)
plt.show()
# 绘制预测
# plt.figure(figsize=(12, 9), facecolor='w')
# plt.ylim(0.5,3.5)
# plt.plot(x_test_len, Y_test, 'ro',markersize = 6,
# zorder=3, label=u'真实值')
# plt.plot(x_test_len, lr_y_predict, 'go', markersize = 10, zorder=2,
# label=u'Logis算法预测值,$R^2$=%.3f' % lr.score(X_test, Y_test))
# plt.legend(loc = 'lower right')
# plt.xlabel(u'数据编号', fontsize=18)
# plt.ylabel(u'种类', fontsize=18)
# plt.title(u'鸢尾花数据分类', fontsize=20)
# plt.show()
console输出
/anaconda3/envs/mlenvment/bin/python3.7 /Users/long/Desktop/ml_worksapce/MlGitHubCode/MlWorkSpacePrj/回归算法/回归算法/鸢尾花数据分类(分类问题).py
/anaconda3/envs/mlenvment/lib/python3.7/site-packages/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py:47: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
import imp
原始数据条数:150;训练数据条数:90;特征个数:4;测试样本条数:60
[[0 0 1]
[0 1 0]
[1 0 0]
[0 0 1]
[1 0 0]
[0 0 1]
[1 0 0]
[0 1 0]
[0 1 0]
[0 1 0]
[0 0 1]
[0 1 0]
[0 1 0]
[0 1 0]
[0 1 0]
[1 0 0]
[0 1 0]
[0 1 0]
[1 0 0]
[1 0 0]
[0 0 1]
[0 1 0]
[1 0 0]
[1 0 0]
[0 0 1]
[1 0 0]
[1 0 0]
[0 1 0]
[0 1 0]
[1 0 0]
[0 0 1]
[0 1 0]
[1 0 0]
[0 0 1]
[0 0 1]
[0 1 0]
[1 0 0]
[0 1 0]
[0 1 0]
[0 1 0]
[0 0 1]
[1 0 0]
[0 0 1]
[1 0 0]
[1 0 0]
[0 1 0]
[0 0 1]
[0 0 1]
[0 0 1]
[0 0 1]
[0 1 0]
[0 0 1]
[0 1 0]
[0 1 0]
[0 0 1]
[0 0 1]
[0 0 1]
[0 0 1]
[0 1 0]
[0 0 1]]
Logistic算法R值: 0.9777777777777777
Logistic算法AUC值: 0.9269444444444445
[[ -6.27937676 1.05548892 5.22388784]
[ -2.5371109 4.02213826 -1.48502736]
[ 9.58561223 2.6059998 -12.19161203]
[ -8.18346495 2.6530125 5.53045244]
[ 8.06522513 3.206342 -11.27156713]
[ -7.22184333 0.48250241 6.73934092]
[ 8.3061655 2.99869891 -11.30486441]
[ -3.63926189 2.64337134 0.99589054]
[ -4.44558943 3.22131538 1.22427405]
[ -1.96604656 2.94686917 -0.98082261]
[ -4.96636124 2.69911013 2.26725111]
[ -2.62637732 2.30702815 0.31934917]
[ -2.60962466 2.90343809 -0.29381343]
[ -4.0043684 2.89099073 1.11337767]
[ -2.93851571 2.52286105 0.41565466]
[ 7.70822708 3.63929131 -11.34751839]
[ -2.8118009 2.36157169 0.4502292 ]
[ -1.8045935 2.73019664 -0.92560314]
[ 7.12845508 3.45997993 -10.58843501]
[ 8.46288289 2.68843117 -11.15131406]
[ -4.49448454 1.43394608 3.06053846]
[ -1.91130978 1.89262974 0.01868004]
[ 7.84043823 2.94114447 -10.7815827 ]
[ 7.78617371 3.37763355 -11.16380726]
[ -4.70281021 2.21479818 2.48801203]
[ 9.91131131 2.72673147 -12.63804279]
[ 7.85422387 2.42150625 -10.27573012]
[ -2.25629084 2.82839377 -0.57210293]
[ 0.25066183 3.21437436 -3.46503619]
[ 7.33003213 2.93259577 -10.2626279 ]
[ -5.14664337 1.82293543 3.32370794]
[ -1.59680356 1.72359445 -0.12679089]
[ 8.02219758 3.24450592 -11.2667035 ]
[ -4.13742303 1.84449723 2.29292581]
[ -7.2966421 1.69209766 5.60454444]
[ -0.98760882 2.21626947 -1.22866065]
[ 7.48053662 3.10154607 -10.5820827 ]
[ -4.46388278 2.35421825 2.10966453]
[ -1.97859927 2.21830435 -0.23970508]
[ -1.73705971 3.07991182 -1.3428521 ]
[ -6.94768906 1.91731888 5.03037018]
[ 8.26367368 3.26261293 -11.52628661]
[ -6.8945401 1.71742173 5.17711837]
[ 7.00114109 2.55201873 -9.55315982]
[ 8.2506487 3.15179846 -11.40244716]
[ -0.83342074 3.55890736 -2.72548661]
[ -5.57537515 2.03832445 3.53705071]
[ -7.44638593 0.89269467 6.55369126]
[ -5.386268 3.1574685 2.2287995 ]
[ -7.65174457 2.78959492 4.86214965]
[ -1.25902403 3.19064129 -1.93161727]
[ -9.77227645 2.28402866 7.48824779]
[ -2.76747696 1.91870518 0.84877178]
[ -1.23842631 3.26103558 -2.02260927]
[ -5.26636527 2.40614663 2.86021863]
[ -4.36397858 2.60175264 1.76222594]
[ -2.76830109 1.70565742 1.06264367]
[ -6.24433075 2.51899599 3.72533476]
[ -3.53935769 2.89090574 0.64845195]
[ -5.77127503 1.86074046 3.91053457]]
[[9.94308711e-06 1.52409814e-02 9.84749076e-01]
[1.40923429e-03 9.94555266e-01 4.03549934e-03]
[9.99070201e-01 9.29798504e-04 3.48231198e-10]
[1.04795442e-06 5.32800658e-02 9.46718886e-01]
[9.92300592e-01 7.69940419e-03 3.96991379e-09]
[8.62784771e-07 1.91362710e-03 9.98085510e-01]
[9.95069960e-01 4.93003683e-03 3.02615758e-09]
[1.56435834e-03 8.37238489e-01 1.61197152e-01]
[4.11954355e-04 8.80123366e-01 1.19464680e-01]
[7.15748439e-03 9.73671826e-01 1.91706895e-02]
[2.84121940e-04 6.06145229e-01 3.93570649e-01]
[6.29421519e-03 8.73961623e-01 1.19744162e-01]
[3.86037204e-03 9.57021938e-01 3.91176902e-02]
[8.65322130e-04 8.54661679e-01 1.44472999e-01]
[3.77297029e-03 8.88237645e-01 1.07989385e-01]
[9.83191769e-01 1.68082260e-02 5.20994486e-09]
[4.91131150e-03 8.66891307e-01 1.28197381e-01]
[1.03506360e-02 9.64720201e-01 2.49291631e-02]
[9.75119469e-01 2.48805118e-02 1.97110710e-08]
[9.96903714e-01 3.09628334e-03 3.02214789e-09]
[4.37277212e-04 1.64225862e-01 8.35336861e-01]
[1.89512121e-02 8.50485546e-01 1.30563242e-01]
[9.92603267e-01 7.39672472e-03 8.11586415e-09]
[9.87973457e-01 1.20265376e-02 5.81933161e-09]
[4.27698467e-04 4.31933453e-01 5.67638848e-01]
[9.99242387e-01 7.57613088e-04 1.60920285e-10]
[9.95647811e-01 4.35217610e-03 1.33157849e-08]
[5.95532105e-03 9.61956971e-01 3.20877079e-02]
[4.90338189e-02 9.49772774e-01 1.19340698e-03]
[9.87840788e-01 1.21591890e-02 2.26095427e-08]
[1.71351240e-04 1.82279096e-01 8.17549553e-01]
[3.02840778e-02 8.38001677e-01 1.31714245e-01]
[9.91654822e-01 8.34517412e-03 4.16195355e-09]
[9.82713315e-04 3.89351453e-01 6.09665834e-01]
[2.44617907e-06 1.95996503e-02 9.80397904e-01]
[3.78591986e-02 9.32391009e-01 2.97497922e-02]
[9.87617232e-01 1.23827534e-02 1.41283928e-08]
[6.13063609e-04 5.60491705e-01 4.38895231e-01]
[1.36665771e-02 9.08556294e-01 7.77771291e-02]
[7.93188460e-03 9.80303477e-01 1.17646383e-02]
[6.01310014e-06 4.25718447e-02 9.57422142e-01]
[9.93314195e-01 6.68580269e-03 2.52589966e-09]
[5.54497074e-06 3.04808279e-02 9.69513627e-01]
[9.88446166e-01 1.15537705e-02 6.39015637e-08]
[9.93933267e-01 6.06673050e-03 2.89818523e-09]
[1.21982255e-02 9.85962770e-01 1.83900493e-03]
[9.01386278e-05 1.82599114e-01 8.17310747e-01]
[8.28581172e-07 3.46698330e-03 9.96532188e-01]
[1.39586331e-04 7.16705121e-01 2.83155292e-01]
[3.26435596e-06 1.11792749e-01 8.88203987e-01]
[1.14799130e-02 9.82660931e-01 5.85915602e-03]
[3.17300208e-08 5.46332632e-03 9.94536642e-01]
[6.81960060e-03 7.39506484e-01 2.53673915e-01]
[1.09379036e-02 9.84069025e-01 4.99307120e-03]
[1.80743782e-04 3.88322849e-01 6.11496407e-01]
[6.58594413e-04 6.97905585e-01 3.01435820e-01]
[7.41788656e-03 6.50572468e-01 3.42009645e-01]
[3.60169716e-05 2.30341207e-01 7.69622776e-01]
[1.45514725e-03 9.02682174e-01 9.58626786e-02]
[5.52862776e-05 1.14066880e-01 8.85877833e-01]]
objc[33525]: Class FIFinderSyncExtensionHost is implemented in both /System/Library/PrivateFrameworks/FinderKit.framework/Versions/A/FinderKit (0x7fff8fd57c90) and /System/Library/PrivateFrameworks/FileProvider.framework/OverrideBundles/FinderSyncCollaborationFileProviderOverride.bundle/Contents/MacOS/FinderSyncCollaborationFileProviderOverride (0x1a242c3cd8). One of the two will be used. Which one is undefined.
效果
Roc_auc.png 鸢尾花数据分类.jpg知识点:
1 Roc曲线:接收者操作特征(receiveroperating characteristic),roc曲线上每个点反映着对同一信号刺激的感受性。
横轴:负正类率(false postive rate FPR)特异度,划分实例中所有负例占所有负例的比例;(1-Specificity)
纵轴:真正类率(true postive rate TPR)灵敏度,Sensitivity(正类覆盖率)
针对一个二分类问题 将实例分为正类和负类 实际会出现四种情况(真正类(TP(True Postive)), 假负类(TN(False Negative),假正类(FP(False Postive FP)),真负类(TN(True Negative)))
TP:正确肯定数目
FN:漏报
FP:误报
TN:正确拒绝的非匹配项
真正类率 (TPR) TP/(TP+FN) :预测的正类中实际正实例占所有正实例的比例
负正类率 (FPR) FP/(FP+TN)
横轴FPR:1-TNR,1-Specificity,FPR越大,预测正类中实际负类越多。
纵轴TPR:Sensitivity(正类覆盖率),TPR越大,预测正类中实际正类越多。
理想目标:TPR=1,FPR=0,即图中(0,1)点,故ROC曲线越靠拢(0,1)点,越偏离45度对角线越好,Sensitivity、Specificity越大效果越好。
AUC(Area under Curce) Roc曲线下的面积 值越大越好
首先AUC值是一个概率值,当你随机挑选一个正样本以及负样本,当前的分类算法根据计算得到的Score值将这个正样本排在负样本前面的概率就是AUC值,AUC值越大,当前分类算法越有可能将正样本排在负样本前面,从而能够更好地分类。
2 信贷审批问题
# -*- coding: utf-8 -*-
# @Time : 2018/12/10 下午3:28
# @Author : scl
# @Email : 1163820757@qq.com
# @File : 信贷审批(分类问题).py
# @Software: PyCharm
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import sklearn
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.preprocessing import label_binarize
from sklearn import metrics
## 设置字符集,防止中文乱码
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
## 拦截异常
warnings.filterwarnings(action = 'ignore', category=ConvergenceWarning)
# 1 加载数据
path = "datas/crx.data"
names = ['A1','A2','A3','A4','A5','A6','A7','A8',
'A9','A10','A11','A12','A13','A14','A15','A16']
df = pd.read_csv(path,header = None,names = names)
print("数据条数",len(df))
# 2 过滤异常数据
df = df.replace("?", np.nan).dropna(how='any')
print ("过滤后数据条数:", len(df))
print("正常状态数据条数",len(df))
print(df.head(10))
# 自定义一个亚编码 将v变量转换成一个list形式
def parse(v, l):
# v是一个字符串,需要进行转换的数据
# l是一个类别信息,其中v是其中的一个值
return [1 if i == v else 0 for i in l]
print(parse('b',('a', 'b')))
print(df["A4"])
def parseRecord(record):
result = []
a1 = record['A1']
for i in parse(a1, ('a', 'b')):
result.append(i)
result.append(float(record['A2']))
result.append(float(record['A3']))
# 将A4的信息转换为哑编码的形式; 对于DataFrame中,原来一列的数据现在需要四列来进行表示
a4 = record['A4']
for i in parse(a4, ('u', 'y', 'l', 't')):
result.append(i)
a5 = record['A5']
for i in parse(a5, ('g', 'p', 'gg')):
result.append(i)
a6 = record['A6']
for i in parse(a6, ('c', 'd', 'cc', 'i', 'j', 'k', 'm', 'r', 'q', 'w', 'x', 'e', 'aa', 'ff')):
result.append(i)
a7 = record['A7']
for i in parse(a7, ('v', 'h', 'bb', 'j', 'n', 'z', 'dd', 'ff', 'o')):
result.append(i)
result.append(float(record['A8']))
a9 = record['A9']
for i in parse(a9, ('t', 'f')):
result.append(i)
a10 = record['A10']
for i in parse(a10, ('t', 'f')):
result.append(i)
result.append(float(record['A11']))
a12 = record['A12']
for i in parse(a12, ('t', 'f')):
result.append(i)
a13 = record['A13']
for i in parse(a13,('g', 'p', 's')):
result.append(i)
result.append(float(record['A14']))
result.append(float(record['A15']))
a16 = record['A16']
if a16 == '+':
result.append(1)
else:
result.append(0)
return result
### 数据特征处理(将数据转换为数值类型的)
new_names = ['A1_0', 'A1_1',
'A2','A3',
'A4_0','A4_1','A4_2','A4_3', # 因为需要对A4进行哑编码操作,需要使用四列来表示一列的值
'A5_0', 'A5_1', 'A5_2',
'A6_0', 'A6_1', 'A6_2', 'A6_3', 'A6_4', 'A6_5', 'A6_6', 'A6_7', 'A6_8', 'A6_9', 'A6_10', 'A6_11', 'A6_12', 'A6_13',
'A7_0', 'A7_1', 'A7_2', 'A7_3', 'A7_4', 'A7_5', 'A7_6', 'A7_7', 'A7_8',
'A8',
'A9_0', 'A9_1' ,
'A10_0', 'A10_1',
'A11',
'A12_0', 'A12_1',
'A13_0', 'A13_1', 'A13_2',
'A14','A15','A16']
datas = df.apply(lambda x: pd.Series(parseRecord(x), index = new_names), axis=1)
print(datas.head(5))
## 数据分割
X = datas[new_names[0:-1]]
Y = datas[new_names[-1]]
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,train_size = 0.8, random_state=0)
print(X_train.shape)
print(X_test.shape)
## 数据标准化
ss = StandardScaler()
## 模型训练一定是在训练集合上训练的
X_train = ss.fit_transform(X_train) ## 训练正则化模型,并将训练数据归一化操作
X_test = ss.transform(X_test) ## 使用训练好的模型对测试数据进行归一化操作
## 模型训练
lr = LogisticRegressionCV(Cs=np.logspace(-4,1,50), fit_intercept=True, penalty='l2', solver='lbfgs', tol=0.01, multi_class='ovr')
lr.fit(X_train, Y_train)
## Logistic算法效果输出
lr_r = lr.score(X_train, Y_train)
print ("Logistic算法(训练集上的准确率):", lr_r)
print ("Logistic算法稀疏化特征比率:%.2f%%" % (np.mean(lr.coef_.ravel() == 0) * 100))
print ("Logistic算法参数:",lr.coef_)
print ("Logistic算法截距:",lr.intercept_)
# 预测
lr_y_predict = lr.predict(X_test)
print(lr_y_predict)
x_len = range(len(X_test))
plt.figure(figsize=(14,7), facecolor='w')
plt.ylim(-0.1,1.1)
plt.plot(x_len, Y_test, 'ro',markersize = 6, zorder=3, label=u'真实值')
plt.plot(x_len, lr_y_predict, 'go', markersize = 10, zorder=2,
label=u'Logistic算法预测值,准确率=%.3f' % lr.score(X_test, Y_test))
plt.legend(loc = 'center right')
plt.xlabel(u'数据编号', fontsize=18)
plt.ylabel(u'是否审批(0表示未通过,1表示通过)', fontsize=18)
plt.title(u'Logistic回归算法', fontsize=20)
plt.show()
效果图
信贷审批.png欢迎访问我的机器学习GitHub地址https://github.com/longsan1234567/mlFolder
网友评论