import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
bankdata.info()
微信截图_20201125002401.png
c=round(bankdata.drop(['CHURN_CUST_IND'],axis=1).corr(),1)
plt.subplots(figsize=(20, 20))
sns.heatmap(c,annot=True,vmax=1, square=True,cmap="YlGnBu")
plt.show()
微信截图_20201125002420.png
bd = bankdata.drop(['LOCAL_FIX_MON_AVG_BAL_PROP','LOCAL_CUR_MON_AVG_BAL','LOCAL_BELONEYR_FF_MON_AVG_BAL','LOCAL_FIX_MON_AVG_BAL','LOCAL_SAV_SLOPE',
'LOCAL_SAV_CUR_ALL_BAL','LOCAL_SAV_MON_AVG_BAL','SAV_CUR_ALL_BAL','SAV_MON_AVG_BAL','ASSET_MON_AVG_BAL','LOCAL_CUR_TRANS_TX_AMT',
'ATM_ACCT_TX_NUM','COUNTER_ALL_TX_NUM','TELEBANK_ALL_TX_NUM'],axis=1)
c=round(bd.corr(),1)
plt.subplots(figsize=(20, 20))
sns.heatmap(c,annot=True,vmax=1, square=True,cmap="YlGnBu")
plt.show()
微信截图_20201125002430.png
enc = preprocessing.LabelEncoder()
for cols in ['GENDER_CD', 'HASNT_HOME_ADDRESS_INF', 'HASNT_MOBILE_TEL_NUM_INF']:
bd[cols] = enc.fit_transform(bd[cols])
x = bd.drop(['CUST_ID'],axis=1).values
y = bd['CHURN_CUST_IND']
#上采样
sm = SMOTE(random_state=42)
x1,y1 = sm.fit_sample(x,y)
#C调参
l2 = []
l2test = []
xtrain,xtest,ytrain,ytest = train_test_split(x1,y1,test_size=0.3,random_state=420)
n1 = np.linspace(0.1,10,9)
for i in n1:
lrl2 = LR(penalty='l2',solver='lbfgs',C=i,max_iter=10000)
lr2 = lrl2.fit(xtrain,ytrain)
l2.append(accuracy_score(lr2.predict(xtrain),ytrain))
l2test.append(accuracy_score(lr2.predict(xtest),ytest))
l = [l2,l2test]
color = ['red','green']
label = ['l2','l2test']
for i in range(len(l)):
plt.plot(n1,l[i],color[i],label=label[i])
plt.legend(loc=4)
plt.show()
微信截图_20201125002504.png
l2 = []
l2test = []
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=420)
n2 = np.linspace(3,5,19)
for i in n2:
lrl2 = LR(penalty='l2',solver='lbfgs',C=i,max_iter=10000)
lr2 = lrl2.fit(xtrain,ytrain)
l2.append(accuracy_score(lr2.predict(xtrain),ytrain))
l2test.append(accuracy_score(lr2.predict(xtest),ytest))
l = [l2,l2test]
color = ['red','green']
label = ['l2','l2test']
for i in range(len(l)):
plt.plot(n2,l[i],color[i],label=label[i])
plt.legend(loc=4)
plt.show()
微信截图_20201125002513.png
#max_iter调参
l2 = []
l2test = []
for i in np.arange(1,10000,500):
lrl2 = LR(penalty='l2',solver='lbfgs',C=c,max_iter=i)
lr2 = lrl2.fit(xtrain,ytrain)
l2.append(accuracy_score(lr2.predict(xtrain),ytrain))
l2test.append(accuracy_score(lr2.predict(xtest),ytest))
l = [l2,l2test]
color = ['red','green']
label = ['l2','l2test']
for i in range(len(l)):
plt.plot(np.arange(1,10000,500),l[i],color[i],label=label[i])
plt.legend(loc=4)
plt.show()
微信截图_20201125002521.png
l2 = []
l2test = []
for i in np.arange(2500,4000,50):
lrl2 = LR(penalty='l2',solver='lbfgs',C=c,max_iter=i)
lr2 = lrl2.fit(xtrain,ytrain)
l2.append(accuracy_score(lr2.predict(xtrain),ytrain))
l2test.append(accuracy_score(lr2.predict(xtest),ytest))
l = [l2,l2test]
color = ['red','green']
label = ['l2','l2test']
for i in range(len(l)):
plt.plot(np.arange(2500,4000,50),l[i],color[i],label=label[i])
plt.legend(loc=4)
plt.show()
微信截图_20201125002529.png
lrF = LR(penalty='l2',solver='lbfgs',C=c,max_iter=max_iter)
lrF = lrF.fit(xtest,ytest)
#预测测试集概率
lrF.predict_proba(xtest)
微信截图_20201125003021.png
网友评论