from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
(2)格式是["自定义操作名", 转化器/流水线, 操作的列]
from sklearn.pipeline import Pipeline
(2)格式是["自定义操作名", 转化器/流水线]
from sklearn.preprocessing import KBinsDiscretizer
df = pd.read_json("ershou_suz_20200319.txt", lines=True) # 读取数据
# 分箱之后全部onehot,采用等频分箱,输出稀疏矩阵
kbin = KBinsDiscretizer(n_bins=5, encode="onehot", strategy="quantile")
kbin.fit_transform(df["area"].values.reshape(-1 ,1)).toarray() # 输入要是2D array
#array([[0., 1., 0., 0., 0.],
# [0., 0., 0., 1., 0.],
# [1., 0., 0., 0., 0.],
# ...,
# [0., 0., 0., 0., 1.],
# [0., 0., 1., 0., 0.],
# [0., 1., 0., 0., 0.]])
# 分箱之后从到大数值编码,等距分箱
kbin = KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform")
df["area2"] = kbin.fit_transform(df["area"].values.reshape(-1 ,1))
# [1.],
# [0.],
# ...,
# [1.],
# [0.],
# [0.]])
# 分箱之后全部onehot,采用等频分箱,和encode="onehot"相比不需要toarray了
kbin = KBinsDiscretizer(n_bins=5, encode="onehot-dense", strategy="quantile")
kbin.fit_transform(df["area"].values.reshape(-1 ,1))
#array([[0., 1., 0., 0., 0.],
# [0., 0., 0., 1., 0.],
# [1., 0., 0., 0., 0.],
# ...,
# [0., 0., 0., 0., 1.],
# [0., 0., 1., 0., 0.],
# [0., 1., 0., 0., 0.]])
# 测试新数据
new = pd.DataFrame([[50], [80], [120], [240], [500]], columns=["area"])
kbin.transform(new["area"].values.reshape(-1 ,1))
# [0.],
# [1.],
# [2.],
# [4.]])
简单填充 SimpleImputer
from sklearn.impute import SimpleImputer
df = pd.DataFrame(
{"a": [1, 0, 0, 1, 1],
"b": [np.nan, 0, 0, 1, 2],
"c": [1, 0, 1, 0, 3],
"d": [0, 1, 0, 1, 4],
"label": ["a", "b", "c", "c", "?"]
# a b c d label
#0 1 NaN 1 0 a
#1 0 0.0 0 1 b
#2 0 0.0 1 0 c
#3 1 1.0 0 1 c
#4 1 2.0 3 4 ?
# 使用常数填充,此时需要指定fill_value
fillna = SimpleImputer(strategy='constant', fill_value=0)
df2 = fillna.fit_transform(df)
#array([[1, 0, 1, 0, 'a'],
# [0, 0.0, 0, 1, 'b'],
# [0, 0.0, 1, 0, 'c'],
# [1, 1.0, 0, 1, 'c'],
# [1, 2.0, 3, 4, '?']], dtype=object)
# 使用统计量填充,此时不需要fill_value,支持mean(默认), median, most_frequent
fillna = SimpleImputer(strategy='mean')
df2 = fillna.fit_transform(df["b"].values.reshape(-1, 1))
# [0. ],
# [0. ],
# [1. ],
# [2. ]])
# 可以指定缺失值missing_values,默认missing_values=np.nan,可以指定是数字,字符串,np.nan,None
# 使用众数
fillna = SimpleImputer(missing_values="?", strategy='most_frequent')
df2 = fillna.fit_transform(df["label"].values.reshape(-1, 1))
# ['b'],
# ['c'],
# ['c'],
# ['c']], dtype=object)
# 替换多个缺失值
df = pd.DataFrame(
{"a": [1, 0, 0, 1, 1],
"b": [np.nan, 0, 0, 1, 2],
"c": [1, 0, 1, 0, 3],
"d": [0, 1, 0, 1, 4],
"label": ["a", "b", np.nan, "c", "?"]
fillna1 = SimpleImputer(strategy='constant', fill_value="unknow")
fillna2 = SimpleImputer(missing_values="?", strategy='constant', fill_value="unknow")
fillna_pipeline = Pipeline([("fillna1", fillna1), ("fillna2", fillna2)])
df2 = fillna_pipeline.fit_transform(df["label"].values.reshape(-1, 1))
# ['b'],
# ['unknow'],
# ['c'],
# ['unknow']], dtype=object)
# 配合ColumnTransform指定填充列,和不同的填充的方式
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
cat_cols = ['a', 'b']
cin_cols = ['c', 'd']
df = pd.DataFrame(
{"a": [1, 0, 0, 1, 0],
"b": [np.nan, "a", "v", "a", "c"],
"c": [1, 0, np.nan, 0, 1],
"d": [0, 1, 0, 1, 0],
"label": [1, 0, 1, 0, 0]
fillna1 = SimpleImputer(strategy='most_frequent')
fillna2 = SimpleImputer(strategy='mean')
preprocessor = ColumnTransformer(
('cat', fillna1, cat_cols),
('num', fillna2, cin_cols)]) # 这个流水线可以继续封装模型
df3 = preprocessor.fit_transform(df)
#array([[1, 'a', 1.0, 0.0],
# [0, 'a', 0.0, 1.0],
# [0, 'v', 0.5, 0.0],
# [1, 'a', 0.0, 1.0],
# [0, 'c', 1.0, 0.0]], dtype=object)
# KNNImputer k近邻填充
# 找到最相似的n条样本,,用这些样本的这个非空特征的均值填充
# 只能填充数值特征
from sklearn.impute import KNNImputer
df = pd.DataFrame(
{"a": [1, 2, 0, 1, 0, 2, 5, 9, 103, np.nan, 102, 999, 5, 2],
"b": ["a", "v", "a", "c", "a", "v", "a", "c", "a", "c", "a", "v", "a", "c"],
"c": [10, 12, 98, 25, 103, 102, 89, 55, 43, -1, 67, 89, 133, 252]
# a b c
#0 1.0 a 10
#1 2.0 v 12
#2 0.0 a 98
#3 1.0 c 25
#4 0.0 a 103
#5 2.0 v 102
#6 5.0 a 89
#7 9.0 c 55
#8 103.0 a 43
#9 NaN c -1
#10 102.0 a 67
#11 999.0 v 89
#12 5.0 a 133
#13 2.0 c 252
knnfillna = KNNImputer(n_neighbors=2, weights="uniform")
# 会比较除了a特征外的其他特征的相似度,这个例子中比较
# np.nan对应的C特征是-1,-1的两个近邻是10,12,均值是(1 + 2) / 2 = 1.5
knnfillna.fit_transform(df[["a", "c"]])
#array([[ 1. , 10. ],
# [ 2. , 12. ],
# [ 0. , 98. ],
# [ 1. , 25. ],
# [ 0. , 103. ],
# [ 2. , 102. ],
# [ 5. , 89. ],
# [ 9. , 55. ],
# [103. , 43. ],
# [ 1.5, -1. ],
# [102. , 67. ],
# [999. , 89. ],
# [ 5. , 133. ],
# [ 2. , 252. ]])
knnfillna.fit_transform(df[["a", "b"]])
# could not convert string to float: 'a'
# 报错,所以进入fit的特征必须全部是数值,否则无法计算欧式距离
transformers:("自定义操作名", 转化器/流水线, 操作的列)
# knn缺失值填充之后,整合离散变量onehot和连续变量分箱onehot
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
df = pd.DataFrame(
{"a": [1, 2, 0, 1, 0, 2, 5, 9, 103, np.nan, 102, 999, 5, 2],
"b": ["a", "v", "a", "c", "a", "v", "a", "c", "a", "c", "a", "v", "a", "c"],
"c": [10, 12, 98, 25, np.nan, 102, 89, 55, 43, -1, 67, 89, 133, 252]
continus_cols = ["a", "c"]
category_cols = ["b"]
# 连续变量处理单独是一个pipeline
num_pipeline = Pipeline([
('continus_cols_knnfillna', KNNImputer(n_neighbors=2, weights="uniform")),
('continus_cols_cut', KBinsDiscretizer(n_bins=3, encode="onehot", strategy="quantile"))
# ColumnTransformer 中的列不能重复指定,比如指定两次continus_cols,这样会重新调用continus_cols生成新的特征,操作断开了
preprocessor = ColumnTransformer(
('num_pipeline', num_pipeline, continus_cols),
('category_cols_onehot', OneHotEncoder(handle_unknown='ignore'), category_cols)
# 结果是先用knn填充缺失值,再各自分箱3段,加上onehot的3段,一个9个特征
#array([[1., 0., 0., 1., 0., 0., 1., 0., 0.],
# [0., 1., 0., 1., 0., 0., 0., 0., 1.],
# [1., 0., 0., 0., 0., 1., 1., 0., 0.],
# [1., 0., 0., 1., 0., 0., 0., 1., 0.],
# [1., 0., 0., 0., 1., 0., 1., 0., 0.],
# [0., 1., 0., 0., 0., 1., 0., 0., 1.],
# [0., 0., 1., 0., 0., 1., 1., 0., 0.],
# [0., 0., 1., 0., 1., 0., 0., 1., 0.],
# [0., 0., 1., 1., 0., 0., 1., 0., 0.],
# [1., 0., 0., 1., 0., 0., 0., 1., 0.],
# [0., 0., 1., 0., 1., 0., 1., 0., 0.],
# [0., 0., 1., 0., 0., 1., 0., 0., 1.],
# [0., 0., 1., 0., 0., 1., 1., 0., 0.],
# [0., 1., 0., 0., 0., 1., 0., 1., 0.]])
# 再一个例子
# 连续变量用均值填充+MinMaxScaler
# 离散变量onehot
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
num_pipeline = Pipeline([
('continus_cols_meanfillna', SimpleImputer(strategy='mean')),
('continus_cols_minmax', MinMaxScaler())
# ColumnTransformer 中的类不能重复指定,比如指定两次continus_cols,这样会重新调用continus_cols生成新的特征,操作断开了
preprocessor = ColumnTransformer(
('num_pipeline', num_pipeline, continus_cols),
('category_cols_onehot', OneHotEncoder(handle_unknown='ignore'), category_cols)
#array([[0.001001 , 0.04347826, 1. , 0. , 0. ],
# [0.002002 , 0.0513834 , 0. , 0. , 1. ],
# [0. , 0.39130435, 1. , 0. , 0. ],
# [0.001001 , 0.1027668 , 0. , 1. , 0. ],
# [0. , 0.30009121, 1. , 0. , 0. ],
# [0.002002 , 0.40711462, 0. , 0. , 1. ],
# [0.00500501, 0.35573123, 1. , 0. , 0. ],
# [0.00900901, 0.22134387, 0. , 1. , 0. ],
# [0.1031031 , 0.17391304, 1. , 0. , 0. ],
# [0.09478709, 0. , 0. , 1. , 0. ],
# [0.1021021 , 0.2687747 , 1. , 0. , 0. ],
# [1. , 0.35573123, 0. , 0. , 1. ],
# [0.00500501, 0.52964427, 1. , 0. , 0. ],
# [0.002002 , 1. , 0. , 1. , 0. ]])
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
df = pd.DataFrame(
{"a": [1, 2, 0, 1, 0, 2, 5, 9, 103, 22, 102, 999, 5, 2],
"b": ["a", "v", "a", "c", "a", np.nan, "a", "c", "a", "c", "a", "v", "a", "c"]
continus_cols = ["a"]
category_cols = ["b"]
cat_pipeline = Pipeline([
('cat_cols_fillna', SimpleImputer(strategy='constant', fill_value="unknown")),
('cat_cols_onehot', OneHotEncoder(handle_unknown='ignore'))
preprocessor = ColumnTransformer(
('cat_pipeline', cat_pipeline, category_cols),
('category_cols_onehot', SimpleImputer(strategy='constant', fill_value=0), continus_cols)
res = preprocessor.fit_transform(df)
ColumnTransformer只会对指定的列进行操作, 没有指定的列直接忽略不进入结果,如果有列不进行任何操作,可以使用自定义转换器返回自己
preprocessor = ColumnTransformer(
("none", FunctionTransformer(), continus_cols),
("onehot", OneHotEncoder(handle_unknown='ignore'), category_cols)
train = preprocessor.fit_transform(df[continus_cols + category_cols])
# 自定义函数转化器 FunctionTransformer
from sklearn.preprocessing import FunctionTransformer
df = pd.DataFrame([[1, 2], [3, 2], [9, 2]], columns=["a", "b"])
# a b
#0 1 2
#1 3 2
#2 9 2
def myfunc(x):
return x + 1
# a b
#0 2 3
#1 4 3
#2 10 3
# 对Y做log(1+)变换
所有对数据的转化,评估操作最终被包在一个Pipeline中,通过steps=[ ]的来指定顺序,同样可以调用steps和索引拿到流水线的某个数据处理部分,比如将已经训练好的Pipeline拿到评估器XGBClassifier进行绘图
clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", XGBClassifier())])
clf.fit(train, train["label"])
# 拿到xgb
model = clf.steps[1][1]
xgboost.plot_importance(model, max_num_features=10)
一个整体pipeline在线部署预测模型的demo,传入的数据必须还是dataframe格式,使用pandas的read_json得到数据源,模型pickle文件使用flask部署web server,采用restful接口部署模型,POST请求得到预测结果
import pickle
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score
continus_cols = ["shop_duration", "recent", "monetary", "max_amount", "items_count",
"valid_points_sum", "member_day", "frequence", "avg_amount", "item_count_turn",
"avg_piece_amount", "monetary3","max_amount3", "items_count3",
"frequence3", "shops_count", "promote_percent", "wxapp_diff", "store_diff",
category_cols = ["CHANNEL_NUM_ID", "shop_channel", "infant_group", "water_product_group",
"meat_group", "beauty_group", "health_group", "fruits_group", "vegetables_group",
"pets_group", "snacks_group", "smoke_group", "milk_group", "instant_group",
if __name__ == "__main__":
df = pd.read_csv("./data/churn.csv")
train, test = train_test_split(df, test_size=0.2)
# 整体组装
preprocessor = ColumnTransformer(
("0_fillna", SimpleImputer(strategy='constant', fill_value=0), continus_cols),
("onehot", OneHotEncoder(handle_unknown='ignore'), category_cols)
clf = Pipeline(steps=[("preprocessor", preprocessor),
("classifier", XGBClassifier(max_depth=8, n_estimators=100, n_jods=3))])
# 训练
clf.fit(train[continus_cols + category_cols], train["label"])
pickle.dump(clf, open("./churn_xgb.model", "wb"))
# 预测
model = pickle.load(open("./churn_xgb.model", "rb"))
predictions = model.predict(test[continus_cols + category_cols])
predict_proba = model.predict_proba(test[continus_cols + category_cols])[:, 1]
# 模型评价
print("acc:", accuracy_score(test["label"], predictions))
print("pri:", precision_score(test["label"], predictions))
print("rec:", recall_score(test["label"], predictions))
print("auc:", roc_auc_score(test["label"], predict_proba))
from flask import Flask, jsonify, request
import pickle
import pandas as pd
import json
app = Flask(__name__)
@app.route('/predict', methods=['POST'])
def apicall():
test_json = request.get_json(force=True)
test = pd.read_json(json.dumps(test_json), orient='records') # reconds格式,转化为dataframe数据格式
loan_ids = test['USR_NUM_ID']
except Exception as e:
raise e
clf = 'churn_xgb.model'
if test.empty:
return (bad_request())
print("Loading the model...")
loaded_model = None
with open('/Users/gengpeng/' + clf, 'rb') as f:
loaded_model = pickle.load(f)
print("The model has been loaded...doing predictions now...")
predictions = loaded_model.predict_proba(test)[:, 1]
prediction_series = list(pd.Series(predictions))
res = dict(zip(loan_ids, prediction_series))
responses = jsonify(predictions=res)
responses.status_code = 200
return (responses)
def bad_request(error=None):
message = {
'status': 400,
'message': 'Bad Request: ' + request.url + '--> Please check your data payload...',
resp = jsonify(message)
resp.status_code = 400
return resp
if __name__ == '__main__':
启动flask web server
python churn_xgb_server.py
代码整理在 https://github.com/xiaogp/customer_churn_prediction/tree/master/GBDT/python