美文网首页机器学习和人工智能入门
天池o2o优惠券使用预测比赛解析(初级)

天池o2o优惠券使用预测比赛解析(初级)

作者: JasonChiu17 | 来源:发表于2018-11-27 08:43 被阅读7次

    天池o2o优惠券使用预测比赛解析(初级)

    赛题链接:

    天池o2o优惠券使用预测

    import os, sys, pickle
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from datetime import date
    from sklearn.linear_model import SGDClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import KFold,train_test_split,StratifiedKFold,GridSearchCV
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import  auc, roc_curve
    

    加载数据

    df_train = pd.read_csv('data/ccf_offline_stage1_train.csv')
    df_test = pd.read_csv('data/ccf_offline_stage1_test_revised.csv')
    
    print(df_train.head())
    
       User_id  Merchant_id  Coupon_id Discount_rate  Distance  Date_received  \
    0  1439408         2632        NaN           NaN       0.0            NaN   
    1  1439408         4663    11002.0        150:20       1.0     20160528.0   
    2  1439408         2632     8591.0          20:1       0.0     20160217.0   
    3  1439408         2632     1078.0          20:1       0.0     20160319.0   
    4  1439408         2632     8591.0          20:1       0.0     20160613.0   
    
             Date  
    0  20160217.0  
    1         NaN  
    2         NaN  
    3         NaN  
    4         NaN  
    

    缺失值处理

    df_train=df_train.fillna('null')
    df_test=df_test.fillna('null')
    

    统计

    df_train.info()
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 1754884 entries, 0 to 1754883
    Data columns (total 7 columns):
    User_id          int64
    Merchant_id      int64
    Coupon_id        object
    Discount_rate    object
    Distance         object
    Date_received    object
    Date             object
    dtypes: int64(2), object(5)
    memory usage: 93.7+ MB
    
    print('有优惠券,有购买商品: %s 人' %df_train[(df_train['Date_received'] != 'null') & (df_train['Date'] != 'null')].shape[0])
    print('有优惠券,没购买商品: %s 人' %df_train[(df_train['Date_received'] != 'null') & (df_train['Date'] == 'null')].shape[0])
    print('没优惠券,有购买商品: %s 人' %df_train[(df_train['Date_received'] == 'null') & (df_train['Date'] != 'null')].shape[0])
    print('没优惠券,没购买商品: %s 人' %df_train[(df_train['Date_received'] == 'null') & (df_train['Date'] == 'null')].shape[0])
    
    有优惠券,有购买商品: 75382 人
    有优惠券,没购买商品: 977900 人
    没优惠券,有购买商品: 701602 人
    没优惠券,没购买商品: 0 人
    
    • 比赛的意义是把优惠券给那要购买商品却没有优惠券的人(701602 人),真正有需要的人。

    特征提取

    1. 打折率

    #打折率的元素有三种类型,需要拆分开成为新的特征
    df_train.Discount_rate.unique()
    
    array(['null', '150:20', '20:1', '200:20', '30:5', '50:10', '10:5',
           '100:10', '200:30', '20:5', '30:10', '50:5', '150:10', '100:30',
           '200:50', '100:50', '300:30', '50:20', '0.9', '10:1', '30:1',
           '0.95', '100:5', '5:1', '100:20', '0.8', '50:1', '200:10',
           '300:20', '100:1', '150:30', '300:50', '20:10', '0.85', '0.6',
           '150:50', '0.75', '0.5', '200:5', '0.7', '30:20', '300:10', '0.2',
           '50:30', '200:100', '150:5'], dtype=object)
    
    #定义函数拆分discount_rate列为新的四个特征
    def getDiscountType(row):
        if 'null' in row:
            return 0
        else:
            return 1
    
    def convertRate(row):
        if 'null' in row:
            return 1
        elif ':' in row:
            money = row.split(':')
            rate = 1.0 - float(money[1])/float(money[0])
            return rate
        else:
            return float(row)
    def getDiscountMan(row):
        if ':' in row:
            money = row.split(':')
            return int(money[0])
        else:
            return 0
    def getDiscountJian(row):
        if ':' in row:
            money = row.split(':')
            return int(money[1])
        else:
            return 0
    
    '''
    之前犯错在于误以为series.apply(func)输入的是series,所以函数都有个for循环:
    def getDiscountType(row):
        for i in row:
            if 'null' in i:
                return 0
            else:
                return 1
    其实是迭代输入series的每一个元素,这一点和直接func(series)区分开来
    '''
    def processData(df):
        df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
        df['discount_rate'] = df_train['Discount_rate'].apply(convertRate)
        df['discount_man'] = df_train['Discount_rate'].apply(getDiscountMan)
        df['discount_jian'] = df_train['Discount_rate'].apply(getDiscountJian)
        print('打折率 %s' %df['discount_rate'].unique())
        return df
    
    • 之前犯错在于误以为series.apply(func)输入的是series,所以函数都有个for循环:
    def getDiscountType(row):
        for i in row:
            if 'null' in i:
                return 0
            else:
                return 1
    

    其实是迭代输入series的每一个元素,这一点和直接func(series)区分开来

    df_train = processData(df_train)
    df_test = processData(df_test)
    print(df_train.head())
    
    打折率 [1.         0.86666667 0.95       0.9        0.83333333 0.8
     0.5        0.85       0.75       0.66666667 0.93333333 0.7
     0.6        0.96666667 0.98       0.99       0.975      0.33333333
     0.2        0.4       ]
    打折率 [1.         0.86666667 0.95       0.9        0.83333333 0.8
     0.5        0.85       0.75       0.66666667 0.93333333 0.7
     0.6        0.96666667 0.98       0.99       0.975      0.33333333
     0.2       ]
       User_id  Merchant_id Coupon_id Discount_rate Distance Date_received  \
    0  1439408         2632      null          null        0          null   
    1  1439408         4663     11002        150:20        1   2.01605e+07   
    2  1439408         2632      8591          20:1        0   2.01602e+07   
    3  1439408         2632      1078          20:1        0   2.01603e+07   
    4  1439408         2632      8591          20:1        0   2.01606e+07   
    
              Date  discount_type  discount_rate  discount_man  discount_jian  
    0  2.01602e+07              0       1.000000             0              0  
    1         null              1       0.866667           150             20  
    2         null              1       0.950000            20              1  
    3         null              1       0.950000            20              1  
    4         null              1       0.950000            20              1  
    

    2. 距离

    df_train['Distance'].unique()
    
    array([0.0, 1.0, 'null', 2.0, 10.0, 4.0, 7.0, 9.0, 3.0, 5.0, 6.0, 8.0],
          dtype=object)
    
    • 处理一下'null',转换成int
    df_train['distance'] = df_train['Distance'].replace('null',-1).astype(int)
    df_test['distance'] = df_test['Distance'].replace('null',-1).astype(int)
    df_train['distance'] .unique()
    
    array([ 0,  1, -1,  2, 10,  4,  7,  9,  3,  5,  6,  8])
    
    df_train.info()
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 1754884 entries, 0 to 1754883
    Data columns (total 12 columns):
    User_id          int64
    Merchant_id      int64
    Coupon_id        object
    Discount_rate    object
    Distance         object
    Date_received    object
    Date             object
    discount_type    int64
    discount_rate    float64
    discount_man     int64
    discount_jian    int64
    distance         int64
    dtypes: float64(1), int64(6), object(5)
    memory usage: 160.7+ MB
    

    3. 领券日期

    #领券日期
    date_receive = df_train['Date_received'].unique()
    date_receive = sorted(date_receive[date_receive != 'null'])
    print('领券日期:%d - %d'%(date_receive[0],date_receive[-1]))
    
    #消费日期
    date_buy = df_train['Date'].unique()
    date_buy = sorted(date_buy[date_buy != 'null'])
    print('领券日期:%d - %d'%(date_buy[0],date_buy[-1]))
    
    领券日期:20160101 - 20160615
    领券日期:20160101 - 20160630
    

    换算成weekday

    • weekday : {null, 1, 2, 3, 4, 5, 6, 7}

    • weekday_type : {1, 0}(周六和周日为1,其他为0)

    • Weekday_1 : {1, 0, 0, 0, 0, 0, 0}

    • Weekday_2 : {0, 1, 0, 0, 0, 0, 0}

    • Weekday_3 : {0, 0, 1, 0, 0, 0, 0}

    • Weekday_4 : {0, 0, 0, 1, 0, 0, 0}

    • Weekday_5 : {0, 0, 0, 0, 1, 0, 0}

    • Weekday_6 : {0, 0, 0, 0, 0, 1, 0}

    • Weekday_7 : {0, 0, 0, 0, 0, 0, 1}

    构造weekday特征

    def getWeekday(row):
        if row == 'null':
            return row
        else:
            weekday = date(int(row[0:4]),int(row[4:6]),int(row[6:8])).weekday() + 1
            return weekday
    df_train['weekday'] = df_train['Date_received'].astype(str).apply(getWeekday)
    df_test['weekday'] = df_test['Date_received'].astype(str).apply(getWeekday)
    df_train['weekday'].unique()
    
    array(['null', 6, 3, 1, 5, 4, 7, 2], dtype=object)
    

    构造weekday_tye特征

    df_train['weekday_type'] = df_train['weekday'].apply(lambda x: 1 if x in [6,7] else 0)
    df_test['weekday_type'] = df_test['weekday'].apply(lambda x: 1 if x in [6,7] else 0)
    df_train['weekday_type'] .unique()
    
    array([0, 1])
    

    构造weekday_number特征

    #训练数据
    #one-hot-encoding
    data = df_train['weekday'].replace('null',np.nan)
    tmpdf = pd.get_dummies(data,prefix='weekday')
    
    #拼接数据
    df_train = pd.concat([df_train,tmpdf],axis=1)
    
    #测试数据
    #one-hot-encoding
    data = df_test['weekday'].replace('null',np.nan)
    tmpdf = pd.get_dummies(data,prefix='weekday')
    
    #拼接数据
    df_test = pd.concat([df_test,tmpdf],axis=1)
    
    print(df_train.head())
    
       User_id  Merchant_id Coupon_id Discount_rate Distance Date_received  \
    0  1439408         2632      null          null        0          null   
    1  1439408         4663     11002        150:20        1    20160528.0   
    2  1439408         2632      8591          20:1        0    20160217.0   
    3  1439408         2632      1078          20:1        0    20160319.0   
    4  1439408         2632      8591          20:1        0    20160613.0   
    
              Date  discount_type  discount_rate  discount_man  ...    weekday  \
    0  2.01602e+07              0       1.000000             0  ...       null   
    1         null              1       0.866667           150  ...          6   
    2         null              1       0.950000            20  ...          3   
    3         null              1       0.950000            20  ...          6   
    4         null              1       0.950000            20  ...          1   
    
       weekday_type weekday_1.0  weekday_2.0  weekday_3.0  weekday_4.0  \
    0             0           0            0            0            0   
    1             1           0            0            0            0   
    2             0           0            0            1            0   
    3             1           0            0            0            0   
    4             0           1            0            0            0   
    
       weekday_5.0  weekday_6.0  weekday_7.0  label  
    0            0            0            0     -1  
    1            0            1            0      0  
    2            0            0            0      0  
    3            0            1            0      0  
    4            0            0            0      0  
    
    [5 rows x 22 columns]
    
    print('所有特征:')
    for i in df_train.columns:
        print('\t',i)
    
    所有特征:
         User_id
         Merchant_id
         Coupon_id
         Discount_rate
         Distance
         Date_received
         Date
         discount_type
         discount_rate
         discount_man
         discount_jian
         distance
         weekday
         weekday_type
         weekday_1.0
         weekday_2.0
         weekday_3.0
         weekday_4.0
         weekday_5.0
         weekday_6.0
         weekday_7.0
    

    标注label

    三种情况:

    • Date_received == 'null':表示没有领到优惠券,无需考虑,y = -1

    • (Date_received != 'null') & (Date != 'null') & (Date - Date_received <= 15):表示领取优惠券且在15天内使用,即正样本,y = 1

    • (Date_received != 'null') & ((Date == 'null') | (Date - Date_received > 15)):表示领取优惠券未在在15天内使用,即负样本,y = 0

    def label(row):
        if row['Date_received'] == 'null':
            return -1
        if row['Date'] != 'null':
            date_buy = pd.to_datetime(row['Date'],format='%Y%m%d')
            date_receive = pd.to_datetime(row['Date_received'],format='%Y%m%d')
            td =  date_buy - date_receive
            if td.days <= 15:
                return 1
        return 0
    
    
    df_train['label'] = df_train.apply(label,axis=1)
    df_train['label'].value_counts()
    
     0    988887
    -1    701602
     1     64395
    Name: label, dtype: int64
    

    建立线性模型 SGDClassifier

    • 使用下面面提取的14个特征:
    • discount_rate
    • discount_type

    • discount_man

    • discount_jian

    • distance

    • weekday

    • weekday_type

    • weekday_1

    • weekday_2

    • weekday_3

    • weekday_4

    • weekday_5

    • weekday_6

    • weekday_7

    • 训练集:20160101-20160515;验证集:20160516-20160615。

    • 用线性模型 SGDClassifier

    划分训练集/验证集

    df_train['Date_received'] = df_train['Date_received'].astype(str)
    df_train['Date_received'].unique()
    
    array(['null', '20160528.0', '20160217.0', '20160319.0', '20160613.0',
           '20160516.0', '20160429.0', '20160129.0', '20160530.0',
           '20160519.0', '20160606.0', '20160207.0', '20160421.0',
           '20160130.0', '20160412.0', '20160518.0', '20160327.0',
           '20160127.0', '20160215.0', '20160524.0', '20160523.0',
           '20160515.0', '20160521.0', '20160114.0', '20160321.0',
           '20160426.0', '20160409.0', '20160326.0', '20160322.0',
           '20160131.0', '20160125.0', '20160602.0', '20160128.0',
           '20160605.0', '20160607.0', '20160324.0', '20160601.0',
           '20160126.0', '20160124.0', '20160123.0', '20160201.0',
           '20160522.0', '20160203.0', '20160417.0', '20160415.0',
           '20160202.0', '20160206.0', '20160218.0', '20160611.0',
           '20160329.0', '20160510.0', '20160302.0', '20160526.0',
           '20160318.0', '20160205.0', '20160411.0', '20160520.0',
           '20160527.0', '20160317.0', '20160213.0', '20160505.0',
           '20160402.0', '20160211.0', '20160405.0', '20160408.0',
           '20160323.0', '20160204.0', '20160112.0', '20160430.0',
           '20160525.0', '20160609.0', '20160403.0', '20160325.0',
           '20160413.0', '20160210.0', '20160610.0', '20160414.0',
           '20160401.0', '20160109.0', '20160328.0', '20160420.0',
           '20160422.0', '20160615.0', '20160120.0', '20160614.0',
           '20160107.0', '20160508.0', '20160608.0', '20160603.0',
           '20160425.0', '20160424.0', '20160305.0', '20160330.0',
           '20160511.0', '20160504.0', '20160223.0', '20160404.0',
           '20160416.0', '20160118.0', '20160303.0', '20160212.0',
           '20160423.0', '20160308.0', '20160228.0', '20160418.0',
           '20160509.0', '20160501.0', '20160428.0', '20160427.0',
           '20160229.0', '20160512.0', '20160506.0', '20160117.0',
           '20160514.0', '20160407.0', '20160410.0', '20160314.0',
           '20160116.0', '20160503.0', '20160502.0', '20160531.0',
           '20160316.0', '20160331.0', '20160517.0', '20160222.0',
           '20160101.0', '20160306.0', '20160604.0', '20160214.0',
           '20160406.0', '20160121.0', '20160313.0', '20160225.0',
           '20160220.0', '20160110.0', '20160301.0', '20160105.0',
           '20160122.0', '20160104.0', '20160113.0', '20160108.0',
           '20160115.0', '20160513.0', '20160208.0', '20160612.0',
           '20160419.0', '20160103.0', '20160312.0', '20160209.0',
           '20160529.0', '20160119.0', '20160227.0', '20160315.0',
           '20160304.0', '20160216.0', '20160507.0', '20160311.0',
           '20160320.0', '20160102.0', '20160106.0', '20160224.0',
           '20160219.0', '20160111.0', '20160310.0', '20160307.0',
           '20160221.0', '20160226.0', '20160309.0'], dtype=object)
    
    df = df_train[df_train['label']!=-1].copy()
    train = df_train[df_train['Date_received'] <='20160515.0'].copy()
    valid = df_train[(df_train['Date_received'] >='20160516.0') & (df_train['Date_received'] <'20160615.0')]
    print('Train Set:\n',train['label'].value_counts())
    print('Valid Set:\n',valid['label'].value_counts())
    
    Train Set:
     0    759172
    1     41524
    Name: label, dtype: int64
    Valid Set:
     0    226595
    1     22516
    Name: label, dtype: int64
    

    特征

    feature = ['discount_type', 'discount_rate',
           'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type',
           'weekday_1.0', 'weekday_2.0', 'weekday_3.0', 'weekday_4.0',
           'weekday_5.0', 'weekday_6.0', 'weekday_7.0']
    print(feature)
    
    ['discount_type', 'discount_rate', 'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type', 'weekday_1.0', 'weekday_2.0', 'weekday_3.0', 'weekday_4.0', 'weekday_5.0', 'weekday_6.0', 'weekday_7.0']
    

    建立模型

    def check_model(data,feature):
        classifier = SGDClassifier(
            loss='log',#The ‘log’ loss gives logistic regression
            penalty='elasticnet',
            fit_intercept=True, # 是否存在截距,默认存在
            max_iter=100,
            shuffle=True,
            n_jobs=1,
            class_weight=None)
        
        #管道机制实现了对全部步骤的流式化封装和管理。
        model = Pipeline(steps=[
            ('ss',StandardScaler()),
            ('clf',classifier)
        ])
        parameters = {
            'clf__alpha':[0.001,0.01,0.1],
            'clf__l1_ratio':[0.001,0.01,0.1]
        }
        #分成采样,确保训练集,测试集中各类别样本的比例与原始数据集中相同。
        folder = StratifiedKFold(n_splits=3,shuffle=True)
        
        #网格搜索
        grid_search = GridSearchCV(
            model,
            parameters,
            cv=folder,
            n_jobs=-1,
            verbose=1)
        grid_search = grid_search.fit(data[feature],data['label'])
        return grid_search
    

    训练

    model = check_model(train,feature)
    
    Fitting 3 folds for each of 9 candidates, totalling 27 fits
    
    
    [Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed: 10.3min finished
    

    验证

    对验证集中每个优惠券预测的结果计算 AUC,再对所有优惠券的 AUC 求平均。计算 AUC 的时候,如果 label 只有一类,就直接跳过,因为 AUC 无法计算。

    y_valid_pred = model.predict_proba(valid[feature])
    valid1 = valid.copy()
    valid1['pred_prob'] = y_valid_pred[:,1]
    print(valid1.head())
    
        User_id  Merchant_id Coupon_id Discount_rate Distance Date_received  \
    1   1439408         4663     11002        150:20        1    20160528.0   
    4   1439408         2632      8591          20:1        0    20160613.0   
    6   1439408         2632      8591          20:1        0    20160516.0   
    9   2029232          450      1532          30:5        0    20160530.0   
    10  2029232         6459     12737          20:1        0    20160519.0   
    
               Date  discount_type  discount_rate  discount_man    ...      \
    1          null              1       0.866667           150    ...       
    4          null              1       0.950000            20    ...       
    6   2.01606e+07              1       0.950000            20    ...       
    9          null              1       0.833333            30    ...       
    10         null              1       0.950000            20    ...       
    
        weekday_type  weekday_1.0 weekday_2.0  weekday_3.0  weekday_4.0  \
    1              1            0           0            0            0   
    4              0            1           0            0            0   
    6              0            1           0            0            0   
    9              0            1           0            0            0   
    10             0            0           0            0            1   
    
        weekday_5.0  weekday_6.0  weekday_7.0  label  pred_prob  
    1             0            1            0      0   0.019839  
    4             0            0            0      0   0.098629  
    6             0            0            0      0   0.098629  
    9             0            0            0      0   0.095701  
    10            0            0            0      0   0.129752  
    
    [5 rows x 23 columns]
    
    • groupby之后是元组的形式
    valid_groupby = valid1.groupby(['Coupon_id'])
    for i in valid_groupby:
        print(i)
    
    (1.0,          User_id  Merchant_id Coupon_id Discount_rate Distance Date_received  \
    768069    472146         6889         1          20:1        9    20160522.0   
    962551   2266597         6889         1          20:1        0    20160603.0   
    964821   3057133         6889         1          20:1        0    20160606.0   
    1665538  5555255         6889         1          20:1        3    20160530.0   
    
                    Date  discount_type  discount_rate  discount_man    ...      \
    768069   2.01606e+07              1           0.95            20    ...       
    962551          null              1           0.95            20    ...       
    964821          null              1           0.95            20    ...       
    1665538         null              1           0.95            20    ...       
    
             weekday_type  weekday_1.0 weekday_2.0  weekday_3.0  weekday_4.0  \
    768069              1            0           0            0            0   
    962551              0            0           0            0            0   
    964821              0            1           0            0            0   
    1665538             0            1           0            0            0   
    
             weekday_5.0  weekday_6.0  weekday_7.0  label  pred_prob  
    768069             0            0            1      1   0.013089  
    962551             1            0            0      0   0.103987  
    964821             0            0            0      0   0.099109  
    1665538            0            0            0      0   0.052904  
    
    [4 rows x 23 columns])
    
    #计算AUC
    valid_groupby = valid1.groupby(['Coupon_id'])
    aucs = []
    mean_tpr = 0.0
    for i in valid_groupby:
        tmpdf = i[1]
        if len(tmpdf['label'].unique())==1:
            continue
        fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
        aucs.append(auc(fpr,tpr))
    
    
    print(np.mean(aucs))
    
    0.5334629648536017
    
    valid[feature].shape
    
    (249111, 14)
    

    测试

    feature_test = [ 'discount_type', 'discount_rate',
           'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type','weekday_1',
           'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6',
           'weekday_7']
    
    y_test_pred = model.predict_proba(df_test[feature_test])
    df_test_1 = df_test[['User_id','Coupon_id','Date_received']].copy()
    df_test_1['Probability'] = y_test_pred[:,1]
    df_test_1.to_csv('submit2.csv',index=False,header=False)
    print(df_test_1.head())
    
       User_id  Coupon_id  Date_received  Probability
    0  4129537       9983       20160712     0.118748
    1  6949378       3429       20160706     0.034147
    2  2166529       6928       20160727     0.045592
    3  2166529       1808       20160727     0.045592
    4  6172162       6500       20160708     0.068717
    

    保存模型 & 导入模型

    if not os.path.isfile('model.pkl'):
        with open('model.pkl','wb') as f:
            pickle.dump(model,f)
    else:
        with open('model.pkl','rb') as f:
            model = pickle.load(f)
    

    比赛第一名代码与解析

    相关文章

      网友评论

        本文标题:天池o2o优惠券使用预测比赛解析(初级)

        本文链接:https://www.haomeiwen.com/subject/nbuwqqtx.html