美文网首页
天池:O2O优惠券使用预测

天池:O2O优惠券使用预测

作者: scottzcw | 来源:发表于2018-08-28 15:37 被阅读164次

    #coding=utf-8

    import os

    import pandasas pd

    import numpyas np

    import sys

    from datetimeimport datetime,date

    from stringimport Template

    from dateutil.parserimport parse

    from sklearnimport linear_model

    from sklearn.linear_modelimport  Ridge

    from sklearn.metricsimport mean_squared_error, r2_score,roc_auc_score

    from sklearnimport preprocessing

    from sklearn.treeimport DecisionTreeRegressor

    from sklearnimport ensemble,svm

    reload(sys)

    sys.setdefaultencoding("utf-8")

    def getDiscountType(row):

    if pd.isnull(row):

    return 0

        elif ':' in row:

    return 0

        else:

    return 1

    def convertRate(row):

    """Convert discount to rate"""

        if pd.isnull(row):

    return 1.0

        elif ':' in str(row):

    rows = row.split(':')

    return 1.0 -float(rows[1]) /float(rows[0])

    else:

    return float(row)

    def getDiscountMan(row):

    if ':' in str(row):

    rows = row.split(':')

    return int(rows[0])

    else:

    return 0

    def getDiscountJian(row):

    if ':' in str(row):

    rows = row.split(':')

    return int(rows[1])

    else:

    return 0

    def getWeekday(row):

    if row =='nan':

    return np.nan

    else:

    return date(int(row[0:4]),int(row[4:6]),int(row[6:8])).weekday() +1

    pd.set_option('display.max_columns',None)

    #设置目录

    dir=r"D:\zcw\tianchi"

    #加载训练数据

    train_df=pd.read_csv(dir+"\ccf_offline_stage1_train.csv",sep=',',delimiter=',',

    dtype= {'User_id':str,'Date':str,'Coupon_id':str,'Date_received':str,'Discount_rate':str,'Distance':str})

    #加载线上数据

    train_ol_df=pd.read_csv(dir+"\ccf_online_stage1_train.csv",sep=',',delimiter=',',

    dtype= {'User_id':str,'Coupon_id':str,'Action':str,'Date':str} )

    #优惠券使用率

    train_ol_df_all=train_ol_df.groupby('Coupon_id',as_index=False)['User_id'].count().copy()

    train_ol_df_cons=train_ol_df[(train_ol_df['Date']>'2016' )].groupby('Coupon_id',as_index=False)['User_id'].count().copy()

    train_ol=pd.merge(train_ol_df_all,train_ol_df_cons,how='left',left_on=['Coupon_id'],right_on=['Coupon_id'])

    train_ol['cratio']=train_ol.apply(lambda row: row['User_id_y']/row['User_id_x'],axis=1)

    print train_ol.head(10)

    #计算折扣率

    train_df['ratio']=train_df['Discount_rate'].map(lambda x:float(str(x).split(':')[1])/float(str(x).split(':')[0])if ':' in str(x)else 0  )

    #计算使用优惠券

    start=pd.to_datetime(train_df['Date'])

    end=pd.to_datetime(train_df['Date_received'])

    days=start-end

    train_df['days']=days.dt.days

    #训练数据增加优惠券使用率

    train_df=pd.merge(train_df,train_ol,how='left',left_on=['Coupon_id'],right_on=['Coupon_id'])

    train_df['cratio']=train_df['cratio'].map(lambda x: xif x>0 else 0 )

    #缺失补0

    train_df['days']=train_df['days'].map(lambda x: xif x>0 else 0 )

    #添加标签

    train_df['label']=train_df['days'].map(lambda x:'1' if x>0 and x<=15 else '0' )

    #距离补0

    train_df['Distance']=train_df['Distance'].map(lambda x: xif x>0 else 0 )

    # print train_df.head(10)

    train_df['discount_man'] = train_df['Discount_rate'].apply(getDiscountMan)

    train_df['discount_jian'] = train_df['Discount_rate'].apply(getDiscountJian)

    train_df['discount_type'] = train_df['Discount_rate'].apply(getDiscountType)

    train_df['weekday'] = train_df['Date_received'].astype(str).apply(getWeekday)

    train_df["p1"] = np.array(train_df['weekday'] ==1.0).astype(np.int32)

    train_df["p2"] = np.array(train_df['weekday'] ==2.0).astype(np.int32)

    train_df["p3"] = np.array(train_df['weekday'] ==3.0).astype(np.int32)

    train_df["p4"] = np.array(train_df['weekday'] ==4.0).astype(np.int32)

    train_df["p5"] = np.array(train_df['weekday'] ==5.0).astype(np.int32)

    train_df["p6"] = np.array(train_df['weekday'] ==6.0).astype(np.int32)

    train_df["p7"] = np.array(train_df['weekday'] ==7.0).astype(np.int32)

    # dftest2=dftest[['p1','p2','p3','p4','p5','p6','p7']]

    del train_df['Date']

    del train_df['Date_received']

    del train_df['Discount_rate']

    # train_df=pd.merge(train_df,train_ol_df, how='left', left_on=['User_id','Coupon_id'], right_on=['User_id','Coupon_id'])

    print train_df.head(10)

    #切分数据 1百万作为训练数据 并且归一化处理

    diabetes_X_train = preprocessing.scale(train_df.loc[:900000,['ratio','cratio','Distance','discount_man','discount_jian','discount_type','p7','p1','p2','p3','p4','p5','p6']])

    diabetes_X_test =  preprocessing.scale(train_df.loc[900000:,['ratio','cratio','Distance','discount_man','discount_jian','discount_type','p7','p1','p2','p3','p4','p5','p6']])

    diabetes_y_train = train_df.loc[:900000,['label']]

    diabetes_y_test =  train_df.loc[900000:,['label']]

    regr = linear_model.LinearRegression()

    # regr =svm.SVR(C=1000,)

    # regr =DecisionTreeRegressor()  #0.04779884855102401

    # regr= linear_model.LogisticRegression(); #0.97

    # regr=ensemble.RandomForestRegressor(n_estimators=20,oob_score=True) # 0.04778887161750578

    # regr=ensemble.AdaBoostRegressor(n_estimators=50)  #0.04079561330108894

    # regr=ensemble.AdaBoostClassifier(n_estimators=50)

    # Train the model using the training sets

    regr.fit(diabetes_X_train, diabetes_y_train)

    # Make predictions using the testing set

    diabetes_y_pred = regr.predict(diabetes_X_test)

    # diabetes_y_pred=diabetes_y_pred/0.2

    # The coefficients

    # print('score: \n', roc_auc_score(diabetes_y_test, diabetes_y_pred))

    # print('clf: \n', regr.coef_)

    #加载预测数据

    test_df=pd.read_csv(dir+"\ccf_offline_stage1_test_revised.csv",sep=',',delimiter=',',

    dtype= {'Date':str,'Coupon_id':str,'Date_received':str,'Distance':str})

    test_df['ratio']=test_df['Discount_rate'].map(lambda x:float(str(x).split(':')[1])/float(str(x).split(':')[0])if ':' in str(x)else 0  )

    test_df['Distance']=test_df['Distance'].map(lambda x: xif x>0 else 0 )

    test_df=pd.merge(test_df,train_ol,how='left',left_on=['Coupon_id'],right_on=['Coupon_id'])

    test_df['cratio']=test_df['cratio'].map(lambda x: xif x>0 else 0 )

    test_df['discount_man'] = test_df['Discount_rate'].apply(getDiscountMan)

    test_df['discount_jian'] = test_df['Discount_rate'].apply(getDiscountJian)

    test_df['discount_type'] = test_df['Discount_rate'].apply(getDiscountType)

    test_df['weekday'] = test_df['Date_received'].astype(str).apply(getWeekday)

    test_df["p1"] = np.array(test_df['weekday'] ==1.0).astype(np.int32)

    test_df["p2"] = np.array(test_df['weekday'] ==2.0).astype(np.int32)

    test_df["p3"] = np.array(test_df['weekday'] ==3.0).astype(np.int32)

    test_df["p4"] = np.array(test_df['weekday'] ==4.0).astype(np.int32)

    test_df["p5"] = np.array(test_df['weekday'] ==5.0).astype(np.int32)

    test_df["p6"] = np.array(test_df['weekday'] ==6.0).astype(np.int32)

    test_df["p7"] = np.array(test_df['weekday'] ==7.0).astype(np.int32)

    p_X=  preprocessing.scale(test_df.loc[:,['ratio','cratio','Distance','discount_man','discount_jian','discount_type','p7','p1','p2','p3','p4','p5','p6']])

    #预测

    p_Y = regr.predict(p_X)

    # print  p_Y[1:10]

    test_df['Probability']=p_Y

    # print  test_df.head(10)

    result_df=test_df[['User_id','Coupon_id','Date_received','Probability']].copy()

    result_df['Probability']=result_df['Probability'].map(lambda x: xif x>0 else 0 )

    print result_df.head(10)

    #保存

    result_df.to_csv(dir+"\\result.csv",sep=',',header=False,index=False)

    相关文章

      网友评论

          本文标题:天池:O2O优惠券使用预测

          本文链接:https://www.haomeiwen.com/subject/iqqqwftx.html