美文网首页
腾讯广告算法大赛19_Top5方案&代码

腾讯广告算法大赛19_Top5方案&代码

作者: 拼搏向上001 | 来源:发表于2020-04-17 16:02 被阅读0次

    转载自https://mp.weixin.qq.com/s/j5YICHrkHLDm7OldPFPOjw

    本篇文章分享2019年腾讯广告算法大赛的方法,文末附带本次比赛简短高效的代码,代码的解析。

    赛题理解

    image.png
    image.png
    image.png
    image.png
    image.png
    image.png

    特征工程

    image.png
    image.png
    image.png

    模型介绍

    image.png
    image.png
    image.png
    image.png

    思考总结

    image.png
    image.png
    # -*- coding: utf-8 -*-
    
    import pandas as pd
    
    import numpy as np
    
    import lightgbm as lgb
    
    from sklearn.model_selection import KFold
    
    path = './data/'
    
    
    
    def get_base_data():
    
        ad_static = pd.read_csv(path + '/map_ad_static.out',
    
                                names=['aid', 'create_time', 'adv_id', 'product_id', 'ad_cate_id',
    
                                       'industry_id', 'creative_size'], sep='\t')
    
        ad_static['create_time_int'] = ad_static['create_time']
    
        ad_static['create_time'] = pd.to_datetime(ad_static['create_time'] + 8 * 3600, unit='s')
    
        cond_req = pd.read_csv(path + '//Btest_select_request_20190424.out',
    
                               names=['aid', 'req_set'], sep='\t')
    
        cond_req['req_set_len'] = cond_req['req_set'].apply(lambda x: len(x.split('|')))
    
        test = pd.read_csv(path + '/Btest_sample_bid.out',
    
                           names=['sample_id', 'aid', 'target_type', 'bid_type', 'bid'], sep='\t')
    
        test['day'] = 24
    
        test['aid_day_count'] = test['aid'].map(cond_req[['aid', 'req_set_len']].set_index('aid')['req_set_len'])
    
        return ad_static, test
    
    ad_static, test = get_base_data()
    
    def get_train_data():
    
        aid_day_label_stat = {}
    
        aid_day_count_stat = {}
    
        for day in range(10, 23):
    
            temp_track_log = pd.read_csv(path + '/track_log/track_log_201904' + str(day) + '.out', names=[
    
                'req_id', 'req_time', 'uid', 'loc_id', 'comp_info'], sep='\t')
    
            for comp_info in temp_track_log['comp_info']:
    
                for ad in comp_info.split(';'):
    
                    ad = ad.split(',')
    
                    aid_day = int(ad[0]) * 100 + day
    
                    try:
    
                        aid_day_count_stat[aid_day] = aid_day_count_stat[aid_day] + 1
    
                    except:
    
                        aid_day_count_stat[aid_day] = 1
    
                    if ad[6] == '1':
    
                        try:
    
                            aid_day_label_stat[aid_day] = aid_day_label_stat[aid_day] + 1
    
                        except:
    
                            aid_day_label_stat[aid_day] = 1
    
        aid_day_label = pd.DataFrame(
    
            {'aid_day': list(aid_day_label_stat.keys()), 'label': list(aid_day_label_stat.values())})
    
        aid_day_count = pd.DataFrame(
    
            {'aid_day': list(aid_day_count_stat.keys()), 'aid_day_count': list(aid_day_count_stat.values())})
    
    
    
        train = aid_day_count.merge(aid_day_label, 'left', 'aid_day').fillna(0)
    
        train['aid'] = train['aid_day'] // 100
    
        train['day'] = train['aid_day'] % 100
    
        train = train[(train['aid'].isin(test.aid.unique()) | (train['aid_day_count'] > 100))]
    
        return train, aid_day_label
    
    train, aid_day_label = get_train_data()
    
    
    
    def get_test_expos():
    
        track_test_a = pd.read_csv(path + '/test_tracklog_20190423.last.out', names=[
    
            'req_id', 'req_time', 'uid', 'loc_id', 'comp_info'], sep='\t')
    
        aid_expos = []
    
        aid_not_expos = []
    
        for i in track_test_a['comp_info']:
    
            tmp_expos = False
    
            for ad in i.split(';'):
    
                ad = ad.split(',')
    
                if ad[-1] == '0' and tmp_expos is False:
    
                    tmp_expos = True
    
                    aid_expos.append(int(ad[0]))
    
                else:
    
                    aid_not_expos.append(int(ad[0]))
    
        aid_not_expos_df = pd.Series(aid_not_expos).value_counts()
    
        aid_not_expos_df = aid_not_expos_df.reset_index().rename(columns={'index': 'aid', 0: 'not_count'})
    
        aid_expos_df = pd.Series(aid_expos).value_counts()
    
        aid_expos_df = aid_expos_df.reset_index().rename(columns={'index': 'aid', 0: 'label'})
    
        test_expos = aid_expos_df.merge(aid_not_expos_df, 'outer', 'aid')
    
        test_expos = test_expos.fillna(0)
    
        test_expos['aid_day_count'] = test_expos['label'] + test_expos['not_count']
    
        test_expos['day'] = 23
    
        test_expos = test_expos[test_expos['aid_day_count'] > 50]
    
    
    
        test_23 = pd.read_csv(path + '/final_select_test_request.out', names=['aid', 'req_set'], sep='\t')
    
        test_23['day'] = 23
    
        test_23['aid_day_count'] = test_23['req_set'].apply(lambda x: len(x.split('|')))
    
        return test_expos, test_23
    
    test_expos, test_23 = get_test_expos()
    
    
    
    data = pd.concat([train, test_expos, test_23, test], ignore_index=True)
    
    data = data.merge(ad_static, 'left', 'aid')
    
    data['olabel'] = data['label']
    
    data['oaid_day_count'] = data['aid_day_count']
    
    data['not_count'] = data['aid_day_count'] - data['olabel']
    
    data['aid_day'] = data['aid'] * 100 + data['day']
    
    data['label'] = np.log1p(data['label'])
    
    data['rate_label'] = data['olabel'] / data['aid_day_count']
    
    data['sample_id'] = data['sample_id'].fillna(0).astype(int)
    
    data['week'] = (data['day'] - 1) % 7
    
    data['create_year'] = data['create_time'].dt.year
    
    data['day_keep'] = data['day'] - data['create_time'].dt.month * 31 - data['create_time'].dt.day
    
    
    
    ad_static_fea = ['adv_id', 'product_id', 'ad_cate_id', 'industry_id', 'creative_size']
    
    cnt_feat = ['cnt_static_' + i for i in ad_static_fea + ['create_time']]
    
    for i in ad_static_fea + ['create_time']:
    
        data['cnt_static_' + i] = data[i].map(ad_static[i].value_counts())
    
    
    
    data['day_unique'] = data.groupby('aid')['day'].transform('unique')
    
    day_nunique = []
    
    for x in data[['day', 'day_unique']].values:
    
        x_1 = x[0] - 1
    
        day_nunique.append(len([i for i in x[1] if i < x_1 and i > x_1 - 7]))
    
    data['day_nunique'] = day_nunique
    
    aid_day_label['day'] = aid_day_label['aid_day'] % 100
    
    data['day_count'] = data['day'].map(aid_day_label.groupby('day')['label'].sum())
    
    data_23 = data[data.day == 23].set_index('aid')
    
    data['rate_label_23'] = data['aid'].map(data_23['rate_label'])
    
    data['label_23'] = data['aid'].map(data_23['label'])
    
    data['aid_day_count_23'] = data['aid'].map(data_23['aid_day_count'])
    
    data['aid_day_1'] = data['aid_day'] + 1
    
    data['aid_day_count_1'] = data['aid_day'].map(data[data.day < 24].set_index('aid_day_1')['aid_day_count'])
    
    data['pred_23'] = data['rate_label_23'] * data['aid_day_count']
    
    data['n_label'] = data['label'] / data['day_count']
    
    data['n_rate_label'] = data['rate_label'] / data.groupby('day')['rate_label'].transform('mean')
    
    
    
    shift_feat = []
    
    data['aid_day'] = data['aid'] * 100 + data['day']
    
    for i in range(14):
    
        i = i + 1
    
        shift_feat.append('aid_day_label_' + str(i))
    
        shift_feat.append('day_label_' + str(i))
    
        shift_feat.append('day_not_' + str(i))
    
        shift_feat.append('rate_label_' + str(i))
    
        data['aid_day_' + str(i)] = data['aid_day'] + i
    
        data_last = data[~data.label.isnull()].set_index('aid_day_' + str(i))
    
        data['day_label_' + str(i)] = data['aid_day'].map(data_last['n_label'])
    
        data['rate_label_' + str(i)] = data['aid_day'].map(data_last['n_rate_label'])
    
        data['day_not_' + str(i)] = data['aid_day'].map(data_last['not_count'])
    
        data['aid_day_label_' + str(i)] = data['aid_day_count'] * data['aid_day'].map(data_last['rate_label'])
    
    
    
    data['adv_id_day'] = data['adv_id'] * 100 + data['day']
    
    data['adv_id_day_count_sum'] = data['adv_id_day'].map(
    
        data.drop_duplicates(['aid_day']).groupby('adv_id_day')['aid_day_count'].sum())
    
    data['adv_id_day_count'] = data['adv_id_day'].map(
    
        data.drop_duplicates(['aid_day']).groupby('adv_id_day')['aid_day_count'].mean())
    
    
    
    adv_shift_feat = []
    
    for i in range(1, 10):
    
        i = i + 1
    
        adv_shift_feat.append('adv_id_day_label_' + str(i))
    
        data['adv_id_day_' + str(i)] = data['adv_id_day'] + i
    
        data['adv_id_day_label_' + str(i)] = data['adv_id_day_count'] * data['adv_id_day'].map(
    
            data[~data.label.isnull()].groupby('adv_id_day_' + str(i))['rate_label'].mean())
    
    
    
    data = data.sort_values(['sample_id', 'day']).reset_index(drop=True)
    
    cate_feature = ['week', 'create_year'] + ad_static_fea
    
    num_feature = shift_feat + adv_shift_feat + ['day_keep'] + [
    
        'day_nunique', 'aid_day_count', 'rate_label_23', 'pred_23', 'label_23', 'aid_day_count_1',
    
        'adv_id_day_count_sum', 'adv_id_day_count', 'aid_day_count_23',
    
    ] + cnt_feat
    
    features = cate_feature + num_feature
    
    lgb_model = lgb.LGBMRegressor(
    
        num_leaves=150, reg_alpha=0., reg_lambda=0.01, objective='mae', metric='mae',
    
        max_depth=-1, learning_rate=0.05, min_child_samples=100, n_jobs=-1,
    
        n_estimators=1000, subsample=0.7, colsample_bytree=0.8, subsample_freq=1, random_state=2019
    
    )
    
    data['rule_pred'] = data[['aid_day_label_2', 'aid_day_label_3', 'aid_day_label_4']].mean(axis=1).fillna(0)
    
    data['loss'] = data['label'] - np.log1p(data['rule_pred'])
    
    lgb_model = lgb_model.fit(data[(data.day < 23)][features], data[(data.day < 23)]['loss'])
    
    sub_control = data[data.sample_id > 0][['sample_id', 'rule_pred', 'aid', 'bid']].reset_index(drop=True)
    
    sub_control['pred'] = lgb_model.predict(data[data.sample_id > 0][features])
    
    sub_control['pred'] = np.expm1(sub_control['pred'] + np.log1p(sub_control['rule_pred']))
    
    
    
    def get_predict_w(model, data, label='label', feature=[], random_state=2018, n_splits=5):
    
        model.random_state = random_state
    
        predict_label = 'predict_' + label
    
        kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
        data[predict_label] = 0
    
        test_index = (data[label].isnull()) | (data[label] == -1)
    
        train_data = data[~test_index].reset_index(drop=True)
    
        test_data = data[test_index]
    
    
    
        for train_idx, val_idx in kfold.split(train_data):
    
            model.random_state = model.random_state + 1
    
            train_x = train_data.loc[train_idx][feature]
    
            train_y = train_data.loc[train_idx][label].values
    
            test_x = train_data.loc[val_idx][feature]
    
            test_y = train_data.loc[val_idx][label].values
    
            model.fit(train_x, train_y, eval_set=[(test_x, test_y)], verbose=None)
    
            train_data.loc[val_idx, predict_label] = model.predict(test_x)
    
            if len(test_data) != 0:
    
                test_data[predict_label] = test_data[predict_label] + model.predict(test_data[feature])
    
        test_data[predict_label] = test_data[predict_label] / n_splits
    
        return pd.concat([train_data, test_data], ignore_index=True), predict_label
    
    
    
    lgb_model.n_estimators = 500
    
    data.loc[data.day == 24, 'aid_day_count'] = data[data.day == 24]['oaid_day_count'] / 1.9
    
    data, pred_label = get_predict_w(lgb_model, data, 'label', [i for i in features if i not in shift_feat[:4]],random_state=2018, n_splits=5)
    
    
    
    sub_norm = data[(data.day == 24)][['sample_id', 'aid', 'bid', pred_label]].reset_index(drop=True)
    
    sub_norm['pred'] = sub_norm[pred_label]
    
    sub_norm['pred'] = np.expm1(sub_norm['pred']) * 1.3
    
    sub_merge = sub_norm.copy()
    
    sub_merge['rank_num'] = sub_merge.groupby('aid')['bid'].rank()
    
    sub_merge['pred'] = sub_norm['pred'] * 0.4 + sub_control['pred'] * 0.6
    
    print(sub_merge['pred'].min())
    
    sub_merge.loc[sub_merge['pred'] < 0, 'pred'] = 0
    
    sub_merge['pred'] = sub_merge['pred'].round() + sub_merge['rank_num'] * 0.0001
    
    sub_merge['pred'] = sub_merge['pred'].round(4)
    
    sub_merge[['sample_id', 'pred']].to_csv('submission.csv', index=False, header=None)
    

    PS: 关于w2v:
    这个方案在初赛的时候是一个很强的操作,但是在复赛阶段由于大量0样本的泄露问题,调整采样,以及增加随机数带来的误差过大,导致并不适用。故代码未加入进来。(在PPT不足与反思处也已经注明。)

    代码性能解析

    计算速度部分,即算法运行时间,我的代码的前100行,分别为逐个读入每个文件,并单纯的仅仅统计label 以及队列长度。由于数据量较大,将花费约7min左右的时间载入内存、统计的过程也是大约7min。合计14min 左右。

    接下来的100 – 170行为特征提取部分,由于我特征较少,且未提取原始日志的特征,故,约会在30s左右的时间运行完毕。
    170行至222行结束,就是紧张刺激的模型训练预测环节,包含两个模型,模型1在10核机器上约30s。模型2为5折,约2min。
    共计约20min (大家可以自行运行检验)

    计算资源部分,这里略有遗憾,由于提取标签的部分未采用流式读取,故内存峰值存在于读取单日日志到内存的时候。最大内存峰值点,为单个日志文件读入的最大内存消耗。(我可没把全量日志一起读入)约4G内存即可。

    代码解析:
    1、 定义函数get_base_data():这里读取并解析广告静态文件,以及测试集
    2、 定义函数get_train_data():这里是分日读取日志并提取每个广告当日的统计,作为标签。从而获得训练集。
    3、 定义函数get_test_expos():这里是获取23号测试集的曝光数据。
    4、 接下来就是连续的特征提取
    5、 接下来是定义lgb模型、标签转换以及5折训练预测。

    相关文章

      网友评论

          本文标题:腾讯广告算法大赛19_Top5方案&代码

          本文链接:https://www.haomeiwen.com/subject/vjjyvhtx.html