美文网首页
借鉴来的算法

借鉴来的算法

作者: 跑者小越 | 来源:发表于2018-04-19 09:32 被阅读221次

    腾讯2018广告算法大赛思路解析 100行代码带你上0.73

    用不了多少内存,稀疏存储的40w特征,占10多g内存,大概3个小时左右能跑完

    1. 首先处理4个G的用户特征

    2. 拼接用户特征,广告特征

    3. 将单取值的离散特征使用稀疏方式one-hot

    4. 将多取值的离散特征使用稀疏方式向量化

    5. 线下测试

    6. 线上提交

    # coding=utf-8
    # @author:bryan
    import pandas as pd
    import lightgbm as lgb
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.preprocessing import OneHotEncoder,LabelEncoder
    from scipy import sparse
    import os
    
    ad_feature=pd.read_csv('../data/adFeature.csv')
    if os.path.exists('../data/userFeature.csv'):
        user_feature=pd.read_csv('../data/userFeature.csv')
    else:
        userFeature_data = []
        with open('../data/userFeature.data', 'r') as f:
            for i, line in enumerate(f):
                line = line.strip().split('|')
                userFeature_dict = {}
                for each in line:
                    each_list = each.split(' ')
                    userFeature_dict[each_list[0]] = ' '.join(each_list[1:])
                userFeature_data.append(userFeature_dict)
                if i % 100000 == 0:
                    print(i)
            user_feature = pd.DataFrame(userFeature_data)
            user_feature.to_csv('../data/userFeature.csv', index=False)
    train=pd.read_csv('../data/train.csv')
    predict=pd.read_csv('../data/test1.csv')
    train.loc[train['label']==-1,'label']=0
    predict['label']=-1
    data=pd.concat([train,predict])
    data=pd.merge(data,ad_feature,on='aid',how='left')
    data=pd.merge(data,user_feature,on='uid',how='left')
    data=data.fillna('-1')
    one_hot_feature=['LBS','age','carrier','consumptionAbility','education','gender','house','os','ct','marriageStatus','advertiserId','campaignId', 'creativeId',
           'adCategoryId', 'productId', 'productType']
    vector_feature=['appIdAction','appIdInstall','interest1','interest2','interest3','interest4','interest5','kw1','kw2','kw3','topic1','topic2','topic3']
    for feature in one_hot_feature:
        try:
            data[feature] = LabelEncoder().fit_transform(data[feature].apply(int))
        except:
            data[feature] = LabelEncoder().fit_transform(data[feature])
    
    train=data[data.label!=-1]
    train_y=train.pop('label')
    # train, test, train_y, test_y = train_test_split(train,train_y,test_size=0.2, random_state=2018)
    test=data[data.label==-1]
    res=test[['aid','uid']]
    test=test.drop('label',axis=1)
    enc = OneHotEncoder()
    train_x=train[['creativeSize']]
    test_x=test[['creativeSize']]
    
    for feature in one_hot_feature:
        enc.fit(data[feature].values.reshape(-1, 1))
        train_a=enc.transform(train[feature].values.reshape(-1, 1))
        test_a = enc.transform(test[feature].values.reshape(-1, 1))
        train_x= sparse.hstack((train_x, train_a))
        test_x = sparse.hstack((test_x, test_a))
    print('one-hot prepared !')
    
    cv=CountVectorizer()
    for feature in vector_feature:
        cv.fit(data[feature])
        train_a = cv.transform(train[feature])
        test_a = cv.transform(test[feature])
        train_x = sparse.hstack((train_x, train_a))
        test_x = sparse.hstack((test_x, test_a))
    print('cv prepared !')
    
    def LGB_test(train_x,train_y,test_x,test_y):
        from multiprocessing import cpu_count
        print("LGB test")
        clf = lgb.LGBMClassifier(
            boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
            max_depth=-1, n_estimators=1000, objective='binary',
            subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
            learning_rate=0.05, min_child_weight=50,random_state=2018,n_jobs=cpu_count()-1
        )
        clf.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)],eval_metric='auc',early_stopping_rounds=100)
        # print(clf.feature_importances_)
        return clf,clf.best_score_[ 'valid_1']['auc']
    
    def LGB_predict(train_x,train_y,test_x,res):
        print("LGB test")
        clf = lgb.LGBMClassifier(
            boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
            max_depth=-1, n_estimators=1500, objective='binary',
            subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
            learning_rate=0.05, min_child_weight=50, random_state=2018, n_jobs=100
        )
        clf.fit(train_x, train_y, eval_set=[(train_x, train_y)], eval_metric='auc',early_stopping_rounds=100)
        res['score'] = clf.predict_proba(test_x)[:,1]
        res['score'] = res['score'].apply(lambda x: float('%.6f' % x))
        res.to_csv('../data/submission.csv', index=False)
        os.system('zip baseline.zip ../data/submission.csv')
        return clf
    
    model=LGB_predict(train_x,train_y,test_x,res)
    

    RP调参,可上0.75
    https://github.com/jiaqiangbandongg/RPTune

    https://github.com/scikit-learn/scikit-learn

    相关文章

      网友评论

          本文标题:借鉴来的算法

          本文链接:https://www.haomeiwen.com/subject/urbikftx.html