美文网首页
“梧桐杯”中国移动大数据应用创新大赛 - 智慧城市赛道[Base

“梧桐杯”中国移动大数据应用创新大赛 - 智慧城市赛道[Base

作者: Sl0wDive | 来源:发表于2021-03-02 19:08 被阅读0次
import os
import gc
import time
import math
import psutil
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm 
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score,roc_auc_score
from sklearn.model_selection import StratifiedKFold

def summary(f):
    def wrap(*args, **kwargs):
        time1 = time.time()
        p = psutil.Process(os.getpid())
        m0 = p.memory_info()[0] / 2. ** 30
        ret = f(*args, **kwargs)
        m1 = p.memory_info()[0] / 2. ** 30
        delta = m1 - m0
        sign = '+' if delta >= 0 else '-'
        delta = math.fabs(delta)
        time2 = time.time()
        print('')
        print('process :'.ljust(20) , f'{f.__name__}')
        print('run time :'.ljust(20) , f'{np.round(time2-time1, 2)} s')
        print('memory usage :'.ljust(20) , f'{m1:.1f}GB({sign}{delta:.1f}GB)')
        return ret
    return wrap

@summary
def load_data():
    data_path = os.getcwd().replace('code','A榜给选手数据\\')
    result_path = os.getcwd().replace('code','result\\')
    train = pd.read_csv(data_path+'train_set.csv').drop(['X38','X27'],axis=1)
    label = pd.read_csv(data_path+'train_label.csv')
    test = pd.read_csv(data_path+'result_predict_A.csv').drop(['X38','X27'],axis=1)
    df = pd.concat([train,test],axis=0).reset_index(drop=True)
    df = df.merge(label,on='user_id',how='left')
    
    return df

@summary
def fill_na(df):
    cat_na_cols = ['X3','X5','X28','X29','X30','X31']
    num_na_cols = ['X6','X7','X8','X9','X10','X11','X12','X13','X14','X15','X16','X17','X18','X19','X20','X21','X22','X23','X32','X33','X34','X35','X36']  
    tmp = df[['user_id']]
    for col in num_na_cols:
        tmp = pd.concat([tmp,df[col].fillna(df[col].median())],axis=1)
    for col in cat_na_cols:
        tmp = pd.concat([tmp,df[col].fillna(df[col].mode())],axis=1)
    df.drop(cat_na_cols+num_na_cols,axis=1,inplace=True)
    df = df.merge(tmp,on='user_id',how='left')
    
    return df

@summary
def feature_engineering(df):
    df['basic_combine'] = df['X1'].map(str)+df['X2'].map(str)+df['X3'].map(str)+df['X4'].map(str)+df['X5'].map(str)
    df['kuandai_combine'] = df['X24'].map(str)+df['X25'].map(str)+df['X26'].map(str)
    df['qianyue_combine'] = df['X28'].map(str)+df['X29'].map(str)+df['X30'].map(str)+df['X31'].map(str)
    df['else_combine'] = df['X37'].map(str)+df['X39'].map(str)+df['X40'].map(str)+df['X41'].map(str)+df['X42'].map(str)+df['X43'].map(str)


    features = [['X6','X7','X8'],['X9','X10','X11'],['X12','X13','X14'],['X18','X19','X20'],['X21','X22','X23']]
    for fea in features:
        df[f'{fea[0]}_{fea[1]}_{fea[2]}_std'] = df[fea].std(1)
        df[f'{fea[0]}_{fea[1]}_{fea[2]}_max'] = df[fea].max(1)
        df[f'{fea[0]}_{fea[1]}_{fea[2]}_min'] = df[fea].min(1)
        
        df[f'{fea[0]}_{fea[1]}_sub'] = df[fea[0]] - df[fea[1]]
        df[f'{fea[0]}_{fea[2]}_sub'] = df[fea[0]] - df[fea[2]]
        
        df.loc[df[fea[0]] <= df[fea[1]],f'{fea[0]}_{fea[1]}_mark'] = 0
        df.loc[df[fea[0]]  > df[fea[1]],f'{fea[0]}_{fea[1]}_mark'] = 1 
        df.loc[df[fea[0]] <= df[fea[2]],f'{fea[0]}_{fea[2]}_mark'] = 0
        df.loc[df[fea[0]]  > df[fea[2]],f'{fea[0]}_{fea[2]}_mark'] = 1 
        
        
    features = ['X18','X19','X20','X21','X22','X23']
    for fea in features:
        df.loc[df[fea] == 0,f'{fea}_mark'] = 1
        df.loc[df[fea] > 0,f'{fea}_mark'] = 0
        
        
    mark_cols = [col for col in df.columns if 'mark' in col]
    df['total_mark'] = 0
    for col in mark_cols:
        df['total_mark'] += df[col]
    df.drop(mark_cols,axis=1,inplace=True)
    gc.collect()


    le = LabelEncoder()
    for col in [col for col in df.columns if df[col].dtype == 'object']:
        df[col] = le.fit_transform(df[col].astype(str))
    
    
    return df

@summary
def model_f1(ta,te):
    res = [0 for _ in range(len(test))]
    fea = [c for c in ta.columns if c not in ['user_id','product_no','label']]
    kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=2)
    f1 = []
    threshold = 0.25
    for ta_idx , val_idx in kf.split(ta,ta['label']):
        X_ta, X_val = ta[fea].iloc[ta_idx], ta[fea].iloc[val_idx],
        y_ta, y_val = np.array(ta['label'])[ta_idx], np.array(ta['label'])[val_idx]
        
        model = lgb.LGBMClassifier(num_leaves=64,max_depth=13,n_estimators=10000,learning_rate=0.07,verbose=-1,metric='auc')
        model.fit(X_ta, y_ta, eval_set = [(X_val,y_val)], early_stopping_rounds=200, verbose=500)
        
        y_pred = model.predict_proba(X_val)[:,1]
        res += model.predict_proba(te[fea])[:,1] / 5
        
        y_pred[y_pred>threshold], y_pred[y_pred<=threshold] = 1, 0
        f1.append(f1_score(y_val,y_pred))
        imp = pd.Series(model.feature_importances_, fea).sort_values(ascending=False)
    print('\nmean_f1:',np.around(np.mean(f1),3))
    
    return res,imp


if __name__ == '__main__':
    
    df = load_data()
    
    df = fill_na(df)

    df = feature_engineering(df)
    
    df_ = df.copy()
    ta,te = df_[~df_['label'].isna()], df_[df_['label'].isna()]
    
    te['label'],imp = model_f1(ta,te)
    
    res = te.copy()
    res.loc[res['label']>0.24,'label'] = 1
    res.loc[res['label']<0.24,'label'] = 0
    res[['user_id','label']].to_csv(result_path+'sub_0.24.csv',index=False)

相关文章

网友评论

      本文标题:“梧桐杯”中国移动大数据应用创新大赛 - 智慧城市赛道[Base

      本文链接:https://www.haomeiwen.com/subject/ndpxqltx.html