美文网首页我爱编程
pandas效率探索

pandas效率探索

作者: _龙雀 | 来源:发表于2017-07-13 15:26 被阅读240次

    在数据挖掘任务中,特征工程占据相当大的工作量。最近做唯品会购物预测的比赛中,发现生成特征时候,使用list的append方法去连接不同样本的特征比用pandas的concat方法效率提高很大。然后再尝试用并行的方式对concat方法进行优化,发现无任何提高,不知道是不是姿势不对。 把代码粘贴一下,记录一下,tt是抽取的一个(100000,4)大小的dataframe进行运行时间的比较

    1.append生成特征集

    start = time()
    X = []
    for each_uid,group in tt.groupby(by=['uid']):
        time_alpha = datetime.date(1900,3,20)
        time_alpha2 = datetime.date(1900,4,1)
        #df = pd.Series({'uid':each_uid})
        user_activity = group.shape[0] #用户活跃度user_activity :总浏览次数
        user_buyablity = group[group['action_type'] == 1].shape[0] #用户购买力:总购买数量
        user_takerate = user_buyablity/user_activity #转化率
        user_near_activity = 0 #最近10天的活跃度
        for i in group['date']:
            if(i.date() > time_alpha):
                user_near_activity += 1
        user_first_time = min(group['date']).date()#第一次活跃时间
        user_last_time = max(group['date']).date()#最后一次活跃时间
        user_timedalta = (user_last_time - user_first_time).days#用户总活跃时间差
        user_near_timedelta = (time_alpha2 - user_last_time).days#用户最有一次活跃距离4.1的天数
        X1 = [each_uid,user_activity,user_buyablity,user_takerate,user_near_activity,user_timedalta,user_near_timedelta]
        X.append(X1)
    names = ['each_uid','user_activity','user_buyablity','user_takerate','user_near_activity','user_timedalta','user_near_timedelta']
    X = pd.DataFrame(X,columns=names)
    stop = time()
    print(str(stop-start) + "秒")
    

    运行时间:42.0529999733秒

    2.concat生成特征集

    start = time()
    X = pd.DataFrame()
    for each_uid,group in tt.groupby(by=['uid']):
        time_alpha = datetime.date(1900,3,20)
        time_alpha2 = datetime.date(1900,4,1)
        df = pd.Series({'uid':each_uid})
        user_activity = group.shape[0] #用户活跃度user_activity :总浏览次数
        user_buyablity = group[group['action_type'] == 1].shape[0] #用户购买力:总购买数量
        user_takerate = user_buyablity/user_activity #转化率
        user_near_activity = 0 #最近10天的活跃度
        for i in group['date']:
            if(i.date() > time_alpha):
                user_near_activity += 1
        user_first_time = min(group['date']).date()#第一次活跃时间
        user_last_time = max(group['date']).date()#最后一次活跃时间
        user_timedalta = (user_last_time - user_first_time).days#用户总活跃时间差
        user_near_timedelta = (time_alpha2 - user_last_time).days#用户最有一次活跃距离4.1的天数
        df['user_activity'] = user_activity
        df['user_buyablity'] = user_buyablity
        df['user_takerate'] = user_takerate
        df['user_near_activity'] = user_near_activity
        df['user_timedalta'] = user_timedalta
        df['user_near_timedelta'] = user_near_timedelta
        X = pd.concat([X,df.to_frame().T], axis=0)
    stop = time()
    print(str(stop-start) + "秒")
    

    运行时间:284.087000132秒

    3.并行处理

    def applyParallel(dfGrouped,func):
        with Parallel(n_jobs=32) as parallel:
            retLst = parallel(delayed(func)(group) for name, group in dfGrouped )
            return pd.concat(retLst, axis=0)
        
    #user features user_num = 196030
    #style 3:
    def getUserFeatures(group):
        time_alpha = datetime.date(1900,3,20)
        time_alpha2 = datetime.date(1900,4,1)
        df = pd.Series({'uid':group.iloc[0]['uid']})
        user_activity = group.shape[0] #用户活跃度user_activity :总浏览次数
        user_buyablity = group[group['action_type'] == 1].shape[0] #用户购买力:总购买数量
        user_takerate = user_buyablity/user_activity #转化率
        user_near_activity = 0 #最近10天的活跃度
        for i in group['date']:
            if(i.date() > time_alpha):
                user_near_activity += 1
        user_first_time = min(group['date']).date()#第一次活跃时间
        user_last_time = max(group['date']).date()#最后一次活跃时间
        user_timedalta = (user_last_time - user_first_time).days#用户总活跃时间差
        user_near_timedelta = (time_alpha2 - user_last_time).days#用户最有一次活跃距离4.1的天数
        df['user_activity'] = user_activity
        df['user_buyablity'] = user_buyablity
        df['user_takerate'] = user_takerate
        df['user_near_activity'] = user_near_activity
        df['user_timedalta'] = user_timedalta
        df['user_near_timedelta'] = user_near_timedelta
        return df.to_frame().T
        
    X = applyParallel(tt.groupby(by=['uid']),getUserFeatures)
    

    运行时间:卡住

    相关文章

      网友评论

        本文标题:pandas效率探索

        本文链接:https://www.haomeiwen.com/subject/cbwwhxtx.html