1. 读取数据
user_feature = pd.read_csv(self.feature_path + '/underexpose_user_feat.csv',
header=None,
names=['user_id', 'user_age_level', 'user_gender', 'user_city_level'],
na_values=['']
)
item_feature = pd.read_csv(path, sep=',\[|\],|,' ,engine='python',
header=None, names=['item_id'] + ['emb_1_'+ str(i) for i in range(128)] + ['emb_2_' + str(i) for i in range(128)])
2. 存储数据
train_data.to_csv(self.sample_save_path + "_train", index=False, header=True)
3. group by
user_item_ = pos_data.groupby('user_id')['item_id'].agg(set).reset_index()
user_item_dict = dict(zip(user_item_['user_id'], user_item_['item_id']))
4. merge
pre_sample = pd.merge(d_a, self.item_feature, on='item_id', how='left')
5. fill na
def _fill_NA(df):
rand_value = np.random.uniform(-1, 1, size=(128))
values= {"user_id": -1, 'item_id': -1,
'user_city_level': '-1',
'user_age_level': '-1', 'user_gender': 'O'}
emb_dict = {'emb_1_'+ str(i):0 for i in range(128) } # TODO:拥均值 or rankd ??
emb_dict2= {'emb_2_'+ str(i):0 for i in range(128) }
emb_all = emb_dict.copy()
emb_all.update(emb_dict2)
values_merge = values.copy()
values_merge.update(emb_all)
return df.fillna(value=values_merge)
网友评论