昨天工作:
- 将图片数据中不重要的剔除,剩下7000+w张商品图片,特征提取程序还在进行;
- 对dssm程序的训练集中负样本不合理的例子进行处理,比如query是连衣裙的话,而title的分词结果中如果有“连衣裙”这个词,那这条数据应该划分到正样本中,这样的例子在负样本中占比10%左右;使用处理后的数据进行模型的训练,inference结果:acc:0.75 auc:0.702 precision:0.93;acc结果是当前最优,但是auc和precision较之前最优结果要差一点;
- 准备转正答辩ppt
今天计划:
图片特征提取完毕之后进行mvdssm网络的训练
- 关键词法的结果:
inference_query : hs_dssm_dic_query_inf_7 - | query_id | query |
hs_dssm_dic_query_inf_11 : | id | query_emb |
inference_title : hs_dssm_dic_title_inf_10 - | item_id | title |
hs_dssm_dic_title_inf_14 : | id | title_emb |
hs_tmp_157 : | title_id | query_id |
create table hs_tmp_336 as select c., d.title from
(select a., b.query from hs_tmp_157 a join hs_dssm_dic_query_inf_7 b on a.query_id == b.query_id)c join hs_dssm_dic_title_inf_10 d on c.title_id == d.item_id;
create table hs_tmp_337 as select *, search_kg:alinlp_segment(hs_return_clean(title), "MAINSE", "0", "1") as title_seg, search_kg:alinlp_segment(hs_return_clean(query), "MAINSE", "0", "1") as query_seg from hs_tmp_336;
drop table hs_tmp_338;
yes
create table hs_tmp_338 as select hs_return_direct_score_2(title_id, query_id, query, title, title_seg, query_seg) as (title_id, query_id, query, title, title_seg, query_seg, score) from hs_tmp_337;
create table hs_tmp_339 as select * from hs_tmp_338 where score = 0;
网友评论