昨天工作:
- 将经过叶子类目过滤后的结果使用alinlp预训练的dssm模型重新计算相关性,使用0.5作为阈值的话可以得到比较满意的结果,最后结果剩余300+w条数据,其中关联到的视频9.7w左右。结果在:graph_embedding.hs_tmp_176
- 问题:发现最终结果中服饰相关的query中经常会有带有季节信息的的query,但是关联到的结果会忽视这一信息。比如"query:高跟鞋女夏; title:2018新款秋白色高跟鞋"这种情况,还在想办法改善。
今天计划:
改善dssm模型inference结果中季节信息丢失的问题
alinlp的dssm充当过滤器:graph_embedding.hs_tmp_176
测试将季节的比重调高一点会不会有效果
query和title叶子类目是否靠谱
- 使用阔姐给的叶子类目表重新过滤
graph_embedding.jl_top_query_related_cate
drop table hs_tmp_175;
yes
create table hs_tmp_175 as
select query, title, video_id, item_id, search_kg:alinlp_dssm_text_similarity(search_kg:alinlp_segment(query, 'CONTENT_SEARCH', ' '), search_kg:alinlp_segment(title, 'CONTENT_SEARCH', ' '),' ') as score from hs_query_ugc_co_video_final_result_info_1;
drop table hs_query_ugc_co_video_final_result_info_3;
yes
create table hs_query_ugc_co_video_final_result_info_3 as
select distinct a.query, a.title, a.video_id, a.item_id from
(select * from hs_tmp_170 where score > 0.5 )a left join (select * from jl_top_query_related_cate)b on
a.query == b.se_keyword and a.cate_id == b.cate_id;
- query和title的分数分布怎么样?
hs_dssm_dic_query_inf_without_space_0 : | query_id | query |
create table graph_embedding.hs_dssm_dic_query_2 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", "{weight}|${type}", 1, 1) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0 limit 20)a)b where lengthb(b.pair) > 0;
drop table hs_tmp_149;
yes
create table hs_tmp_149 as select id, words_mainse_ids, search_kg:alinlp_remove_stop_words(title, "") as title from hs_dssm_dic_title_3;
set odps.sql.udf.jvm.memory=4096;
create table graph_embedding.hs_dssm_dic_title_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", "{weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;
- 分词结果是否靠谱
create table hs_tmp_179 as
select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", "{weight}|${type}", 1, 1) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0;
create table hs_tmp_180 as
select id, search_kg:alinlp_termweight_ecom(title, "%", "{weight}|${type}", 1, 0) as tag_result from graph_embedding.hs_tmp_149;
- 有问题的结果(query)
| 37 | 40|3|268435456%岁|3|536870912%女短裤|93|65537 |
| 38 | 40|2|268435456%岁|2|536870912%妈妈|20|16%夏装|75|1073807361 |
| 39 | 45|2|268435456%岁|2|536870912%女|20|16%连衣裙|75|65537 |
| 40 | 50|2|268435456%岁|2|536870912%妈妈|20|16%夏装|75|1073807361 |
| 41 | 50|2|268435456%岁|2|536870912%妈妈|17|16%夏装|64|1073807361% |0|1024%洋气|13|128 |
| 46 | 7分裤|61|65537%女|19|16% |0|1024%薄款|19|16 |
aj|3|0%女鞋|96|65537 |
| 100 | diy|11|128%小屋|88|65537 |
| 92 | cosply|23|8%服装|58|1073807361%女|17|16 |
%网|70|131073%红包|70|
- 有问题的结果(title)
%ins|20|128%超|40|16%火包|66|4 |
分词会分错,按照种类来给的权重,有关键字的直接给70(如果比70小)?
-
使用电商命名实体进行权重确定(不好使)
-
白名单
['春', '夏', '秋', '冬', '男', '女', '宝宝', '童', '学生', '爸', '妈', '黑', '白', '灰', '厚', '薄', '暖', '凉', '式', '风', '码', '长', '短', '大', '小', '红', '橙', '黄', '绿', '青', '蓝', '紫', '岁', '年', 'ins', '冰', '欧', '韩', '宽松', '紧身', '胖', '瘦', '单', '双', '2019', '2018']
处理方式:
query:白名单中的词增加到 max(本句子中权重最大词的一半,origin)
title:白名单中的词增加到 max(75,origin)
- 数据预处理
train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |
inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |
train query 数据重新获取
drop table hs_dssm_dic_query_4;
yes
create table graph_embedding.hs_dssm_dic_query_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", "{weight}", 1, 0) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0)a)b where lengthb(b.pair) > 0;
drop resource hs_udf_87_1.py;
yes
add py /home/hengsong/query_co_video/src/hs_udf_87_1.py;
CREATE FUNCTION hs_return_clean AS hs_udf_87_1.return_clean USING hs_udf_87_1.py;
CREATE FUNCTION hs_change_score_1 AS hs_udf_87_1.Processor USING hs_udf_87_1.py;
drop table hs_dssm_dic_query_5;
yes
create table hs_dssm_dic_query_5 as select id, word, weight, hs_change_score_1(word, weight) as new_weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_4;
select title, hs_return_clean(title) as clean_title from graph_embedding.hs_tmp_149 limit 100;
drop table hs_tmp_181;
yes
create table graph_embedding.hs_tmp_181 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_5 group by id;
train title 数据的重新获取
create table graph_embedding.hs_dssm_dic_title_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", "{weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;
create table hs_dssm_dic_title_6 as select id, word, weight, hs_change_score_1(word, weight) as new_weight, word_emb from hs_dssm_dic_title_5;
drop resource hs_udaf_82_1.py;
yes
add py /home/hengsong/query_co_video/src/hs_udaf_82_1.py;
CREATE FUNCTION hs_merge_emb_15 AS hs_udaf_82_1.Average USING hs_udaf_82_1.py;
create table graph_embedding.hs_tmp_182 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_title_6 group by id;
inference query 数据重新获取
drop table hs_dssm_dic_query_inf_4;
yes
create table graph_embedding.hs_dssm_dic_query_inf_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(query, "%", "{weight}", 1, 0) as tag_result from graph_embedding.hs_dssm_dic_query_inf_1 where lengthb(query) > 0)a)b where lengthb(b.pair) > 0;
drop table hs_dssm_dic_query_inf_5;
yes
create table hs_dssm_dic_query_inf_5 as select id, word, weight, hs_change_score_1(word, weight) as new_weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_inf_4;
drop table hs_tmp_184;
yes
create table graph_embedding.hs_tmp_184 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_inf_5 group by id;
inference title 数据重新获取
create table graph_embedding.hs_dssm_dic_title_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", "{weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;
create table hs_dssm_dic_title_inf_6 as select id, word, weight, hs_change_score_1(word, weight) as new_weight, word_emb from hs_dssm_dic_title_inf_3;
create table graph_embedding.hs_tmp_185 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_title_inf_6 group by id;
- 得到新的训练集
drop table hs_tmp_183;
yes
create table hs_tmp_183
as select c.se_keyword_mainse_ws, d.emb as title_mainse_ws, c.label from
(select a.*, b.emb as se_keyword_mainse_ws from (select * from hs_dssm_train_v2_0)a left join (select * from hs_tmp_181)b on a.query_id == b.id)c left join (select * from hs_tmp_182)d on c.item_id == d.id;
记得除掉NULL!!!
drop table hs_train_data_dssm_v2_4;
yes
drop table hs_test_data_dssm_v2_4;
yes
PAI -name split -project algo_public
-DinputTableName=graph_embedding.hs_tmp_183
-Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_4
-Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_4
-Dfraction=0.8
-DmemSizePerCore=4096
-DcoreNum=100
;
- 构造测试集
drop table hs_tmp_187;
yes
create table graph_embedding.hs_tmp_187 as
select c.query_id, c.title_id, c.query, d.emb as title from
(select a.*, b.emb as query from (select * from graph_embedding.hs_tmp_157)a left join (select * from graph_embedding.hs_tmp_184)b on a.query_id == b.id)c left join (select * from graph_embedding.hs_tmp_185)d on c.title_id == d.id;
- train & inference
| query_id | video_id | query_ws | video_ws |
alter table hs_tmp_187 change column query query_ws string;
alter table hs_tmp_187 change column title_id video_id string;
alter table hs_tmp_187 change column title video_ws string;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_inference_v7.py" -Dcluster='{"worker":{"count":50, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_4,odps://graph_embedding/tables/hs_test_data_dssm_v2_4,odps://graph_embedding/tables/hs_tmp_187" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_3" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_4e_1.ckpt" -DuseSparseClusterSchema=True;
网友评论