20190807工作进展

作者: Songger | 来源:发表于2019-08-07 20:50 被阅读0次

20190807工作进展
第147篇【20190807】乞巧节快乐鸭
20190807
20190807
20190807
20190807
20190807
20190807
20190807
20190807

昨天工作：

将经过叶子类目过滤后的结果使用alinlp预训练的dssm模型重新计算相关性，使用0.5作为阈值的话可以得到比较满意的结果，最后结果剩余300+w条数据，其中关联到的视频9.7w左右。结果在：graph_embedding.hs_tmp_176
问题：发现最终结果中服饰相关的query中经常会有带有季节信息的的query，但是关联到的结果会忽视这一信息。比如"query:高跟鞋女夏; title:2018新款秋白色高跟鞋"这种情况，还在想办法改善。

今天计划：
改善dssm模型inference结果中季节信息丢失的问题

alinlp的dssm充当过滤器：graph_embedding.hs_tmp_176
测试将季节的比重调高一点会不会有效果
query和title叶子类目是否靠谱

使用阔姐给的叶子类目表重新过滤

graph_embedding.jl_top_query_related_cate

drop table hs_tmp_175;
yes
create table hs_tmp_175 as
select query, title, video_id, item_id, search_kg:alinlp_dssm_text_similarity(search_kg:alinlp_segment(query, 'CONTENT_SEARCH', ' '), search_kg:alinlp_segment(title, 'CONTENT_SEARCH', ' '),' ') as score from hs_query_ugc_co_video_final_result_info_1;

drop table hs_query_ugc_co_video_final_result_info_3;
yes
create table hs_query_ugc_co_video_final_result_info_3 as
select distinct a.query, a.title, a.video_id, a.item_id from
(select * from hs_tmp_170 where score > 0.5 )a left join (select * from jl_top_query_related_cate)b on
a.query == b.se_keyword and a.cate_id == b.cate_id;

query和title的分数分布怎么样？

hs_dssm_dic_query_inf_without_space_0 : | query_id | query |

create table graph_embedding.hs_dssm_dic_query_2 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", " ${word}|$ {weight}|${type}", 1, 1) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0 limit 20)a)b where lengthb(b.pair) > 0;

drop table hs_tmp_149;
yes
create table hs_tmp_149 as select id, words_mainse_ids, search_kg:alinlp_remove_stop_words(title, "") as title from hs_dssm_dic_title_3;

set odps.sql.udf.jvm.memory=4096;
create table graph_embedding.hs_dssm_dic_title_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", " ${word}|$ {weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;

分词结果是否靠谱

create table hs_tmp_179 as
select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", " ${word}|$ {weight}|${type}", 1, 1) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0;

create table hs_tmp_180 as
select id, search_kg:alinlp_termweight_ecom(title, "%", " ${word}|$ {weight}|${type}", 1, 0) as tag_result from graph_embedding.hs_tmp_149;

有问题的结果（query）

| 37 | 40|3|268435456%岁|3|536870912%女短裤|93|65537 |
| 38 | 40|2|268435456%岁|2|536870912%妈妈|20|16%夏装|75|1073807361 |
| 39 | 45|2|268435456%岁|2|536870912%女|20|16%连衣裙|75|65537 |
| 40 | 50|2|268435456%岁|2|536870912%妈妈|20|16%夏装|75|1073807361 |
| 41 | 50|2|268435456%岁|2|536870912%妈妈|17|16%夏装|64|1073807361% |0|1024%洋气|13|128 |
| 46 | 7分裤|61|65537%女|19|16% |0|1024%薄款|19|16 |
aj|3|0%女鞋|96|65537 |
| 100 | diy|11|128%小屋|88|65537 |
| 92 | cosply|23|8%服装|58|1073807361%女|17|16 |
%网|70|131073%红包|70|

有问题的结果（title）

%ins|20|128%超|40|16%火包|66|4 |
分词会分错，按照种类来给的权重，有关键字的直接给70（如果比70小）？

使用电商命名实体进行权重确定（不好使）
白名单
['春', '夏', '秋', '冬', '男', '女', '宝宝', '童', '学生', '爸', '妈', '黑', '白', '灰', '厚', '薄', '暖', '凉', '式', '风', '码', '长', '短', '大', '小', '红', '橙', '黄', '绿', '青', '蓝', '紫', '岁', '年', 'ins', '冰', '欧', '韩', '宽松', '紧身', '胖', '瘦', '单', '双', '2019', '2018']

处理方式：
query：白名单中的词增加到 max(本句子中权重最大词的一半,origin)
title：白名单中的词增加到 max(75,origin)

数据预处理

train query 数据重新获取

drop table hs_dssm_dic_query_4;
yes
create table graph_embedding.hs_dssm_dic_query_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", " ${word}|$ {weight}", 1, 0) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0)a)b where lengthb(b.pair) > 0;

drop resource hs_udf_87_1.py;
yes
add py /home/hengsong/query_co_video/src/hs_udf_87_1.py;
CREATE FUNCTION hs_return_clean AS hs_udf_87_1.return_clean USING hs_udf_87_1.py;
CREATE FUNCTION hs_change_score_1 AS hs_udf_87_1.Processor USING hs_udf_87_1.py;

drop table hs_dssm_dic_query_5;
yes
create table hs_dssm_dic_query_5 as select id, word, weight, hs_change_score_1(word, weight) as new_weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_4;

select title, hs_return_clean(title) as clean_title from graph_embedding.hs_tmp_149 limit 100;

drop table hs_tmp_181;
yes
create table graph_embedding.hs_tmp_181 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_5 group by id;

train title 数据的重新获取

create table graph_embedding.hs_dssm_dic_title_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", " ${word}|$ {weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;

create table hs_dssm_dic_title_6 as select id, word, weight, hs_change_score_1(word, weight) as new_weight, word_emb from hs_dssm_dic_title_5;

drop resource hs_udaf_82_1.py;
yes
add py /home/hengsong/query_co_video/src/hs_udaf_82_1.py;
CREATE FUNCTION hs_merge_emb_15 AS hs_udaf_82_1.Average USING hs_udaf_82_1.py;

create table graph_embedding.hs_tmp_182 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_title_6 group by id;

inference query 数据重新获取

drop table hs_dssm_dic_query_inf_4;
yes
create table graph_embedding.hs_dssm_dic_query_inf_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(query, "%", " ${word}|$ {weight}", 1, 0) as tag_result from graph_embedding.hs_dssm_dic_query_inf_1 where lengthb(query) > 0)a)b where lengthb(b.pair) > 0;

drop table hs_dssm_dic_query_inf_5;
yes
create table hs_dssm_dic_query_inf_5 as select id, word, weight, hs_change_score_1(word, weight) as new_weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_inf_4;

drop table hs_tmp_184;
yes
create table graph_embedding.hs_tmp_184 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_inf_5 group by id;

inference title 数据重新获取

create table graph_embedding.hs_dssm_dic_title_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", " ${word}|$ {weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;

create table hs_dssm_dic_title_inf_6 as select id, word, weight, hs_change_score_1(word, weight) as new_weight, word_emb from hs_dssm_dic_title_inf_3;

create table graph_embedding.hs_tmp_185 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_title_inf_6 group by id;

得到新的训练集

drop table hs_tmp_183;
yes
create table hs_tmp_183
as select c.se_keyword_mainse_ws, d.emb as title_mainse_ws, c.label from
(select a.*, b.emb as se_keyword_mainse_ws from (select * from hs_dssm_train_v2_0)a left join (select * from hs_tmp_181)b on a.query_id == b.id)c left join (select * from hs_tmp_182)d on c.item_id == d.id;

记得除掉NULL！！！

drop table hs_train_data_dssm_v2_4;
yes
drop table hs_test_data_dssm_v2_4;
yes
PAI -name split -project algo_public
-DinputTableName=graph_embedding.hs_tmp_183
-Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_4
-Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_4
-Dfraction=0.8
-DmemSizePerCore=4096
-DcoreNum=100
;

构造测试集

drop table hs_tmp_187;
yes
create table graph_embedding.hs_tmp_187 as
select c.query_id, c.title_id, c.query, d.emb as title from
(select a.*, b.emb as query from (select * from graph_embedding.hs_tmp_157)a left join (select * from graph_embedding.hs_tmp_184)b on a.query_id == b.id)c left join (select * from graph_embedding.hs_tmp_185)d on c.title_id == d.id;

train & inference

| query_id | video_id | query_ws | video_ws |
alter table hs_tmp_187 change column query query_ws string;
alter table hs_tmp_187 change column title_id video_id string;
alter table hs_tmp_187 change column title video_ws string;

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_inference_v7.py" -Dcluster='{"worker":{"count":50, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_4,odps://graph_embedding/tables/hs_test_data_dssm_v2_4,odps://graph_embedding/tables/hs_tmp_187" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_3" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_4e_1.ckpt" -DuseSparseClusterSchema=True;

网友评论

本文标题：20190807工作进展

本文链接：https://www.haomeiwen.com/subject/jhmndctx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

20190807工作进展

相关文章