美文网首页
2019-08-09工作进展2

2019-08-09工作进展2

作者: Songger | 来源:发表于2019-08-09 14:12 被阅读0次

set odps.sql.mapper.split.size=1;

  1. 构造query关键词表

train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |


inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |


drop table hs_query_keyword_set;
yes
create table hs_query_keyword_set as select distinct word as key from hs_tmp_201 where lengthb(word) > 0;

insert overwrite table hs_query_keyword_set select hs_return_clean(key) as key from hs_query_keyword_set where lengthb(key) > 0;

insert overwrite table hs_query_keyword_set select * from hs_query_keyword_set where lengthb(key) > 0;

  1. 重新构造训练集

train query 数据重新获取

drop table hs_dssm_dic_query_4;
yes
create table graph_embedding.hs_dssm_dic_query_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", "{word}|{weight}", 1, 0) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0)a)b where lengthb(b.pair) > 0;

drop resource hs_udf_87_1.py;
yes
add py /home/hengsong/query_co_video/src/hs_udf_87_1.py;
CREATE FUNCTION change_weight_query_key_3 AS hs_udf_87_1.change_weight_query_key USING hs_udf_87_1.py,hs_query_keyword_set;

drop table hs_dssm_dic_query_6;
yes
create table hs_dssm_dic_query_6 as select id, word, weight, change_weight_query_key_3(word, weight) as new_weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_4;

select title, hs_return_clean(title) as clean_title from graph_embedding.hs_tmp_149 limit 100;

drop table hs_tmp_202;
yes
create table graph_embedding.hs_tmp_202 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_6 group by id;

train title 数据的重新获取

create table graph_embedding.hs_dssm_dic_title_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", "{word}|{weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;

create table hs_dssm_dic_title_7 as select id, word, weight, change_weight_query_key_3(word, weight) as new_weight, word_emb from hs_dssm_dic_title_5;

drop resource hs_udaf_82_1.py;
yes
add py /home/hengsong/query_co_video/src/hs_udaf_82_1.py;
CREATE FUNCTION hs_merge_emb_15 AS hs_udaf_82_1.Average USING hs_udaf_82_1.py;

create table graph_embedding.hs_tmp_203 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_title_7 group by id;

inference query 数据重新获取

drop table hs_dssm_dic_query_inf_4;
yes
create table graph_embedding.hs_dssm_dic_query_inf_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(query, "%", "{word}|{weight}", 1, 0) as tag_result from graph_embedding.hs_dssm_dic_query_inf_1 where lengthb(query) > 0)a)b where lengthb(b.pair) > 0;

drop table hs_dssm_dic_query_inf_6;
yes
create table hs_dssm_dic_query_inf_6 as select id, word, weight, change_weight_query_key_3(word, weight) as new_weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_inf_4;

drop table hs_tmp_204;
yes
create table graph_embedding.hs_tmp_204 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_inf_6 group by id;

inference title 数据重新获取

create table graph_embedding.hs_dssm_dic_title_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", "{word}|{weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;

create table hs_dssm_dic_title_inf_7 as select id, word, weight, change_weight_query_key_3(word, weight) as new_weight, word_emb from hs_dssm_dic_title_inf_3;

create table graph_embedding.hs_tmp_205 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_title_inf_7 group by id;

  1. 得到新的训练集

drop table hs_tmp_206;
yes
create table hs_tmp_206
as select c.se_keyword_mainse_ws, d.emb as title_mainse_ws, c.label from
(select a.*, b.emb as se_keyword_mainse_ws from (select * from hs_dssm_train_v2_0)a left join (select * from hs_tmp_202)b on a.query_id == b.id)c left join (select * from hs_tmp_203)d on c.item_id == d.id;

记得除掉NULL!!!
insert overwrite table hs_tmp_206 select * from hs_tmp_206 where se_keyword_mainse_ws is not NULL and title_mainse_ws is not NULL;

drop table hs_train_data_dssm_v2_5;
yes
drop table hs_test_data_dssm_v2_5;
yes
PAI -name split -project algo_public
-DinputTableName=graph_embedding.hs_tmp_206
-Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_5
-Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_5
-Dfraction=0.8
-DmemSizePerCore=4096
-DcoreNum=100
;

  1. 构造测试集

drop table hs_tmp_207;
yes
create table graph_embedding.hs_tmp_207 as
select c.query_id, c.title_id, c.query, d.emb as title from
(select a.*, b.emb as query from (select * from graph_embedding.hs_tmp_157)a left join (select * from graph_embedding.hs_tmp_204)b on a.query_id == b.id)c left join (select * from graph_embedding.hs_tmp_205)d on c.title_id == d.id;

  1. train & inference

| query_id | video_id | query_ws | video_ws |
alter table hs_tmp_207 change column query query_ws string;
alter table hs_tmp_207 change column title_id video_id string;
alter table hs_tmp_207 change column title video_ws string;

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_inference_v7.py" -Dcluster='{"worker":{"count":50, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_5,odps://graph_embedding/tables/hs_test_data_dssm_v2_5,odps://graph_embedding/tables/hs_tmp_207" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_4e_3.ckpt" -DuseSparseClusterSchema=True;

http://logview.odps.aliyun-inc.com:8080/logview/?h=http://service-corp.odps.aliyun-inc.com/api&p=graph_embedding&i=20190809083110262gydq4gep2_6c970ae7_917e_4382_beda_75bcd500e314&token=UXZDWWRteW5mcEFMM3FXYjk2Mk5waHkvaUcwPSxPRFBTX09CTzoxMjkzMzAzOTgzMjUxNTQ4LDE1NjU5NDQ1MzUseyJTdGF0ZW1lbnQiOlt7IkFjdGlvbiI6WyJvZHBzOlJlYWQiXSwiRWZmZWN0IjoiQWxsb3ciLCJSZXNvdXJjZSI6WyJhY3M6b2RwczoqOnByb2plY3RzL2dyYXBoX2VtYmVkZGluZy9pbnN0YW5jZXMvMjAxOTA4MDkwODMxMTAyNjJneWRxNGdlcDJfNmM5NzBhZTdfOTE3ZV80MzgyX2JlZGFfNzViY2Q1MDBlMzE0Il19XSwiVmVyc2lvbiI6IjEifQ==

  1. 传表

drop table hs_query_title_inference_gt;
yes
create table hs_query_title_inference_gt (query string, title string, label string);

tunnel upload -fd '|' /home/hengsong/test_gt3.txt graph_embedding.hs_query_title_inference_gt;

  1. 对应id

inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |

create table hs_query_title_inference_gt_2 as
select distinct c., d.id as title_id from
(select a.
, b.id as query_id from (select * from hs_query_title_inference_gt)a join (select * from hs_dssm_dic_query_inf_1)b on a.query == b.query)c join (select * from hs_dssm_dic_title_inf_1)d on c.title == d.title;

insert overwrite table hs_query_title_inference_gt_2 select * from hs_query_title_inference_gt_2 where label == 1 or label == 0;

hs_query_cluster_finetune

insert overwrite table graph_embedding.hs_tmp_208
select cast(a.label as bigint) as label, cast(b.score as bigint) as score from
(select * from graph_embedding.hs_query_title_inference_gt_2)a join (select * from graph_embedding.hs_query_cluster_finetune)b on a.query_id == b.query_id and a.title_id == b.video_id;

tunnel download -fd '|' graph_embedding.hs_tmp_208 /home/hengsong/hs_label_score.txt;

相关文章

网友评论

      本文标题:2019-08-09工作进展2

      本文链接:https://www.haomeiwen.com/subject/bnlcjctx.html