set odps.sql.mapper.split.size=1;
- 构造query关键词表
train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |
inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |
drop table hs_query_keyword_set;
yes
create table hs_query_keyword_set as select distinct word as key from hs_tmp_201 where lengthb(word) > 0;
insert overwrite table hs_query_keyword_set select hs_return_clean(key) as key from hs_query_keyword_set where lengthb(key) > 0;
insert overwrite table hs_query_keyword_set select * from hs_query_keyword_set where lengthb(key) > 0;
- 重新构造训练集
train query 数据重新获取
drop table hs_dssm_dic_query_4;
yes
create table graph_embedding.hs_dssm_dic_query_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", "{weight}", 1, 0) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0)a)b where lengthb(b.pair) > 0;
drop resource hs_udf_87_1.py;
yes
add py /home/hengsong/query_co_video/src/hs_udf_87_1.py;
CREATE FUNCTION change_weight_query_key_3 AS hs_udf_87_1.change_weight_query_key USING hs_udf_87_1.py,hs_query_keyword_set;
drop table hs_dssm_dic_query_6;
yes
create table hs_dssm_dic_query_6 as select id, word, weight, change_weight_query_key_3(word, weight) as new_weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_4;
select title, hs_return_clean(title) as clean_title from graph_embedding.hs_tmp_149 limit 100;
drop table hs_tmp_202;
yes
create table graph_embedding.hs_tmp_202 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_6 group by id;
train title 数据的重新获取
create table graph_embedding.hs_dssm_dic_title_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", "{weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;
create table hs_dssm_dic_title_7 as select id, word, weight, change_weight_query_key_3(word, weight) as new_weight, word_emb from hs_dssm_dic_title_5;
drop resource hs_udaf_82_1.py;
yes
add py /home/hengsong/query_co_video/src/hs_udaf_82_1.py;
CREATE FUNCTION hs_merge_emb_15 AS hs_udaf_82_1.Average USING hs_udaf_82_1.py;
create table graph_embedding.hs_tmp_203 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_title_7 group by id;
inference query 数据重新获取
drop table hs_dssm_dic_query_inf_4;
yes
create table graph_embedding.hs_dssm_dic_query_inf_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(query, "%", "{weight}", 1, 0) as tag_result from graph_embedding.hs_dssm_dic_query_inf_1 where lengthb(query) > 0)a)b where lengthb(b.pair) > 0;
drop table hs_dssm_dic_query_inf_6;
yes
create table hs_dssm_dic_query_inf_6 as select id, word, weight, change_weight_query_key_3(word, weight) as new_weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_inf_4;
drop table hs_tmp_204;
yes
create table graph_embedding.hs_tmp_204 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_inf_6 group by id;
inference title 数据重新获取
create table graph_embedding.hs_dssm_dic_title_4 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", "{weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;
create table hs_dssm_dic_title_inf_7 as select id, word, weight, change_weight_query_key_3(word, weight) as new_weight, word_emb from hs_dssm_dic_title_inf_3;
create table graph_embedding.hs_tmp_205 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_title_inf_7 group by id;
- 得到新的训练集
drop table hs_tmp_206;
yes
create table hs_tmp_206
as select c.se_keyword_mainse_ws, d.emb as title_mainse_ws, c.label from
(select a.*, b.emb as se_keyword_mainse_ws from (select * from hs_dssm_train_v2_0)a left join (select * from hs_tmp_202)b on a.query_id == b.id)c left join (select * from hs_tmp_203)d on c.item_id == d.id;
记得除掉NULL!!!
insert overwrite table hs_tmp_206 select * from hs_tmp_206 where se_keyword_mainse_ws is not NULL and title_mainse_ws is not NULL;
drop table hs_train_data_dssm_v2_5;
yes
drop table hs_test_data_dssm_v2_5;
yes
PAI -name split -project algo_public
-DinputTableName=graph_embedding.hs_tmp_206
-Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_5
-Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_5
-Dfraction=0.8
-DmemSizePerCore=4096
-DcoreNum=100
;
- 构造测试集
drop table hs_tmp_207;
yes
create table graph_embedding.hs_tmp_207 as
select c.query_id, c.title_id, c.query, d.emb as title from
(select a.*, b.emb as query from (select * from graph_embedding.hs_tmp_157)a left join (select * from graph_embedding.hs_tmp_204)b on a.query_id == b.id)c left join (select * from graph_embedding.hs_tmp_205)d on c.title_id == d.id;
- train & inference
| query_id | video_id | query_ws | video_ws |
alter table hs_tmp_207 change column query query_ws string;
alter table hs_tmp_207 change column title_id video_id string;
alter table hs_tmp_207 change column title video_ws string;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_inference_v7.py" -Dcluster='{"worker":{"count":50, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_5,odps://graph_embedding/tables/hs_test_data_dssm_v2_5,odps://graph_embedding/tables/hs_tmp_207" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_4e_3.ckpt" -DuseSparseClusterSchema=True;
- 传表
drop table hs_query_title_inference_gt;
yes
create table hs_query_title_inference_gt (query string, title string, label string);
tunnel upload -fd '|' /home/hengsong/test_gt3.txt graph_embedding.hs_query_title_inference_gt;
- 对应id
inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |
create table hs_query_title_inference_gt_2 as
select distinct c., d.id as title_id from
(select a., b.id as query_id from (select * from hs_query_title_inference_gt)a join (select * from hs_dssm_dic_query_inf_1)b on a.query == b.query)c join (select * from hs_dssm_dic_title_inf_1)d on c.title == d.title;
insert overwrite table hs_query_title_inference_gt_2 select * from hs_query_title_inference_gt_2 where label == 1 or label == 0;
hs_query_cluster_finetune
insert overwrite table graph_embedding.hs_tmp_208
select cast(a.label as bigint) as label, cast(b.score as bigint) as score from
(select * from graph_embedding.hs_query_title_inference_gt_2)a join (select * from graph_embedding.hs_query_cluster_finetune)b on a.query_id == b.query_id and a.title_id == b.video_id;
tunnel download -fd '|' graph_embedding.hs_tmp_208 /home/hengsong/hs_label_score.txt;
网友评论