昨天工作:
训练数据手动id化,对dssm网络进行修改,并使用新数据对dssm网络重新训练,现阶段训练结果acc0.94、auc0.93。网络正在训练。
今天计划:
使用top query和ugc数据对网络进行效果测试,并分析改善方向
inference
truncate tabel hs_dssm_result_0;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_129" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_0" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;
样本关联
hs_dssm_result_0:| query_id | video_id | score | query_emb | video_emb |
hs_tmp_122:| query_id | query |
hs_tmp_123:| item_id | title |
drop table hs_tmp_131;
yes
create table hs_tmp_131 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_dssm_result_0)a left join (select * from hs_tmp_122)b on a.query_id == b.query_id)c left join (select * from hs_tmp_123)d on c.video_id == d.item_id;
类目过滤
drop table if exists graph_embedding.hs_query_ugc_keywords_cate_top_;
yes
create table if not exists graph_embedding.hs_query_ugc_keywords_cate_top_ LIFECYCLE 2
as select query_id, se_keyword, process_query_cate(cate_id, freq) as cate_list
from graph_embedding.zj_query_ugc_keywords_cate_freq_infos_ where query_id != 0
group by query_id, se_keyword;
---依据query词拿到首页的item_list
drop table if exists graph_embedding.hs_query_ugc_keywords_page_item_list_;
yes
create table if not exists graph_embedding.hs_query_ugc_keywords_page_item_list_ lifecycle 2
as select se_keyword, item_id from (
select distinct se_keyword, item_id from (
select bi_udf:bi_split_value(se_keyword, item_list, ",") as (se_keyword, item_id)
from (
select distinct se_keyword, item_list
from graph_embedding.jl_jingyan_query_related_top_query_detailed
where ds=MAX_PT('graph_embedding.jl_jingyan_query_related_top_query_detailed') and page_seq=1
)a
)b
)c;
select a.se_keyword, a.item_id, b.cate_id
from graph_embedding.hs_query_ugc_keywords_page_item_list_ a JOIN
(
select item_id, cate_id from tbcdm.dim_tb_itm
where ds=MAX_PT('tbcdm.dim_tb_itm') and is_online="Y"
)b on a.item_id=b.item_id
- 重新训练
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_v4.py" -Dcluster='{"worker":{"count":30, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_2,odps://graph_embedding/tables/hs_test_data_dssm_v2_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=10 --ckpt=hs_ugc_video_3e_4.ckpt" -DuseSparseClusterSchema=True;
- 使用训练测试集做测试
hs_dssm_dic_query_0:| id | words_mainse_ids |
hs_dssm_dic_title_2:| id | words_mainse_ids |
hs_dssm_train_v2_0:| query_id | item_id | label |
query_id, query_ws, video_id, video_ws
drop table hs_tmp_132;
yes
create table hs_tmp_132
as select c.query_id, c.se_keyword_mainse_ws as query_ws, d.id as video_id, d.words_mainse_ids as video_ws, c.label from
(select a.words_mainse_ids as se_keyword_mainse_ws, b.* from(select * from hs_dssm_dic_query_0)a right join (select * from hs_dssm_train_v2_0)b on a.id == b.query_id)c left join (select * from hs_dssm_dic_title_2)d on c.item_id == d.id;
create table hs_tmp_133 as
select query_id, query_ws, video_id, video_ws from hs_tmp_132 limit 200000;
inference
truncate tabel hs_dssm_result_1;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_133" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_1" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;
和真实label比较
drop table hs_tmp_134;
yes
create table hs_tmp_134 as
select a.score, b.* from
(select query_id, video_id, score from hs_dssm_result_1)a left join (select * from hs_tmp_132)b on a.query_id==b.query_id and a.video_id == b.video_id;
- 训练集修正:将query为空的去掉(需要把query太短的去掉吗?)
insert overwrite table hs_tmp_124 select * from hs_tmp_124 where se_keyword_mainse_ws != '0';
insert overwrite table hs_tmp_124 select * from hs_tmp_124 where title_mainse_ws is not NULL and se_keyword_mainse_ws is not NULL;
drop table hs_train_data_dssm_v2_2;
yes
drop table hs_test_data_dssm_v2_2;
yes
PAI -name split -project algo_public
-DinputTableName=graph_embedding.hs_tmp_124
-Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_2
-Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_2
-Dfraction=0.8
-DmemSizePerCore=4096
-DcoreNum=100
;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_v4.py" -Dcluster='{"worker":{"count":30, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_2,odps://graph_embedding/tables/hs_test_data_dssm_v2_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=10 --ckpt=hs_ugc_video_3e_4.ckpt" -DuseSparseClusterSchema=True;
- 构造交叉验证集
train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |
inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |
train_query & inference_title
create table hs_tmp_135 as select id as title_id, words_mainse_ids as title_ws, int(rand() * 9999 + 2) as query_id from hs_dssm_dic_title_inf_1;
create table hs_tmp_136 as
select a.query_id, b.words_mainse_ids as query_ws, a.title_id as video_id, a.title_ws as video_ws from
(select * from hs_tmp_135)a left join (select * from hs_dssm_dic_query_1)b on a.query_id == b.id;
inference_query & train_title
create table hs_tmp_137 as select id as title_id, words_mainse_ids as title_ws, int(rand() * 9999 + 2) as query_id from hs_dssm_dic_title_3 limit 200000;
create table hs_tmp_138 as
select a.query_id, b.words_mainse_ids as query_ws, a.title_id as video_id, a.title_ws as video_ws from
(select * from hs_tmp_137)a left join (select * from hs_dssm_dic_query_inf_1)b on a.query_id == b.id;
train set
train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |
create table hs_tmp_147 as
select a.query_id, b.words_mainse_ids as query_ws, a.title_id as video_id, a.title_ws as video_ws from
(select * from hs_tmp_137)a left join (select * from hs_dssm_dic_query_1)b on a.query_id == b.id;
- 测试交叉验证集效果
inference set : hs_tmp_129
truncate tabel hs_dssm_result_0;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_129" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_0" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_5.ckpt-3" -DuseSparseClusterSchema=True;
关联:
drop table hs_tmp_131;
yes
create table hs_tmp_131 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_dssm_result_0)a left join (select * from hs_tmp_122)b on a.query_id == b.query_id)c left join (select * from hs_tmp_123)d on c.video_id == d.item_id;
train_query & inference_title : hs_tmp_136
truncate tabel hs_dssm_result_2;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_136" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;
关联:
drop table hs_tmp_139;
yes
create table hs_tmp_139 as
select c.se_keyword, d.title, c.score from
(select a.*, b.se_keyword from (select * from hs_dssm_result_2)a left join (select * from hs_dssm_dic_query_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_inf_1)d on c.video_id == d.id;
inference_query & train_title : hs_tmp_138
truncate tabel hs_dssm_result_3;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_138" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_3" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;
关联:
drop table hs_tmp_141;
yes
create table hs_tmp_141 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_dssm_result_3)a left join (select * from hs_dssm_dic_query_inf_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_3)d on c.video_id == d.id;
train set : hs_tmp_133
train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |
truncate tabel hs_dssm_result_1;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_133" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_1" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;
drop table hs_tmp_134;
yes
create table hs_tmp_134 as
select a.score, b.* from
(select query_id, video_id, score from hs_dssm_result_1)a left join (select * from hs_tmp_132)b on a.query_id==b.query_id and a.video_id == b.video_id;
train set : hs_tmp_147
train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |
truncate tabel hs_dssm_result_1;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_147" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_1" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;
drop table hs_tmp_140;
yes
create table hs_tmp_140 as
select c.se_keyword, d.title, c.score from
(select a.*, b.se_keyword from (select * from hs_dssm_result_1)a left join (select * from hs_dssm_dic_query_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_3)d on c.video_id == d.id;
- 使用knn进行关联:效果不好
insert overwrite table hs_tmp_141
select distinct query_id as node_id, query_emb as emb from hs_dssm_result_0;
insert overwrite table hs_tmp_142
select distinct video_id as node_id, video_emb as emb from hs_dssm_result_0;
drop table if exists graph_embedding.hs_tmp_143;
yes
create table if not exists graph_embedding.hs_tmp_143(
node_id bigint,
emb string
) LIFECYCLE 14;
PAI -name am_vsearch_nearest_neighbor_014 -project algo_market
-Dcluster="{"worker":{"count":40,"gpu":100}}"
-Ddim=64
-Did_col="node_id"
-Dvector_col="emb"
-Dinput_slice=40
-Dtopk=10
-Dnprob=512
-Dmetric="l2"
-Dinput="odps://graph_embedding/tables/hs_tmp_142"
-Dquery="odps://graph_embedding/tables/hs_tmp_141"
-Doutputs="odps://graph_embedding/tables/hs_tmp_143"
-DenableDynamicCluster=true -DmaxTrainingTimeInHour=60;
分割结果:效果不好
drop table hs_tmp_144;
yes
create table hs_tmp_144 as select bi_udf:bi_split_value(node_id, emb, " ") as (query_id, title_id) from hs_tmp_143;
drop table hs_tmp_145;
yes
create table hs_tmp_145 as select graph_embedding:hs_split(query_id, title_id, ":") as (query_id, title_id, score) from hs_tmp_144;
inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |
create table hs_tmp_146 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_tmp_145)a left join (select * from hs_dssm_dic_query_inf_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_inf_1)d on c.title_id == d.id;
- 训练集应该没有问题吧
train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |
drop table hs_tmp_145;
yes
create table hs_tmp_145
as select c.se_keyword, d.title as title_mainse_ws, c.label from
(select a.words_mainse_ids as se_keyword_mainse_ws, a.se_keyword, b.* from(select * from hs_dssm_dic_query_1)a right join (select * from hs_dssm_train_v2_0)b on a.id == b.query_id)c left join (select * from hs_dssm_dic_title_3)d on c.item_id == d.id;
-
问题
4种组合的测试
词太短的影响
输入比较简单
分字会不会有所改善? -
重新训练,测试train set
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_v4.py" -Dcluster='{"worker":{"count":30, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_2,odps://graph_embedding/tables/hs_test_data_dssm_v2_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=10 --ckpt=hs_ugc_video_3e_5.ckpt" -DuseSparseClusterSchema=True;
- 把attention换成ave pool:
网友评论