20190731工作进展

作者: Songger | 来源:发表于2019-08-01 01:00 被阅读0次

20190731工作进展
20190731
20190731
20190731
20190731
【20190731】
20190731
20190731
20190731
20190731

昨天工作：
训练数据手动id化，对dssm网络进行修改，并使用新数据对dssm网络重新训练，现阶段训练结果acc0.94、auc0.93。网络正在训练。

今天计划：
使用top query和ugc数据对网络进行效果测试，并分析改善方向

inference

truncate tabel hs_dssm_result_0;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_129" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_0" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;

样本关联

drop table hs_tmp_131;
yes
create table hs_tmp_131 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_dssm_result_0)a left join (select * from hs_tmp_122)b on a.query_id == b.query_id)c left join (select * from hs_tmp_123)d on c.video_id == d.item_id;

类目过滤

drop table if exists graph_embedding.hs_query_ugc_keywords_cate_top_;
yes
create table if not exists graph_embedding.hs_query_ugc_keywords_cate_top_ LIFECYCLE 2
as select query_id, se_keyword, process_query_cate(cate_id, freq) as cate_list
from graph_embedding.zj_query_ugc_keywords_cate_freq_infos_ where query_id != 0
group by query_id, se_keyword;

---依据query词拿到首页的item_list

drop table if exists graph_embedding.hs_query_ugc_keywords_page_item_list_;
yes
create table if not exists graph_embedding.hs_query_ugc_keywords_page_item_list_ lifecycle 2
as select se_keyword, item_id from (
select distinct se_keyword, item_id from (
select bi_udf:bi_split_value(se_keyword, item_list, ",") as (se_keyword, item_id)
from (
select distinct se_keyword, item_list
from graph_embedding.jl_jingyan_query_related_top_query_detailed
where ds=MAX_PT('graph_embedding.jl_jingyan_query_related_top_query_detailed') and page_seq=1
)a
)b
)c;

select a.se_keyword, a.item_id, b.cate_id
from graph_embedding.hs_query_ugc_keywords_page_item_list_ a JOIN
(
select item_id, cate_id from tbcdm.dim_tb_itm
where ds=MAX_PT('tbcdm.dim_tb_itm') and is_online="Y"
)b on a.item_id=b.item_id

重新训练

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_v4.py" -Dcluster='{"worker":{"count":30, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_2,odps://graph_embedding/tables/hs_test_data_dssm_v2_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=10 --ckpt=hs_ugc_video_3e_4.ckpt" -DuseSparseClusterSchema=True;

使用训练测试集做测试

query_id, query_ws, video_id, video_ws

drop table hs_tmp_132;
yes
create table hs_tmp_132
as select c.query_id, c.se_keyword_mainse_ws as query_ws, d.id as video_id, d.words_mainse_ids as video_ws, c.label from
(select a.words_mainse_ids as se_keyword_mainse_ws, b.* from(select * from hs_dssm_dic_query_0)a right join (select * from hs_dssm_train_v2_0)b on a.id == b.query_id)c left join (select * from hs_dssm_dic_title_2)d on c.item_id == d.id;

create table hs_tmp_133 as
select query_id, query_ws, video_id, video_ws from hs_tmp_132 limit 200000;

inference

truncate tabel hs_dssm_result_1;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_133" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_1" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;

和真实label比较

drop table hs_tmp_134;
yes
create table hs_tmp_134 as
select a.score, b.* from
(select query_id, video_id, score from hs_dssm_result_1)a left join (select * from hs_tmp_132)b on a.query_id==b.query_id and a.video_id == b.video_id;

训练集修正：将query为空的去掉（需要把query太短的去掉吗？）

insert overwrite table hs_tmp_124 select * from hs_tmp_124 where se_keyword_mainse_ws != '0';

insert overwrite table hs_tmp_124 select * from hs_tmp_124 where title_mainse_ws is not NULL and se_keyword_mainse_ws is not NULL;

drop table hs_train_data_dssm_v2_2;
yes
drop table hs_test_data_dssm_v2_2;
yes
PAI -name split -project algo_public
-DinputTableName=graph_embedding.hs_tmp_124
-Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_2
-Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_2
-Dfraction=0.8
-DmemSizePerCore=4096
-DcoreNum=100
;

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_v4.py" -Dcluster='{"worker":{"count":30, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_2,odps://graph_embedding/tables/hs_test_data_dssm_v2_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=10 --ckpt=hs_ugc_video_3e_4.ckpt" -DuseSparseClusterSchema=True;

构造交叉验证集

train_query & inference_title

create table hs_tmp_135 as select id as title_id, words_mainse_ids as title_ws, int(rand() * 9999 + 2) as query_id from hs_dssm_dic_title_inf_1;

create table hs_tmp_136 as
select a.query_id, b.words_mainse_ids as query_ws, a.title_id as video_id, a.title_ws as video_ws from
(select * from hs_tmp_135)a left join (select * from hs_dssm_dic_query_1)b on a.query_id == b.id;

inference_query & train_title

create table hs_tmp_137 as select id as title_id, words_mainse_ids as title_ws, int(rand() * 9999 + 2) as query_id from hs_dssm_dic_title_3 limit 200000;

create table hs_tmp_138 as
select a.query_id, b.words_mainse_ids as query_ws, a.title_id as video_id, a.title_ws as video_ws from
(select * from hs_tmp_137)a left join (select * from hs_dssm_dic_query_inf_1)b on a.query_id == b.id;

train set

create table hs_tmp_147 as
select a.query_id, b.words_mainse_ids as query_ws, a.title_id as video_id, a.title_ws as video_ws from
(select * from hs_tmp_137)a left join (select * from hs_dssm_dic_query_1)b on a.query_id == b.id;

测试交叉验证集效果

inference set : hs_tmp_129

truncate tabel hs_dssm_result_0;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_129" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_0" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_5.ckpt-3" -DuseSparseClusterSchema=True;

关联：

drop table hs_tmp_131;
yes
create table hs_tmp_131 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_dssm_result_0)a left join (select * from hs_tmp_122)b on a.query_id == b.query_id)c left join (select * from hs_tmp_123)d on c.video_id == d.item_id;

train_query & inference_title : hs_tmp_136

truncate tabel hs_dssm_result_2;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_136" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;

关联：

drop table hs_tmp_139;
yes
create table hs_tmp_139 as
select c.se_keyword, d.title, c.score from
(select a.*, b.se_keyword from (select * from hs_dssm_result_2)a left join (select * from hs_dssm_dic_query_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_inf_1)d on c.video_id == d.id;

inference_query & train_title : hs_tmp_138

truncate tabel hs_dssm_result_3;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_138" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_3" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;

关联：

drop table hs_tmp_141;
yes
create table hs_tmp_141 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_dssm_result_3)a left join (select * from hs_dssm_dic_query_inf_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_3)d on c.video_id == d.id;

train set : hs_tmp_133

truncate tabel hs_dssm_result_1;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_133" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_1" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;

drop table hs_tmp_134;
yes
create table hs_tmp_134 as
select a.score, b.* from
(select query_id, video_id, score from hs_dssm_result_1)a left join (select * from hs_tmp_132)b on a.query_id==b.query_id and a.video_id == b.video_id;

train set : hs_tmp_147

truncate tabel hs_dssm_result_1;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_147" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_1" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;

drop table hs_tmp_140;
yes
create table hs_tmp_140 as
select c.se_keyword, d.title, c.score from
(select a.*, b.se_keyword from (select * from hs_dssm_result_1)a left join (select * from hs_dssm_dic_query_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_3)d on c.video_id == d.id;

使用knn进行关联：效果不好

insert overwrite table hs_tmp_141
select distinct query_id as node_id, query_emb as emb from hs_dssm_result_0;

insert overwrite table hs_tmp_142
select distinct video_id as node_id, video_emb as emb from hs_dssm_result_0;

drop table if exists graph_embedding.hs_tmp_143;
yes
create table if not exists graph_embedding.hs_tmp_143(
node_id bigint,
emb string
) LIFECYCLE 14;

PAI -name am_vsearch_nearest_neighbor_014 -project algo_market
-Dcluster="{"worker":{"count":40,"gpu":100}}"
-Ddim=64
-Did_col="node_id"
-Dvector_col="emb"
-Dinput_slice=40
-Dtopk=10
-Dnprob=512
-Dmetric="l2"
-Dinput="odps://graph_embedding/tables/hs_tmp_142"
-Dquery="odps://graph_embedding/tables/hs_tmp_141"
-Doutputs="odps://graph_embedding/tables/hs_tmp_143"
-DenableDynamicCluster=true -DmaxTrainingTimeInHour=60;

分割结果：效果不好

drop table hs_tmp_144;
yes
create table hs_tmp_144 as select bi_udf:bi_split_value(node_id, emb, " ") as (query_id, title_id) from hs_tmp_143;

drop table hs_tmp_145;
yes
create table hs_tmp_145 as select graph_embedding:hs_split(query_id, title_id, ":") as (query_id, title_id, score) from hs_tmp_144;

create table hs_tmp_146 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_tmp_145)a left join (select * from hs_dssm_dic_query_inf_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_inf_1)d on c.title_id == d.id;

训练集应该没有问题吧

drop table hs_tmp_145;
yes
create table hs_tmp_145
as select c.se_keyword, d.title as title_mainse_ws, c.label from
(select a.words_mainse_ids as se_keyword_mainse_ws, a.se_keyword, b.* from(select * from hs_dssm_dic_query_1)a right join (select * from hs_dssm_train_v2_0)b on a.id == b.query_id)c left join (select * from hs_dssm_dic_title_3)d on c.item_id == d.id;

问题
4种组合的测试
词太短的影响
输入比较简单
分字会不会有所改善？
重新训练，测试train set

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_v4.py" -Dcluster='{"worker":{"count":30, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_2,odps://graph_embedding/tables/hs_test_data_dssm_v2_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=10 --ckpt=hs_ugc_video_3e_5.ckpt" -DuseSparseClusterSchema=True;

把attention换成ave pool：

http://logview.odps.aliyun-inc.com:8080/logview/?h=http://service-corp.odps.aliyun-inc.com/api&p=graph_embedding&i=20190731165543924gr2n4gep2_1e032e5a_0693_40ff_abc8_8e075ebaa001&token=NHkrK0tRK0RWWTdOcEVEMG1RZ0dySHdJT0Q0PSxPRFBTX09CTzoxMjkzMzAzOTgzMjUxNTQ4LDE1NjUxOTY5NDUseyJTdGF0ZW1lbnQiOlt7IkFjdGlvbiI6WyJvZHBzOlJlYWQiXSwiRWZmZWN0IjoiQWxsb3ciLCJSZXNvdXJjZSI6WyJhY3M6b2RwczoqOnByb2plY3RzL2dyYXBoX2VtYmVkZGluZy9pbnN0YW5jZXMvMjAxOTA3MzExNjU1NDM5MjRncjJuNGdlcDJfMWUwMzJlNWFfMDY5M180MGZmX2FiYzhfOGUwNzVlYmFhMDAxIl19XSwiVmVyc2lvbiI6IjEifQ==

网友评论

本文标题：20190731工作进展

本文链接：https://www.haomeiwen.com/subject/eutzrctx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

20190731工作进展

相关文章