美文网首页
20190731工作进展

20190731工作进展

作者: Songger | 来源:发表于2019-08-01 01:00 被阅读0次

昨天工作:
训练数据手动id化,对dssm网络进行修改,并使用新数据对dssm网络重新训练,现阶段训练结果acc0.94、auc0.93。网络正在训练。

今天计划:
使用top query和ugc数据对网络进行效果测试,并分析改善方向

inference

truncate tabel hs_dssm_result_0;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_129" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_0" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;

样本关联

hs_dssm_result_0:| query_id | video_id | score | query_emb | video_emb |
hs_tmp_122:| query_id | query |
hs_tmp_123:| item_id | title |

drop table hs_tmp_131;
yes
create table hs_tmp_131 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_dssm_result_0)a left join (select * from hs_tmp_122)b on a.query_id == b.query_id)c left join (select * from hs_tmp_123)d on c.video_id == d.item_id;

类目过滤

drop table if exists graph_embedding.hs_query_ugc_keywords_cate_top_;
yes
create table if not exists graph_embedding.hs_query_ugc_keywords_cate_top_ LIFECYCLE 2
as select query_id, se_keyword, process_query_cate(cate_id, freq) as cate_list
from graph_embedding.zj_query_ugc_keywords_cate_freq_infos_ where query_id != 0
group by query_id, se_keyword;

---依据query词拿到首页的item_list

drop table if exists graph_embedding.hs_query_ugc_keywords_page_item_list_;
yes
create table if not exists graph_embedding.hs_query_ugc_keywords_page_item_list_ lifecycle 2
as select se_keyword, item_id from (
select distinct se_keyword, item_id from (
select bi_udf:bi_split_value(se_keyword, item_list, ",") as (se_keyword, item_id)
from (
select distinct se_keyword, item_list
from graph_embedding.jl_jingyan_query_related_top_query_detailed
where ds=MAX_PT('graph_embedding.jl_jingyan_query_related_top_query_detailed') and page_seq=1
)a
)b
)c;

select a.se_keyword, a.item_id, b.cate_id
from graph_embedding.hs_query_ugc_keywords_page_item_list_ a JOIN
(
select item_id, cate_id from tbcdm.dim_tb_itm
where ds=MAX_PT('tbcdm.dim_tb_itm') and is_online="Y"
)b on a.item_id=b.item_id

  1. 重新训练

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_v4.py" -Dcluster='{"worker":{"count":30, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_2,odps://graph_embedding/tables/hs_test_data_dssm_v2_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=10 --ckpt=hs_ugc_video_3e_4.ckpt" -DuseSparseClusterSchema=True;

  1. 使用训练测试集做测试

hs_dssm_dic_query_0:| id | words_mainse_ids |
hs_dssm_dic_title_2:| id | words_mainse_ids |
hs_dssm_train_v2_0:| query_id | item_id | label |

query_id, query_ws, video_id, video_ws

drop table hs_tmp_132;
yes
create table hs_tmp_132
as select c.query_id, c.se_keyword_mainse_ws as query_ws, d.id as video_id, d.words_mainse_ids as video_ws, c.label from
(select a.words_mainse_ids as se_keyword_mainse_ws, b.* from(select * from hs_dssm_dic_query_0)a right join (select * from hs_dssm_train_v2_0)b on a.id == b.query_id)c left join (select * from hs_dssm_dic_title_2)d on c.item_id == d.id;

create table hs_tmp_133 as
select query_id, query_ws, video_id, video_ws from hs_tmp_132 limit 200000;

inference

truncate tabel hs_dssm_result_1;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_133" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_1" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;

和真实label比较

drop table hs_tmp_134;
yes
create table hs_tmp_134 as
select a.score, b.* from
(select query_id, video_id, score from hs_dssm_result_1)a left join (select * from hs_tmp_132)b on a.query_id==b.query_id and a.video_id == b.video_id;

  1. 训练集修正:将query为空的去掉(需要把query太短的去掉吗?)

insert overwrite table hs_tmp_124 select * from hs_tmp_124 where se_keyword_mainse_ws != '0';

insert overwrite table hs_tmp_124 select * from hs_tmp_124 where title_mainse_ws is not NULL and se_keyword_mainse_ws is not NULL;

drop table hs_train_data_dssm_v2_2;
yes
drop table hs_test_data_dssm_v2_2;
yes
PAI -name split -project algo_public
-DinputTableName=graph_embedding.hs_tmp_124
-Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_2
-Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_2
-Dfraction=0.8
-DmemSizePerCore=4096
-DcoreNum=100
;

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_v4.py" -Dcluster='{"worker":{"count":30, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_2,odps://graph_embedding/tables/hs_test_data_dssm_v2_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=10 --ckpt=hs_ugc_video_3e_4.ckpt" -DuseSparseClusterSchema=True;

  1. 构造交叉验证集

train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |

inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |

train_query & inference_title

create table hs_tmp_135 as select id as title_id, words_mainse_ids as title_ws, int(rand() * 9999 + 2) as query_id from hs_dssm_dic_title_inf_1;

create table hs_tmp_136 as
select a.query_id, b.words_mainse_ids as query_ws, a.title_id as video_id, a.title_ws as video_ws from
(select * from hs_tmp_135)a left join (select * from hs_dssm_dic_query_1)b on a.query_id == b.id;

inference_query & train_title

create table hs_tmp_137 as select id as title_id, words_mainse_ids as title_ws, int(rand() * 9999 + 2) as query_id from hs_dssm_dic_title_3 limit 200000;

create table hs_tmp_138 as
select a.query_id, b.words_mainse_ids as query_ws, a.title_id as video_id, a.title_ws as video_ws from
(select * from hs_tmp_137)a left join (select * from hs_dssm_dic_query_inf_1)b on a.query_id == b.id;

train set

train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |

create table hs_tmp_147 as
select a.query_id, b.words_mainse_ids as query_ws, a.title_id as video_id, a.title_ws as video_ws from
(select * from hs_tmp_137)a left join (select * from hs_dssm_dic_query_1)b on a.query_id == b.id;

  1. 测试交叉验证集效果

inference set : hs_tmp_129

truncate tabel hs_dssm_result_0;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_129" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_0" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_5.ckpt-3" -DuseSparseClusterSchema=True;

关联:

drop table hs_tmp_131;
yes
create table hs_tmp_131 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_dssm_result_0)a left join (select * from hs_tmp_122)b on a.query_id == b.query_id)c left join (select * from hs_tmp_123)d on c.video_id == d.item_id;

train_query & inference_title : hs_tmp_136

truncate tabel hs_dssm_result_2;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_136" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;

关联:

drop table hs_tmp_139;
yes
create table hs_tmp_139 as
select c.se_keyword, d.title, c.score from
(select a.*, b.se_keyword from (select * from hs_dssm_result_2)a left join (select * from hs_dssm_dic_query_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_inf_1)d on c.video_id == d.id;

inference_query & train_title : hs_tmp_138

truncate tabel hs_dssm_result_3;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_138" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_3" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;

关联:

drop table hs_tmp_141;
yes
create table hs_tmp_141 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_dssm_result_3)a left join (select * from hs_dssm_dic_query_inf_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_3)d on c.video_id == d.id;

train set : hs_tmp_133

train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |

truncate tabel hs_dssm_result_1;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_133" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_1" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;

drop table hs_tmp_134;
yes
create table hs_tmp_134 as
select a.score, b.* from
(select query_id, video_id, score from hs_dssm_result_1)a left join (select * from hs_tmp_132)b on a.query_id==b.query_id and a.video_id == b.video_id;

train set : hs_tmp_147

train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |

truncate tabel hs_dssm_result_1;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_147" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_1" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_4.ckpt-3" -DuseSparseClusterSchema=True;

drop table hs_tmp_140;
yes
create table hs_tmp_140 as
select c.se_keyword, d.title, c.score from
(select a.*, b.se_keyword from (select * from hs_dssm_result_1)a left join (select * from hs_dssm_dic_query_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_3)d on c.video_id == d.id;

  1. 使用knn进行关联:效果不好

insert overwrite table hs_tmp_141
select distinct query_id as node_id, query_emb as emb from hs_dssm_result_0;

insert overwrite table hs_tmp_142
select distinct video_id as node_id, video_emb as emb from hs_dssm_result_0;

drop table if exists graph_embedding.hs_tmp_143;
yes
create table if not exists graph_embedding.hs_tmp_143(
node_id bigint,
emb string
) LIFECYCLE 14;

PAI -name am_vsearch_nearest_neighbor_014 -project algo_market
-Dcluster="{"worker":{"count":40,"gpu":100}}"
-Ddim=64
-Did_col="node_id"
-Dvector_col="emb"
-Dinput_slice=40
-Dtopk=10
-Dnprob=512
-Dmetric="l2"
-Dinput="odps://graph_embedding/tables/hs_tmp_142"
-Dquery="odps://graph_embedding/tables/hs_tmp_141"
-Doutputs="odps://graph_embedding/tables/hs_tmp_143"
-DenableDynamicCluster=true -DmaxTrainingTimeInHour=60;

分割结果:效果不好

drop table hs_tmp_144;
yes
create table hs_tmp_144 as select bi_udf:bi_split_value(node_id, emb, " ") as (query_id, title_id) from hs_tmp_143;

drop table hs_tmp_145;
yes
create table hs_tmp_145 as select graph_embedding:hs_split(query_id, title_id, ":") as (query_id, title_id, score) from hs_tmp_144;

inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |

create table hs_tmp_146 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_tmp_145)a left join (select * from hs_dssm_dic_query_inf_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_inf_1)d on c.title_id == d.id;

  1. 训练集应该没有问题吧

train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |

drop table hs_tmp_145;
yes
create table hs_tmp_145
as select c.se_keyword, d.title as title_mainse_ws, c.label from
(select a.words_mainse_ids as se_keyword_mainse_ws, a.se_keyword, b.* from(select * from hs_dssm_dic_query_1)a right join (select * from hs_dssm_train_v2_0)b on a.id == b.query_id)c left join (select * from hs_dssm_dic_title_3)d on c.item_id == d.id;

  1. 问题
    4种组合的测试
    词太短的影响
    输入比较简单
    分字会不会有所改善?

  2. 重新训练,测试train set

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_v4.py" -Dcluster='{"worker":{"count":30, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_2,odps://graph_embedding/tables/hs_test_data_dssm_v2_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=10 --ckpt=hs_ugc_video_3e_5.ckpt" -DuseSparseClusterSchema=True;

  1. 把attention换成ave pool:

http://logview.odps.aliyun-inc.com:8080/logview/?h=http://service-corp.odps.aliyun-inc.com/api&p=graph_embedding&i=20190731165543924gr2n4gep2_1e032e5a_0693_40ff_abc8_8e075ebaa001&token=NHkrK0tRK0RWWTdOcEVEMG1RZ0dySHdJT0Q0PSxPRFBTX09CTzoxMjkzMzAzOTgzMjUxNTQ4LDE1NjUxOTY5NDUseyJTdGF0ZW1lbnQiOlt7IkFjdGlvbiI6WyJvZHBzOlJlYWQiXSwiRWZmZWN0IjoiQWxsb3ciLCJSZXNvdXJjZSI6WyJhY3M6b2RwczoqOnByb2plY3RzL2dyYXBoX2VtYmVkZGluZy9pbnN0YW5jZXMvMjAxOTA3MzExNjU1NDM5MjRncjJuNGdlcDJfMWUwMzJlNWFfMDY5M180MGZmX2FiYzhfOGUwNzVlYmFhMDAxIl19XSwiVmVyc2lvbiI6IjEifQ==

相关文章

  • 20190731工作进展

    昨天工作:训练数据手动id化,对dssm网络进行修改,并使用新数据对dssm网络重新训练,现阶段训练结果acc0....

  • 20190731

    第一书记与第一书记助理对接,三查一访进行镇公示。8月1日进行县公示。 市扶贫办大固本调查三查一访工作,到固本村进行走访。

  • 20190731

    所有的不值得期待所有的算了吧热爱所有的兜转停留徘徊 一片片破碎一遍遍重来纵使悲欢离合千百态肯化多少情仇爱恨不舍到头...

  • 20190731

    一直想要做到工作的时候坦坦荡荡。如何才能做到 1.心里无诉求,甚至可以放弃; 2.在这件事不能期望受利于人。 3....

  • 20190731

    事件记录 7月的最后一天。今天也是休假的第一天。一共只有五天。 上午想睡懒觉,两个小孩催我起床。吃过早饭,扫了地,...

  • 【20190731】

    Yeo最后一天工作,晚上回家收拾东西。明天晚上搬去虎门。

  • 20190731

    相信大家都听过一万小时理论吧,我以前听到这么说的时候,感觉挺有道理,比尔盖茨经过了一万小时的练习,最后写程序特别厉...

  • 20190731

    做难事必有所得。无所求即可坦荡荡。 1,学习 英语坚持学习,继续努力。早中晚都要复习。多读,张大嘴。 2.生活: ...

  • 20190731

    首先,为什么我没收到昨天没有日更的提醒?难道出现bug了?再,至于昨天为什么没有更呢?是因为我在写了关于“养生”的...

  • 20190731

    姓名:杨启明 公司:合肥井然餐饮公司 知—学习 《六项精进》 行—实践一•修 身:脚踏实地做人 心:掊养"自我燃烧...

网友评论

      本文标题:20190731工作进展

      本文链接:https://www.haomeiwen.com/subject/eutzrctx.html