美文网首页
2019-08-14工作进展

2019-08-14工作进展

作者: Songger | 来源:发表于2019-08-15 00:17 被阅读0次

昨天工作:
对图片数据进行筛选,筛选后训练集中需要处理的图片是530w左右,验证集中剩下2.4w左右,现图片特征已经提取完毕,今天可以开始网络的训练。

今天计划:

  1. mvdssm:图片特征提取完毕,训练测试数据处理完之后可以开始模型的训练;

  2. 关键词表优化:考虑尝试多种不同的分词方式丰富query关键词库,考虑对不重要query关键词进行处理,要是能拿到品牌词就好了。

  3. 扩展关键词表


train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |


inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |


create table hs_tmp_218 as select se_keyword as word from hs_dssm_dic_query_1;
insert into table hs_tmp_218 select query from hs_dssm_dic_query_inf_1;

create table hs_tm_219 as select search_kg:alinlp_segment(word, "YOUKU", "0", "1") as keyword from hs_tmp_218;
insert into table hs_tmp_219 select search_kg:alinlp_segment(word, "GENERAL_CHN", "0", "1") as keyword from hs_tmp_218;
insert into table hs_tmp_219 select search_kg:alinlp_segment(word, "CONTENT_SEARCH", "0", "1") as keyword from hs_tmp_218;
insert into table hs_tmp_219 select search_kg:alinlp_segment(word, "MAINSE", "0", "1") as keyword from hs_tmp_218;
insert into table hs_tmp_219 select search_kg:alinlp_segment(word, "GENERAL_CHN", "0", "1") as keyword from hs_tmp_218;
insert into table hs_tmp_219 select search_kg:alinlp_segment(word, "GENERAL_CHN", "0", "1") as keyword from hs_tmp_218;

  1. mvdssm训练集数据处理

  2. word emb

train_query : hs_tmp_202 : | id | emb |
train_title : hs_tmp_203 : | id | emb | ; hs_tmp_214 : | item_id | title | pic_url | pic_ws |
inference_query : hs_tmp_204 : | id | emb |
inference_title : hs_tmp_205 : | id | emb | ; hs_tmp_217 : | item_id | title | pic_url | pic_ws |

train_set : hs_dssm_train_v3_0 : | query_id | item_id | label |

  1. 构造训练测试验证集

训练集测试集构造 : se_keyword_mainse_ws,title_mainse_ws,pic_mainse_ws

drop table hs_tmp_220;
yes
create table hs_tmp_220 as
select c., d.emb as title_mainse_ws from (select a., b.emb as se_keyword_mainse_ws from (select * from hs_dssm_train_v3_0)a join (select * from hs_tmp_202)b on a. query_id == b. id)c join (select * from hs_tmp_203)d on c.item_id == d.id;

create table hs_tmp_223 as
select a.*, b.pic_ws from hs_dssm_train_v3_0 a join hs_tmp_214 b on a.item_id == b.item_id;

create table hs_tmp_224 as
select a.se_keyword_mainse_ws, a.title_mainse_ws, b.pic_ws as pic_mainse_ws from hs_tmp_220 a join hs_tmp_223 b on a.item_id == b.item_id;

insert overwrite table hs_tmp_224 select distinct * from hs_tmp_224;

验证集构造 : query_id, query_ws, video_id, video_ws, pic_ws

create table hs_tmp_221 as
select e.query_id, e.title_id as video_id, e.query_ws, e.video_ws, f.pic_ws from
(select c., d.emb as video_ws from (select a., b.emb as query_ws from hs_tmp_157 a join hs_tmp_204 b on a.query_id == b.id)c join hs_tmp_205 d on c.title_id == d.id)e join hs_tmp_217 f on e.title_id == f.item_id;

insert overwrite table hs_tmp_221 select distinct * from hs_tmp_221;

train & inference

create table hs_tmp_231 as select * from hs_tmp_224 limit 1000000;
se_keyword_mainse_ws,title_mainse_ws, pic_mainse_ws, label
query_id, query_ws, video_id, video_ws, pic_ws

create table hs_tmp_250 as select *, 1 as label from hs_tmp_224;
create table hs_tmp_251 as select *, 1 as label from hs_tmp_231;
create table hs_tmp_253 as select * from hs_tmp_251 limit 10000;

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_mv_dssm_v2.py" -Dcluster='{"worker":{"count":10, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_250,odps://graph_embedding/tables/hs_tmp_251,odps://graph_embedding/tables/hs_tmp_245" -Doutputs="odps://graph_embedding/tables/hs_tmp_211" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_4e_5.ckpt" -DuseSparseClusterSchema=True;

hs_tmp_252 : | query_id | video_id | score | active_view |
hs_tmp_215: | item_id | title | pic_url |
hs_dssm_dic_query_inf_1 : | id | words_mainse_ids | query |

drop table hs_tmp_248;
yes
create table hs_tmp_248 as select c., d.query from
(select a.
, b.title, b.pic_url from hs_tmp_211 a join hs_tmp_215 b on a.video_id == b.item_id)c join hs_dssm_dic_query_inf_1 d on c.query_id == d.id;

  1. 重新构造负样本

graph_embedding.jl_jingyan_query_related_top_query_detailed :

create table if not exists graph_embedding.hs_query_ugc_keywords_cate_freq_infos_1
as select b.query_id, a.se_keyword, a.cate_id, a.freq from (
select se_keyword, cate_id, count(item_id) as freq
from (
select a.se_keyword, a.item_id, b.cate_id
from graph_embedding.hs_query_ugc_keywords_page_item_list_ a JOIN
(
select item_id, cate_id from tbcdm.dim_tb_itm
where ds=MAX_PT('tbcdm.dim_tb_itm') and is_online="Y"
)b on a.item_id=b.item_id
)c group by se_keyword, cate_id
)a join (
select id as query_id, query
from graph_embedding.hs_tmp_233
)b on a.se_keyword=b.query;

drop resource hs_udaf_82_1.py;
yes
add py /home/hengsong/query_co_video/src/hs_udaf_82_1.py;
CREATE FUNCTION hs_return_merge_2 AS hs_udaf_82_1.hs_return_merge USING hs_udaf_82_1.py;

create table hs_tmp_234 as select query_id, se_keyword, hs_return_merge_2(cate_id) as cate_ids from hs_query_ugc_keywords_cate_freq_infos_1 group by query_id, se_keyword;

add table graph_embedding.hs_tmp_234 as hs_tmp_234;

drop resource hs_udtf_82_1.py;
yes
add py /home/hengsong/query_co_video/src/hs_udtf_82_1.py;
CREATE FUNCTION return_query_sim_2 AS hs_udtf_82_1.return_query_sim USING hs_udtf_82_1.py,hs_tmp_234;

drop table hs_tmp_235;
yes
create table hs_tmp_235 as
select return_query_sim_2(query_id, se_keyword, cate_ids) as (query_id, query, corr_query, corr_len) from hs_tmp_234;

drop table hs_tmp_236;
yes
create table hs_tmp_236 as
select a.*, b.se_keyword from hs_tmp_235 a join hs_tmp_234 b on a.corr_query == b.query_id;

drop table hs_tmp_237;
yes
create table hs_tmp_237 as select a.* from hs_tmp_236 a where 5 > (select count(*) from hs_tmp_236 where query_id == a.query_id and corr_len > a.corr_len ) order by a.corr_len, a.query_id;

drop table hs_tmp_239;
yes
create table hs_tmp_239 as
select *,row_number() over (partition by query_id order by cast(corr_len as bigint) desc) from hs_tmp_236;

drop table hs_tmp_240;
yes
create table hs_tmp_240 as select query_id, query, corr_query, corr_len, se_keyword, _c1 as index from hs_tmp_239 where _c1 = 5 or _c1 = 50 or _c1 = 500 or _c1 = 5000;

https://dacoolbaby.iteye.com/blog/1876638

想知道query对应的商品数量

create table hs_tmp_238 as select se_keyword, count(*) as freq from hs_query_ugc_keywords_page_item_list_ group by se_keyword order by freq desc;

样本均衡

create table hs_tmp_241 as
select b.query_id, b.se_keyword, a.item_id from hs_query_ugc_keywords_page_item_list_ a join hs_tmp_234 b on a.se_keyword == b.se_keyword;

drop table hs_tmp_243;
yes
create table hs_tmp_243 as
select *,row_number() over (partition by se_keyword order by cast(query_id as bigint) desc) from hs_tmp_241;

create table hs_tmp_244 as
select * from hs_tmp_243 where _c1 < 6000;

drop table hs_tmp_242;
yes
create table hs_tmp_242 as
select query_id, item_id, 1 as label from hs_tmp_244;

insert into table hs_tmp_242 select distinct a.query_id, b.item_id, 0 as label from hs_tmp_240 a join hs_tmp_244 b on a.corr_query == b.query_id;

insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();

train_query : hs_tmp_234 : | query_id | se_keyword | cate_ids |
train_title : hs_tmp_246 : | item_id | title |

该表不用重新生成:
drop table hs_tmp_246;
yes
create table hs_tmp_246 as
select b.* from
(select distinct item_id from hs_tmp_244)a join
(
select item_id, title from tbcdm.dim_tb_itm
where ds=MAX_PT('tbcdm.dim_tb_itm') and is_online="Y"
)b
on a.item_id == b.item_id;

drop table hs_tmp_247;
yes
create table hs_tmp_247 as select c., d.title from (select a., b.se_keyword as query from hs_tmp_242 a join hs_tmp_234 b on a.query_id == b.query_id)c join hs_tmp_246 d on c.item_id == d.item_id;

hs_dssm_dic_query_1 : | id | words_mainse_ids | se_keyword |
hs_tmp_202 : | id | emb |
hs_tmp_247 : | query_id | item_id | label | query | title |
hs_tmp_203 : | id | emb |

target : se_keyword_mainse_ws,title_mainse_ws,label

drop table hs_tmp_249;
yes
create table hs_tmp_249 as
select c.se_keyword_mainse_ws, d.title_mainse_ws, c.label from
(select a.*, b.emb as se_keyword_mainse_ws from hs_tmp_247 a join hs_tmp_203 b on a.item_id == b.id)c join (select e.se_keyword as query, f.emb as title_mainse_ws from hs_dssm_dic_query_1 e join hs_tmp_202 f on e.id == f.id)d on c.query == d.query;

drop table hs_train_data_dssm_v2_7;
yes
drop table hs_test_data_dssm_v2_7;
yes
PAI -name split -project algo_public
-DinputTableName=graph_embedding.hs_tmp_249
-Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_7
-Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_7
-Dfraction=0.9
-DmemSizePerCore=4096
-DcoreNum=100
;

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_inference_v7.py" -Dcluster='{"worker":{"count":10, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_4,odps://graph_embedding/tables/hs_test_data_dssm_v2_4,odps://graph_embedding/tables/hs_tmp_207" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_3" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_4e_ .ckpt" -DuseSparseClusterSchema=True;

7 : 20190816112659657gpb2kbs9
4 : 20190816113025356gysftuvj2

相关文章

网友评论

      本文标题:2019-08-14工作进展

      本文链接:https://www.haomeiwen.com/subject/irfijctx.html