2019-08-14工作进展

作者: Songger | 来源:发表于2019-08-15 00:17 被阅读0次

2019-08-14工作进展
789艺术广场和老井西餐厅
Lan的ScalersTalk第四轮新概念朗读持续力训练Day
8月14日
腾讯云服务等级协议（SLA）
安装MongoDB(windows)
工作进展
再见，简友
Linux指定时间段日志筛选
文先森的日常 -- 抉择

昨天工作：
对图片数据进行筛选，筛选后训练集中需要处理的图片是530w左右，验证集中剩下2.4w左右，现图片特征已经提取完毕，今天可以开始网络的训练。

今天计划：

mvdssm：图片特征提取完毕，训练测试数据处理完之后可以开始模型的训练；
关键词表优化：考虑尝试多种不同的分词方式丰富query关键词库，考虑对不重要query关键词进行处理，要是能拿到品牌词就好了。
扩展关键词表

create table hs_tmp_218 as select se_keyword as word from hs_dssm_dic_query_1;
insert into table hs_tmp_218 select query from hs_dssm_dic_query_inf_1;

create table hs_tm_219 as select search_kg:alinlp_segment(word, "YOUKU", "0", "1") as keyword from hs_tmp_218;
insert into table hs_tmp_219 select search_kg:alinlp_segment(word, "GENERAL_CHN", "0", "1") as keyword from hs_tmp_218;
insert into table hs_tmp_219 select search_kg:alinlp_segment(word, "CONTENT_SEARCH", "0", "1") as keyword from hs_tmp_218;
insert into table hs_tmp_219 select search_kg:alinlp_segment(word, "MAINSE", "0", "1") as keyword from hs_tmp_218;
insert into table hs_tmp_219 select search_kg:alinlp_segment(word, "GENERAL_CHN", "0", "1") as keyword from hs_tmp_218;
insert into table hs_tmp_219 select search_kg:alinlp_segment(word, "GENERAL_CHN", "0", "1") as keyword from hs_tmp_218;

mvdssm训练集数据处理
word emb

train_set : hs_dssm_train_v3_0 : | query_id | item_id | label |

构造训练测试验证集

训练集测试集构造 : se_keyword_mainse_ws,title_mainse_ws,pic_mainse_ws

drop table hs_tmp_220;
yes
create table hs_tmp_220 as
select c., d.emb as title_mainse_ws from (select a., b.emb as se_keyword_mainse_ws from (select * from hs_dssm_train_v3_0)a join (select * from hs_tmp_202)b on a. query_id == b. id)c join (select * from hs_tmp_203)d on c.item_id == d.id;

create table hs_tmp_223 as
select a.*, b.pic_ws from hs_dssm_train_v3_0 a join hs_tmp_214 b on a.item_id == b.item_id;

create table hs_tmp_224 as
select a.se_keyword_mainse_ws, a.title_mainse_ws, b.pic_ws as pic_mainse_ws from hs_tmp_220 a join hs_tmp_223 b on a.item_id == b.item_id;

insert overwrite table hs_tmp_224 select distinct * from hs_tmp_224;

验证集构造 : query_id, query_ws, video_id, video_ws, pic_ws

create table hs_tmp_221 as
select e.query_id, e.title_id as video_id, e.query_ws, e.video_ws, f.pic_ws from
(select c., d.emb as video_ws from (select a., b.emb as query_ws from hs_tmp_157 a join hs_tmp_204 b on a.query_id == b.id)c join hs_tmp_205 d on c.title_id == d.id)e join hs_tmp_217 f on e.title_id == f.item_id;

insert overwrite table hs_tmp_221 select distinct * from hs_tmp_221;

train & inference

create table hs_tmp_231 as select * from hs_tmp_224 limit 1000000;
se_keyword_mainse_ws,title_mainse_ws, pic_mainse_ws, label
query_id, query_ws, video_id, video_ws, pic_ws

create table hs_tmp_250 as select *, 1 as label from hs_tmp_224;
create table hs_tmp_251 as select *, 1 as label from hs_tmp_231;
create table hs_tmp_253 as select * from hs_tmp_251 limit 10000;

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_mv_dssm_v2.py" -Dcluster='{"worker":{"count":10, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_250,odps://graph_embedding/tables/hs_tmp_251,odps://graph_embedding/tables/hs_tmp_245" -Doutputs="odps://graph_embedding/tables/hs_tmp_211" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_4e_5.ckpt" -DuseSparseClusterSchema=True;

drop table hs_tmp_248;
yes
create table hs_tmp_248 as select c., d.query from
(select a., b.title, b.pic_url from hs_tmp_211 a join hs_tmp_215 b on a.video_id == b.item_id)c join hs_dssm_dic_query_inf_1 d on c.query_id == d.id;

重新构造负样本

graph_embedding.jl_jingyan_query_related_top_query_detailed :

create table if not exists graph_embedding.hs_query_ugc_keywords_cate_freq_infos_1
as select b.query_id, a.se_keyword, a.cate_id, a.freq from (
select se_keyword, cate_id, count(item_id) as freq
from (
select a.se_keyword, a.item_id, b.cate_id
from graph_embedding.hs_query_ugc_keywords_page_item_list_ a JOIN
(
select item_id, cate_id from tbcdm.dim_tb_itm
where ds=MAX_PT('tbcdm.dim_tb_itm') and is_online="Y"
)b on a.item_id=b.item_id
)c group by se_keyword, cate_id
)a join (
select id as query_id, query
from graph_embedding.hs_tmp_233
)b on a.se_keyword=b.query;

drop resource hs_udaf_82_1.py;
yes
add py /home/hengsong/query_co_video/src/hs_udaf_82_1.py;
CREATE FUNCTION hs_return_merge_2 AS hs_udaf_82_1.hs_return_merge USING hs_udaf_82_1.py;

create table hs_tmp_234 as select query_id, se_keyword, hs_return_merge_2(cate_id) as cate_ids from hs_query_ugc_keywords_cate_freq_infos_1 group by query_id, se_keyword;

add table graph_embedding.hs_tmp_234 as hs_tmp_234;

drop resource hs_udtf_82_1.py;
yes
add py /home/hengsong/query_co_video/src/hs_udtf_82_1.py;
CREATE FUNCTION return_query_sim_2 AS hs_udtf_82_1.return_query_sim USING hs_udtf_82_1.py,hs_tmp_234;

drop table hs_tmp_235;
yes
create table hs_tmp_235 as
select return_query_sim_2(query_id, se_keyword, cate_ids) as (query_id, query, corr_query, corr_len) from hs_tmp_234;

drop table hs_tmp_236;
yes
create table hs_tmp_236 as
select a.*, b.se_keyword from hs_tmp_235 a join hs_tmp_234 b on a.corr_query == b.query_id;

drop table hs_tmp_237;
yes
create table hs_tmp_237 as select a.* from hs_tmp_236 a where 5 > (select count(*) from hs_tmp_236 where query_id == a.query_id and corr_len > a.corr_len ) order by a.corr_len, a.query_id;

drop table hs_tmp_239;
yes
create table hs_tmp_239 as
select *,row_number() over (partition by query_id order by cast(corr_len as bigint) desc) from hs_tmp_236;

drop table hs_tmp_240;
yes
create table hs_tmp_240 as select query_id, query, corr_query, corr_len, se_keyword, _c1 as index from hs_tmp_239 where _c1 = 5 or _c1 = 50 or _c1 = 500 or _c1 = 5000;

https://dacoolbaby.iteye.com/blog/1876638

想知道query对应的商品数量

create table hs_tmp_238 as select se_keyword, count(*) as freq from hs_query_ugc_keywords_page_item_list_ group by se_keyword order by freq desc;

样本均衡

create table hs_tmp_241 as
select b.query_id, b.se_keyword, a.item_id from hs_query_ugc_keywords_page_item_list_ a join hs_tmp_234 b on a.se_keyword == b.se_keyword;

drop table hs_tmp_243;
yes
create table hs_tmp_243 as
select *,row_number() over (partition by se_keyword order by cast(query_id as bigint) desc) from hs_tmp_241;

create table hs_tmp_244 as
select * from hs_tmp_243 where _c1 < 6000;

drop table hs_tmp_242;
yes
create table hs_tmp_242 as
select query_id, item_id, 1 as label from hs_tmp_244;

insert into table hs_tmp_242 select distinct a.query_id, b.item_id, 0 as label from hs_tmp_240 a join hs_tmp_244 b on a.corr_query == b.query_id;

insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();
insert overwrite table hs_tmp_242 select * from hs_tmp_242 DISTRIBUTE by random();

该表不用重新生成：
drop table hs_tmp_246;
yes
create table hs_tmp_246 as
select b.* from
(select distinct item_id from hs_tmp_244)a join
(
select item_id, title from tbcdm.dim_tb_itm
where ds=MAX_PT('tbcdm.dim_tb_itm') and is_online="Y"
)b
on a.item_id == b.item_id;

drop table hs_tmp_247;
yes
create table hs_tmp_247 as select c., d.title from (select a., b.se_keyword as query from hs_tmp_242 a join hs_tmp_234 b on a.query_id == b.query_id)c join hs_tmp_246 d on c.item_id == d.item_id;

target : se_keyword_mainse_ws,title_mainse_ws,label

drop table hs_tmp_249;
yes
create table hs_tmp_249 as
select c.se_keyword_mainse_ws, d.title_mainse_ws, c.label from
(select a.*, b.emb as se_keyword_mainse_ws from hs_tmp_247 a join hs_tmp_203 b on a.item_id == b.id)c join (select e.se_keyword as query, f.emb as title_mainse_ws from hs_dssm_dic_query_1 e join hs_tmp_202 f on e.id == f.id)d on c.query == d.query;

drop table hs_train_data_dssm_v2_7;
yes
drop table hs_test_data_dssm_v2_7;
yes
PAI -name split -project algo_public
-DinputTableName=graph_embedding.hs_tmp_249
-Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_7
-Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_7
-Dfraction=0.9
-DmemSizePerCore=4096
-DcoreNum=100
;

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_inference_v7.py" -Dcluster='{"worker":{"count":10, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_4,odps://graph_embedding/tables/hs_test_data_dssm_v2_4,odps://graph_embedding/tables/hs_tmp_207" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_3" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_4e_ .ckpt" -DuseSparseClusterSchema=True;

7 : 20190816112659657gpb2kbs9
4 : 20190816113025356gysftuvj2

2019-08-14工作进展
昨天工作：对图片数据进行筛选，筛选后训练集中需要处理的图片是530w左右，验证集中剩下2.4w左右，现图片特征已经...
789艺术广场和老井西餐厅
2019-08-14 798艺术广场 2019-08-14 老井西餐厅
Lan的ScalersTalk第四轮新概念朗读持续力训练Day
练习材料： [Day 1783 2019-08-14] Lesson 32-3 Galileo reborn 伽利...
8月14日
2019-08-14 毛雅亭字数 561 · 阅读 17 2019-06-02 18:39 ...
腾讯云服务等级协议（SLA）
服务等级协议（SLA）最近更新时间：2019-08-14 11:03:08 前往 GitHub 编辑查看 PDF...
安装MongoDB(windows)
安装MongoDB(windows) 安装时间: 2019-08-14 操作系统: Win10专业版 10.0.1...
工作进展
来到徐工半月有余了，是时候对这段时间总结一下了。首先说一下工作内容，我应聘的岗位是动力电池设计师，当时乐观的想是关...
再见，简友
时间：2019-08-14 14:43 周三地点：家乡天气：晴朗状态：思念关键字：离开算起来，今...
Linux指定时间段日志筛选
日志中的时间格式： 2019-08-14 13:59:28,751 2019-08-05 17:56:09,101...
文先森的日常 -- 抉择
日精进打卡第379天姓名：李文杰 (四爷)；公司：中国太平人寿；日期：2019-08-14 【知～学习】《...