2019-08-28工作进展

作者: Songger | 来源:发表于2019-08-28 19:04 被阅读0次

2019-08-28工作进展
2019-08-28工作进展
2019-08-28
[补]Lan的ScalersTalk第四轮新概念朗读持续力训练D
说了不一定有机会，但不说一定没机会
它是四川街边小吃第一名！我用三分钟就做出来了！
DALS028-批次效应04-因子分析
工作进展
文先森的日常 -- 第一天
行事手段

哪些图片需要保留？
image_position > 0 and image_position < 6 and type = 1;

create table hs_tmp_322 as select * from hs_tmp_318 order by item_id;

create table graph_embedding.hs_tmp_323 as select * from graph_embedding.hs_tmp_318 where image_position > 0 and image_position < 6 and type = 1;

create table graph_embedding.hs_tmp_324 as select item_id, concat('https://img.alicdn.com/imgextra/',image_url) as url, image_position, type, status from tbcdm.dim_tb_itm_image where ds=max_pt('tbcdm.dim_tb_itm_image');

create table graph_embedding.hs_tmp_327 as select a.* from graph_embedding.hs_tmp_324 a join graph_embedding.hs_tmp_303 b on a.item_id == b.item_id;

insert overwrite table graph_embedding.hs_tmp_327 select item_id, url, image_position, type, row_number()over() as index, cast(index / 40000000 as bigint) as part from graph_embedding.hs_tmp_327;

create table graph_embedding.hs_tmp_328 as select * from graph_embedding.hs_tmp_327 where image_position > 0 and image_position < 6 and type = 1;

insert overwrite table hs_tmp_328 select * from hs_tmp_328 where status = 1;

create table graph_embedding.hs_tmp_329 as select item_id, url, image_position, type, status, row_number()over() as index, 0 as part from graph_embedding.hs_tmp_328;

insert overwrite table graph_embedding.hs_tmp_329 select item_id, url, image_position, type, status, index, cast(index / 10000000 as bigint) as part from graph_embedding.hs_tmp_329;

图片特征提取

create table hs_tmp_330 (item_id bigint, url string, index bigint, pic_ws string) partitioned by (part bigint);

0 : 2019082805554137gxy0mh39
1 : 20190828055802932gfgetuvj2
2 : 20190828060315633gz55as69
3 : 2019082806034297gl15drw
4 : 20190828060413623g2dxzuvj2
5 : 20190828060445251gczk4m39
6 : 20190828060513845g9edtuvj2
7 : 20190828060552361g4pwwtyi2

insert overwrite table hs_tmp_330 partition (part = 7) select item_id, url, index, search_offline:yuyan_udf_resnet50_fullcate_fc(search_offline:Imgto1d_Yuyan_python(search_offline:single_img_get(url))) as pic_ws from hs_tmp_329 where part = 7;

use graph_embedding;
set odps.instance.priority=0;
SET odps.isolation.session.enable = true;
set odps.sql.reducer.instances=2500;
set odps.pypy.enabled=true;
set odps.sql.reducer.cpu=400;

drop table if exists graph_embedding.hs_tmp_336;
yes
create table graph_embedding.hs_tmp_336
as
select t1.item_id, t1.url, t1.index, search_offline:yuyan_udf_resnet50_fullcate_fc(search_offline:Imgto1d_Yuyan_python(search_offline:single_img_get(t1.url))) as pic_ws from
(
select *
from
graph_embedding.hs_tmp_329
distribute by rand()
) t1;

20190828124345952g8et0yyi2

还是数据集的问题？

hs_tmp_300 : | index | item_id | label |

create table hs_tmp_331 as
select c., d.query from
(select a., b.title from hs_tmp_300 a join hs_tmp_303 b on a.item_id == b.item_id) c join hs_tmp_304 d on c.index == d.index;

create table hs_tmp_332 as select *, search_kg:alinlp_segment(hs_return_clean(title), "MAINSE", "0", "1") as title_seg, search_kg:alinlp_segment(hs_return_clean(query), "MAINSE", "0", "1") as query_emb from hs_tmp_331;

create table hs_tmp_333 as select
hs_return_correct_label(index, item_id, label, title_seg, query_emb) as (index, item_id, label_origin, label_new, title_seg, query_seg)from hs_tmp_332;

create table hs_tmp_334 as select cast(index as bigint) as index, item_id, cast(label_new as bigint) as label from hs_tmp_333;

create table hs_tmp_335 as
select c.query_emb as se_keyword_mainse_ws, d.title_emb as title_mainse_ws, c.label from
(select a.*, b.query_emb from hs_tmp_334 a join hs_tmp_313 b on a.index == b.id)c join hs_tmp_312 d on c.item_id == d.id;

drop table hs_train_data_dssm_v2_8;
yes
drop table hs_test_data_dssm_v2_8;
yes
PAI -name split -project algo_public
-DinputTableName=graph_embedding.hs_tmp_335
-Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_8
-Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_8
-Dfraction=0.9
-DmemSizePerCore=4096
-DcoreNum=100
;

train & inference

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_inference_v10.py" -Dcluster='{"worker":{"count":50, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -DuseSparseClusterSchema=True -DenableDynamicCluster=True -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_8,odps://graph_embedding/tables/hs_test_data_dssm_v2_8,odps://graph_embedding/tables/hs_tmp_267" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_5" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_2e_2.ckpt" -DuseSparseClusterSchema=True;