- 哪些图片需要保留?
image_position > 0 and image_position < 6 and type = 1;
create table hs_tmp_322 as select * from hs_tmp_318 order by item_id;
create table graph_embedding.hs_tmp_323 as select * from graph_embedding.hs_tmp_318 where image_position > 0 and image_position < 6 and type = 1;
create table graph_embedding.hs_tmp_324 as select item_id, concat('https://img.alicdn.com/imgextra/',image_url) as url, image_position, type, status from tbcdm.dim_tb_itm_image where ds=max_pt('tbcdm.dim_tb_itm_image');
create table graph_embedding.hs_tmp_327 as select a.* from graph_embedding.hs_tmp_324 a join graph_embedding.hs_tmp_303 b on a.item_id == b.item_id;
insert overwrite table graph_embedding.hs_tmp_327 select item_id, url, image_position, type, row_number()over() as index, cast(index / 40000000 as bigint) as part from graph_embedding.hs_tmp_327;
create table graph_embedding.hs_tmp_328 as select * from graph_embedding.hs_tmp_327 where image_position > 0 and image_position < 6 and type = 1;
insert overwrite table hs_tmp_328 select * from hs_tmp_328 where status = 1;
create table graph_embedding.hs_tmp_329 as select item_id, url, image_position, type, status, row_number()over() as index, 0 as part from graph_embedding.hs_tmp_328;
insert overwrite table graph_embedding.hs_tmp_329 select item_id, url, image_position, type, status, index, cast(index / 10000000 as bigint) as part from graph_embedding.hs_tmp_329;
- 图片特征提取
create table hs_tmp_330 (item_id bigint, url string, index bigint, pic_ws string) partitioned by (part bigint);
0 : 2019082805554137gxy0mh39
1 : 20190828055802932gfgetuvj2
2 : 20190828060315633gz55as69
3 : 2019082806034297gl15drw
4 : 20190828060413623g2dxzuvj2
5 : 20190828060445251gczk4m39
6 : 20190828060513845g9edtuvj2
7 : 20190828060552361g4pwwtyi2
insert overwrite table hs_tmp_330 partition (part = 7) select item_id, url, index, search_offline:yuyan_udf_resnet50_fullcate_fc(search_offline:Imgto1d_Yuyan_python(search_offline:single_img_get(url))) as pic_ws from hs_tmp_329 where part = 7;
use graph_embedding;
set odps.instance.priority=0;
SET odps.isolation.session.enable = true;
set odps.sql.reducer.instances=2500;
set odps.pypy.enabled=true;
set odps.sql.reducer.cpu=400;
drop table if exists graph_embedding.hs_tmp_336;
yes
create table graph_embedding.hs_tmp_336
as
select t1.item_id, t1.url, t1.index, search_offline:yuyan_udf_resnet50_fullcate_fc(search_offline:Imgto1d_Yuyan_python(search_offline:single_img_get(t1.url))) as pic_ws from
(
select *
from
graph_embedding.hs_tmp_329
distribute by rand()
) t1;
20190828124345952g8et0yyi2
- 还是数据集的问题?
hs_tmp_300 : | index | item_id | label |
create table hs_tmp_331 as
select c., d.query from
(select a., b.title from hs_tmp_300 a join hs_tmp_303 b on a.item_id == b.item_id) c join hs_tmp_304 d on c.index == d.index;
create table hs_tmp_332 as select *, search_kg:alinlp_segment(hs_return_clean(title), "MAINSE", "0", "1") as title_seg, search_kg:alinlp_segment(hs_return_clean(query), "MAINSE", "0", "1") as query_emb from hs_tmp_331;
create table hs_tmp_333 as select
hs_return_correct_label(index, item_id, label, title_seg, query_emb) as (index, item_id, label_origin, label_new, title_seg, query_seg)from hs_tmp_332;
create table hs_tmp_334 as select cast(index as bigint) as index, item_id, cast(label_new as bigint) as label from hs_tmp_333;
create table hs_tmp_335 as
select c.query_emb as se_keyword_mainse_ws, d.title_emb as title_mainse_ws, c.label from
(select a.*, b.query_emb from hs_tmp_334 a join hs_tmp_313 b on a.index == b.id)c join hs_tmp_312 d on c.item_id == d.id;
drop table hs_train_data_dssm_v2_8;
yes
drop table hs_test_data_dssm_v2_8;
yes
PAI -name split -project algo_public
-DinputTableName=graph_embedding.hs_tmp_335
-Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_8
-Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_8
-Dfraction=0.9
-DmemSizePerCore=4096
-DcoreNum=100
;
- train & inference
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_inference_v10.py" -Dcluster='{"worker":{"count":50, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -DuseSparseClusterSchema=True -DenableDynamicCluster=True -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_8,odps://graph_embedding/tables/hs_test_data_dssm_v2_8,odps://graph_embedding/tables/hs_tmp_267" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_5" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_2e_2.ckpt" -DuseSparseClusterSchema=True;
20190828122545404g7c74wyi2
网友评论