-
给之己的表
create table hs_result_title_query_1w_top1000_filtered_2 as
select distinct b.index, b.query, a.video_id, b.title, b.item_id, b.score, b.cate_id, b.cate_name, b.url from
(select coalesce(CONCAT('http://cloud.video.taobao.com', get_json_object(body, '.entities.k3.play_url/s')))as url, coalesce(get_json_object(body, '
.entities.k2.video_id/l')) as video_id from hs_jingyan_query_related_video_pool_2_3)a join (select * from hs_result_title_query_1w_top1000_filtered_1)b on a.url == b.url;
-
DSSM tensorflow pai命令:
pai -name tensorflow140 -Dscript="odps://ps_ads_model_train/resources/video_text2tag_v1.tar.gz"
-DentryFile="inference.py" -Dcluster='{"worker":{"count":30, "gpu":50, "cpu":400, "memory":5000}, "ps":{"count":5, "cpu":200, "memory":5000}}'
-Dtables="odps://palgo_wow/tables/zhiji_gul_video_dssm_text_idx_v2_fushi_need_to_tag"
-DcheckpointDir="oss://bucket-automl/text2tag_m1/?role_arn=acs:ram::1293303983251548:role/video2019&host=cn-hangzhou.oss-internal.aliyun-inc.com"
-Doutputs="odps://palgo_wow/tables/text2tag_2_class_inference_result_fushi"
-DuserDefinedParameters="--learning_rate=1e-2 --vocab_size=221675 --batch_size=8 --maxinputsize_summary=80 --maxinputsize_title=16 --attention_type=1 --num_epochs=1 --ckpt=text2tags_fushi.ckpt-2 --max_area_number=2"
-DuseSparseClusterSchema=True;
分词并去掉常用词
drop table if exists palgo_wow.zj_video_trigger_info_without_tags_ws_tmp;
create table if not exists palgo_wow.zj_video_trigger_info_without_tags_ws_tmp LIFECYCLE 2 as
select video_id,
search_kg:alinlp_segment(title, "YOUKU", "0", "1") as title_ws,
search_kg:alinlp_segment(summary, "YOUKU", "0", "1") as summary_ws,
duration,
cover_url,
search_kg:alinlp_segment(tag, "YOUKU", "0", "1") as tag_ws
from palgo_wow.zj_video_trigger_info_without_tags;
---去除常用词
DROP TABLE IF EXISTS palgo_wow.zj_video_trigger_info_without_tags_FreqWord_RM;
PAI -name FilterNoise -project algo_public
-DinputTableName=palgo_wow.zj_video_trigger_info_without_tags_ws_tmp
-DnoiseTableName=palgo_wow.qber_videotags_pool_tmp1_Noise_Word_Pool_Current
-DoutputTableName=palgo_wow.zj_video_trigger_info_without_tags_FreqWord_RM
-DselectedColNames="title_ws,summary_ws,tag_ws"
-Dlifecycle=30;
palgo_wow.zhiji_gul_video_dssm_text_idx_v2_fushi_need_to_tag
palgo_wow.qber_videotags_pool_tmp1_Noise_Word_Pool_Current
网友评论