美文网首页
20190730工作进展

20190730工作进展

作者: Songger | 来源:发表于2019-07-30 10:02 被阅读0次

给程序加buf

algo_platform
set odps.sql.mapper.split.size=1;

昨天工作:

  1. 与阔姐之己一起讨论网络和训练数据的问题
  2. 重新生成训练测试数据,取graph_embedding.jl_jingyan_query_related_top_query_detailed的第一页展示结果作为训练测试数据,并进行数据清洗,开始dssm网络的训练。现阶段测试集效果:acc:0.93;auc:0.907

今天计划:
使用新得到的dssm模型对top 1w query与ugc视频的title进行关联,测试效果

  1. inference 测试

truncate tabel hs_dssm_result_0;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_dssm_inference_2" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_0" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_1.ckpt-1" -DuseSparseClusterSchema=True;

drop table hs_tmp_117;
yes
create table hs_tmp_117 as
select distinct video_id, video_ws from hs_dssm_inference_2;

drop table hs_tmp_118;
yes
create table hs_tmp_118 as
select distinct query_id, query_ws from hs_dssm_inference_2;

drop table hs_tmp_116;
yes
create table hs_tmp_116 as
select c.video_id, c.video_ws, c.query_id, d.query_ws, c.score from
(select a.video_id, a.query_id, a.score, b.video_ws from (select video_id as query_id, query_id as video_id, score from hs_dssm_result_0)a left join (select * from hs_tmp_117)b on a.video_id == b.video_id)c left join (select * from hs_tmp_118)d on c.query_id == d.query_id;

  1. 重新构造inference数据

得到训练词库:hs_clean_words_info_(flag : 0 : query ; flag : 1 : title)

create table hs_clean_title_words_info_ as select query_word as words from hs_tmp_113 where freq > 2;

drop table hs_tmp_120;
yes
create table hs_tmp_120 as select bi_udf:bi_split_value(query_id, words_mainse_ws, " ") as (index, query_word) from hs_tmp_108;

drop table hs_tmp_121;
yes
create table hs_tmp_121 as select query_word, count(*) as freq from hs_tmp_120 group by query_word order by freq desc;s

create table hs_clean_words_info_ as select words as word, row_number()over() + 2 as id, 1 as flag from hs_clean_title_words_info_;

insert into table hs_clean_words_info_ select query_word as word,row_number()over() + 2 as id, 0 as flag from hs_tmp_121;

过滤,将训练词库中不存在的词设置为UNK

add table graph_embedding.hs_clean_words_info_ as hs_clean_words_info_;

drop resource hs_udtf_730_4.py;
yes
add py /home/hengsong/query_co_video/src/hs_udtf_730_4.py;
CREATE FUNCTION hs_filter_dssm_inference_12 AS hs_udtf_730_4.Processor USING hs_udtf_730_4.py,hs_clean_words_info_;

select hs_filter_dssm_inference_9(query_id, query_ws, 0) as (query_id, query_ws) from hs_tmp_118 limit 10;

from odps.udf import BaseUDTF
from odps.distcache import get_cache_table
import string
import random
import sys

class Processor(BaseUDTF):
    def __init__(self):
        ###graph_embedding.zj_xhs_dssm_pos_sample_info_
        FEATURE_TABLE = "hs_clean_words_info_"
        self.querys = set()
        self.titles = set()
        for ele in get_cache_table(FEATURE_TABLE):
            rec = list(ele)
            word = rec[0]
            flag = rec[1]
            if flag is 0:
                self.querys.append(word)
            else:
                self.titles.append(word)

    def process(self, id, words, flag=0):
        wordlist = words.split(" ")
        candidate_list = self.querys if flag is 0 else self.titles
        result = []

        for x in wordlist:
            if x in candidate_list:
                result.append(x)
            else:
                result.append('UNK')
        self.forward(id, ' '.join(result))
  1. 查看prod是啥情况
    忘记加 l2 normalization了。。。

  2. 手动id化

获得id字典

hs_tmp_108:| query_id | se_keyword | words_mainse_ws |
hs_tmp_109:| item_id | title | words_mainse_ws |

create table hs_dssm_dic_query_0 as
select hs_filter_dssm_inference_8(query_id, words_mainse_ws) as (id, words_mainse_ids) from hs_tmp_108;

drop table graph_embedding.hs_dssm_dic_title_2;
drop table graph_embedding.hs_dssm_dic_title_2;
yes
create table graph_embedding.hs_dssm_dic_title_2 as select graph_embedding.hs_filter_dssm_inference_10(item_id, words_mainse_ws) as (id, words_mainse_ids) from graph_embedding.hs_tmp_109;

from odps.udf import BaseUDTF
from odps.distcache import get_cache_table
import string
import random
import sys

class Processor(BaseUDTF):
    def __init__(self):
        ###graph_embedding.zj_xhs_dssm_pos_sample_info_
        print('---0---')
        FEATURE_TABLE = "hs_clean_words_info_"
        self.querys = {}
        self.titles = {}
        for ele in get_cache_table(FEATURE_TABLE):
            rec = list(ele)
            word = rec[0]
            id = rec[1]
            flag = rec[2]
            if flag is 0:
                self.querys[word] = id
            else:
                self.titles[word] = id
        self.querys[''] = 0
        self.titles[''] = 0

    def process(self, index, words, flag=0):
        wordlist = words.split(' ')
        candidate_list = self.querys if flag is 0 else self.titles

        result = []

        for x in wordlist:
            if x in candidate_list.keys():
                result.append(candidate_list[x])
            else:
                result.append(0)

        result = ' '.join([str(i) for i in result])
        self.forward(str(index), result)

进行数据对应

drop table hs_tmp_124;
yes
create table hs_tmp_124
as select c.se_keyword_mainse_ws, d.words_mainse_ids as title_mainse_ws, c.label from
(select a.words_mainse_ids as se_keyword_mainse_ws, b.* from(select * from hs_dssm_dic_query_0)a right join (select * from hs_dssm_train_v2_0)b on a.id == b.query_id)c left join (select * from hs_dssm_dic_title_2)d on c.item_id == d.id;

划分训练集and测试集

drop table hs_train_data_dssm_v2_2;
yes
drop table hs_test_data_dssm_v2_2;
yes
PAI -name split -project algo_public
-DinputTableName=graph_embedding.hs_tmp_124
-Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_2
-Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_2
-Dfraction=0.8
-DmemSizePerCore=4096
-DcoreNum=100
;

开始训练

pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_v4.py" -Dcluster='{"worker":{"count":30, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_2,odps://graph_embedding/tables/hs_test_data_dssm_v2_2" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=10 --ckpt=hs_ugc_video_3e_2.ckpt" -DuseSparseClusterSchema=True;

http://logview.odps.aliyun-inc.com:8080/logview/?h=http://service-corp.odps.aliyun-inc.com/api&p=graph_embedding&i=20190730132712622g0qr4gep2_5c69c7b5_0f68_4ba8_a5e6_75ce6075c262&token=Q2VYd1JTMzg4VmFJVTE3VzJHQ2gyeTE4TExNPSxPRFBTX09CTzoxMjkzMzAzOTgzMjUxNTQ4LDE1NjUwOTgwMzMseyJTdGF0ZW1lbnQiOlt7IkFjdGlvbiI6WyJvZHBzOlJlYWQiXSwiRWZmZWN0IjoiQWxsb3ciLCJSZXNvdXJjZSI6WyJhY3M6b2RwczoqOnByb2plY3RzL2dyYXBoX2VtYmVkZGluZy9pbnN0YW5jZXMvMjAxOTA3MzAxMzI3MTI2MjJnMHFyNGdlcDJfNWM2OWM3YjVfMGY2OF80YmE4X2E1ZTZfNzVjZTYwNzVjMjYyIl19XSwiVmVyc2lvbiI6IjEifQ==

  1. 构造inference数据-query&ugc数据

create table hs_tmp_122 as
select row_number()over() as query_id, query from
graph_embedding.jl_jingyan_query_related_top_query where ds=max_pt('graph_embedding.jl_jingyan_query_related_top_query');

create table hs_tmp_123 as
select id as item_id, title from
graph_embedding.jl_jingyan_query_related_video_pool where ds=max_pt('graph_embedding.jl_jingyan_query_related_video_pool');

mainse分词
hs_tmp_122:query_id, query
hs_tmp_123:item_id, title
hs_dssm_train_v2_0

create table if not exists hs_tmp_125 LIFECYCLE 20 as select query_id, query, search_kg:alinlp_segment(query, "MAINSE", "0", "1") as words_mainse_ws from hs_tmp_122;

create table if not exists hs_tmp_126 LIFECYCLE 20 as select item_id, title, search_kg:alinlp_segment(title, "MAINSE", "0", "1") as words_mainse_ws from hs_tmp_123;

id化

create table graph_embedding.hs_dssm_dic_query_inf_0 as select graph_embedding.hs_filter_dssm_inference_12(query_id, words_mainse_ws, 0) as (id, words_mainse_ids) from graph_embedding.hs_tmp_125;

create table graph_embedding.hs_dssm_dic_title_inf_0 as select graph_embedding.hs_filter_dssm_inference_12(item_id, words_mainse_ws, 1) as (id, words_mainse_ids) from graph_embedding.hs_tmp_126;

随机采样本

create table hs_tmp_128 as select id as title_id, words_mainse_ids as title_ws, int(rand() * 10000 + 1) as query_id from hs_dssm_dic_title_inf_0;

query_id, query_ws, video_id, video_ws

create table hs_tmp_129 as
select a.query_id, b.words_mainse_ids as query_ws, a.title_id as video_id, a.title_ws as video_ws from
(select * from hs_tmp_128)a left join (select * from hs_dssm_dic_query_inf_0)b on a.query_id == b.id;

inference

truncate tabel hs_dssm_result_0;
pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="inference_v4.py" -Dcluster='{"worker":{"count":1, "cpu":200, "memory":4000}, "ps":{"count":1, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_tmp_129" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_0" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=False --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_3e_2.ckpt-1" -DuseSparseClusterSchema=True;

http://logview.odps.aliyun-inc.com:8080/logview/?h=http://service-corp.odps.aliyun-inc.com/api&p=graph_embedding&i=20190730142154665gxul4gep2_6622aa91_a73b_4129_9e04_7e46c60ffb95&token=QWxVUzhscmhCV1Q4RUVhcVFwcG14bFVOYUc4PSxPRFBTX09CTzoxMjkzMzAzOTgzMjUxNTQ4LDE1NjUxMDEzMTYseyJTdGF0ZW1lbnQiOlt7IkFjdGlvbiI6WyJvZHBzOlJlYWQiXSwiRWZmZWN0IjoiQWxsb3ciLCJSZXNvdXJjZSI6WyJhY3M6b2RwczoqOnByb2plY3RzL2dyYXBoX2VtYmVkZGluZy9pbnN0YW5jZXMvMjAxOTA3MzAxNDIxNTQ2NjVneHVsNGdlcDJfNjYyMmFhOTFfYTczYl80MTI5XzllMDRfN2U0NmM2MGZmYjk1Il19XSwiVmVyc2lvbiI6IjEifQ==

样本关联

  1. 构造inference数据-测试集数据

  2. 训练数据有问题吗?

hs_dssm_dic_query_0:| id | words_mainse_ids |
hs_dssm_train_v2_0:| query_id | item_id | label |
hs_dssm_dic_title_2:
hs_tmp_110:| query_id | se_keyword | words_mainse_ws |
hs_tmp_111:| item_id | title | words_mainse_ws |

drop hs_tmp_127;
yes
create table hs_tmp_127
as select c.se_keyword_mainse_ws, d.words_mainse_ids as title_mainse_ws, c.label from
(select a.words_mainse_ids as se_keyword_mainse_ws, b.* from(select * from hs_dssm_dic_query_0)a right join (select * from hs_dssm_train_v2_0)b on a.id == b.query_id)c left join (select * from hs_dssm_dic_title_2)d on c.item_id == d.id;

create table hs_tmp_127
as select c.query_id, c.item_id, c.se_keyword_mainse_ws, d.words_mainse_ws as title_mainse_ws, c.label from
(select a.words_mainse_ws as se_keyword_mainse_ws, b.* from(select * from hs_tmp_110)a right join (select * from hs_dssm_train_v2_0)b on a.query_id == b.query_id)c left join (select * from hs_tmp_111)d on c.item_id == d.item_id;

hs_tmp_127:| query_id | item_id | se_keyword_mainse_ws | title_mainse_ws | label |
hs_dssm_dic_query_0:| id | words_mainse_ids |
hs_dssm_dic_title_2:| id | words_mainse_ids |

相关文章

  • 20190730工作进展

    给程序加buf algo_platformset odps.sql.mapper.split.size=1; 昨天...

  • 第139篇【20190730】山重水复疑无路,柳暗花明又一村——

    本日提纲001 20190730流程 001 20190730流程 6:30-6:40起床洗漱 6:40-7:50...

  • 20190730

    死亡是无法拆解的永恒不过是假象时间每过一寸表针还不是离停滞更近一分 黑夜是沉重的轮回的世人小心翼翼地低头寻找谋杀自...

  • 20190730

    一直想要做到工作的时候坦坦荡荡。如何才能做到 :1.心里无诉求,甚至可以放弃; 2.在这件事不能期望受利于人。

  • 20190730

    1帮助孙德辉减免贷款利息5000元,2孟宪昌申请临时救助500元,3第一书记发展两名预备党员李海霞,卢艳艳。完善党...

  • 20190730

    今天阅读了《如何高效阅读》部分章节。作者讲到几点,我归纳成两个部分。第一部分,需要提高自己的阅读速度。心里默读和回...

  • 20190730

    人都不喜欢苦难,但如果不经历苦难又怎么知道幸福的滋味? 第二次上手术台,仍然吓的半死。手术室很冷,且比我想象中小很...

  • 20190730

    说好的会一点一点地进步的呢?为什我有怀疑自己智商的感觉? 在长投停滞的时间很长了!

  • 20190730

    从28号到今天下店了两天,在店里看到的再加上培训的内容,我总有一种想辞职离开的冲动,第一次这么深刻的体会到与自己的...

  • 20190730

    最近听樊登讲的逆商听了两边,第一遍感觉挺好的,但是让我自己想想怎么做,又想不起来。 今天又听了一遍,感觉里面很多知...

网友评论

      本文标题:20190730工作进展

      本文链接:https://www.haomeiwen.com/subject/vdcgrctx.html