美文网首页
20190807工作进展

20190807工作进展

作者: Songger | 来源:发表于2019-08-07 20:50 被阅读0次

    昨天工作:

    1. 将经过叶子类目过滤后的结果使用alinlp预训练的dssm模型重新计算相关性,使用0.5作为阈值的话可以得到比较满意的结果,最后结果剩余300+w条数据,其中关联到的视频9.7w左右。结果在:graph_embedding.hs_tmp_176
    2. 问题:发现最终结果中服饰相关的query中经常会有带有季节信息的的query,但是关联到的结果会忽视这一信息。比如"query:高跟鞋女夏; title:2018新款秋白色高跟鞋"这种情况,还在想办法改善。

    今天计划:
    改善dssm模型inference结果中季节信息丢失的问题

    alinlp的dssm充当过滤器:graph_embedding.hs_tmp_176
    测试将季节的比重调高一点会不会有效果
    query和title叶子类目是否靠谱

    1. 使用阔姐给的叶子类目表重新过滤

    graph_embedding.jl_top_query_related_cate

    drop table hs_tmp_175;
    yes
    create table hs_tmp_175 as
    select query, title, video_id, item_id, search_kg:alinlp_dssm_text_similarity(search_kg:alinlp_segment(query, 'CONTENT_SEARCH', ' '), search_kg:alinlp_segment(title, 'CONTENT_SEARCH', ' '),' ') as score from hs_query_ugc_co_video_final_result_info_1;

    drop table hs_query_ugc_co_video_final_result_info_3;
    yes
    create table hs_query_ugc_co_video_final_result_info_3 as
    select distinct a.query, a.title, a.video_id, a.item_id from
    (select * from hs_tmp_170 where score > 0.5 )a left join (select * from jl_top_query_related_cate)b on
    a.query == b.se_keyword and a.cate_id == b.cate_id;

    1. query和title的分数分布怎么样?

    hs_dssm_dic_query_inf_without_space_0 : | query_id | query |

    create table graph_embedding.hs_dssm_dic_query_2 as
    select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
    (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
    (select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", "{word}|{weight}|${type}", 1, 1) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0 limit 20)a)b where lengthb(b.pair) > 0;

    drop table hs_tmp_149;
    yes
    create table hs_tmp_149 as select id, words_mainse_ids, search_kg:alinlp_remove_stop_words(title, "") as title from hs_dssm_dic_title_3;

    set odps.sql.udf.jvm.memory=4096;
    create table graph_embedding.hs_dssm_dic_title_4 as
    select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", "{word}|{weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;

    1. 分词结果是否靠谱

    create table hs_tmp_179 as
    select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", "{word}|{weight}|${type}", 1, 1) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0;

    create table hs_tmp_180 as
    select id, search_kg:alinlp_termweight_ecom(title, "%", "{word}|{weight}|${type}", 1, 0) as tag_result from graph_embedding.hs_tmp_149;

    1. 有问题的结果(query)

    | 37 | 40|3|268435456%岁|3|536870912%女短裤|93|65537 |
    | 38 | 40|2|268435456%岁|2|536870912%妈妈|20|16%夏装|75|1073807361 |
    | 39 | 45|2|268435456%岁|2|536870912%女|20|16%连衣裙|75|65537 |
    | 40 | 50|2|268435456%岁|2|536870912%妈妈|20|16%夏装|75|1073807361 |
    | 41 | 50|2|268435456%岁|2|536870912%妈妈|17|16%夏装|64|1073807361% |0|1024%洋气|13|128 |
    | 46 | 7分裤|61|65537%女|19|16% |0|1024%薄款|19|16 |
    aj|3|0%女鞋|96|65537 |
    | 100 | diy|11|128%小屋|88|65537 |
    | 92 | cosply|23|8%服装|58|1073807361%女|17|16 |
    %网|70|131073%红包|70|

    1. 有问题的结果(title)

    %ins|20|128%超|40|16%火包|66|4 |
    分词会分错,按照种类来给的权重,有关键字的直接给70(如果比70小)?

    1. 使用电商命名实体进行权重确定(不好使)

    2. 白名单
      ['春', '夏', '秋', '冬', '男', '女', '宝宝', '童', '学生', '爸', '妈', '黑', '白', '灰', '厚', '薄', '暖', '凉', '式', '风', '码', '长', '短', '大', '小', '红', '橙', '黄', '绿', '青', '蓝', '紫', '岁', '年', 'ins', '冰', '欧', '韩', '宽松', '紧身', '胖', '瘦', '单', '双', '2019', '2018']

    处理方式:
    query:白名单中的词增加到 max(本句子中权重最大词的一半,origin)
    title:白名单中的词增加到 max(75,origin)

    1. 数据预处理

    train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
    train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |


    inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
    inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |


    train query 数据重新获取

    drop table hs_dssm_dic_query_4;
    yes
    create table graph_embedding.hs_dssm_dic_query_4 as
    select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
    (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
    (select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", "{word}|{weight}", 1, 0) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0)a)b where lengthb(b.pair) > 0;

    drop resource hs_udf_87_1.py;
    yes
    add py /home/hengsong/query_co_video/src/hs_udf_87_1.py;
    CREATE FUNCTION hs_return_clean AS hs_udf_87_1.return_clean USING hs_udf_87_1.py;
    CREATE FUNCTION hs_change_score_1 AS hs_udf_87_1.Processor USING hs_udf_87_1.py;

    drop table hs_dssm_dic_query_5;
    yes
    create table hs_dssm_dic_query_5 as select id, word, weight, hs_change_score_1(word, weight) as new_weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_4;

    select title, hs_return_clean(title) as clean_title from graph_embedding.hs_tmp_149 limit 100;

    drop table hs_tmp_181;
    yes
    create table graph_embedding.hs_tmp_181 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_5 group by id;

    train title 数据的重新获取

    create table graph_embedding.hs_dssm_dic_title_4 as
    select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", "{word}|{weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;

    create table hs_dssm_dic_title_6 as select id, word, weight, hs_change_score_1(word, weight) as new_weight, word_emb from hs_dssm_dic_title_5;

    drop resource hs_udaf_82_1.py;
    yes
    add py /home/hengsong/query_co_video/src/hs_udaf_82_1.py;
    CREATE FUNCTION hs_merge_emb_15 AS hs_udaf_82_1.Average USING hs_udaf_82_1.py;

    create table graph_embedding.hs_tmp_182 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_title_6 group by id;

    inference query 数据重新获取

    drop table hs_dssm_dic_query_inf_4;
    yes
    create table graph_embedding.hs_dssm_dic_query_inf_4 as
    select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
    (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
    (select id, search_kg:alinlp_termweight_ecom(query, "%", "{word}|{weight}", 1, 0) as tag_result from graph_embedding.hs_dssm_dic_query_inf_1 where lengthb(query) > 0)a)b where lengthb(b.pair) > 0;

    drop table hs_dssm_dic_query_inf_5;
    yes
    create table hs_dssm_dic_query_inf_5 as select id, word, weight, hs_change_score_1(word, weight) as new_weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_inf_4;

    drop table hs_tmp_184;
    yes
    create table graph_embedding.hs_tmp_184 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_inf_5 group by id;

    inference title 数据重新获取

    create table graph_embedding.hs_dssm_dic_title_4 as
    select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from (select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from (select id, search_kg:alinlp_termweight_ecom(title, "%", "{word}|{weight}", 1, 0) as tag_result from graph_embedding.hs_tmp_149)a)b where lengthb(b.pair) > 0;

    create table hs_dssm_dic_title_inf_6 as select id, word, weight, hs_change_score_1(word, weight) as new_weight, word_emb from hs_dssm_dic_title_inf_3;

    create table graph_embedding.hs_tmp_185 as select id, graph_embedding:hs_merge_emb_15(new_weight, word_emb) as emb from graph_embedding.hs_dssm_dic_title_inf_6 group by id;

    1. 得到新的训练集

    drop table hs_tmp_183;
    yes
    create table hs_tmp_183
    as select c.se_keyword_mainse_ws, d.emb as title_mainse_ws, c.label from
    (select a.*, b.emb as se_keyword_mainse_ws from (select * from hs_dssm_train_v2_0)a left join (select * from hs_tmp_181)b on a.query_id == b.id)c left join (select * from hs_tmp_182)d on c.item_id == d.id;

    记得除掉NULL!!!

    drop table hs_train_data_dssm_v2_4;
    yes
    drop table hs_test_data_dssm_v2_4;
    yes
    PAI -name split -project algo_public
    -DinputTableName=graph_embedding.hs_tmp_183
    -Doutput1TableName=graph_embedding.hs_train_data_dssm_v2_4
    -Doutput2TableName=graph_embedding.hs_test_data_dssm_v2_4
    -Dfraction=0.8
    -DmemSizePerCore=4096
    -DcoreNum=100
    ;

    1. 构造测试集

    drop table hs_tmp_187;
    yes
    create table graph_embedding.hs_tmp_187 as
    select c.query_id, c.title_id, c.query, d.emb as title from
    (select a.*, b.emb as query from (select * from graph_embedding.hs_tmp_157)a left join (select * from graph_embedding.hs_tmp_184)b on a.query_id == b.id)c left join (select * from graph_embedding.hs_tmp_185)d on c.title_id == d.id;

    1. train & inference

    | query_id | video_id | query_ws | video_ws |
    alter table hs_tmp_187 change column query query_ws string;
    alter table hs_tmp_187 change column title_id video_id string;
    alter table hs_tmp_187 change column title video_ws string;

    pai -name tensorflow140 -Dscript="file:///home/hengsong/origin_deep_cluster_odps_8.tar.gz" -DentryFile="train_inference_v7.py" -Dcluster='{"worker":{"count":50, "cpu":200, "memory":4000}, "ps":{"count":10, "cpu":200, "memory":5000}}' -Dtables="odps://graph_embedding/tables/hs_train_data_dssm_v2_4,odps://graph_embedding/tables/hs_test_data_dssm_v2_4,odps://graph_embedding/tables/hs_tmp_187" -Doutputs="odps://graph_embedding/tables/hs_dssm_result_3" -DcheckpointDir="oss://bucket-automl/hengsong/?role_arn=acs:ram::1293303983251548:role/graph2018&host=cn-hangzhou.oss-internal.aliyun-inc.com" -DuserDefinedParameters="--learning_rate=3e-4 --batch_size=1024 --is_save_model=True --attention_type=1 --num_epochs=1 --ckpt=hs_ugc_video_4e_1.ckpt" -DuseSparseClusterSchema=True;

    相关文章

      网友评论

          本文标题:20190807工作进展

          本文链接:https://www.haomeiwen.com/subject/jhmndctx.html