美文网首页MySQL
Hive表血缘关系获取

Hive表血缘关系获取

作者: 嘻嘻是小猪 | 来源:发表于2020-06-19 17:59 被阅读0次

    hive血缘关系获取比较容易,方案也比较成熟。

    org.apache.hadoop.hive.ql.tools.LineageInfo 表级别血缘

    利用LineageInfo分析HiveQL中的表级别血缘关系

    这个类中本身带main方法,可以独立启动,将sql语句作为参数传入

    public static void main(String[] args) throws IOException, ParseException,
          SemanticException {
    
        String query = args[0];
    
        LineageInfo lep = new LineageInfo();
    
        lep.getLineageInfo(query);
    
        for (String tab : lep.getInputTableList()) {
          System.out.println("InputTable=" + tab);
        }
    
        for (String tab : lep.getOutputTableList()) {
          System.out.println("OutputTable=" + tab);
        }
      }
    

    hive源码已给出很好的代码示例,直接用就行了。
    要注意的是,源码对CREATE_TABLE_AS, LOAD这样的语法似乎并不支持
    可根据org.apache.hadoop.hive.ql.tools.LineageInfo#process方法中代码增加自己的逻辑

    org.apache.hadoop.hive.ql.hooks.LineageLogger 字段级别血缘

    利用LineageLogger分析HiveQL中的字段级别血缘关系

    这是hive提供的一个Hook,使用也很简单


    我参照的是第二种方式,做了自己关于表级别的探索。
    同时在LineageLogger 的源思路上做了些许改变

    • 对LOAD这个语法做支持,我认为外部到Hive也是数据流向的一部分
    • 对没有输出目标的简单查询语句做了屏蔽

    代码如下

    pom依赖

    <dependency>
        <groupId>org.apache.hive</groupId>
        <artifactId>hive-exec</artifactId>
        <version>2.3.4</version>
    </dependency>
    
    public class LineageHook implements ExecuteWithHookContext {
    
        private static final HashSet<String> OPERATION_NAMES = new HashSet<String>();
        private static final HashSet<String> INPUTS = new HashSet<String>();
        private static final HashSet<String> OUTPUTS = new HashSet<String>();
    
        static {
            OPERATION_NAMES.add(HiveOperation.QUERY.getOperationName());
            OPERATION_NAMES.add(HiveOperation.CREATETABLE_AS_SELECT.getOperationName());
            OPERATION_NAMES.add(HiveOperation.ALTERVIEW_AS.getOperationName());
            OPERATION_NAMES.add(HiveOperation.CREATEVIEW.getOperationName());
            OPERATION_NAMES.add(HiveOperation.LOAD.getOperationName());//在原有基础上,开放load语句
        } 
    
        public void run(HookContext hookContext) throws Exception {
    
            INPUTS.clear();
            OUTPUTS.clear();
    
            QueryPlan plan = hookContext.getQueryPlan();
            LineageCtx.Index index = hookContext.getIndex();
            SessionState ss = SessionState.get();
            if (ss != null && index != null
                    && OPERATION_NAMES.contains(plan.getOperationName())
                    && !plan.isExplain()) {
    
                System.out.println(plan.getOperationName());
    
                //输出
                for (WriteEntity output : plan.getOutputs()) {
                    Entity.Type entityType = output.getType();
                    if (entityType == Entity.Type.TABLE
                            || entityType == Entity.Type.PARTITION
                            || entityType == Entity.Type.LOCAL_DIR //放行LOCAL_DIR 
                            || entityType == Entity.Type.DFS_DIR //放行DFS_DIR  
                            ) {
                        Table trgTb = output.getTable();
                        String trgTbName = null;
                        if (trgTb!=null) {
                            trgTbName = trgTb.getDbName()+"."+trgTb.getTableName();
                        }else {
                            trgTbName = output.getD().toString();
                            //hdfs://master:8020/tmp/hive/admin/27808155-878a-4446-9c4e-a2f3388301fc/hive_2020-06-19_16-47-52_939_789950828629061887-1/-mr-10001
                            if (trgTbName.matches("hdfs://.+/tmp/hive/.+")) {// 过滤MR中间临时落地数据的路径
                                continue;
                            }
                        }
    //                    System.out.println("target table "+trgTbName);l
                        if (OUTPUTS.contains(trgTbName)) {
                            continue;
                        }else {
                            OUTPUTS.add(trgTbName);
                        }
                        break;
                    }
                }
    
                if (OUTPUTS.size()==0) {//如果没有输出,不获取输入,相当于屏蔽了无输出的简单Query
                    return;
                }
    
                //输入
                for (ReadEntity input : plan.getInputs()) {
                    Entity.Type entityType = input.getType();
                    if (entityType == Entity.Type.TABLE
                            || entityType == Entity.Type.PARTITION
                            || entityType == Entity.Type.LOCAL_DIR
                            || entityType == Entity.Type.DFS_DIR
                            ) {
                        Table srcTb = input.getTable();
    
                        String srcTbName = null;
                        if (srcTb!=null) {
                            srcTbName = srcTb.getDbName()+"."+srcTb.getTableName();
                        }else {
                            srcTbName = input.getD().toString();
                            if (srcTbName.matches("hdfs://.+/tmp/hive/.+")) {
                                continue;
                            }
                        }
                        INPUTS.add(srcTbName);  //用HashSet装输入源名称,因为多分区输入时会有多个ReadEntity 这些Entity表名是相同的
    //                    System.out.println("src table "+srcTbName);
                    }
                }
    
                for (String input : INPUTS) {
                    System.out.println("INPUT="+input);
                }
    
                for (String output : OUTPUTS) {
                    System.out.println("OUTPUT="+output);
                }
            }
        }
    }
    

    实验开始

    1. Hive Cli开启
    2. set hive.exec.pre.hooks=LineageHook
    insert overwrite table gdm.gdm_cus_tag_di partition (dt)
    select tmp.user_id, tag_name, tmp.act_type, sum(tmp.cnt) as cnt, tmp.dt from 
    (select a.user_id as user_id, b.tags as tags, 2 as act_type, a.match_cnt as cnt, a.dt as dt 
    from fdm.fdm_cms_matches_da b, gdm.gdm_cus_match_di a where a.match_id = b.match_id and a.dt='2020-05-25'
    union all
    select a.user_id as user_id, b.tags as tags, 1 as act_type, a.game_cnt as cnt, a.dt as dt 
    from fdm.fdm_cms_subgame_public_da b, gdm.gdm_cus_game_di a where a.game_id = b.game_id and a.dt='2020-05-25'
    union all
    select a.user_id as user_id, b.tags as tags, 3 as act_type, a.sign_cnt as cnt, a.dt as dt
    from fdm.fdm_cms_matches_da b, gdm.gdm_cus_signup_di a where a.match_id = b.match_id and a.dt='2020-05-25'
    union all
    select a.user_id as user_id, b.tags as tags, 4 as act_type, a.cancel_cnt as cnt, a.dt as dt
    from fdm.fdm_cms_matches_da b, gdm.gdm_cus_cl_signup_di a where a.match_id = b.match_id and a.dt='2020-05-25') tmp
    lateral view explode(split(tmp.tags, ',')) tagtable as tag_name 
    group by user_id, tag_name, act_type, dt;
    
    INSERT OVERWRITE LOCAL DIRECTORY '/tmp/hadoop/output' ROW FORMAT DELIMITED FIELDS TERMINATED by ',' select * from fdm.fdm_cms_matches_da; 
    
    LOAD DATA INPATH '/user/hive/external/mongo_ipt/relation/follow_num/follow_num_sum.csv'
    OVERWRITE INTO table bdm.bdm_relation_follow_num_sum_di partition(dt='2020-06-19');
    

    暂时测试几个典型的用例


    谢谢本文被我参考的大神

    收工!!!

    相关文章

      网友评论

        本文标题:Hive表血缘关系获取

        本文链接:https://www.haomeiwen.com/subject/jmhbxktx.html