hive血缘关系获取比较容易,方案也比较成熟。
org.apache.hadoop.hive.ql.tools.LineageInfo 表级别血缘
利用LineageInfo分析HiveQL中的表级别血缘关系
这个类中本身带main方法,可以独立启动,将sql语句作为参数传入
public static void main(String[] args) throws IOException, ParseException,
SemanticException {
String query = args[0];
LineageInfo lep = new LineageInfo();
lep.getLineageInfo(query);
for (String tab : lep.getInputTableList()) {
System.out.println("InputTable=" + tab);
}
for (String tab : lep.getOutputTableList()) {
System.out.println("OutputTable=" + tab);
}
}
hive源码已给出很好的代码示例,直接用就行了。
要注意的是,源码对CREATE_TABLE_AS, LOAD这样的语法似乎并不支持
可根据org.apache.hadoop.hive.ql.tools.LineageInfo#process方法中代码增加自己的逻辑
org.apache.hadoop.hive.ql.hooks.LineageLogger 字段级别血缘
利用LineageLogger分析HiveQL中的字段级别血缘关系
这是hive提供的一个Hook,使用也很简单
我参照的是第二种方式,做了自己关于表级别的探索。
同时在LineageLogger 的源思路上做了些许改变
-
对LOAD这个语法做支持,我认为外部到Hive也是数据流向的一部分
-
对没有输出目标的简单查询语句做了屏蔽
代码如下
pom依赖
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.4</version>
</dependency>
public class LineageHook implements ExecuteWithHookContext {
private static final HashSet<String> OPERATION_NAMES = new HashSet<String>();
private static final HashSet<String> INPUTS = new HashSet<String>();
private static final HashSet<String> OUTPUTS = new HashSet<String>();
static {
OPERATION_NAMES.add(HiveOperation.QUERY.getOperationName());
OPERATION_NAMES.add(HiveOperation.CREATETABLE_AS_SELECT.getOperationName());
OPERATION_NAMES.add(HiveOperation.ALTERVIEW_AS.getOperationName());
OPERATION_NAMES.add(HiveOperation.CREATEVIEW.getOperationName());
OPERATION_NAMES.add(HiveOperation.LOAD.getOperationName());//在原有基础上,开放load语句
}
public void run(HookContext hookContext) throws Exception {
INPUTS.clear();
OUTPUTS.clear();
QueryPlan plan = hookContext.getQueryPlan();
LineageCtx.Index index = hookContext.getIndex();
SessionState ss = SessionState.get();
if (ss != null && index != null
&& OPERATION_NAMES.contains(plan.getOperationName())
&& !plan.isExplain()) {
System.out.println(plan.getOperationName());
//输出
for (WriteEntity output : plan.getOutputs()) {
Entity.Type entityType = output.getType();
if (entityType == Entity.Type.TABLE
|| entityType == Entity.Type.PARTITION
|| entityType == Entity.Type.LOCAL_DIR //放行LOCAL_DIR
|| entityType == Entity.Type.DFS_DIR //放行DFS_DIR
) {
Table trgTb = output.getTable();
String trgTbName = null;
if (trgTb!=null) {
trgTbName = trgTb.getDbName()+"."+trgTb.getTableName();
}else {
trgTbName = output.getD().toString();
//hdfs://master:8020/tmp/hive/admin/27808155-878a-4446-9c4e-a2f3388301fc/hive_2020-06-19_16-47-52_939_789950828629061887-1/-mr-10001
if (trgTbName.matches("hdfs://.+/tmp/hive/.+")) {// 过滤MR中间临时落地数据的路径
continue;
}
}
// System.out.println("target table "+trgTbName);l
if (OUTPUTS.contains(trgTbName)) {
continue;
}else {
OUTPUTS.add(trgTbName);
}
break;
}
}
if (OUTPUTS.size()==0) {//如果没有输出,不获取输入,相当于屏蔽了无输出的简单Query
return;
}
//输入
for (ReadEntity input : plan.getInputs()) {
Entity.Type entityType = input.getType();
if (entityType == Entity.Type.TABLE
|| entityType == Entity.Type.PARTITION
|| entityType == Entity.Type.LOCAL_DIR
|| entityType == Entity.Type.DFS_DIR
) {
Table srcTb = input.getTable();
String srcTbName = null;
if (srcTb!=null) {
srcTbName = srcTb.getDbName()+"."+srcTb.getTableName();
}else {
srcTbName = input.getD().toString();
if (srcTbName.matches("hdfs://.+/tmp/hive/.+")) {
continue;
}
}
INPUTS.add(srcTbName); //用HashSet装输入源名称,因为多分区输入时会有多个ReadEntity 这些Entity表名是相同的
// System.out.println("src table "+srcTbName);
}
}
for (String input : INPUTS) {
System.out.println("INPUT="+input);
}
for (String output : OUTPUTS) {
System.out.println("OUTPUT="+output);
}
}
}
}
实验开始
- Hive Cli开启
- set hive.exec.pre.hooks=LineageHook
insert overwrite table gdm.gdm_cus_tag_di partition (dt)
select tmp.user_id, tag_name, tmp.act_type, sum(tmp.cnt) as cnt, tmp.dt from
(select a.user_id as user_id, b.tags as tags, 2 as act_type, a.match_cnt as cnt, a.dt as dt
from fdm.fdm_cms_matches_da b, gdm.gdm_cus_match_di a where a.match_id = b.match_id and a.dt='2020-05-25'
union all
select a.user_id as user_id, b.tags as tags, 1 as act_type, a.game_cnt as cnt, a.dt as dt
from fdm.fdm_cms_subgame_public_da b, gdm.gdm_cus_game_di a where a.game_id = b.game_id and a.dt='2020-05-25'
union all
select a.user_id as user_id, b.tags as tags, 3 as act_type, a.sign_cnt as cnt, a.dt as dt
from fdm.fdm_cms_matches_da b, gdm.gdm_cus_signup_di a where a.match_id = b.match_id and a.dt='2020-05-25'
union all
select a.user_id as user_id, b.tags as tags, 4 as act_type, a.cancel_cnt as cnt, a.dt as dt
from fdm.fdm_cms_matches_da b, gdm.gdm_cus_cl_signup_di a where a.match_id = b.match_id and a.dt='2020-05-25') tmp
lateral view explode(split(tmp.tags, ',')) tagtable as tag_name
group by user_id, tag_name, act_type, dt;
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/hadoop/output' ROW FORMAT DELIMITED FIELDS TERMINATED by ',' select * from fdm.fdm_cms_matches_da;
LOAD DATA INPATH '/user/hive/external/mongo_ipt/relation/follow_num/follow_num_sum.csv'
OVERWRITE INTO table bdm.bdm_relation_follow_num_sum_di partition(dt='2020-06-19');
暂时测试几个典型的用例
谢谢本文被我参考的大神
收工!!!
网友评论