美文网首页
Hive是怎么转化hql为mr程序的

Hive是怎么转化hql为mr程序的

作者: 烂泥_119c | 来源:发表于2020-01-05 23:36 被阅读0次

    hive是怎么转化hql为MR程序的?

    image.png

    总的来说,Hive是通过给用户提供的一系列交互接口,接收到用户的指令(SQL),使用自己的Driver,结合元数据(MetaStore),将这些指令翻译成MapReduce,提交到Hadoop中执行,最后,将执行返回的结果输出到用户交互接口。

    1. 用户接口:Client
      CLI(hiveshell)、JDBC/ODBC(java访问hive)、WEBUI(浏览器访问hive)
    2. 元数据:Metastore
      元数据包括:表名、表所属的数据库(默认是default)、表的拥有者、列/分区字段、表的类型(是否是外部表)、表的数据所在目录等;
      默认存储在自带的derby数据库中,推荐使用MySQL存储Metastore
    3. Hadoop
      使用HDFS进行存储,使用MapReduce进行计算。
    4. 驱动器:Driver
      (1)解析器(SQL Parser):将SQL字符串转换成抽象语法树AST,这一步一般都用第三方工具库完成,比如antlr;对AST进行语法分析,比如表是否存在、字段是否存在、SQL语义是否有误。
      (2)编译器(Physical Plan):将AST编译生成逻辑执行计划。
      (3)优化器(Query Optimizer):对逻辑执行计划进行优化。
      (4)执行器(Execution):把逻辑执行计划转换成可以运行的物理计划。对于Hive来说,就是MR/Spark。
    • 执行流程:

      1. ANTLR将用户提供的语法文件进行分析,转换成语法树,包含各种符号(token)和字面值;--- TOK_QUERY、TOK_FROM、TOK_SELECT等
      2. 遍历语法树,抽象出查询的基本单元块,QueryBlock,包含输入源、计算过程、输出。可以理解为子查询
      3. 遍历QueryBlock生成操作树,包含 TableScanOperator、SelectOperator等
      4. 优化器优化操作树,变换、减少MR任务数、Shuffle阶段数量等
      5. 转换为最终的MR程序提交作业。
    • 例:

    explain select * from sqoop where id > 0;
    Stage-0
       Fetch Operator
          limit:-1
          Select Operator [SEL_2]
             outputColumnNames:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12","_col13","_col14","_col15","_col16","_col17","_col18","_col19","_col20","_col21","_col22","_col23","_col24","_col25","_col26","_col27","_col28","_col29","_col30","_col31"]
             Filter Operator [FIL_4]
                predicate:(id > 0) (type: boolean)
                TableScan [TS_0]
                   alias:sqoop
    
    image.png
    package com.test;
    
    import java.io.IOException;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    /**
     * @author phil.zhang
     * @date 2019/4/3
     */
    // SELECT pageid, age, count(1) FROM TABLE GROUP BY pageid,age
    public class Hive2MR {
      static class PageMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
          String data = value.toString();
            context.write(new Text(data), new IntWritable(1));
    
        }
      }
    
      static class PageReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
          int total=0;
          for (IntWritable value : values) {
            total=total+value.get();
          }
          context.write(key, new IntWritable(total));
        }
      }
    
      public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        System.setProperty("hadoop.home.dir","c:\\hadoop\\2.7.3");
        Job job = Job.getInstance(new Configuration());
        job.setJarByClass(Hive2MR.class);
        job.setMapperClass(PageMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setReducerClass(PageReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.setInputPaths(job,new Path("C:\\zf\\pageAge.txt"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\zf\\result"));
        boolean b = job.waitForCompletion(true);
      }
    }
    
    package com.test;
    
    import org.apache.hadoop.fs.Path;
    import org.apache.spark.SparkConf;
    import org.apache.spark.SparkContext;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.function.PairFunction;
    import org.apache.spark.rdd.RDD;
    import scala.Tuple2;
    
    /**
     * @author phil.zhang
     * @date 2019/4/3
     */
    // SELECT pv.pageid, u.age FROM page_view pv JOIN user u ON (pv.userid = u.userid);
    
      // pgid, uid , time
      // uid, age , gender
    public class Hive2Spark {
    
      public static void main(String[] args) {
        System.setProperty("hadoop.home.dir","c:\\hadoop\\2.7.3");
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("hive");
        SparkContext context = new SparkContext(conf);
        RDD<String> page = context.textFile("c:/zf/page.txt", 1);
        RDD<String> user = context.textFile("c:/zf/user.txt", 1);
        JavaPairRDD<String, String> pagePair = page.toJavaRDD()
            .map(str -> str.split(",")).mapToPair(strs -> new Tuple2<>(strs[1], strs[0]));
        for (Tuple2<String, String> tuple2 : pagePair.collect()) {
          System.out.println(tuple2._1 + ":" + tuple2._2);
        }
        JavaPairRDD<String, String> userPair = user.toJavaRDD()
            .map(str -> str.split(",")).mapToPair(strs -> new Tuple2<>(strs[0], strs[1]));
        for (Tuple2<String, String> tuple2 : userPair.collect()) {
          System.out.println(tuple2._1 + ":" + tuple2._2);
        }
        JavaPairRDD<String, Tuple2<String, String>> pairRDD = pagePair.join(userPair);
        for (Tuple2<String, Tuple2<String, String>> tuple2 : pairRDD.collect()) {
          System.out.println(tuple2._1 + ":" + tuple2._2()._1() +"," + tuple2._2()._2());
        }
        JavaRDD<String> result = pairRDD.map(pair -> pair._2()._1 + "," + pair._2()._2());
        for (String s : result.collect()) {
          System.out.println(s);
        }
      }
    }
    

    相关文章

      网友评论

          本文标题:Hive是怎么转化hql为mr程序的

          本文链接:https://www.haomeiwen.com/subject/ojchactx.html