Hive-UDAF

作者: raincoffee | 来源:发表于2017-04-06 16:21 被阅读3132次

    UDAF

    前两节分别介绍了基础UDF和UDTF,这一节我们将介绍最复杂的用户自定义聚合函数(UDAF)。用户自定义聚合函数(UDAF)接受从零行到多行的零个到多个列,然后返回单一值,如sum()、count()。要实现UDAF,我们需要实现下面的类:

    org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver

    org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator

    AbstractGenericUDAFResolver检查输入参数,并且指定使用哪个resolver。在AbstractGenericUDAFResolver里,只需要实现一个方法:

    public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException;  
    

    但是,主要的逻辑处理还是在Evaluator中。我们需要继承GenericUDAFEvaluator,并且实现下面几个方法:

    
    // 输入输出都是Object inspectors  
    public  ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException;  
      
    // AggregationBuffer保存数据处理的临时结果  
    abstract AggregationBuffer getNewAggregationBuffer() throws HiveException;  
      
    // 重新设置AggregationBuffer  
    public void reset(AggregationBuffer agg) throws HiveException;  
      
    // 处理输入记录  
    public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException;  
      
    // 处理全部输出数据中的部分数据  
    public Object terminatePartial(AggregationBuffer agg) throws HiveException;  
      
    // 把两个部分数据聚合起来  
    public void merge(AggregationBuffer agg, Object partial) throws HiveException;  
      
    // 输出最终结果  
    public Object terminate(AggregationBuffer agg) throws HiveException;  
    

    在处理之前,先看下UADF的Enum GenericUDAFEvaluator.Mode。Mode有4中情况:

    • PARTIAL1:Mapper阶段。从原始数据到部分聚合,会调用iterate()和terminatePartial()。
    • PARTIAL2:Combiner阶段,在Mapper端合并Mapper的结果数据。从部分聚合到部分聚合,会调用merge()和terminatePartial()。
    • FINAL:Reducer阶段。从部分聚合数据到完全聚合,会调用merge()和terminate()。
    • COMPLETE:出现这个阶段,表示MapReduce中只用Mapper没有Reducer,所以Mapper端直接输出结果了。从原始数据到完全聚合,会调用iterate()和terminate()。

    GenericUDAFResolver2

    @Deprecated
    public abstract interface GenericUDAFResolver {
        public abstract GenericUDAFEvaluator getEvaluator(TypeInfo[] paramArrayOfTypeInfo) throws SemanticException;
    }
    

    已废弃

    public abstract interface GenericUDAFResolver2 extends GenericUDAFResolver {
        public abstract GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo paramGenericUDAFParameterInfo)
                throws SemanticException;
    }
    

    GenericUDAFEvaluator

    @UDFType(deterministic = true)
    public abstract class GenericUDAFEvaluator implements Closeable {
        Mode mode;
    
        public static boolean isEstimable(AggregationBuffer buffer) {
            if (buffer instanceof AbstractAggregationBuffer) {
                Class clazz = buffer.getClass();
                AggregationType annotation = (AggregationType) clazz.getAnnotation(AggregationType.class);
                return ((annotation != null) && (annotation.estimable()));
            }
            return false;
        }
    
        public void configure(MapredContext mapredContext) {
        }
    
        public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
            this.mode = m;
            return null;
        }
    
        public abstract AggregationBuffer getNewAggregationBuffer() throws HiveException;
    
        public abstract void reset(AggregationBuffer paramAggregationBuffer) throws HiveException;
    
        public void close() throws IOException {
        }
    
        public void aggregate(AggregationBuffer agg, Object[] parameters) throws HiveException {
            if ((this.mode == Mode.PARTIAL1) || (this.mode == Mode.COMPLETE)) {
                iterate(agg, parameters);
            } else {
                assert (parameters.length == 1);
                merge(agg, parameters[0]);
            }
        }
    
        public Object evaluate(AggregationBuffer agg) throws HiveException {
            if ((this.mode == Mode.PARTIAL1) || (this.mode == Mode.PARTIAL2)) {
                return terminatePartial(agg);
            }
            return terminate(agg);
        }
    
        public abstract void iterate(AggregationBuffer paramAggregationBuffer, Object[] paramArrayOfObject)
                throws HiveException;
    
        public abstract Object terminatePartial(AggregationBuffer paramAggregationBuffer) throws HiveException;
    
        public abstract void merge(AggregationBuffer paramAggregationBuffer, Object paramObject) throws HiveException;
    
        public abstract Object terminate(AggregationBuffer paramAggregationBuffer) throws HiveException;
    
        public static abstract class AbstractAggregationBuffer implements GenericUDAFEvaluator.AggregationBuffer {
            public int estimate() {
                return -1;
            }
        }
    
        public static abstract interface AggregationBuffer {
        }
    
        public static enum Mode {
            PARTIAL1, PARTIAL2, FINAL, COMPLETE;
        }
    
        public static @interface AggregationType {
            public abstract boolean estimable();
        }
    }
    

    例子

    count
    /*** Eclipse Class Decompiler plugin, copyright (c) 2016 Chen Chao (cnfree2000@hotmail.com) ***/
    package org.apache.hadoop.hive.ql.udf.generic;
    
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    import org.apache.hadoop.hive.ql.exec.Description;
    import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
    import org.apache.hadoop.hive.ql.metadata.HiveException;
    import org.apache.hadoop.hive.ql.parse.SemanticException;
    import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
    import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
    import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
    import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
    import org.apache.hadoop.io.LongWritable;
    
    @Description(name = "count", value = "_FUNC_(*) - Returns the total number of retrieved rows, including rows containing NULL values.\n_FUNC_(expr) - Returns the number of rows for which the supplied expression is non-NULL.\n_FUNC_(DISTINCT expr[, expr...]) - Returns the number of rows for which the supplied expression(s) are unique and non-NULL.")
    public class GenericUDAFCount implements GenericUDAFResolver2 {
        private static final Log LOG;
    
        public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
            return new GenericUDAFCountEvaluator();
        }
    
        public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo paramInfo) throws SemanticException {
            TypeInfo[] parameters = paramInfo.getParameters();
    
            if (parameters.length == 0) {
                if (!(paramInfo.isAllColumns())) {
                    throw new UDFArgumentException("Argument expected");
                }
                if ((!($assertionsDisabled)) && (paramInfo.isDistinct()))
                    throw new AssertionError("DISTINCT not supported with *");
            } else {
                if ((parameters.length > 1) && (!(paramInfo.isDistinct()))) {
                    throw new UDFArgumentException("DISTINCT keyword must be specified");
                }
                assert (!(paramInfo.isAllColumns())) : "* not supported in expression list";
            }
    
            return new GenericUDAFCountEvaluator().setCountAllColumns(paramInfo.isAllColumns());
        }
    
        static {
            LOG = LogFactory.getLog(GenericUDAFCount.class.getName());
        }
    
        public static class GenericUDAFCountEvaluator extends GenericUDAFEvaluator {
            private boolean countAllColumns;
            private LongObjectInspector partialCountAggOI;
            private LongWritable result;
    
            public GenericUDAFCountEvaluator() {
                this.countAllColumns = false;
            }
    
            public ObjectInspector init(GenericUDAFEvaluator.Mode m, ObjectInspector[] parameters) throws HiveException {
                super.init(m, parameters);
                this.partialCountAggOI = PrimitiveObjectInspectorFactory.writableLongObjectInspector;
    
                this.result = new LongWritable(0L);
                return PrimitiveObjectInspectorFactory.writableLongObjectInspector;
            }
    
            private GenericUDAFCountEvaluator setCountAllColumns(boolean countAllCols) {
                this.countAllColumns = countAllCols;
                return this;
            }
    
            public GenericUDAFEvaluator.AggregationBuffer getNewAggregationBuffer() throws HiveException {
                CountAgg buffer = new CountAgg();
                reset(buffer);
                return buffer;
            }
    
            public void reset(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
                ((CountAgg) agg).value = 0L;
            }
    
            public void iterate(GenericUDAFEvaluator.AggregationBuffer agg, Object[] parameters) throws HiveException {
                if (parameters == null) {
                    return;
                }
                if (this.countAllColumns) {
                    assert (parameters.length == 0);
                    ((CountAgg) agg).value += 1L;
                } else {
                    assert (parameters.length > 0);
                    boolean countThisRow = true;
                    for (Object nextParam : parameters) {
                        if (nextParam == null) {
                            countThisRow = false;
                            break;
                        }
                    }
                    if (countThisRow)
                        ((CountAgg) agg).value += 1L;
                }
            }
    
            public void merge(GenericUDAFEvaluator.AggregationBuffer agg, Object partial) throws HiveException {
                if (partial != null) {
                    long p = this.partialCountAggOI.get(partial);
                    ((CountAgg) agg).value += p;
                }
            }
    
            public Object terminate(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
                this.result.set(((CountAgg) agg).value);
                return this.result;
            }
    
            public Object terminatePartial(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
                return terminate(agg);
            }
    
            @GenericUDAFEvaluator.AggregationType(estimable = true)
            static class CountAgg extends GenericUDAFEvaluator.AbstractAggregationBuffer {
                long value;
    
                public int estimate() {
                    return 8;
                }
            }
        }
    }
    
    sum

    udaf 需要hive的sql和group by联合使用。hive的group by对于每个分组,只能返回一条记录。

    开发通用udaf有另个步骤,一个是编写resolver类,第二个是编写evaluator类。resolver负责类型检查,操作符重载。evaluator负责实现真正的udaf逻辑、

    以sum为例、

    reslver通常继承resolver2.但是建议继承resolver。隔离将来hive接口的变化。

    public class GenericUDAFSum extends AbstractGenericUDAFResolver {
        static final Log LOG = LogFactory.getLog(GenericUDAFSum.class.getName());
    
        public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters)
        throws SemanticException
      {
        if (parameters.length != 1) {
          throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected.");
        }
    
        if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
          throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but " + parameters[0].getTypeName() + " is passed.");
        }
    
        switch (1.$SwitchMap$org$apache$hadoop$hive$serde2$objectinspector$PrimitiveObjectInspector$PrimitiveCategory[((org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo)parameters[0]).getPrimitiveCategory().ordinal()]) {
        case 1:
        case 2:
        case 3:
        case 4:
          return new GenericUDAFSumLong();
        case 5:
        case 6:
        case 7:
        case 8:
        case 9:
        case 10:
          return new GenericUDAFSumDouble();
        case 11:
          return new GenericUDAFSumHiveDecimal();
        case 12:
        case 13:
        }
        throw new UDFArgumentTypeException(0, "Only numeric or string type arguments are accepted but " + parameters[0].getTypeName() + " is passed.");
      }
    

    着就是udaf的代码骨架。创建一个log对象。 重写getEvaluator方法。根据sql传入的参数类型,返回争取的evaluator。主要实现操作符的重载。

    实现evaluator

    下面以genericudafsumlong为例。

    public static class GenericUDAFSumLong extends GenericUDAFEvaluator {
            private PrimitiveObjectInspector inputOI;
            private LongWritable result;
            private boolean warned;
    
            public GenericUDAFSumLong() {
                this.warned = false;
            }
            //这个方法返回可udaf的返回类型。这里定义返回类型为long
            public ObjectInspector init(GenericUDAFEvaluator.Mode m, ObjectInspector[] parameters) throws HiveException {
                assert (parameters.length == 1);
                super.init(m, parameters);
                this.result = new LongWritable(0L);
                this.inputOI = ((PrimitiveObjectInspector) parameters[0]);
                return PrimitiveObjectInspectorFactory.writableLongObjectInspector;
            }
    
            //创建新的聚合计算需要的内存,用来存储mapper,combiner,reducer运算过程中的相加总和。
            public GenericUDAFEvaluator.AggregationBuffer getNewAggregationBuffer() throws HiveException {
                SumLongAgg result = new SumLongAgg();
                reset(result);
                return result;
            }
    
            //mr支持mapper和reducer的重用,所以为了兼容,也要做内存的重用
            public void reset(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
                SumLongAgg myagg = (SumLongAgg) agg;
                myagg.empty = true;
                myagg.sum = 0L;
            }
            
            //map阶段,只要把保存道歉和的对象agg,再加上输入的参数,就可以了。
            public void iterate(GenericUDAFEvaluator.AggregationBuffer agg, Object[] parameters) throws HiveException {
                assert (parameters.length == 1);
                try {
                    merge(agg, parameters[0]);
                } catch (NumberFormatException e) {
                    if (!(this.warned)) {
                        this.warned = true;
                        GenericUDAFSum.LOG.warn(super.getClass().getSimpleName() + " " + StringUtils.stringifyException(e));
                    }
                }
            }
    
            //mapper结束要返回的结果和combiner结束要返回的结果。
            public Object terminatePartial(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
                return terminate(agg);
            }
            
            //combiner合并map返回的结果,还有reducer合并mapper或combiner返回的结果
            public void merge(GenericUDAFEvaluator.AggregationBuffer agg, Object partial) throws HiveException {
                if (partial != null) {
                    SumLongAgg myagg = (SumLongAgg) agg;
                    myagg.sum += PrimitiveObjectInspectorUtils.getLong(partial, this.inputOI);
                    myagg.empty = false;
                }
            }
    
            //reducer返回结果,或者是只有mapper,没有reducer,在mapper端返回结果。
            public Object terminate(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
                SumLongAgg myagg = (SumLongAgg) agg;
                if (myagg.empty) {
                    return null;
                }
                this.result.set(myagg.sum);
                return this.result;
            }
            
            //存储sum值得类
            @GenericUDAFEvaluator.AggregationType(estimable = true)
            static class SumLongAgg extends GenericUDAFEvaluator.AbstractAggregationBuffer {
                boolean empty;
                long sum;
    
                public int estimate() {
                    return 12;
                }
            }
        }
    

    相关文章

      网友评论

        本文标题:Hive-UDAF

        本文链接:https://www.haomeiwen.com/subject/bdeqattx.html