UDAF
前兩節分別介紹了基礎UDF和UDTF,這一節我們將介紹最復雜的用戶自定義聚合函數(UDAF)。用戶自定義聚合函數(UDAF)接受從零行到多行的零個到多個列,然后返回單一值,如sum()、count()。要實現UDAF,我們需要實現下面的類:
org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver
org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator
AbstractGenericUDAFResolver檢查輸入參數,并且指定使用哪個resolver。在AbstractGenericUDAFResolver里,只需要實現一個方法:
public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException;
但是,主要的邏輯處理還是在Evaluator中。我們需要繼承GenericUDAFEvaluator,并且實現下面幾個方法:
// 輸入輸出都是Object inspectors
public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException;
// AggregationBuffer保存數據處理的臨時結果
abstract AggregationBuffer getNewAggregationBuffer() throws HiveException;
// 重新設置AggregationBuffer
public void reset(AggregationBuffer agg) throws HiveException;
// 處理輸入記錄
public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException;
// 處理全部輸出數據中的部分數據
public Object terminatePartial(AggregationBuffer agg) throws HiveException;
// 把兩個部分數據聚合起來
public void merge(AggregationBuffer agg, Object partial) throws HiveException;
// 輸出最終結果
public Object terminate(AggregationBuffer agg) throws HiveException;
在處理之前,先看下UADF的Enum GenericUDAFEvaluator.Mode。Mode有4中情況:
- PARTIAL1:Mapper階段。從原始數據到部分聚合,會調用iterate()和terminatePartial()。
- PARTIAL2:Combiner階段,在Mapper端合并Mapper的結果數據。從部分聚合到部分聚合,會調用merge()和terminatePartial()。
- FINAL:Reducer階段。從部分聚合數據到完全聚合,會調用merge()和terminate()。
- COMPLETE:出現這個階段,表示MapReduce中只用Mapper沒有Reducer,所以Mapper端直接輸出結果了。從原始數據到完全聚合,會調用iterate()和terminate()。
GenericUDAFResolver2
@Deprecated
public abstract interface GenericUDAFResolver {
public abstract GenericUDAFEvaluator getEvaluator(TypeInfo[] paramArrayOfTypeInfo) throws SemanticException;
}
已廢棄
public abstract interface GenericUDAFResolver2 extends GenericUDAFResolver {
public abstract GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo paramGenericUDAFParameterInfo)
throws SemanticException;
}
GenericUDAFEvaluator
@UDFType(deterministic = true)
public abstract class GenericUDAFEvaluator implements Closeable {
Mode mode;
public static boolean isEstimable(AggregationBuffer buffer) {
if (buffer instanceof AbstractAggregationBuffer) {
Class clazz = buffer.getClass();
AggregationType annotation = (AggregationType) clazz.getAnnotation(AggregationType.class);
return ((annotation != null) && (annotation.estimable()));
}
return false;
}
public void configure(MapredContext mapredContext) {
}
public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
this.mode = m;
return null;
}
public abstract AggregationBuffer getNewAggregationBuffer() throws HiveException;
public abstract void reset(AggregationBuffer paramAggregationBuffer) throws HiveException;
public void close() throws IOException {
}
public void aggregate(AggregationBuffer agg, Object[] parameters) throws HiveException {
if ((this.mode == Mode.PARTIAL1) || (this.mode == Mode.COMPLETE)) {
iterate(agg, parameters);
} else {
assert (parameters.length == 1);
merge(agg, parameters[0]);
}
}
public Object evaluate(AggregationBuffer agg) throws HiveException {
if ((this.mode == Mode.PARTIAL1) || (this.mode == Mode.PARTIAL2)) {
return terminatePartial(agg);
}
return terminate(agg);
}
public abstract void iterate(AggregationBuffer paramAggregationBuffer, Object[] paramArrayOfObject)
throws HiveException;
public abstract Object terminatePartial(AggregationBuffer paramAggregationBuffer) throws HiveException;
public abstract void merge(AggregationBuffer paramAggregationBuffer, Object paramObject) throws HiveException;
public abstract Object terminate(AggregationBuffer paramAggregationBuffer) throws HiveException;
public static abstract class AbstractAggregationBuffer implements GenericUDAFEvaluator.AggregationBuffer {
public int estimate() {
return -1;
}
}
public static abstract interface AggregationBuffer {
}
public static enum Mode {
PARTIAL1, PARTIAL2, FINAL, COMPLETE;
}
public static @interface AggregationType {
public abstract boolean estimable();
}
}
例子
count
/*** Eclipse Class Decompiler plugin, copyright (c) 2016 Chen Chao (cnfree2000@hotmail.com) ***/
package org.apache.hadoop.hive.ql.udf.generic;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.LongWritable;
@Description(name = "count", value = "_FUNC_(*) - Returns the total number of retrieved rows, including rows containing NULL values.\n_FUNC_(expr) - Returns the number of rows for which the supplied expression is non-NULL.\n_FUNC_(DISTINCT expr[, expr...]) - Returns the number of rows for which the supplied expression(s) are unique and non-NULL.")
public class GenericUDAFCount implements GenericUDAFResolver2 {
private static final Log LOG;
public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException {
return new GenericUDAFCountEvaluator();
}
public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo paramInfo) throws SemanticException {
TypeInfo[] parameters = paramInfo.getParameters();
if (parameters.length == 0) {
if (!(paramInfo.isAllColumns())) {
throw new UDFArgumentException("Argument expected");
}
if ((!($assertionsDisabled)) && (paramInfo.isDistinct()))
throw new AssertionError("DISTINCT not supported with *");
} else {
if ((parameters.length > 1) && (!(paramInfo.isDistinct()))) {
throw new UDFArgumentException("DISTINCT keyword must be specified");
}
assert (!(paramInfo.isAllColumns())) : "* not supported in expression list";
}
return new GenericUDAFCountEvaluator().setCountAllColumns(paramInfo.isAllColumns());
}
static {
LOG = LogFactory.getLog(GenericUDAFCount.class.getName());
}
public static class GenericUDAFCountEvaluator extends GenericUDAFEvaluator {
private boolean countAllColumns;
private LongObjectInspector partialCountAggOI;
private LongWritable result;
public GenericUDAFCountEvaluator() {
this.countAllColumns = false;
}
public ObjectInspector init(GenericUDAFEvaluator.Mode m, ObjectInspector[] parameters) throws HiveException {
super.init(m, parameters);
this.partialCountAggOI = PrimitiveObjectInspectorFactory.writableLongObjectInspector;
this.result = new LongWritable(0L);
return PrimitiveObjectInspectorFactory.writableLongObjectInspector;
}
private GenericUDAFCountEvaluator setCountAllColumns(boolean countAllCols) {
this.countAllColumns = countAllCols;
return this;
}
public GenericUDAFEvaluator.AggregationBuffer getNewAggregationBuffer() throws HiveException {
CountAgg buffer = new CountAgg();
reset(buffer);
return buffer;
}
public void reset(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
((CountAgg) agg).value = 0L;
}
public void iterate(GenericUDAFEvaluator.AggregationBuffer agg, Object[] parameters) throws HiveException {
if (parameters == null) {
return;
}
if (this.countAllColumns) {
assert (parameters.length == 0);
((CountAgg) agg).value += 1L;
} else {
assert (parameters.length > 0);
boolean countThisRow = true;
for (Object nextParam : parameters) {
if (nextParam == null) {
countThisRow = false;
break;
}
}
if (countThisRow)
((CountAgg) agg).value += 1L;
}
}
public void merge(GenericUDAFEvaluator.AggregationBuffer agg, Object partial) throws HiveException {
if (partial != null) {
long p = this.partialCountAggOI.get(partial);
((CountAgg) agg).value += p;
}
}
public Object terminate(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
this.result.set(((CountAgg) agg).value);
return this.result;
}
public Object terminatePartial(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
return terminate(agg);
}
@GenericUDAFEvaluator.AggregationType(estimable = true)
static class CountAgg extends GenericUDAFEvaluator.AbstractAggregationBuffer {
long value;
public int estimate() {
return 8;
}
}
}
}
sum
udaf 需要hive的sql和group by聯合使用。hive的group by對于每個分組,只能返回一條記錄。
開發通用udaf有另個步驟,一個是編寫resolver類,第二個是編寫evaluator類。resolver負責類型檢查,操作符重載。evaluator負責實現真正的udaf邏輯、
以sum為例、
reslver通常繼承resolver2.但是建議繼承resolver。隔離將來hive接口的變化。
public class GenericUDAFSum extends AbstractGenericUDAFResolver {
static final Log LOG = LogFactory.getLog(GenericUDAFSum.class.getName());
public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters)
throws SemanticException
{
if (parameters.length != 1) {
throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected.");
}
if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but " + parameters[0].getTypeName() + " is passed.");
}
switch (1.$SwitchMap$org$apache$hadoop$hive$serde2$objectinspector$PrimitiveObjectInspector$PrimitiveCategory[((org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo)parameters[0]).getPrimitiveCategory().ordinal()]) {
case 1:
case 2:
case 3:
case 4:
return new GenericUDAFSumLong();
case 5:
case 6:
case 7:
case 8:
case 9:
case 10:
return new GenericUDAFSumDouble();
case 11:
return new GenericUDAFSumHiveDecimal();
case 12:
case 13:
}
throw new UDFArgumentTypeException(0, "Only numeric or string type arguments are accepted but " + parameters[0].getTypeName() + " is passed.");
}
著就是udaf的代碼骨架。創建一個log對象。 重寫getEvaluator方法。根據sql傳入的參數類型,返回爭取的evaluator。主要實現操作符的重載。
實現evaluator
下面以genericudafsumlong為例。
public static class GenericUDAFSumLong extends GenericUDAFEvaluator {
private PrimitiveObjectInspector inputOI;
private LongWritable result;
private boolean warned;
public GenericUDAFSumLong() {
this.warned = false;
}
//這個方法返回可udaf的返回類型。這里定義返回類型為long
public ObjectInspector init(GenericUDAFEvaluator.Mode m, ObjectInspector[] parameters) throws HiveException {
assert (parameters.length == 1);
super.init(m, parameters);
this.result = new LongWritable(0L);
this.inputOI = ((PrimitiveObjectInspector) parameters[0]);
return PrimitiveObjectInspectorFactory.writableLongObjectInspector;
}
//創建新的聚合計算需要的內存,用來存儲mapper,combiner,reducer運算過程中的相加總和。
public GenericUDAFEvaluator.AggregationBuffer getNewAggregationBuffer() throws HiveException {
SumLongAgg result = new SumLongAgg();
reset(result);
return result;
}
//mr支持mapper和reducer的重用,所以為了兼容,也要做內存的重用
public void reset(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
SumLongAgg myagg = (SumLongAgg) agg;
myagg.empty = true;
myagg.sum = 0L;
}
//map階段,只要把保存道歉和的對象agg,再加上輸入的參數,就可以了。
public void iterate(GenericUDAFEvaluator.AggregationBuffer agg, Object[] parameters) throws HiveException {
assert (parameters.length == 1);
try {
merge(agg, parameters[0]);
} catch (NumberFormatException e) {
if (!(this.warned)) {
this.warned = true;
GenericUDAFSum.LOG.warn(super.getClass().getSimpleName() + " " + StringUtils.stringifyException(e));
}
}
}
//mapper結束要返回的結果和combiner結束要返回的結果。
public Object terminatePartial(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
return terminate(agg);
}
//combiner合并map返回的結果,還有reducer合并mapper或combiner返回的結果
public void merge(GenericUDAFEvaluator.AggregationBuffer agg, Object partial) throws HiveException {
if (partial != null) {
SumLongAgg myagg = (SumLongAgg) agg;
myagg.sum += PrimitiveObjectInspectorUtils.getLong(partial, this.inputOI);
myagg.empty = false;
}
}
//reducer返回結果,或者是只有mapper,沒有reducer,在mapper端返回結果。
public Object terminate(GenericUDAFEvaluator.AggregationBuffer agg) throws HiveException {
SumLongAgg myagg = (SumLongAgg) agg;
if (myagg.empty) {
return null;
}
this.result.set(myagg.sum);
return this.result;
}
//存儲sum值得類
@GenericUDAFEvaluator.AggregationType(estimable = true)
static class SumLongAgg extends GenericUDAFEvaluator.AbstractAggregationBuffer {
boolean empty;
long sum;
public int estimate() {
return 12;
}
}
}