本文介紹如何使用Spark2中自定義累加器來實現數據的統計。
Spark2.x之后,之前的的accumulator被廢除,用AccumulatorV2代替;
1.自定義Accumulator
class StrAccumulator extends AccumulatorV2[String, String] {
// a=10|b=20
private var v = ""
override def isZero: Boolean = v == ""
override def copy(): AccumulatorV2[String, String] = {
val newAcc = new StrAccumulator
newAcc.v = this.v
newAcc
}
override def reset(): Unit = v = ""
override def add(v: String): Unit = {
if (v == null || v == "") {
return this.v
}
val oldValue = getFieldFromConcatString(this.v, "\\|", v)
if (oldValue != null) {
val newValue = (oldValue.toInt + 1).toString
this.v = setFieldInConcatString(this.v, "\\|", v, newValue)
} else {
if (isZero) {
this.v = v + "=" + 1
} else {
this.v = this.v + "|" + v + "=" + 1
}
}
this.v
}
override def merge(other: AccumulatorV2[String, String]): Unit = other match {
case o: StrAccumulator => v += o.v
case _ => throw new UnsupportedOperationException(
s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
}
override def value: String = v
def getFieldFromConcatString(str: String, delimiter: String, field: String): String = {
val fields = str.split(delimiter)
for (concatField <- fields) {
if (concatField.split("=").length == 2) {
val fieldName = concatField.split("=")(0)
val fieldValue = concatField.split("=")(1)
if (fieldName == field)
return fieldValue
}
}
null
}
def setFieldInConcatString(str: String, delimiter: String, field: String, newValue: String): String = {
val fields = str.split(delimiter)
var break = false
for (i <- 0 until fields.length if !break) {
if (fields(i).split("=")(0) == field) {
val concatField = field + "=" + newValue
fields(i) = concatField
break = true
}
}
fields.mkString("|")
}
}
2.使用
需求:統計Session總數的時候,同時計算Session的步長
測試數據
session-1 1
session-1 2
session-1 3
session-2 1
session-2 2
測試代碼
object AccumulatorTest {
def main(args: Array[String]): Unit = {
//創建一個Config
val conf = new SparkConf()
.setAppName("AccumulatorTest")
.setMaster("local")
//核心創建SparkContext對象
val sc = new SparkContext(conf)
// 注冊累加器
val strAccumulator = new StrAccumulator
sc.register(strAccumulator)
//WordCount
sc.textFile("D:\\workspaces\\idea\\hadoop\\spark\\data\\session.txt")
.map(line => {
val lines = line.split("\t")
val sessionId = lines(0)
val pageId = lines(1)
// 累加統計
strAccumulator.add(sessionId)
(sessionId, 1L)
})
.reduceByKey(_ + _)
.sortBy(_._2, false)
.foreach(println)
println(strAccumulator.value)
//停止SparkContext對象
sc.stop()
}
}
打印結果
(session-1,123)
(session-2,12)
session-1=3|session-2=2
這樣在統計Session數量的同時,也計算了每個session的步長,當然還可以計算其它屬性。比如每個Session的會話時長,會話時長區間統計,會話步長區間統計等等。