ProjectCount2.scala
package day08
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{HashPartitioner, Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
/**
* 緩存機制
* 自定義一個分區器
* 按照每種學科數據放到不同的分區器里
*/
object ProjectCount3 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("ProjectCount3").setMaster("local[2]")
val sc: SparkContext = new SparkContext(conf)
//獲取數據
val file: RDD[String] = sc.textFile("D:/teachingprogram/Spark學習視頻/day08/access.txt")
// 提取出url并生成一個元組
val urlAndOne: RDD[(String, Int)] = file.map(line => {
val fields = line.split("\t")
val url = fields(1)
(url, 1)
})
// 把相同的url聚合
val sumedUrl: RDD[(String, Int)] = urlAndOne.reduceByKey(_+_)
// 獲取學科信息
val cachedProject: RDD[(String, (String, Int))] = sumedUrl.map(x => {
val url = x._1
val project = new URL(url).getHost
val count = x._2
(project, (url, count))
}).cache()
// 調用Spark自帶的分區器此時會發生哈希碰撞,需要自定義分區器
// val res: RDD[(String, (String, Int))] = cachedProject.partitionBy(new HashPartitioner(3))
// res.saveAsTextFile("d://out")
// 得到所有學科
val projects: Array[String] = cachedProject.keys.distinct().collect()
// 調用自定義分區器并得到分區號
val partitioner: ProjectPartitioner = new ProjectPartitioner(projects)
// 分區
val partitioned: RDD[(String, (String, Int))] = cachedProject.partitionBy(partitioner)
// 對每個分區的數據進行排序并取top3
val res: RDD[(String, (String, Int))] = partitioned.mapPartitions(it => {
it.toList.sortBy(_._2._2).reverse.take(3).iterator
})
res.saveAsTextFile("d://out")
sc.stop()
}
}
class ProjectPartitioner(projects: Array[String]) extends Partitioner {
// 用來存放學科和分區號
private val projectsAndPartNum = new mutable.HashMap[String, Int]
// 計數器,用于指定分區號
var n = 0
for(pro <- projects) {
projectsAndPartNum += (pro -> n)
n += 1
}
// 得到分區數
override def numPartitions: Int = projects.length
// 得到分區號
override def getPartition(key: Any): Int = {
projectsAndPartNum.getOrElse(key.toString,0)
}
}