原始文件baidu.log內容:
hello world spark hadoop hive 223.104.18.110 v1.go2yd.com 17168 http://v1.go2yd.com/video/38ddjsic89s8je8sjxjcdie89.mp4_sd.mp4
world spark hello mysql sqoop 113.101.75.194 v2.go2yd.com 17222 http://v2.go2yd.com/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4
spark hello mysql hive world 27.17.127.135 v2.go2yd.com 1556 http://v2.go2yd.com/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4
......
......
需求:求每個域名下訪問次數最多的文件資源
一般情況下url格式為:http://domain/a/b/c/xxx.mp4?x=y&w=z.... 資源應該是:/a/b/c/xxx.mp4這一段,即第一個/后到第一個?前的內容,因此要做一個截取
object test {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("test").setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val lines = sc.textFile("file:///E:/BigDataSoftware/data/baidu.log")
val url = lines.map(x=> {
val tmp = x.split("\t")
val resource = getresources(tmp(8))
((tmp(6),resource),1)
}) //見result1
.reduceByKey(_+_) //見result2
val a = url.groupBy(_._1._1) //見result3
val b = a.mapValues(_.toList.sortBy(_._2).reverse) //見result4
val c = b.flatMap(_._2) //見result5
//關閉SparkContext
sc.stop()
}
//定義一個函數,從url中獲取資源名稱
def getresources (url:String) = {
// 將url中的“//”刪除
val pathTemp = url.replaceFirst("http://","")
// 取pathTemp中第一個“/”的位置
var pathIndex = pathTemp.indexOf("/")
var path = ""
// 如果pathIndex != -1,取第一個“/”后邊的內容,包括“/”
if (pathIndex != -1) {
path = pathTemp.substring(pathIndex)
// 如果pathIndex != -1,取第一個“?”前邊的內容,不包括“?”
pathIndex = path.indexOf("?")
if (pathIndex != -1) {
path = path.substring(0,pathIndex)
}
}
path
}
}
result1:
--------------------------------------------
((v1.go2yd.com,/video/38ddjsic89s8je8sjxjcdie89.mp4_sd.mp4),1)
((v2.go2yd.com,/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4),1)
((v2.go2yd.com,/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4),1)
((v3.go2yd.com,/video/u65eu56trhydxry56e.mp4_bd.mp4),1)
((v4.go2yd.com,/video/65e54e87okiuygguyo8y7to6t7ru6.mp4_bd.mp4),1)
((v4.go2yd.com,/user_upload/5r7564e5ghdrhdrfu654e.mp4_bd.mp4),1)
((v3.go2yd.com,/video/u64e545y4wy4ergrdjsu7567.mp4_bd.mp4),1)
((v1.go2yd.com,/video/54wt4regshy65r675785865dyhdxh.mp4_bd.mp4),1)
((v4.go2yd.com,/video/d4765476eytrdyhbsdy54e7657.mp4_bd.mp4),1)
((v1.go2yd.com,/video/y54ey54y5hdxshtr6u4w4y2tg2.mp4_bd.mp4),1)
((v1.go2yd.com,/user_upload/4346547u6ytsgrfgsersa23tr4egst4.mp4_bd.mp4),1)
((v2.go2yd.com,/video/4ste57r7d8udytdyyyyy43433.mp4_bd.mp4),1)
((v3.go2yd.com,/video/465esgdsju7i7uyvgfjyd5ytrdxg.mp4_bd.mp4),1)
((v3.go2yd.com,/user_upload/u64e545y4wy4ergrdjsu7567.mp4_bd.mp4),1)
((v1.go2yd.com,/video/654rythdju65787ikyukjfvkyi8.mp4_bd.mp4),1)
((v3.go2yd.com,/video/65764ydxse5y34est4343.mp4_bd.mp4),1)
((v3.go2yd.com,/video/4365u7tyfdjhudxyhs43t43t54765u6d.mp4_bd.mp4),1)
result2:
--------------------------------------------
((v4.go2yd.com,/user_upload/5r7564e5ghdrhdrfu654e.mp4_bd.mp4),1)
((v4.go2yd.com,/video/65e54e87okiuygguyo8y7to6t7ru6.mp4_bd.mp4),1)
((v1.go2yd.com,/video/654rythdju65787ikyukjfvkyi8.mp4_bd.mp4),1)
((v1.go2yd.com,/user_upload/4346547u6ytsgrfgsersa23tr4egst4.mp4_bd.mp4),1)
((v3.go2yd.com,/user_upload/u64e545y4wy4ergrdjsu7567.mp4_bd.mp4),1)
((v4.go2yd.com,/video/d4765476eytrdyhbsdy54e7657.mp4_bd.mp4),1)
((v3.go2yd.com,/video/u65eu56trhydxry56e.mp4_bd.mp4),1)
((v3.go2yd.com,/video/4365u7tyfdjhudxyhs43t43t54765u6d.mp4_bd.mp4),1)
((v2.go2yd.com,/video/4ste57r7d8udytdyyyyy43433.mp4_bd.mp4),1)
((v3.go2yd.com,/video/65764ydxse5y34est4343.mp4_bd.mp4),1)
((v1.go2yd.com,/video/54wt4regshy65r675785865dyhdxh.mp4_bd.mp4),1)
((v1.go2yd.com,/video/y54ey54y5hdxshtr6u4w4y2tg2.mp4_bd.mp4),1)
((v3.go2yd.com,/video/u64e545y4wy4ergrdjsu7567.mp4_bd.mp4),1)
((v3.go2yd.com,/video/465esgdsju7i7uyvgfjyd5ytrdxg.mp4_bd.mp4),1)
((v2.go2yd.com,/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4),2)
((v1.go2yd.com,/video/38ddjsic89s8je8sjxjcdie89.mp4_sd.mp4),1)
result3:
--------------------------------------------
(v2.go2yd.com,CompactBuffer(((v2.go2yd.com,/video/4ste57r7d8udytdyyyyy43433.mp4_bd.mp4),1), ((v2.go2yd.com,/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4),2)))
(v4.go2yd.com,CompactBuffer(((v4.go2yd.com,/user_upload/5r7564e5ghdrhdrfu654e.mp4_bd.mp4),1), ((v4.go2yd.com,/video/65e54e87okiuygguyo8y7to6t7ru6.mp4_bd.mp4),1), ((v4.go2yd.com,/video/d4765476eytrdyhbsdy54e7657.mp4_bd.mp4),1)))
(v3.go2yd.com,CompactBuffer(((v3.go2yd.com,/user_upload/u64e545y4wy4ergrdjsu7567.mp4_bd.mp4),1), ((v3.go2yd.com,/video/u65eu56trhydxry56e.mp4_bd.mp4),1), ((v3.go2yd.com,/video/4365u7tyfdjhudxyhs43t43t54765u6d.mp4_bd.mp4),1), ((v3.go2yd.com,/video/65764ydxse5y34est4343.mp4_bd.mp4),1), ((v3.go2yd.com,/video/u64e545y4wy4ergrdjsu7567.mp4_bd.mp4),1), ((v3.go2yd.com,/video/465esgdsju7i7uyvgfjyd5ytrdxg.mp4_bd.mp4),1)))
(v1.go2yd.com,CompactBuffer(((v1.go2yd.com,/video/654rythdju65787ikyukjfvkyi8.mp4_bd.mp4),1), ((v1.go2yd.com,/user_upload/4346547u6ytsgrfgsersa23tr4egst4.mp4_bd.mp4),1), ((v1.go2yd.com,/video/54wt4regshy65r675785865dyhdxh.mp4_bd.mp4),1), ((v1.go2yd.com,/video/y54ey54y5hdxshtr6u4w4y2tg2.mp4_bd.mp4),1), ((v1.go2yd.com,/video/38ddjsic89s8je8sjxjcdie89.mp4_sd.mp4),1)))
result4:
--------------------------------------------
(v2.go2yd.com,List(((v2.go2yd.com,/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4),2)))
(v4.go2yd.com,List(((v4.go2yd.com,/video/d4765476eytrdyhbsdy54e7657.mp4_bd.mp4),1)))
(v3.go2yd.com,List(((v3.go2yd.com,/video/465esgdsju7i7uyvgfjyd5ytrdxg.mp4_bd.mp4),1)))
(v1.go2yd.com,List(((v1.go2yd.com,/video/38ddjsic89s8je8sjxjcdie89.mp4_sd.mp4),1)))
result5:
--------------------------------------------
((v2.go2yd.com,/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4),2)
((v4.go2yd.com,/video/d4765476eytrdyhbsdy54e7657.mp4_bd.mp4),1)
((v3.go2yd.com,/video/465esgdsju7i7uyvgfjyd5ytrdxg.mp4_bd.mp4),1)
((v1.go2yd.com,/video/38ddjsic89s8je8sjxjcdie89.mp4_sd.mp4),1)