原始文件baidu.log内容:
hello world spark hadoop hive 223.104.18.110 v1.go2yd.com 17168 http://v1.go2yd.com/video/38ddjsic89s8je8sjxjcdie89.mp4_sd.mp4
world spark hello mysql sqoop 113.101.75.194 v2.go2yd.com 17222 http://v2.go2yd.com/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4
spark hello mysql hive world 27.17.127.135 v2.go2yd.com 1556 http://v2.go2yd.com/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4
......
......
需求:求每个域名下访问次数最多的文件资源
一般情况下url格式为:http://domain/a/b/c/xxx.mp4?x=y&w=z.... 资源应该是:/a/b/c/xxx.mp4这一段,即第一个/后到第一个?前的内容,因此要做一个截取
object test {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("test").setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val lines = sc.textFile("file:///E:/BigDataSoftware/data/baidu.log")
val url = lines.map(x=> {
val tmp = x.split("\t")
val resource = getresources(tmp(8))
((tmp(6),resource),1)
}) //见result1
.reduceByKey(_+_) //见result2
val a = url.groupBy(_._1._1) //见result3
val b = a.mapValues(_.toList.sortBy(_._2).reverse) //见result4
val c = b.flatMap(_._2) //见result5
//关闭SparkContext
sc.stop()
}
//定义一个函数,从url中获取资源名称
def getresources (url:String) = {
// 将url中的“//”删除
val pathTemp = url.replaceFirst("//","")
// 取pathTemp中第一个“/”的位置
var pathIndex = pathTemp.indexOf("/")
var path = ""
// 如果pathIndex != -1,取第一个“/”后边的内容,包括“/”
if (pathIndex != -1) {
path = pathTemp.substring(pathIndex)
// 如果pathIndex != -1,取第一个“?”前边的内容,不包括“?”
pathIndex = path.indexOf("?")
if (pathIndex != -1) {
path = path.substring(0,pathIndex)
}
}
path
}
}
result1:
--------------------------------------------
((v1.go2yd.com,/video/38ddjsic89s8je8sjxjcdie89.mp4_sd.mp4),1)
((v2.go2yd.com,/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4),1)
((v2.go2yd.com,/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4),1)
((v3.go2yd.com,/video/u65eu56trhydxry56e.mp4_bd.mp4),1)
((v4.go2yd.com,/video/65e54e87okiuygguyo8y7to6t7ru6.mp4_bd.mp4),1)
((v4.go2yd.com,/user_upload/5r7564e5ghdrhdrfu654e.mp4_bd.mp4),1)
((v3.go2yd.com,/video/u64e545y4wy4ergrdjsu7567.mp4_bd.mp4),1)
((v1.go2yd.com,/video/54wt4regshy65r675785865dyhdxh.mp4_bd.mp4),1)
((v4.go2yd.com,/video/d4765476eytrdyhbsdy54e7657.mp4_bd.mp4),1)
((v1.go2yd.com,/video/y54ey54y5hdxshtr6u4w4y2tg2.mp4_bd.mp4),1)
((v1.go2yd.com,/user_upload/4346547u6ytsgrfgsersa23tr4egst4.mp4_bd.mp4),1)
((v2.go2yd.com,/video/4ste57r7d8udytdyyyyy43433.mp4_bd.mp4),1)
((v3.go2yd.com,/video/465esgdsju7i7uyvgfjyd5ytrdxg.mp4_bd.mp4),1)
((v3.go2yd.com,/user_upload/u64e545y4wy4ergrdjsu7567.mp4_bd.mp4),1)
((v1.go2yd.com,/video/654rythdju65787ikyukjfvkyi8.mp4_bd.mp4),1)
((v3.go2yd.com,/video/65764ydxse5y34est4343.mp4_bd.mp4),1)
((v3.go2yd.com,/video/4365u7tyfdjhudxyhs43t43t54765u6d.mp4_bd.mp4),1)
result2:
--------------------------------------------
((v4.go2yd.com,/user_upload/5r7564e5ghdrhdrfu654e.mp4_bd.mp4),1)
((v4.go2yd.com,/video/65e54e87okiuygguyo8y7to6t7ru6.mp4_bd.mp4),1)
((v1.go2yd.com,/video/654rythdju65787ikyukjfvkyi8.mp4_bd.mp4),1)
((v1.go2yd.com,/user_upload/4346547u6ytsgrfgsersa23tr4egst4.mp4_bd.mp4),1)
((v3.go2yd.com,/user_upload/u64e545y4wy4ergrdjsu7567.mp4_bd.mp4),1)
((v4.go2yd.com,/video/d4765476eytrdyhbsdy54e7657.mp4_bd.mp4),1)
((v3.go2yd.com,/video/u65eu56trhydxry56e.mp4_bd.mp4),1)
((v3.go2yd.com,/video/4365u7tyfdjhudxyhs43t43t54765u6d.mp4_bd.mp4),1)
((v2.go2yd.com,/video/4ste57r7d8udytdyyyyy43433.mp4_bd.mp4),1)
((v3.go2yd.com,/video/65764ydxse5y34est4343.mp4_bd.mp4),1)
((v1.go2yd.com,/video/54wt4regshy65r675785865dyhdxh.mp4_bd.mp4),1)
((v1.go2yd.com,/video/y54ey54y5hdxshtr6u4w4y2tg2.mp4_bd.mp4),1)
((v3.go2yd.com,/video/u64e545y4wy4ergrdjsu7567.mp4_bd.mp4),1)
((v3.go2yd.com,/video/465esgdsju7i7uyvgfjyd5ytrdxg.mp4_bd.mp4),1)
((v2.go2yd.com,/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4),2)
((v1.go2yd.com,/video/38ddjsic89s8je8sjxjcdie89.mp4_sd.mp4),1)
result3:
--------------------------------------------
(v2.go2yd.com,CompactBuffer(((v2.go2yd.com,/video/4ste57r7d8udytdyyyyy43433.mp4_bd.mp4),1), ((v2.go2yd.com,/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4),2)))
(v4.go2yd.com,CompactBuffer(((v4.go2yd.com,/user_upload/5r7564e5ghdrhdrfu654e.mp4_bd.mp4),1), ((v4.go2yd.com,/video/65e54e87okiuygguyo8y7to6t7ru6.mp4_bd.mp4),1), ((v4.go2yd.com,/video/d4765476eytrdyhbsdy54e7657.mp4_bd.mp4),1)))
(v3.go2yd.com,CompactBuffer(((v3.go2yd.com,/user_upload/u64e545y4wy4ergrdjsu7567.mp4_bd.mp4),1), ((v3.go2yd.com,/video/u65eu56trhydxry56e.mp4_bd.mp4),1), ((v3.go2yd.com,/video/4365u7tyfdjhudxyhs43t43t54765u6d.mp4_bd.mp4),1), ((v3.go2yd.com,/video/65764ydxse5y34est4343.mp4_bd.mp4),1), ((v3.go2yd.com,/video/u64e545y4wy4ergrdjsu7567.mp4_bd.mp4),1), ((v3.go2yd.com,/video/465esgdsju7i7uyvgfjyd5ytrdxg.mp4_bd.mp4),1)))
(v1.go2yd.com,CompactBuffer(((v1.go2yd.com,/video/654rythdju65787ikyukjfvkyi8.mp4_bd.mp4),1), ((v1.go2yd.com,/user_upload/4346547u6ytsgrfgsersa23tr4egst4.mp4_bd.mp4),1), ((v1.go2yd.com,/video/54wt4regshy65r675785865dyhdxh.mp4_bd.mp4),1), ((v1.go2yd.com,/video/y54ey54y5hdxshtr6u4w4y2tg2.mp4_bd.mp4),1), ((v1.go2yd.com,/video/38ddjsic89s8je8sjxjcdie89.mp4_sd.mp4),1)))
result4:
--------------------------------------------
(v2.go2yd.com,List(((v2.go2yd.com,/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4),2)))
(v4.go2yd.com,List(((v4.go2yd.com,/video/d4765476eytrdyhbsdy54e7657.mp4_bd.mp4),1)))
(v3.go2yd.com,List(((v3.go2yd.com,/video/465esgdsju7i7uyvgfjyd5ytrdxg.mp4_bd.mp4),1)))
(v1.go2yd.com,List(((v1.go2yd.com,/video/38ddjsic89s8je8sjxjcdie89.mp4_sd.mp4),1)))
result5:
--------------------------------------------
((v2.go2yd.com,/video/5federste456yrtdstr5y4eygtdy5.mp4_bd.mp4),2)
((v4.go2yd.com,/video/d4765476eytrdyhbsdy54e7657.mp4_bd.mp4),1)
((v3.go2yd.com,/video/465esgdsju7i7uyvgfjyd5ytrdxg.mp4_bd.mp4),1)
((v1.go2yd.com,/video/38ddjsic89s8je8sjxjcdie89.mp4_sd.mp4),1)
网友评论