package com.soul.bigdata.day0511
import com.soul.bigdata.Utils.LogUtils
import org.apache.spark.{SparkConf, SparkContext}
object Task01 {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("Task01") setMaster ("local[2]")
val sc = new SparkContext(sparkConf)
System.setProperty("hadoop.home.dir", "D:\\Hadoop\\tar.gz\\hadoop-2.6.0-cdh5.7.0\\hadoop-2.6.0-cdh5.7.0")
val textFile = sc.textFile("D:\\RZ-G6\\2019G6\\data\\task0511.log")
val tools = new LogUtils()
val dataRDD = textFile.map(line => {
var traffic = 0L
val log = tools.parse(line)
val spilts = log.split("\t")
if (spilts.length == 8) {
val cdn = spilts(0)
val region = spilts(1)
val level = spilts(2)
val time = spilts(3)
val ip = spilts(4)
val domain = spilts(5)
val url = spilts(6)
try {
traffic = spilts(7).toLong
} catch {
case e: Exception => 0L
}
(cdn, region, level, time, ip, domain, url, traffic)
} else {
("-", "-", "-", "-", "-", "-", "-", 0L)
}
})
//dataRDD.take(10).foreach(println)
//(baidu,CN,2,20190111200039,171.15.67.215,v6.go2yd.com,http://v6.go2yd.com/user_upload/43c.mp4_bd.mp4,9755)
//TODO
/**
* 完成每个域名下访问数TOP3的文件资源,按照访问次数降序排列
* (v6.go2yd.com,43c.mp4_bd.mp4,9755)
*
*/
//数据结构转成((v7.go2yd.com,4c.mp4_bd.mp4),(1,1366))
val task01RDD = dataRDD.map(x => {
val domain = x._6
val resourceFile = x._7.split("upload/")(1)
val traffic = x._8
((domain, resourceFile), (1, traffic))
})
//((v7.go2yd.com,4c.mp4_bd.mp4),(1,1366)) key=(v7.go2yd.com,4c.mp4_bd.mp4) value=(1,1366)
task01RDD.reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2))
.map(x => (x._1._1, (x._1._2, x._2._1, x._2._1))).groupByKey().repartition(1)
.map(x => {
val top53 = x._2.toArray.sortWith((a, b) => (a._2 > b._2)).take(3)
(x._1, top3)
}).flatMapValues(x => x).sortByKey().collect().foreach(println)
sc.stop()
}
}
(v1.go2yd.com,(44c.mp4_bd.mp4,1762,1762))
(v1.go2yd.com,(45c.mp4_bd.mp4,1715,1715))
(v1.go2yd.com,(42c.mp4_bd.mp4,1698,1698))
(v10.go2yd.com,(43c.mp4_bd.mp4,1735,1735))
(v10.go2yd.com,(41c.mp4_bd.mp4,1689,1689))
(v10.go2yd.com,(42c.mp4_bd.mp4,1682,1682))
(v2.go2yd.com,(4c.mp4_bd.mp4,1727,1727))
(v2.go2yd.com,(43c.mp4_bd.mp4,1723,1723))
(v2.go2yd.com,(42c.mp4_bd.mp4,1696,1696))
(v3.go2yd.com,(4c.mp4_bd.mp4,1751,1751))
(v3.go2yd.com,(42c.mp4_bd.mp4,1741,1741))
(v3.go2yd.com,(45c.mp4_bd.mp4,1716,1716))
(v4.go2yd.com,(4c.mp4_bd.mp4,1751,1751))
(v4.go2yd.com,(42c.mp4_bd.mp4,1712,1712))
(v4.go2yd.com,(44c.mp4_bd.mp4,1699,1699))
(v5.go2yd.com,(45c.mp4_bd.mp4,1780,1780))
(v5.go2yd.com,(42c.mp4_bd.mp4,1756,1756))
(v5.go2yd.com,(4c.mp4_bd.mp4,1708,1708))
(v6.go2yd.com,(4c.mp4_bd.mp4,1713,1713))
(v6.go2yd.com,(42c.mp4_bd.mp4,1697,1697))
(v6.go2yd.com,(45c.mp4_bd.mp4,1682,1682))
(v7.go2yd.com,(44c.mp4_bd.mp4,1799,1799))
(v7.go2yd.com,(42c.mp4_bd.mp4,1722,1722))
(v7.go2yd.com,(41c.mp4_bd.mp4,1722,1722))
(v8.go2yd.com,(42c.mp4_bd.mp4,1734,1734))
(v8.go2yd.com,(45c.mp4_bd.mp4,1725,1725))
(v8.go2yd.com,(44c.mp4_bd.mp4,1714,1714))
(v9.go2yd.com,(44c.mp4_bd.mp4,1768,1768))
(v9.go2yd.com,(4c.mp4_bd.mp4,1742,1742))
(v9.go2yd.com,(43c.mp4_bd.mp4,1732,1732))
网友评论