93.用spark 实现WordCount
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
/**
* Created by Administrator on 2019/11/25
*/
object TOK {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
System.setProperty("hadoop.home.dir", "E:\\hadoop-2.6.0-cdh5.15.0")
val ss = SparkSession
.builder()
.appName(" spark 2.0")
.master("local")
.getOrCreate()
val sc = ss.sparkContext //获取socket
val df = sc.textFile("D:\\hadoop.txt") //
df.map(x=>(x,1)).reduceByKey((x,y)=>(x+y)).foreach(println)
"hello","jack","desire","hello","mask","hello"
(hello,3)
(jack,1)
(desire,1)
(mask,1)
}
}
1.1.spark里面 RDD.persist和RDD.cache的区别
cache是memory only , 而persist可以选择多种固化方式 , 不仅仅局限于内存之中
网友评论