val lines = sc.textFile("hdfs://spark1:9000/spark.txt")
val words = lines.flatMap(line => line.split(" "))
val pairs = words.map(word => (word, 1))
val wordCounts = pairs.reduceByKey(_ + _)
wordCounts.foreach(wordcount => println(wordcount._1 + " appeared " + wordcount._2 + " times"))
2.Scala for idea
package cn.spark.study.core
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object WordCount {
def main(args: Array[String]) {
val conf = new SparkConf()
val sc = new SparkContext(conf)
val lines = sc.textFile(args(0), 1);
val words = lines.flatMap { line => line.split(" ")}
val pairs = words.map {word => (word, 1)}
val wordCount = pairs.reduceByKey(_ + _)
wordCount.foreach(wordCount => println(wordCount._1 + " appeared " + wordCount._2 + " times"))
最后,需要使用spark submit提交到spark集群中进行运行,执行脚本如下:
--class cn.spark.study.core.WordCount
注意:需要停止spark-shell,否则可能出现内存不足错误(Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources)