wordcount
java编写spark执行
maven pom
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.zb</groupId>
<artifactId>spark</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.2.1</version>
</dependency>
</dependencies>
</project>
项目结构
image.pngjava代码
package cn.zb;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
public class WordCount {
public static void main(String[] args) {
String inputFile = args[0];
String outputFile = args[1];
SparkConf sparkConf = new SparkConf().setAppName("wordcount");
JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
// 获取文件行
JavaRDD<String> lines = sparkContext.textFile(inputFile);
// 转化为单词
JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
// 转换为键值对
JavaPairRDD<String, Integer> counts = words.mapToPair(word -> new Tuple2<>(word, 1))
.reduceByKey((v1, v2) -> v1 + v2);
// 保存文件
counts.saveAsTextFile(outputFile);
sparkContext.stop();
}
}
bash提交任务到Spark
drwxrwxr-x 14 zb zb 4.0K Jan 22 22:58 ./
drwxrwxr-x 8 zb zb 4.0K Jan 19 05:14 ../
drwxrwxr-x 2 zb zb 4.0K Nov 24 13:52 bin/
drwxrwxr-x 2 zb zb 4.0K Nov 24 13:52 conf/
drwxrwxr-x 5 zb zb 4.0K Nov 24 13:52 data/
drwxrwxr-x 4 zb zb 4.0K Nov 24 13:52 examples/
drwxrwxr-x 2 zb zb 12K Nov 24 13:52 jars/
-rw-rw-r-- 1 zb zb 18K Nov 24 13:52 LICENSE
drwxrwxr-x 2 zb zb 4.0K Nov 24 13:52 licenses/
-rw-rw-r-- 1 zb zb 25K Nov 24 13:52 NOTICE
drwxrwxr-x 2 zb zb 4.0K Jan 22 22:58 outputFile/
drwxrwxr-x 6 zb zb 4.0K Nov 24 13:52 python/
drwxrwxr-x 3 zb zb 4.0K Nov 24 13:52 R/
-rw-rw-r-- 1 zb zb 3.8K Nov 24 13:52 README.md
-rw-rw-r-- 1 zb zb 128 Nov 24 13:52 RELEASE
drwxrwxr-x 2 zb zb 4.0K Nov 24 13:52 sbin/
drwxrwxr-x 2 zb zb 4.0K Jan 22 22:54 workspace/
drwxrwxr-x 2 zb zb 4.0K Nov 24 13:52 yarn/
zb@ubuntu:~/programs/spark-2.2.1-bin-hadoop2.6$
zb@ubuntu:~/programs/spark-2.2.1-bin-hadoop2.6$ ./bin/spark-submit --class cn.zb.WordCount ./workspace/spark-1.0-SNAPSHOT.jar ./README.md ./outputFile
网友评论