美文网首页
spark wordcount

spark wordcount

作者: 持而盈 | 来源:发表于2018-01-23 15:09 被阅读53次

    wordcount

    java编写spark执行

    maven pom

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>cn.zb</groupId>
        <artifactId>spark</artifactId>
        <version>1.0-SNAPSHOT</version>
        <build>
            <plugins>
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-compiler-plugin</artifactId>
                    <configuration>
                        <source>1.8</source>
                        <target>1.8</target>
                    </configuration>
                </plugin>
            </plugins>
        </build>
    
    
        <dependencies>
            <dependency>
                <groupId>org.apache.spark</groupId>
                <artifactId>spark-core_2.11</artifactId>
                <version>2.2.1</version>
            </dependency>
        </dependencies>
    
    </project>
    

    项目结构

    image.png

    java代码

    package cn.zb;
    
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import scala.Tuple2;
    
    import java.util.Arrays;
    
    public class WordCount {
    
        public static void main(String[] args) {
            String inputFile = args[0];
            String outputFile = args[1];
    
            SparkConf sparkConf = new SparkConf().setAppName("wordcount");
            JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
            // 获取文件行
            JavaRDD<String> lines = sparkContext.textFile(inputFile);
            // 转化为单词
            JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
            // 转换为键值对
            JavaPairRDD<String, Integer> counts = words.mapToPair(word -> new Tuple2<>(word, 1))
                    .reduceByKey((v1, v2) -> v1 + v2);
            // 保存文件
            counts.saveAsTextFile(outputFile);
            sparkContext.stop();
        }
    }
    
    

    bash提交任务到Spark

    drwxrwxr-x 14 zb zb 4.0K Jan 22 22:58 ./
    drwxrwxr-x  8 zb zb 4.0K Jan 19 05:14 ../
    drwxrwxr-x  2 zb zb 4.0K Nov 24 13:52 bin/
    drwxrwxr-x  2 zb zb 4.0K Nov 24 13:52 conf/
    drwxrwxr-x  5 zb zb 4.0K Nov 24 13:52 data/
    drwxrwxr-x  4 zb zb 4.0K Nov 24 13:52 examples/
    drwxrwxr-x  2 zb zb  12K Nov 24 13:52 jars/
    -rw-rw-r--  1 zb zb  18K Nov 24 13:52 LICENSE
    drwxrwxr-x  2 zb zb 4.0K Nov 24 13:52 licenses/
    -rw-rw-r--  1 zb zb  25K Nov 24 13:52 NOTICE
    drwxrwxr-x  2 zb zb 4.0K Jan 22 22:58 outputFile/
    drwxrwxr-x  6 zb zb 4.0K Nov 24 13:52 python/
    drwxrwxr-x  3 zb zb 4.0K Nov 24 13:52 R/
    -rw-rw-r--  1 zb zb 3.8K Nov 24 13:52 README.md
    -rw-rw-r--  1 zb zb  128 Nov 24 13:52 RELEASE
    drwxrwxr-x  2 zb zb 4.0K Nov 24 13:52 sbin/
    drwxrwxr-x  2 zb zb 4.0K Jan 22 22:54 workspace/
    drwxrwxr-x  2 zb zb 4.0K Nov 24 13:52 yarn/
    zb@ubuntu:~/programs/spark-2.2.1-bin-hadoop2.6$ 
    zb@ubuntu:~/programs/spark-2.2.1-bin-hadoop2.6$ ./bin/spark-submit --class cn.zb.WordCount ./workspace/spark-1.0-SNAPSHOT.jar ./README.md ./outputFile
    
    

    相关文章

      网友评论

          本文标题:spark wordcount

          本文链接:https://www.haomeiwen.com/subject/fwzlaxtx.html