美文网首页
04 kafka作为Flink的数据源完成词频统计

04 kafka作为Flink的数据源完成词频统计

作者: 张力的程序园 | 来源:发表于2020-06-29 22:51 被阅读0次

本节将展示使用kafka作为Flink的数据来源,该例子也是一个Flink流处理的demo。

1、前提约束

  • 已安装kafka
    https://www.jianshu.com/p/1a7b9970d073
    假设kafka所在的服务器的ip为192.168.100.141,且已关闭防火墙。
    kafka的安装目录为:/root/kafka_2.11-2.2.1
    zookeeper的安装目录为:/root/zookeeper-3.4.11

2 操作

  • 1 在idea中创建一个maven项目
  • 2 修改该maven项目的pom.xml中的依赖
   <dependencies>
        <dependency>
            <!--spark依赖-->
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.2.0</version>
        </dependency>
        <!--scala依赖-->
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>2.11.8</version>
        </dependency>
        <!--storm依赖-->
        <dependency>
            <groupId>org.apache.storm</groupId>
            <artifactId>storm-core</artifactId>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>log4j-over-slf4j</artifactId>
                </exclusion>
            </exclusions>
            <version>1.2.1</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-core -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-core</artifactId>
            <version>1.5.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.5.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>1.5.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.11</artifactId>
            <version>1.5.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka-0.10_2.11</artifactId>
            <version>1.5.0</version>
        </dependency>
    </dependencies>
  • 在项目的src/main/java文件夹下创建KafkaToFlink.java
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.util.Collector;

import java.util.Properties;

public class KafkaToFlink {

    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.enableCheckpointing(5000);
        Properties properties = new Properties();
        properties.put("bootstrap.servers", "192.168.100.141:9092");
        properties.put("zookeeper.connect", "192.168.100.141:2181");
        properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        properties.put("group.id", "kf");
        FlinkKafkaConsumer010<String> myConsumer = new FlinkKafkaConsumer010<String>("kafka-flink", new SimpleStringSchema(),
                properties);

        DataStream<String> dataStream = env.addSource(myConsumer);


        DataStream<WordWithCount> windowCounts = dataStream.rebalance().flatMap(new FlatMapFunction<String, WordWithCount>() {
            public void flatMap(String value, Collector<WordWithCount> out) {
                System.out.println("接收到kafka数据:" + value);
                for (String word : value.split("\\s")) {
                    out.collect(new WordWithCount(word, 1L));
                }
            }
        }).keyBy("word")
                .timeWindow(Time.seconds(2))
                .reduce(new ReduceFunction<WordWithCount>() {
                    public WordWithCount reduce(WordWithCount a, WordWithCount b) {
                        return new WordWithCount(a.word, a.count + b.count);
                    }
                });
        windowCounts.print().setParallelism(1);
        env.execute("KafkaToFlink");
    }

    public static class WordWithCount {
        public String word;
        public long count;

        public WordWithCount() {
        }

        public WordWithCount(String word, long count) {
            this.word = word;
            this.count = count;
        }

        @Override
        public String toString() {
            return word + " : " + count;
        }
    }
}
  • 在192.168.100.141中执行以下命令:
# 启动zookeeper
cd /root/zookeeper-3.4.11/bin
./zkServer.sh start
# 启动kafka
cd /root/kafka_2.11-2.2.1
bin/kafka-server-start.sh config/server.properties
# 创建topic
bin/kafka-topics.sh --create --bootstrap-server 192.168.100.141:9092 --replication-factor 1 --partitions 1 --topic  kafka-flink
# 启动生产者
bin/kafka-console-producer.sh --broker-list  192.168.100.141:9092  --topic kafka-flink
  • 测试
    启动KafkaToFlink.java中的main方法,在kafka的生产者命令行中连续输入字符串,则在main方法启动的命令行中就能看到词频统计的结果。
    以上就是kafka作为Flink的数据源完成词频统计的演示。

相关文章

网友评论

      本文标题:04 kafka作为Flink的数据源完成词频统计

      本文链接:https://www.haomeiwen.com/subject/cmbafktx.html