Apache Flink 学习笔记（二）

作者: 憨人Zoe | 来源:发表于2018-09-19 20:40 被阅读0次

Apache Flink 学习笔记（二）
2019-02-26
Flink
Apache Flink笔记
BI系统套装
Flink —— 基本组件与 WordCount
Apache Flink 学习笔记（一）
Apache Flink 学习笔记（三）
Apache Flink 学习笔记（四）
Apache Flink 学习笔记（五）

上一篇 Apache Flink 学习笔记（一）简单示范了批处理的使用，本篇展示流式处理的使用方法。

流处理也叫无界处理，因为数据是源源不断的被加载进来的，流处理需要用到DataStream类。本篇demo 将结合kafka（公司有现成的消息生产者）来演示。

kafka 消息体如下（json）：

{
    "appId":"xxxx",
    "module":"xxxx"
    //其余省略
}

现在我想每10s统计一次，按照appid分组计数（需求简单一点），Event Time为ProcessingTime，Windows为滚动窗口。

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer09;

import java.util.Date;
import java.util.HashMap;
import java.util.Map;

public class Demo3 {
    public static void main(String[] args) {
        //生成流式执行环境对象 StreamExecutionEnvironment
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.getConfig().enableSysoutLogging();//开启Sysout打日志
        env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime); //设置窗口的时间单位为process time
        env.setParallelism(2);//全局并发数
        //配置kafka bootstrap.servers
        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "kafka bootstrap.servers");
        //配置消息主题和应用名（自定义工具类FlinkKafkaManager，源码在后面）
        FlinkKafkaManager manager = new FlinkKafkaManager("kafka.topic", "app.name", properties);
        //用JsonObject 反序列化接收kafka
        FlinkKafkaConsumer09<JSONObject> consumer = manager.build(JSONObject.class);
        //从最新的消息开始接收
        consumer.setStartFromLatest();
        //获得DataStream
        DataStream<JSONObject> messageStream = env.addSource(consumer);
        //转化为pojo
        DataStream<Bean3> bean3DataStream = messageStream.map(new FlatMap());
        bean3DataStream
                .keyBy(Bean3::getAppId) //也可以用“appId”替换
                .timeWindow(Time.seconds(10))//等价于下面这一行，因为上面设置了TimeCharacteristic.ProcessingTime
               // .window(TumblingProcessingTimeWindows.of(Time.seconds(10)))//基于process time的窗口
                .aggregate(new Agg()) //聚合函数，这里也可以参照demo2用reduce函数
                .addSink(new Sink()); //输出函数
        try {
            env.execute("app.name");//流式处理需要调用触发
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static class FlatMap implements MapFunction<JSONObject, Bean3> {
        @Override
        public Bean3 map(JSONObject jsonObject) throws Exception {
            return new Bean3(jsonObject.getString("appId"), jsonObject.getString("module"));
        }
    }

    public static class Agg implements AggregateFunction<Bean3, Tuple2<Bean3, Long>, Tuple2<Bean3, Long>> {
        @Override
        public Tuple2<Bean3, Long> createAccumulator() {
            return new Tuple2<Bean3, Long>();
        }

        @Override
        public Tuple2<Bean3, Long> add(Bean3 bean3, Tuple2<Bean3, Long> bean3LongTuple2) {
            Bean3 bean = bean3LongTuple2.f0;
            Long count = bean3LongTuple2.f1;
            if (bean == null) {
                bean = bean3;
            }
            if (count == null) {
                count = 1L;
            } else {
                count++;
            }
            return new Tuple2<>(bean, count);
        }

        @Override
        public Tuple2<Bean3, Long> getResult(Tuple2<Bean3, Long> bean3LongTuple2) {
            return bean3LongTuple2;
        }

        @Override
        public Tuple2<Bean3, Long> merge(Tuple2<Bean3, Long> bean3LongTuple2, Tuple2<Bean3, Long> acc1) {
            Bean3 bean = bean3LongTuple2.f0;
            Long count = bean3LongTuple2.f1;
            Long acc = acc1.f1;
            return new Tuple2<>(bean, count + acc);
        }
    }

    public static class Sink implements SinkFunction<Tuple2<Bean3, Long>> {
        @Override
        public void invoke(Tuple2<Bean3, Long> value, Context context) throws Exception {
            System.out.println(value.f0.toString() + "," + value.f1);
        }
    }

    public static class Bean3 {
        public String appId;
        public String module;

        public Bean3() {
        }

        public Bean3(String appId, String module) {
            this.appId = appId;
            this.module = module;
        }

        public String getAppId() {
            return appId;
        }

        public void setAppId(String appId) {
            this.appId = appId;
        }

        public String getModule() {
            return module;
        }

        public void setModule(String module) {
            this.module = module;
        }

        @Override
        public String toString() {
            return "Bean3{" +
                    "appId='" + appId + '\'' +
                    ", module='" + module + '\'' +
                    '}';
        }
    }
}

与上一篇批处理的demo相比，流处理显得复杂了许多。实际上二者有很多想通的地方，比如批处理中的groupBy和流处理的keyBy，都是按照指定维度分组的。

而流处理中会引入窗口的概念，正如前面所说，流式数据是无界数据，Flink 借助窗口将无界数据转化成一个个“批处理”再做计算。窗口分为滚动窗口，滑动窗口，会话窗口等等，具体可参见官网介绍。而每个窗口的时间划分则是由event time 决定的，本例采用的是ProcessingTime即处理时间。

下面我将demo3改造，使其变成使用EventTime，也就是说窗口的时间由数据源的时间戳（事件发生）决定。

改动1

//为pojo Bean3 添加时间戳字段
public static class Bean3 {
    public Long timestamp;//add event time
    public String appId;
    public String module;

    public Bean3() {
    }

    public Bean3(Long timestamp, String appId, String module) {
        this.timestamp = timestamp;
        this.appId = appId;
        this.module = module;
    }

    public long getTimestamp() {
        return timestamp;
    }

    public void setTimestamp(Long timestamp) {
        this.timestamp = timestamp;
    }
//省略其他
}

改动2

//设置窗口的时间单位为event time  
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

改动3

//新增
//指定数据源的时间戳，Time.seconds(int)是指允许多长时间消息延迟
DataStream<Bean3> bean3DataStreamWithAssignTime = 
bean3DataStream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<Bean3>(Time.seconds(0)) {
    @Override
    public long extractTimestamp(Bean3 element) {
        return element.getTimestamp();
    }
});

改动4

bean3DataStreamWithAssignTime
                .keyBy(Bean3::getAppId)
                .window(TumblingEventTimeWindows.of(Time.seconds(10)))//基于event time的窗口
                .allowedLateness(Time.seconds(5)) //允许数据延迟多长时间,谨慎使用,迟到的数据会导致出现重复
//后面省略

FlinkKafkaManager 源码

package flink.test.manager;

import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer09;
import java.util.Properties;

public class FlinkKafkaManager<T> {
    private String topic;
    private String groupId;
    private Properties properties;

    public FlinkKafkaManager(String topic, String groupId, Properties properties) {
        this.topic = topic;
        this.groupId = groupId;
        this.properties = properties;
        this.properties.setProperty("group.id", this.groupId);
        //为使用默认kafka的用户配置基础配置
        this.setDefaultKafkaProperties();
    }

    private void setDefaultKafkaProperties() {
        //启用auto commit offset, 每5s commit一次
        this.properties.setProperty("enable.auto.commit", "true");
        this.properties.setProperty("auto.commit.interval.ms", "5000");
    }

    public FlinkKafkaConsumer09<T> build(Class<T> clazz) {
        if (checkProperties()) {
            return new FlinkKafkaConsumer09<T>(topic, new ConsumerDeserializationSchema(clazz), properties);
        } else {
            return null;
        }
    }

    private boolean checkProperties() {
        boolean isValued = true;

        if (!properties.containsKey("bootstrap.servers")) {
            isValued = false;
        } else {
            String brokers = properties.getProperty("bootstrap.servers");
            if (brokers == null || brokers.isEmpty()) {
                isValued = false;
            }
        }

        if (this.topic == null || this.topic.isEmpty()) {
            isValued = false;
        }

        if (!properties.containsKey("group.id")) {
            isValued = false;
        } else {
            String groupId = properties.getProperty("group.id");
            if (groupId == null || groupId.isEmpty()) {
                isValued = false;
            }
        }

        return isValued;
    }
}

ConsumerDeserializationSchema 源码

package flink.test.manager;

import com.alibaba.fastjson.JSON;
import org.apache.flink.api.common.serialization.DeserializationSchema;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.typeutils.TypeExtractor;

import java.io.IOException;

public class ConsumerDeserializationSchema<T> implements DeserializationSchema<T> {
    private Class<T> clazz;

    public ConsumerDeserializationSchema(Class<T> clazz) {
        this.clazz = clazz;
    }

    @Override
    public T deserialize(byte[] bytes) throws IOException {
        //确保 new String(bytes) 是json 格式，如果不是，请自行解析
        return JSON.parseObject(new String(bytes), clazz);
    }

    @Override
    public boolean isEndOfStream(T t) {
        return false;
    }

    @Override
    public TypeInformation<T> getProducedType() {
        return TypeExtractor.getForClass(clazz);
    }
}

Apache Flink 学习笔记（二）
上一篇 Apache Flink 学习笔记（一）简单示范了批处理的使用，本篇展示流式处理的使用方法。流处理也叫...
2019-02-26
《从 1 到 100 深入学习 Flink》--- Apache Flink 介绍目录： 1，flink 流介绍...
Flink
本文主要参考自： Apache Flink 漫谈Apache Flink 漫谈系列 - 序Apache Flink...
Apache Flink笔记
Apache Flink笔记原文链接：http://timeyang.com/articles/28/2018/...
BI系统套装
flink 文档https://flink.apache.org/[https://flink.apache.or...
Flink —— 基本组件与 WordCount
小白的新手学习笔记，请大佬轻喷本文归档于GitHub，欢迎大家批评指正 Apache Flink is a fra...
Apache Flink 学习笔记（一）
最近在项目中需要用到Flink，关于Flink的基本介绍就不啰嗦了，官方文档传送门。由于是第一次接触，我花了一...
Apache Flink 学习笔记（三）
本篇将演示如何用Table API 实现上一篇demo3的功能。上一篇传送门 Apache Flink 学习笔记（...
Apache Flink 学习笔记（四）
本篇将演示如何使用 Flink SQL 实现上一篇demo5的功能，上一篇传送门 Apache Flink 学习笔...
Apache Flink 学习笔记（五）
通过阅读官方文档，整理一些前面demo中没有用到但是很有用的东西。排序不分先后。 2018.11.1补充 Flin...

Apache Flink 学习笔记（二）

改动1

改动2

改动3

改动4

FlinkKafkaManager 源码

ConsumerDeserializationSchema 源码

相关文章

Apache Flink 学习笔记（二）

2019-02-26

Flink

Apache Flink笔记

BI系统套装

Flink —— 基本组件与 WordCount

Apache Flink 学习笔记（一）

Apache Flink 学习笔记（三）

Apache Flink 学习笔记（四）

Apache Flink 学习笔记（五）

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读