美文网首页
Apache Flink 学习笔记(二)

Apache Flink 学习笔记(二)

作者: 憨人Zoe | 来源:发表于2018-09-19 20:40 被阅读0次

    上一篇 Apache Flink 学习笔记(一) 简单示范了批处理的使用,本篇展示流式处理的使用方法。

    流处理也叫无界处理,因为数据是源源不断的被加载进来的,流处理需要用到DataStream类。本篇demo 将结合kafka(公司有现成的消息生产者)来演示。

    kafka 消息体如下(json):

    {
        "appId":"xxxx",
        "module":"xxxx"
        //其余省略
    }
    

    现在我想每10s统计一次,按照appid分组计数(需求简单一点),Event TimeProcessingTimeWindows滚动窗口

    import com.alibaba.fastjson.JSON;
    import com.alibaba.fastjson.JSONObject;
    import org.apache.flink.api.common.functions.AggregateFunction;
    import org.apache.flink.api.common.functions.MapFunction;
    import org.apache.flink.api.java.tuple.Tuple2;
    import org.apache.flink.streaming.api.TimeCharacteristic;
    import org.apache.flink.streaming.api.datastream.DataStream;
    import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
    import org.apache.flink.streaming.api.functions.sink.SinkFunction;
    import org.apache.flink.streaming.api.windowing.time.Time;
    import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer09;
    
    import java.util.Date;
    import java.util.HashMap;
    import java.util.Map;
    
    public class Demo3 {
        public static void main(String[] args) {
            //生成流式执行环境对象 StreamExecutionEnvironment
            final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
            env.getConfig().enableSysoutLogging();//开启Sysout打日志
            env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime); //设置窗口的时间单位为process time
            env.setParallelism(2);//全局并发数
            //配置kafka bootstrap.servers
            Properties properties = new Properties();
            properties.setProperty("bootstrap.servers", "kafka bootstrap.servers");
            //配置消息主题和应用名(自定义工具类FlinkKafkaManager,源码在后面)
            FlinkKafkaManager manager = new FlinkKafkaManager("kafka.topic", "app.name", properties);
            //用JsonObject 反序列化接收kafka
            FlinkKafkaConsumer09<JSONObject> consumer = manager.build(JSONObject.class);
            //从最新的消息开始接收
            consumer.setStartFromLatest();
            //获得DataStream
            DataStream<JSONObject> messageStream = env.addSource(consumer);
            //转化为pojo
            DataStream<Bean3> bean3DataStream = messageStream.map(new FlatMap());
            bean3DataStream
                    .keyBy(Bean3::getAppId) //也可以用“appId”替换
                    .timeWindow(Time.seconds(10))//等价于下面这一行,因为上面设置了TimeCharacteristic.ProcessingTime
                   // .window(TumblingProcessingTimeWindows.of(Time.seconds(10)))//基于process time的窗口
                    .aggregate(new Agg()) //聚合函数,这里也可以参照demo2用reduce函数
                    .addSink(new Sink()); //输出函数
            try {
                env.execute("app.name");//流式处理需要调用触发
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
        public static class FlatMap implements MapFunction<JSONObject, Bean3> {
            @Override
            public Bean3 map(JSONObject jsonObject) throws Exception {
                return new Bean3(jsonObject.getString("appId"), jsonObject.getString("module"));
            }
        }
    
        public static class Agg implements AggregateFunction<Bean3, Tuple2<Bean3, Long>, Tuple2<Bean3, Long>> {
            @Override
            public Tuple2<Bean3, Long> createAccumulator() {
                return new Tuple2<Bean3, Long>();
            }
    
            @Override
            public Tuple2<Bean3, Long> add(Bean3 bean3, Tuple2<Bean3, Long> bean3LongTuple2) {
                Bean3 bean = bean3LongTuple2.f0;
                Long count = bean3LongTuple2.f1;
                if (bean == null) {
                    bean = bean3;
                }
                if (count == null) {
                    count = 1L;
                } else {
                    count++;
                }
                return new Tuple2<>(bean, count);
            }
    
            @Override
            public Tuple2<Bean3, Long> getResult(Tuple2<Bean3, Long> bean3LongTuple2) {
                return bean3LongTuple2;
            }
    
            @Override
            public Tuple2<Bean3, Long> merge(Tuple2<Bean3, Long> bean3LongTuple2, Tuple2<Bean3, Long> acc1) {
                Bean3 bean = bean3LongTuple2.f0;
                Long count = bean3LongTuple2.f1;
                Long acc = acc1.f1;
                return new Tuple2<>(bean, count + acc);
            }
        }
    
        public static class Sink implements SinkFunction<Tuple2<Bean3, Long>> {
            @Override
            public void invoke(Tuple2<Bean3, Long> value, Context context) throws Exception {
                System.out.println(value.f0.toString() + "," + value.f1);
            }
        }
    
        public static class Bean3 {
            public String appId;
            public String module;
    
            public Bean3() {
            }
    
            public Bean3(String appId, String module) {
                this.appId = appId;
                this.module = module;
            }
    
            public String getAppId() {
                return appId;
            }
    
            public void setAppId(String appId) {
                this.appId = appId;
            }
    
            public String getModule() {
                return module;
            }
    
            public void setModule(String module) {
                this.module = module;
            }
    
            @Override
            public String toString() {
                return "Bean3{" +
                        "appId='" + appId + '\'' +
                        ", module='" + module + '\'' +
                        '}';
            }
        }
    }
    

    与上一篇批处理的demo相比,流处理显得复杂了许多。实际上二者有很多想通的地方,比如批处理中的groupBy和流处理的keyBy,都是按照指定维度分组的。

    而流处理中会引入窗口的概念,正如前面所说,流式数据是无界数据,Flink 借助窗口将无界数据转化成一个个“批处理”再做计算。窗口分为滚动窗口滑动窗口会话窗口等等,具体可参见官网介绍。而每个窗口的时间划分则是由event time 决定的,本例采用的是ProcessingTime即处理时间。

    下面我将demo3改造,使其变成使用EventTime,也就是说窗口的时间由数据源的时间戳(事件发生)决定。

    改动1
    //为pojo Bean3 添加时间戳字段
    public static class Bean3 {
        public Long timestamp;//add event time
        public String appId;
        public String module;
    
        public Bean3() {
        }
    
        public Bean3(Long timestamp, String appId, String module) {
            this.timestamp = timestamp;
            this.appId = appId;
            this.module = module;
        }
    
        public long getTimestamp() {
            return timestamp;
        }
    
        public void setTimestamp(Long timestamp) {
            this.timestamp = timestamp;
        }
    //省略其他
    }
    
    改动2
    //设置窗口的时间单位为event time  
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); 
    
    改动3
    //新增
    //指定数据源的时间戳,Time.seconds(int)是指允许多长时间消息延迟
    DataStream<Bean3> bean3DataStreamWithAssignTime = 
    bean3DataStream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<Bean3>(Time.seconds(0)) {
        @Override
        public long extractTimestamp(Bean3 element) {
            return element.getTimestamp();
        }
    });
    
    改动4
    bean3DataStreamWithAssignTime
                    .keyBy(Bean3::getAppId)
                    .window(TumblingEventTimeWindows.of(Time.seconds(10)))//基于event time的窗口
                    .allowedLateness(Time.seconds(5)) //允许数据延迟多长时间,谨慎使用,迟到的数据会导致出现重复
    //后面省略
    
    FlinkKafkaManager 源码
    package flink.test.manager;
    
    import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer09;
    import java.util.Properties;
    
    public class FlinkKafkaManager<T> {
        private String topic;
        private String groupId;
        private Properties properties;
    
        public FlinkKafkaManager(String topic, String groupId, Properties properties) {
            this.topic = topic;
            this.groupId = groupId;
            this.properties = properties;
            this.properties.setProperty("group.id", this.groupId);
            //为使用默认kafka的用户配置基础配置
            this.setDefaultKafkaProperties();
        }
    
        private void setDefaultKafkaProperties() {
            //启用auto commit offset, 每5s commit一次
            this.properties.setProperty("enable.auto.commit", "true");
            this.properties.setProperty("auto.commit.interval.ms", "5000");
        }
    
        public FlinkKafkaConsumer09<T> build(Class<T> clazz) {
            if (checkProperties()) {
                return new FlinkKafkaConsumer09<T>(topic, new ConsumerDeserializationSchema(clazz), properties);
            } else {
                return null;
            }
        }
    
        private boolean checkProperties() {
            boolean isValued = true;
    
            if (!properties.containsKey("bootstrap.servers")) {
                isValued = false;
            } else {
                String brokers = properties.getProperty("bootstrap.servers");
                if (brokers == null || brokers.isEmpty()) {
                    isValued = false;
                }
            }
    
            if (this.topic == null || this.topic.isEmpty()) {
                isValued = false;
            }
    
            if (!properties.containsKey("group.id")) {
                isValued = false;
            } else {
                String groupId = properties.getProperty("group.id");
                if (groupId == null || groupId.isEmpty()) {
                    isValued = false;
                }
            }
    
            return isValued;
        }
    }
    
    ConsumerDeserializationSchema 源码
    package flink.test.manager;
    
    import com.alibaba.fastjson.JSON;
    import org.apache.flink.api.common.serialization.DeserializationSchema;
    import org.apache.flink.api.common.typeinfo.TypeInformation;
    import org.apache.flink.api.java.typeutils.TypeExtractor;
    
    import java.io.IOException;
    
    public class ConsumerDeserializationSchema<T> implements DeserializationSchema<T> {
        private Class<T> clazz;
    
        public ConsumerDeserializationSchema(Class<T> clazz) {
            this.clazz = clazz;
        }
    
        @Override
        public T deserialize(byte[] bytes) throws IOException {
            //确保 new String(bytes) 是json 格式,如果不是,请自行解析
            return JSON.parseObject(new String(bytes), clazz);
        }
    
        @Override
        public boolean isEndOfStream(T t) {
            return false;
        }
    
        @Override
        public TypeInformation<T> getProducedType() {
            return TypeExtractor.getForClass(clazz);
        }
    }
    

    相关文章

      网友评论

          本文标题:Apache Flink 学习笔记(二)

          本文链接:https://www.haomeiwen.com/subject/oyainftx.html