美文网首页
Spark Streaming 从指定时间戳开始消费 kafka

Spark Streaming 从指定时间戳开始消费 kafka

作者: 吃货大米饭 | 来源:发表于2019-10-28 09:32 被阅读0次

    一、需求

    从指定时间戳(比如 2 小时前)开始消费 Kafka 数据

    二、思路

    我们知道通过 Kafka 的 API 可以得到指定时间戳对应数据所在的 segment 的起始 offset。那么就可以通过这个功能来粗略的实现需求。

    三、实现

    我们知道 KafkaUitls.createDirectStream 这个接口可以指定起始点的 offset,那么我们需要做的就变成如下三步:

    • 获取 topic 对应的 TopicAndPartitions ,得到当前 topic 有多少 partition
    • 从 Kafka 获取每个 partition 指定时间戳所在 segment 的起始 offset
    • 将步骤 2 中的 offset 作为参数传入 createDirectStream 即可
    package com.ruozedata.bigdata.spark.streaming01
    
    import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer}
    import org.apache.kafka.common.TopicPartition
    import org.apache.kafka.common.serialization.StringDeserializer
    import org.apache.spark.SparkConf
    import org.apache.spark.streaming.dstream.InputDStream
    import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    import org.joda.time.DateTime
    import org.joda.time.format.{DateTimeFormat, DateTimeFormatter}
    
    import scala.collection.JavaConverters._
    import scala.collection.mutable
    
    object SparkStreamingWithTimestamp {
    
      def main(args: Array[String]): Unit = {
        if (args.length > 1) {
          System.err.println(
            s"""
               |Usage: SparkStreamingWithTimestamp [datetime]
               |  [datetime] is a kafka offset datetime.The format is yyyy-MM-dd hh:mm:ss
               |
            """.stripMargin)
          System.exit(1)
        }
    
        val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
          .set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
        val ssc = new StreamingContext(conf, Seconds(10))
        conf.registerKryoClasses(Array(classOf[ConsumerRecord[String,String]]))
        ssc.sparkContext.setLogLevel("WARN")
    
        val topicsSet = "test".split(",").toSet
        val kafkaParams = Map[String, Object](
          ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.174.120:9092",
          ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
          ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
          ConsumerConfig.GROUP_ID_CONFIG -> "test",
          ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest",
          ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean)
        )
        var messages: InputDStream[ConsumerRecord[String, String]] = null
        if (args.length == 1) {
          messages = KafkaUtils.createDirectStream[String, String](ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topicsSet, kafkaParams, getOffsetByTimestamp(kafkaParams, args(0))))
        } else {
          messages = KafkaUtils.createDirectStream[String, String](ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](Seq("test"), kafkaParams))
        }
    
    
        messages.print()
    
    
        ssc.start()
        ssc.awaitTermination()
      }
    
      /**
       * 根据时间 获取kafka的offset
       *
       * @param kafkaParams
       * @param time
       * @return
       */
      def getOffsetByTimestamp(kafkaParams: collection.Map[String, Object], time: String): mutable.HashMap[TopicPartition, Long] = {
        val consumer = new KafkaConsumer[String, String](new java.util.HashMap[String, Object](kafkaParams.asJava))
        val fetchTime = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss").parseMillis(time)
        //记录(topic,分区) --->对应时间戳
        val timestampToSearch: java.util.Map[TopicPartition, java.lang.Long] = new java.util.HashMap[TopicPartition, java.lang.Long]()
        //记录分区和他对应的offset
        val partitionOffset = new mutable.HashMap[TopicPartition, Long]
    
    
        //获取topic的partition信息   可以得到这个topic的所有partition  返回值是一个list[PartitionInfo]
        val partitionInfos = consumer.partitionsFor("test")
        for (partitionInfo <- partitionInfos.asScala) {
          val tp = new TopicPartition(partitionInfo.topic(), partitionInfo.partition());
          timestampToSearch.put(tp, fetchTime)
        }
    
        val topicPartitionToOffsetAndTimestamp = consumer.offsetsForTimes(timestampToSearch)
        for ((tp, offsetAndTimeStamp) <- topicPartitionToOffsetAndTimestamp.asScala) {
          val offset = offsetAndTimeStamp.offset()
          partitionOffset+=tp->offset
        }
        consumer.close()
        partitionOffset
      }
    
    }
    
    

    相关文章

      网友评论

          本文标题:Spark Streaming 从指定时间戳开始消费 kafka

          本文链接:https://www.haomeiwen.com/subject/ckpdvctx.html