美文网首页我爱编程
kafka 数据容错之 hbase保存 spark消费的offs

kafka 数据容错之 hbase保存 spark消费的offs

作者: 博弈史密斯 | 来源:发表于2018-07-31 15:20 被阅读0次

    本文转载自:https://blog.csdn.net/xnlej/article/details/79037145

    spark streaming 用direct 的方式有优势,但是也容易丢失数据,只能保证at least one ,不能保证exactly one ,要想保证后者,只能手动保存kafka的offset数据。实现方面参考了一位大神的java代码,把它改写成scala 代码,并修复相关bug。在此基础上进一步实现了事务机制

    我们从Spark的官方文档可以知道,维护Spark内部维护Kafka便宜了信息是存储在HasOffsetRanges类的offsetRanges中,我们可以在Spark Streaming程序里面获取这些信息:

    1 val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
    这样我们就可以获取所以分区消费信息,只需要遍历offsetsList

    import java.net.URLDecoder
    
    import com.dianyou.utl.PropertiesUtil
    import org.apache.hadoop.hbase._
    import org.apache.hadoop.hbase.client._
    import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp
    import org.apache.hadoop.hbase.filter.{BinaryComparator, RowFilter}
    import org.apache.hadoop.hbase.util.Bytes
    import org.apache.kafka.clients.consumer.ConsumerRecord
    import org.apache.kafka.common.TopicPartition
    import org.apache.kafka.common.serialization.StringDeserializer
    import org.apache.spark.rdd.RDD
    import org.apache.spark.streaming.dstream.InputDStream
    import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies}
    import org.apache.spark.streaming.{Seconds, StreamingContext, Time}
    import org.apache.spark.{SparkConf, SparkContext}
    
    import scala.collection.mutable
    
    /**
      * 手工操作offset
      *        1 从hbase获取offset,从kafka拉取数据
      *        2 数据处理完后,把until offset 保存到hbase
      *        3 kafka 长时间挂掉之后,从kafka最早的offset 开始读取 此处还需要处理
      * Created by Administrator on 2017/12/28.
      */
    object OffsetOperate {
      var hbaseProp = PropertiesUtil.getProperties("hbase")
      var kafkaconsumePro = PropertiesUtil.getProperties("kafkaconsume")
      def main(args: Array[String]): Unit = {
    
      val conf = new SparkConf().setAppName("sparkStreaming - offset operate")
        .setMaster("local[2]") // --master local[2] | spark://xx:7077 | yarn
        .set("spark.testing.memory", "2147480000")
        val sc = new SparkContext(conf)
        val ssc = new StreamingContext(sc,Seconds(5))
    
        //kafka配置
        val kafkaParams = Map[String, Object](
          "bootstrap.servers" -> kafkaconsumePro.getProperty("bootstrap.servers"),
          "key.deserializer" -> classOf[StringDeserializer],
          "value.deserializer" -> classOf[StringDeserializer],
          "group.id" -> kafkaconsumePro.getProperty("group"),
          "auto.offset.reset" -> "earliest", // 第一次读取时从topic 首位置开始读取
          "enable.auto.commit" -> (false: java.lang.Boolean)// kafka 不保存消费的offset
        )
    
        //监听频道
        val topics = Array(kafkaconsumePro.getProperty("topics"))
        // 获取hbase连接
        val hbaseConf = HBaseConfiguration.create()
        hbaseConf.set("hbase.zookeeper.quorum",hbaseProp.getProperty("quorum")) //zookeeper 集群
        hbaseConf.set("hbase.zookeeper.property.client","2181")
        hbaseConf.set("hbase.master", hbaseProp.getProperty("hbase_master"))
        hbaseConf.set("hbase.defaults.for.version.skip", "true")
    
        //获取连接对象
        val conn = ConnectionFactory.createConnection(hbaseConf)
        val admin = conn.getAdmin
    
        val tn = TableName.valueOf("hbase_consumer_offset") //hbase 表名
        val isExist = admin.tableExists(tn)
        val streams : InputDStream[ConsumerRecord[String,String]]= {
        if(isExist) {
          val table = new HTable(hbaseConf, "hbase_consumer_offset")
          val filter = new RowFilter(CompareOp.GREATER_OR_EQUAL, new BinaryComparator(Bytes.toBytes(topics + "_")))
          println("============ 过滤器已经创建 ==========")
          val s = new Scan()
          s.setFilter(filter)
          val rs = table.getScanner(s)
    
          // 设置 offset
          val fromOffsets = scala.collection.mutable.Map[TopicPartition, Long]()
          var s1 = ""
          var s2 = 0
          var s3: Long = 0
            for (r: Result <- rs.next(200)) {
              println("rowKey : " + new String(r.getRow))
              for (keyvalue: KeyValue <- r.raw()) {
                if ("topic".equals(new String(keyvalue.getQualifier))) {
                  s1 = new String(keyvalue.getValue)
                  println("columnFamily :" + new String(keyvalue.getFamily) + " column :" +new String( keyvalue.getQualifier) + s1)
                } else if ("partition".equals(new String(keyvalue.getQualifier))){
                  s2 = Bytes.toInt(keyvalue.getValue)
                  println("columnFamily :" +  new String(keyvalue.getFamily) + " column :" + new String( keyvalue.getQualifier) + s2)
                } else if("offset".equals(new String(keyvalue.getQualifier))) { //if("offset".equals(new String(keyvalue.getQualifier)))
                  s3 = Bytes.toLong(keyvalue.getValue)
                  println("columnFamily :" + new String(keyvalue.getFamily) + " column :" + new String( keyvalue.getQualifier) + s3)
                }
              }
              fromOffsets.put(new TopicPartition(s1, s2), s3)
            }
          println("fromOffset is : "+fromOffsets)
            KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
              ConsumerStrategies.Assign(fromOffsets.keySet, kafkaParams, fromOffsets)) //(fromOffsets.keySet,kafkaParams,fromOffsets))
          }else{ //Hbase 里面不存在offset表,从topic首位置开始消费
            val htable = new HTableDescriptor(TableName.valueOf("hbase_consumer_offset"))
            htable.addFamily(new HColumnDescriptor(("topic_partition_offset")))
            admin.createTable(htable)
            println("表已经创建成功========" + htable)
          KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe(topics, kafkaParams))
          }
        }
      // val dstream = streams.map(x=>URLDecoder.decode(x.value()))
    
        // 操作成功后更新offset
        streams.foreachRDD{ rdd =>
          //if(!rdd.isEmpty()){
          // 打成一个事务,把业务计算和offset保存放在一起,要么成功,要么一起失败,实现精确一次的消费
          import scala.collection.JavaConversions._
          val table = new HTable(hbaseConf,"hbase_consumer_offset")
          table.setAutoFlush(false, false)
          var putList:List[Put] = List()
            val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges  // RDD[ConsumerRecord[String,String]] 强转成offsetRanges
            for(offsetRange <- offsetRanges){
              println("the topic is "+offsetRange.topic)
              println("the partition is "+offsetRange.partition)
              println("the fromOffset is "+offsetRange.fromOffset)
              println("the untilOffset is "+offsetRange.untilOffset)
              println("the object is "+offsetRange)
             // val table = new HTable(hbaseConf,"hbase_consumer_offset")
             // table.setAutoFlush(false, false)
              val put  = new Put(Bytes.toBytes(offsetRange.topic+"_"+offsetRange.partition))
              put.add(Bytes.toBytes("topic_partition_offset"),Bytes.toBytes("topic"),Bytes.toBytes(offsetRange.topic))
              put.add(Bytes.toBytes("topic_partition_offset"),Bytes.toBytes("partition"),Bytes.toBytes(offsetRange.partition))
              put.add(Bytes.toBytes("topic_partition_offset"),Bytes.toBytes("offset"),Bytes.toBytes(offsetRange.untilOffset))
              putList = put+:putList
             // println("add data success !")
            }
    
            println("the RDD records are "+rdd.map{x =>URLDecoder.decode(x.value())}.collect.foreach(println)) // 程序的计算逻辑
          //  }
          table.put(putList)
          table.flushCommits()
          println("add and compute data success !")
          }
        ssc.start()
        ssc.awaitTermination()
      }
    }
    

    此处还有一个问题,当kafka 数据清掉之后,从hbase中读取offset数据到kafka查询时,会读不到数据 报数组越界的错误,有时间再来完善下
    参考链接 :https://www.jianshu.com/p/667e0f58b7b9

    相关文章

      网友评论

        本文标题:kafka 数据容错之 hbase保存 spark消费的offs

        本文链接:https://www.haomeiwen.com/subject/uurhvftx.html