import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}
import org.apache.spark.{SparkConf, SparkContext}
object SparkReadHBaseDemo {
val HBASE_ZOOKEEPER_QUORUM = "xxx1.com.cn,xxx2.com.cn,xxx3.com.cn"
// 主函数
def main(args: Array[String]) {
// 设置spark访问入口
val conf = new SparkConf().setAppName("SparkReadHBaseDemo ")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.setMaster("local")//调试
val sc = new SparkContext(conf)
// 获取HbaseRDD
val hbaseRDD = sc.newAPIHadoopRDD(getHbaseConf(), classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
hbaseRDD.map(_._2).map(getRes(_)).count()
}
def getRes(result: org.apache.hadoop.hbase.client.Result): String = {
val rowkey = Bytes.toString(result.getRow())
val addr = Bytes.toString(result.getValue("f".getBytes, "addr".getBytes))
println(rowkey+"---"+addr)
addr
}
// 构造 Hbase 配置信息
def getHbaseConf(): Configuration = {
val conf: Configuration = HBaseConfiguration.create()
conf.set("hbase.zookeeper.property.clientPort", "2181")
conf.set("zookeeper.znode.parent", "/hbase-unsecure")
conf.set("hbase.zookeeper.quorum", HBASE_ZOOKEEPER_QUORUM)
// 设置查询的表名
conf.set(TableInputFormat.INPUT_TABLE, "test_shx")
conf.set(TableInputFormat.SCAN, getScanStr())
conf
}
// 获取扫描器
def getScanStr(): String = {
val scan = new Scan()
val proto = ProtobufUtil.toScan(scan)
Base64.encodeBytes(proto.toByteArray())
}
}
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.xcar.etl</groupId>
<artifactId>kafka.hbase.spark.hive</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>kafka.hbase.spark.hive</name>
<url>http://maven.apache.org</url>
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<properties>
<cdh.hbase.version>1.2.0-cdh5.7.0</cdh.hbase.version>
<cdh.spark.version>1.6.0-cdh5.7.0</cdh.spark.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.62</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>${cdh.spark.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${cdh.hbase.version}</version>
</dependency>
</dependencies>
</project>
网友评论