ClickHouase读写HDFS
1. 从ClickHouse导数据到HDFS
1.1 准备测试数据
先在ClickHouse建本地表,并造测试数据备用
-- 建表
drop table if exists account_info_src;
create table account_info_src(
account_id UInt64 comment '',
user_id String comment '',
open_id String comment '',
device_id String comment '',
client_id String comment '',
car_number String comment '',
tel String comment '',
process_date UInt32 DEFAULT toYYYYMMDD(today()) comment '',
create_time DateTime DEFAULT now() ,
update_time DateTime DEFAULT now()
)
ENGINE=MergeTree()
order by account_id
PARTITION BY process_date
;
-- 给表里增加数据
insert into account_info_src(account_id, user_id, open_id, device_id, client_id, car_number, tel, process_date)
select number as account_id,
transform(number % 8, [0, 1, 4], ['', 'unknown', 'UNKNOWN'], concat('userid_', toString(number)) ) AS user_id,
transform(number % 7, [1, 5], ['', 'unknown'], concat('openid_', toString(number)) ) AS open_id,
transform(number % 6, [2, 6], ['', 'unknown'], concat('deviceid_', toString(number)) ) AS device_id,
transform(number % 6, [2, 6], ['', 'unknown'], concat('clientid_', toString(number)) ) AS client_id,
transform(number % 9, [5, 3], ['', 'unknown'], concat('car_number_', toString(number)) ) AS car_number,
transform(number % 10, [1, 4], ['', 'unknown'], concat('tel_', toString(number)) ) AS tel
,toYYYYMMDD(addDays(today(), 0))
from numbers(100)
;
1.2 建ClickHouase的外部引擎表
先在HDFS创建数据的目录
hdfs dfs -mkdir -p /bdp/tmp/clickhouse/account
在CK创建外部引擎表
drop table if exists account_info_hdfs;
create table account_info_hdfs(
account_id UInt64 comment '',
user_id String comment '',
open_id String comment '',
device_id String comment '',
client_id String comment '',
car_number String comment '',
tel String comment '',
process_date UInt32 DEFAULT toYYYYMMDD(today()) comment '',
create_time DateTime DEFAULT now() ,
update_time DateTime DEFAULT now()
)
engine=HDFS('hdfs://node1:8020/bdp/tmp/clickhouse/account/info.Parquet','Parquet')
;
-- 注意:从ck导出hive中时会自己生成info.Parquet这个文件,执行第二次时会提示文件已经存在
1.3. 导数据
将数据从account_info_src(存储在CK中) 导到 account_info_hdfs(存储在HDFS中)
insert into account_info_hdfs
select * from account_info_src
;
这样就成功将ClickHouse的数据导出到HDFS中了。
1.4 在Hive表中查看数据
drop table if exists account_info_hive;
create external table if not exists account_info_hive (
account_id BIGINT comment '',
user_id String comment '',
open_id String comment '',
device_id String comment '',
client_id String comment '',
car_number String comment '',
tel String comment '',
process_date BIGINT ,
create_time BIGINT ,
update_time BIGINT
)
stored as parquet
location '/bdp/tmp/clickhouse/account';
drop table if exists account_info_hive2;
create external table if not exists account_info_hive2 (
account_id BIGINT comment '',
user_id String comment '',
open_id String comment '',
device_id String comment '',
client_id String comment '',
car_number String comment '',
tel String comment ''
)
stored as parquet
location '/bdp/tmp/clickhouse/account2';
现在就可以在HIVE中查看数据了。
注意事项:
-
DateTime数据类型转到HIVE后只能用Bigint,开始时我使用timestamp 时会报错
-
有可能会提示权限问题
-
sudo -u hdfs hdfs dfs -chown -R clickhouse:clickhouse /bdp/tmp/clickhouse/account
2.导HDFS数据到ClickHouse
2.1 数据准备
直接用刚刚从CK导出的数据
2.2 建ClickHouse表
drop table if exists account_info_hdfs2;
create table account_info_hdfs2(
account_id UInt64 comment '',
user_id String comment '',
open_id String comment '',
device_id String comment '',
client_id String comment '',
car_number String comment '',
tel String comment '',
process_date UInt32 DEFAULT toYYYYMMDD(today()) comment '',
create_time DateTime DEFAULT now() ,
update_time DateTime DEFAULT now()
)
engine=HDFS('hdfs://node1:8020/bdp/tmp/clickhouse/account/*.Parquet','Parquet')
;
-- 注意:*.Parquet表示读取多个文件
这样就可以了。
为了验证数据,在HDFS中增加一个文件。
hdfs dfs -cp /bdp/tmp/clickhouse/account/info.Parquet /bdp/tmp/clickhouse/account/a.Parquet
在ClickHouse中查看发现数据多了一倍
结束
网友评论