Hive高级第二部分: *****Hive:复杂数据类型、JDBC编程ZK: Compression压缩比解压速度1G的没压缩数据:1G的gzip压缩数据:codec:我们只需要配置在hadoop的配置文件中即可压缩的使用core-site.xmlio.compression.codecs org.apache.hadoop.io.compress.GzipCodec,
org.apache.hadoop.io.compress.DefaultCodec,
org.apache.hadoop.io.compress.BZip2Codec,
mapred-site.xmlmapreduce.output.fileoutputformat.compresstruemapreduce.output.fileoutputformat.compress.codecorg.apache.hadoop.io.compress.BZip2Codeccreate table ruoze_page_views(track_time string,url string,session_id string,referer string,ip string,end_user_id string,city_id string)row format delimited fields terminated by '\t';load data local inpath '/home/hadoop/data/page_views.dat' overwrite into table ruoze_page_views; SET hive.exec.compress.output=true;set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.BZip2Codec;create table ruoze_page_views_bzip2row format delimited fields terminated by '\t'as select * from ruoze_page_views; set hive.exec.compress.output=false;----------------------- Storage Format STORED AS file_formatcreate table ruoze_b(id int) stored as INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';行式存储 vs 列式存储保证一行所有的列都在一个block里面大数据中:一个表非常多的字段,我们大部分场景只有其中的某些字段TEXTFILESEQUENCEFILE ...create table ruoze_page_views_seq(track_time string,url string,session_id string,referer string,ip string,end_user_id string,city_id string) row format delimited fields terminated by '\t'stored as SEQUENCEFILE; load data local inpath '/home/hadoop/data/page_views.dat' overwrite into table ruoze_page_views_seq;insert into table ruoze_page_views_seq select * from ruoze_page_views;create table ruoze_page_views_rc(track_time string,url string,session_id string,referer string,ip string,end_user_id string,city_id string) row format delimited fields terminated by '\t'stored as rcfile; insert into table ruoze_page_views_rc select * from ruoze_page_views;create table ruoze_page_views_orc(track_time string,url string,session_id string,referer string,ip string,end_user_id string,city_id string) row format delimited fields terminated by '\t'stored as orc; insert into table ruoze_page_views_orc select * from ruoze_page_views;create table ruoze_page_views_orc_null(track_time string,url string,session_id string,referer string,ip string,end_user_id string,city_id string) row format delimited fields terminated by '\t'stored as orc tblproperties ("orc.compress"="NONE"); insert into table ruoze_page_views_orc_null select * from ruoze_page_views;parquet: dremelcreate table ruoze_page_views_parquet(track_time string,url string,session_id string,referer string,ip string,end_user_id string,city_id string) row format delimited fields terminated by '\t'stored as parquet; insert into table ruoze_page_views_parquet select * from ruoze_page_views;set parquet.compression=GZIP;create table ruoze_page_views_parquet_gzip row format delimited fields terminated by '\t'stored as parquetas select * from ruoze_page_views;select count(1) from ruoze_page_views where session_id='B58W48U4WKZCJ5D1T3Z9ZY88RU7QA7B1';19022752select count(1) from ruoze_page_views_orc where session_id='B58W48U4WKZCJ5D1T3Z9ZY88RU7QA7B1';1257523select count(1) from ruoze_page_views_parquet where session_id='B58W48U4WKZCJ5D1T3Z9ZY88RU7QA7B1';26870773496487
网友评论