Hive2

作者: lufaqiang | 来源:发表于2017-11-19 08:29 被阅读0次

    Hive的部署
    解压缩、重命名、设置环境变量
    在目录$HIVE_HOME/conf/下,执行命令mv hive-default.xml.template hive-site.xml重命名
    hive.metastore.warehouse.dir: Hive数据存储目录
    hive.exec.scratchdir:Hive数据临时文件目录
    在目录$HIVE_HOME/conf/下,执行命令mv hive-env.sh.template hive-env.sh重命名
    修改hadoop的配置文件hadoop-env.sh,修改内容如下:
    Export HADOOP_CLASSPATH=.:$CLASSPATH:$HADOOP_CLASSPATH:$HADOOP_HOME/bin
    在目录$HIVE_HOME/bin下面,修改文件hive-config.sh(可跳过)
    hive-site.xml

    <property>
    <name>javax.jdo.option.ConnectionURL</name>
    <value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
    <description>JDBC connect string for a JDBC metastore</description>
    </property>
    <property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>hive</value>
    <description>username to use against metastore database</description>
    </property>
    <property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>123456</value>
    <description>password to use against metastore database</description>
    </property>

    查看hive版本:
    mysql> select * from VERSION;

    查看有哪些表:
    mysql> select * from TBLS \G;

    查看表对应的hdfs目录的metedata:
    mysql> select * from SDS \G;

    查看某个表的partitions:
    mysql> select * from PARTITIONS where TBL_ID=1 \G;

    查看某个表的列:
    mysql> select * from COLUMNS_V2;

    查看某个表的partition:
    mysql> select * from PARTITION_KEYS;

    连接bigdata 元数据库

    从/usr/local/hive/conf/hive-site.xml找ConnectionUserName和ConnectionPassword(hive/hive2017)

    跳到bigdata003

    mysql -u hive -phive2017 -D hive

    Hive与HBase对比
    共同点
    1.HBase与Hive都架构在Hadoop之上,都是用hdfs作为底层存储。
    2.职能上都能对外提供表形式的数据查询等服务。

    区别
    1.Hive是建立在Hadoop之上为了减少MapReduce jobs编写工作的批处理系统,HBase是为了支持弥补Hadoop对实时操作的缺陷的项目 。
    2.Hive本身不存储和计算数据,它完全依赖于HDFS和MapReduce,Hive中的表是纯逻辑表,是对hdfs文件的一种记录方式。
    3.Hbase是物理表,不是逻辑表,提供一个超大的内存hash表,搜索引擎通过它来存储索引,方便查询操作。

    create table city(
    province_code INT,
    province_name string,
    city_code INT,
    city_name string
    )
    row FORMAT delimited
    fields terminated by ','
    lines terminated by '\n';

    load data local inpath '/home/bigdata/hive/city.txt' into table city; //将数据从‘’中取出来,放到city表中。

    alter table city rename to city_new;

    select * from city_new limit 10; //查询十条数据

    select count(*) from city_new; //查询一共多少条数据

    select * from city_new order by city_code desc limit 10 ; //最大值

    select province_name,count(*) as cnt from city_new group by province_name order by cnt desc limit 10;
    //聚合函数在后面跟着order by的话,必须将聚合函数,设置别名。

    select count(distinct province_name) from city_new ; //查询省份的个数

    select
    count(*)
    from
    (
    select
    province_name
    from
    city_new
    group by
    province_name
    )a; //查询省份的个数

    //查询只有一个市的省份
    1.首先,查出来所有省份及其市的个数
    2.查询市的个数为1的省份

    select province_name,count(*) as su from city_new group by province_name having su = 1;

    select province_name,su from (
    select province_name,count(*) as su from city_new group by province_name

    )a
    where su = 1;

    外表与内表
    //创建一个外表,将'/user/cll/city/'这个路径下的文件,倒进来
    create EXTERNAl table city_ex(
    province_code int,
    province_name String,
    city_code int,
    city_name String

    )
    row FORMAT delimited
    fields terminated by ','
    lines terminated by '\n'
    location '/user/cll/city/';

    //创建一个内表,将'/user/cll/city/'这个路径下的文件,倒进来
    //'/user/cll/city/'这个路径是将city.txt文件put到服务器上的
    create table city_in(
    province_code int,
    province_name String,
    city_code int,
    city_name String

    )
    row FORMAT delimited
    fields terminated by ','
    lines terminated by '\n'
    location '/user/cll/city/';

    show create table city_ex;
    show create table city_in;
    //将外表删除以后,倒进来的文件的路径,去服务器上查询,仍然存在;hadoop fs -ls /user/cll/city 文件下依然有文件
    //将内表删除以后,倒进来的文件的路径,去服务器上查询,文件也不存在。

    分区:
    CREATE TABLE user(
    uid INT,
    city_code INT,
    model string,
    access string
    )
    row FORMAT delimited
    fields terminated by ','
    lines terminated by '\n';

    load data local inpath '/home/bigdata/tanqi/hive/user.txt' into table user;

    创建分区表:
    create table user_daily(
    uid int,
    city_code int,
    model String,
    access String

    )
    partitioned by (p_date String);

    //从user表中查出来数据,然后将其插入到表user_daily 2017.09.01这一天里面去
    //静态分区
    insert OVERWRITE table user_daily partition (p_date = '2017-09-01')
    select * from user;

    //插进来以后,可以展示查询知道,有几个分区
    show partitions user_daily;

    //动态分区(不指定日期)
    set hive.exec.dynamic.partition.mode=nonstrict;
    INSERT OVERWRITE TABLE user_daily PARTITION (p_date)
    SELECT *,'2017-09-02' FROM user
    UNION ALL
    SELECT *,'2017-09-03' FROM user
    ;

    ALTER TABLE user_daily PARTITION (p_date='2017-01-01') RENAME TO PARTITION (p_date='20170101');

    insert OVERWRITE table user_daily partition (p_date = '2017-01-01')
    select * from user;

    alter table user_daily drop partition (p_date='20170101');

    select * from user_daily where p_date='2017-09-02';

    select p_date,count(*) from user_daily group by p_date;

    作业1:

    a)统计WIFI环境下用户最多的5个城市
    select city_code, count(uid) as cun from user where access='WIFI' group by city_code order by cun desc limit 5; //在前面加一个explain,可以查看执行过程
    b)统计用户数>=3的access和city_code组合

    select city_code,access,count(uid) as cun from user group by city_code,access having cun>=3;
    c)机型中带有ne(不区分大小写),不同access及对应用户数
    selectaccess,count(uid) from user where lower(model) like '%n%e%' group by access order by access desc;

    select sum(if(access='2G',1,0))/count(*) from user ; //求占比(WIFI人数与总数的比值)

    //计算尾数为[0-3]、[4-7]、[8-9]的人数
    select
    case
    when uid % 10 in (0,1,2,3) then '0-3'
    when uid % 10 in (4,5,6,7) then '4-7'
    else '8-9'
    end as interval,
    count(*) as cun
    from user
    group by
    case
    when uid % 10 in (0,1,2,3) then '0-3'
    when uid % 10 in (4,5,6,7) then '4-7'
    else '8-9'
    end;

    //集合
    select collect_set(access) from user; //去重
    select collect_list(access) from user; //不去重

    //join

    select user.uid, user.city_code, city_new.city_name
    from
    (select * from user where uid <= 100) user
    right join
    (select * from city_new where province_code <= 30) city_new
    on (user.city_code = city_new.city_code)
    limit 20;

    select user.uid, user.city_code, city_new.city_name
    from
    (select * from user where uid <= 100) user
    inner join
    (select * from city_new where province_code <= 30) city_new
    on (user.city_code = city_new.city_code)
    limit 20;

    select user.uid, user.city_code, city_new.city_name
    from
    (select * from user where uid <= 100) user
    full join
    (select * from city_new where province_code <= 30) city_new
    on (user.city_code = city_new.city_code)
    limit 20;

    窗口函数:(a\、b)
    a\分组topN:(窗口函数)
    1、分组 access
    2、排序 序号
    3、top1 序号=1
    select access,city_code,uid
    from
    (
    select uid,access,city_code,
    row_number() over (partition by access order by
    city_code desc) as row_num
    from user
    )a
    where row_num = 1;

    b\累积和
    select p_date,
    sum(cun) over (order by p_date asc rows between unbounded preceding and current row)
    from
    (
    select p_date,count(*) as cun
    from user_daily
    where p_date between '2017-09-01' and '2017-09-30'
    group by p_date
    )a
    ;

    create table phone (
    model String,
    os String,
    brand String,
    resolution String,
    launch_date String
    )
    row FORMAT delimited
    fields terminated by ','
    lines terminated by '\n'
    location '/user/cll/phone/';

    select * from phone limit 10;
    set hive.cli.print.header=true;

    select user.uid, user.city_code, city.province_name
    from user left join city
    on (user.city_code = city.city_code)
    limit 20;

    select user.model,user.access, user.city_code, phone.launch_date
    from user left join phone
    on (user.model = phone.model)
    limit 20;

    create table user_info(
    uid int,
    access String,
    city_code int,
    province_name String,
    model String,
    launch_date String
    )
    row FORMAT delimited
    fields terminated by ','
    lines terminated by '\n'

    insert into //如果插入错了,在来一遍的话,需要覆盖式插入(insert overwrite) table user_info
    select user.uid, user.access,user.city_code, city.province_name,user.model,phone.launch_date
    from user
    left join city on (user.city_code = city.city_code)
    left join phone on (user.model = phone.model)
    ;

    select * from phone;

    select * from user_info limit 10;

    作业2
    create table phone (
    model string,
    os string,
    brand string,
    resolution string,
    launch_data string
    )
    row FORMAT delimited
    fields terminated by ','
    lines terminated by '\n';
    load data local inpath '/home/bigdata/tanqi/hive/phone.txt' into table phone;
    set hive.cli.print.header=true;

    create table user_info (
    uid int,
    access string,
    city_code string,
    provice_name string,
    model string,
    launch_data string
    )
    row FORMAT delimited
    fields terminated by ','
    lines terminated by '\n'
    ;
    加载数据
    以user为主表,分别左连接phone和city表,生成新表user_info
    a)user和city连接时,利用city_code相等;user和phone连接时,利用model相等
    b)user_info表包含字段:uid, access, city_code, province_name, model,launch_date

    insert into table user_info
    select user.uid, user.access, user.city_code, city.provice_name, user.model,phone.launch_data
    from
    user
    left join
    city
    on (user.city_code = city.city_code)
    left join
    phone
    on (user.model = phone.model);

    a)统计不同省份下uid最大时的:province_name,uid,model
    select provice_name,uid,model from
    (
    select provice_name,uid,model,
    row_number() over (partition by provice_name order by uid desc) as row_num
    from user_info
    )a
    where row_num=1;

    b)按launch_date统计每年对应的累计总用户数。结果类似:
    select launch_data,
    sum(cnt) over (order by launch_data asc rows between unbounded preceding and current row)
    from
    (
    select launch_data,count(*) as cnt from user_info
    where launch_data between '2002' and '2016'
    group by launch_data
    )a
    ;

    相关文章

      网友评论

          本文标题:Hive2

          本文链接:https://www.haomeiwen.com/subject/nmjbsxtx.html