1. 基本sql操作

-- 设置队列
set mapreduce.job.queuename=xxxxx;

-- 创建和删除数据库
create database if not exists userdb;
drop databases if exists userdb;

-- 列出所有数据库
show databases;  

-- 使用数据库
use databases;

-- 列出数据库中所有的表
show tables;

-- 列出表中的所有字段
desc table;

-- 统计表的行数
show count(*) from table;

2. hive表操作

-- 创建hive表
create external table if not exists table_name (
  `id` string comment 'id号',
  `name` string comment '名字',
  `height` double comment '身高',
) comment '学生信息表'
partition by (dt string)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
stored as textfile
location 'hdfs文件路径';

-- 删除hive表
drop table if exists table_name;

-- 插入到Hive表
insert overwrite table sale_detail_insert partition (sale_date='2013', region='china')
select customer_id, shop_name, total_price from sale_detail;

3. 分区操作

-- 删除分区
alter table table_name drop if exists partition(dt = '20200820');

-- 新建分区，存入数据
alter table table_name add if not exists partition(dt = '20200820') location 'hdfs文件路径';

-- 重命名分区
alter table table_name partition xxxxx rename to partition xxxxx;

4. sql高级操作

--- 排序
SELECT id, name, gender, score FROM students ORDER BY score;  -- 正排
SELECT id, name, gender, score FROM students ORDER BY score DESC; -- 倒排
SELECT id, name, gender, score FROM students ORDER BY score DESC, gender;

-- 分页
SELECT id, name, gender, score
FROM students
ORDER BY score DESC
LIMIT 3 OFFSET 6;  -- 查询第3页， 第一个是偏移量；OFFSET代表初始偏移；

select id
for students
limit 1,1  -- LIMIT后的第一个参数是输出记录的初始位置，第二个参数偏移量，偏移多少，输出的条目就是多少。

-- 聚合查询
SELECT COUNT(*) FROM students;
SELECT AVG(score) average FROM students WHERE gender = 'M';

-- 分组查询
SELECT COUNT(*) num FROM students GROUP BY class_id; -- 按照class_id分组
SELECT class_id, gender, COUNT(*) num FROM students GROUP BY class_id, gender; -- 按照class_id， gender分组

-- 多表查询
SELECT
    students.id sid,
    students.name,
    students.gender,
    students.score,
    classes.id cid,
    classes.name cname
FROM students, classes;

SELECT
    s.id sid,
    s.name,
    s.gender,
    s.score,
    c.id cid,
    c.name cname
FROM students s, classes c
WHERE s.gender = 'M' AND c.id = 1;

-- 连接查询
SELECT s.id, s.name, s.class_id, c.name class_name, s.gender, s.score
FROM students s
INNER JOIN classes c
ON s.class_id = c.id; -- INNER JOIN查询（内连接）

SELECT s.id, s.name, s.class_id, c.name class_name, s.gender, s.score
FROM students s
RIGHT OUTER JOIN classes c
ON s.class_id = c.id; -- RIGHT OUTER JOIN查询（外连接）

-- 多表连接查询
select * 
from (
    select userId from table_a where dt=20160731
) a join (
    select userId from table_b where dt=20160731
) b  on a.userId=b.userId 
join (
    select userId from table_c where dt=20160731
) c on a.userId=c.userId

-- with as 语句优化sql
WITH t1 AS (
        SELECT *
        FROM carinfo
    ), 
    t2 AS (
        SELECT *
        FROM car_blacklist
    )
SELECT *
FROM t1, t2

5. Hive内置函数

（1）数学函数

常用的数学函数都有：round、floor、ceil、rand、exp 等；

（2）集合函数

size：求数组的长度；
sort_array：对数组进行排序并返回；

（3）数据类型转换

binary：将输入的数字转为二进制；
cast：将输入的数据转为特定的类型；如：cast("1" as BIGINT) 将字符串1转换成了BIGINT类型，如果转换失败将返回NULL；

（4）日期函数

from_unixtime(bigint unixtime[, string format])：将时间的秒值转换成format格式（format可为“yyyy-MM-dd hh:mm:ss”,“yyyy-MM-dd hh”,“yyyy-MM-dd hh:mm”等等）如from_unixtime(1250111000,"yyyy-MM-dd") 得到2009-03-12
unix_timestamp(string date)：将格式为yyyy-MM-dd HH:mm:ss的时间字符串转换成时间戳如unix_timestamp('2009-03-20 11:30:01') = 1237573801；
to_date(string timestamp)：Returns the date part of a timestamp string: to_date("1970-01-01 00:00:00") = "1970-01-01". 返回时间字符串的日期部分；
current_date：返回当前的时间日期；
current_timestamp：返回当前的时间戳；

（5）条件函数

if(boolean testCondition, T valueTrue, T valueFalseOrNull)：如果testCondition 为true就返回valueTrue, 否则返回valueFalseOrNull；
isnull(a)：如果a为null就返回true，否则返回false；
isnotnull(a)：如果a为非null就返回true，否则返回false；

（6）字符串函数

concat(string|binary A, string|binary B...)：对字符串进行拼接；
concat_ws(string SEP, string A, string B...)：使用指定的分隔符sep，对字符串进行拼接；
get_json_object(string json_string, string path)：解析json字符串，例如：get_json_object(para['response'], '$.result')；
instr(string str, string substr)：查找字符串str中子字符串substr出现的位置，如果查找失败将返回0，如果任一参数为Null将返回null，注意位置为从1开始的；
length(string A)：返回字符串长度；
locate(string substr, string str[, int pos])：查找字符串str中的pos位置后字符串substr第一次出现的位置；
lower(string A)：将所有字符串变成小写；
regexp_replace(string INITIAL_STRING, string PATTERN, string REPLACEMENT)：按照Java正则表达式PATTERN将字符串INTIAL_STRING中符合条件的部分成REPLACEMENT所指定的字符串；
reverse(string A)：反转字符串；
trim(string A)：将字符串A中前后出现的空格去除；
levenshtein(string A, string B)：计算两个字符串之间的差异大小如：levenshtein('kitten', 'sitting') = 3；