#cd
#vi relations
Zhangsan Wangwu 01:01:01
Zhangsan Zhaoliu 00:11:21
Zhangsan Yuqi 00:19:01
Zhangsan Jingba 00:21:01
Zhangsan Wuxi 01:31:17
Wangwu Zhaoliu 00:51:01
Wangwu Zhaoliu 01:11:19
Wangwu Yuqi 00:00:21
Wangwu Yuqi 00:37:21
Yuqi Zhaoliu 00:23:01
Yuqi Wuxi 00:18:00
Jingba Wangwu 00:01:01
Jingba Wangwu 00:00:06
Jingba Wangwu 00:02:03
Jingba Wangwu 00:02:54
Wangwu Yuqi 01:00:19
#hadoop fs -put relations /
#vi test_mapper.py
import sys
for line in sys.stdin:
line = line.strip()
fromstr,tostr,time =line.split('\t')
hours,minutes,secondes = time.split(':')
newtime = int(hours)*60*60+int(minutes)*60+int(secondes)
if cmp(fromstr,tostr) == -1:
#print(cmp(fromstr,tostr))
fromstr,tostr = tostr,fromstr
print ' ' .join([fromstr+tostr,str(newtime)])
#vi test.hive
CREATE TABLE relations_new (
fromtostr STRING,
duration INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ' ';
add FILE test_mapper.py;
INSERT OVERWRITE TABLE relations_new
SELECT
TRANSFORM (fromstr,tostr,duration)
USING 'python test_mapper.py'
AS (fromtostr,duration)
FROM relations;
#hive
create table relations (
fromstr string,
tostr string,
duration string)
row format delimited fields terminated by ' ';
load data inpath '/relations' overwrite into table relations;
exit;
#hive -f test.hive
#hive
select
fromtostr,
duration,
rank() over(order by duration desc) rank,
dense_rank() over(order by duration) dense_rank,
row_number() over(order by duration) row_number
from relations_new;
OK
WangwuJingba 6 1 1 1
YuqiWangwu 21 2 2 2
WangwuJingba 61 3 3 3
WangwuJingba 123 4 4 4
WangwuJingba 174 5 5 5
ZhaoliuZhangsan 681 6 6 6
YuqiWuxi 1080 7 7 7
YuqiWangwu 1109 8 8 8
ZhangsanYuqi 1141 9 9 9
ZhangsanJingba 1261 10 10 10
ZhaoliuYuqi 1381 11 11 11
YuqiWangwu 2241 12 12 12
ZhaoliuWangwu 3061 13 13 13
YuqiWangwu 3619 14 14 14
ZhangsanWangwu 3661 15 15 15
ZhaoliuWangwu 4279 16 16 16
ZhangsanWangwu 4713 17 17 17
ZhangsanWuxi 5477 18 18 18
Time taken: 27.345 seconds, Fetched: 18 row(s)
网友评论