创建表格
CREATE TABLE u_data (
userid INT,
movieid INT,
rating INT,
unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
下载数据
# wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
And load u.data into the table that was just created:
hive> LOAD DATA LOCAL INPATH './ml-100k/u.data'
OVERWRITE INTO TABLE u_data;
Create weekday_mapper.py:
[root@master hive]# cat weekday_mapper.py
import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])
创建hive脚本如下
Use the mapper script:
[root@master hive]# cat offical_new_sample.hive
CREATE TABLE u_data_new (
userid INT,
movieid INT,
rating INT,
weekday INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
add FILE weekday_mapper.py;
INSERT OVERWRITE TABLE u_data_new
SELECT
TRANSFORM (userid, movieid, rating, unixtime)
USING 'python weekday_mapper.py'
AS (userid, movieid, rating, weekday)
FROM u_data;
SELECT weekday, COUNT(*)
FROM u_data_new
GROUP BY weekday;
执行脚本
[root@master hive]# hive -f offical_new_sample.hive
网友评论