0.需求描述
统计 谷粒视频 网站的常规指标,各种 TopN 指标:
--统计视频观看数 Top10
--统计视频类别热度 Top10
--统计视频观看数 Top20 所属类别
--统计视频观看数 Top50 所关联视频的所属类别 Rank
--统计每个类别中的视频热度 Top10
--统计每个类别中视频流量 Top10
--统计上传视频最多的用户 Top10 以及他们上传的视频
--统计每个类别视频观看数 Top10
1.数据预处理
对将要处理的数据先进行一次数据清洗,过滤掉不合格的脏数据,同时调整数据的格式
image.png
1.1 将影片类型一栏->People & Blogs 中的空格去掉
1.2 将第9列以后的数据(关联的其他影片ID)用“&“符号连接 (因为在建表时这两列都需要设置为array类型,所以要统一分隔字符)
1.3 建立maven工程
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.victor</groupId>
<artifactId>ETLUtils</artifactId>
<version>0.0.1-SNAPSHOT</version>
<build>
<plugins><plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin></plugins></build>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>1.8</version>
<scope>system</scope>
<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
</dependency>
</dependencies>
</project>
ETLUtilMapper.java
package com.victor.ETLUtils;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class ETLUtilsMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
// PF_ZMlw4rHs DisneyUnleashed 729 Film&Animation 288 96 3.5 2 0 1 xbSFrHzFQ0 4
// VP4qSjDNQs RJgGeYiJrj0
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
// 将第四个字段中的空格去掉
String newline = util(line);
if (StringUtils.isNotBlank(newline)) {
k.set(newline);
context.write(k, NullWritable.get());
}
}
private String util(String line) {
StringBuffer sb = new StringBuffer();
String[] fields = line.split("\t");
if (fields.length < 9) {
return null;
}
fields[3] = fields[3].replaceAll(" ", "");
for (int i = 0; i < fields.length; i++) {
// 如果字段小于9 ,丢弃
if (i < 9) {
sb.append(fields[i] + "\t");
} else {
if (fields.length-1 == i) {
sb.append(fields[i]);
} else {
sb.append(fields[i] + "&");
}
}
}
return sb.toString();
}
}
ETLUtilDriver.java
package com.victor.ETLUtils;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text;
public class ETLUtilsDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//args = new String[] {"2.txt","output"};
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(ETLUtilsDriver.class);
job.setMapperClass(ETLUtilsMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
}
}
1.4 启动集群,将工程打成Jar包,并与数据文件一并上传至集群中(省略)
1.5 清洗数据
[hadoop@hadoop131 software]$ yarn jar ETLUtils-0.0.1-SNAPSHOT.jar com.victor.ETLUtils.ETLUtilsDriver /videotable /videoOutput
处理前数据
image.png
处理后数据
image.png
2.Hive建表
思路:先分别建立video和user 的原始表,再建立video和user的ORC压缩表以提高传输执行效率
2.1 建立原始表
gulivideo_ori
CREATE TABLE gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>
)
row format delimited
fields terminated by '\t'
collection items terminated by '&';
guli_user_ori
create table guli_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
2.2 加载数据
load data inpath '/videotable' into table gulivideo_ori;
load data inpath '/usertable' into table guli_user_ori;
2.3 建立ORC表
create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>
)
row format delimited
fields terminated by '\t'
collection items terminated by '&'
stored as orc;
create table guli_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc;
2.4 向ORC表加载数据
insert into gulivideo_orc
select * from gulivideo_ori;
insert into guli_user_orc
select * from guli_user_ori;
3. 业务分析
3.1 统计视频观看数 Top10
select videoid,views
from gulivideo_orc
order by views desc
limit 10;
+--------------+-----------+--+
| videoid | views |
+--------------+-----------+--+
| dMH0bHeiRNg | 42513417 |
| 0XxI-hvPRRA | 20282464 |
| 1dmVU08zVpA | 16087899 |
| RB-wUgnyGv0 | 15712924 |
| QjA5faZF1A8 | 15256922 |
| -_CSo1gOd48 | 13199833 |
| 49IDp76kjPw | 11970018 |
| tYnn51C3X_w | 11823701 |
| pv5zWaTEVkI | 11672017 |
| D2kJZOfq7zk | 11184051 |
+--------------+-----------+--+
3.2 统计视频类别热度 Top10
3.2.1 将表中category字段数组行转列
select views,hot from
gulivideo_orc lateral view explode (category) category_t as hot;t1
3.2.2 统计每个类别的观看总数
select hot,count(*)
from t1
group by hot;t2
3.2.3 获取观看前10的类别
select hot,total_view
from ()t2
order by total_view desc limit 10;
SELECT hot,
total_view
FROM (
SELECT hot,
Count(*) total_view
FROM (
SELECT views,
hot
FROM gulivideo_orc lateral view explode (category) category_t as hot)t1
GROUP BY hot)t2
ORDER BY total_view DESC limit 10;
+----------------+-------------+--+
| category_name | total_view |
+----------------+-------------+--+
| Music | 179049 |
| Entertainment | 127674 |
| Comedy | 87818 |
| Film | 73293 |
| Animation | 73293 |
| Sports | 67329 |
| Games | 59817 |
| Gadgets | 59817 |
| People | 48890 |
| Blogs | 48890 |
+----------------+-------------+--+
3.3 统计视频观看数 Top20 所属类别
3.3.1 观看数top20视频
select views,category
from gulivideo_orc
order by views desc
limit 20;t1
3.3.2 所属类别
select views,category
from t1 lateral view explode(category)ct as category_name;
SELECT views,
category_name
FROM (
SELECT views,
category
FROM gulivideo_orc
ORDER BY views DESC limit 20)t1 lateral VIEW explode(category)ct AS category_name ;
+-----------+----------------+--+
| views | category_name |
+-----------+----------------+--+
| 42513417 | Comedy |
| 20282464 | Comedy |
| 16087899 | Entertainment |
| 15712924 | Entertainment |
| 15256922 | Music |
| 13199833 | People |
| 13199833 | Blogs |
| 11970018 | Comedy |
| 11823701 | Music |
| 11672017 | Music |
| 11184051 | People |
| 11184051 | Blogs |
| 10786529 | Entertainment |
| 10334975 | Entertainment |
| 10107491 | Comedy |
| 9579911 | Music |
| 9566609 | Comedy |
| 8825788 | UNA |
| 7533070 | Music |
| 7456875 | Entertainment |
| 7066676 | Comedy |
| 6952767 | Entertainment |
+-----------+----------------+--+
3.4 统计视频观看数 Top50 所关联视频的所属类别 Rank
3.4.1 观看数top10,关联视频
select
videoid,views,category,relatedid
from
gulivideo_orc
order by
views desc
limit 50;t1
3.4.2 关联视频行转列
select distinct(r_id)
from
t1 lateral view explode(relatedid) relatedtable as r_id;t2
3.4.3 视频所属类别
select r_id,g.category
from
t2.join gulivideo_orc g on r_id = g.videoid;t3
select r_id,g.category
from
t2 join gulivideo_orc g on r_id = g.videoid;t3
3.4.4 类别展开
select category_name
from ()t3 lateral view explode(category)t as category_name;t4
3.4.5 统计类别个数
select category_name,count(*) hot
from
t4 group by category_name,t_sum;t5
3.4.6 所属类别排名
select * from
t5
order by hot desc;t6
SELECT *
FROM (
SELECT category_name,
Count(*) hot
FROM (
SELECT category_name
FROM (
SELECT r_id,
g.category
FROM (
SELECT DISTINCT(r_id)
FROM (
SELECT videoid,
views,
category,
relatedid
FROM gulivideo_orc
ORDER BY views DESC limit 50)t1 lateral VIEW explode(relatedid) relatedtable as r_id)t2
JOIN gulivideo_orc g
ON r_id = g.videoid)t3 lateral VIEW explode(category)t as category_name )t4
GROUP BY category_name)t5
ORDER BY hot DESC;
+-------------------+---------+--+
| t6.category_name | t6.hot |
+-------------------+---------+--+
| Comedy | 14 |
| Entertainment | 11 |
| Music | 10 |
| Film | 3 |
| Animation | 3 |
| People | 2 |
| Blogs | 2 |
| Travel | 1 |
| Sports | 1 |
| Howto | 1 |
| Places | 1 |
| DIY | 1 |
+-------------------+---------+--+
3.5 统计每个类别中的视频热度 Top10
SELECT category_name,
views,
videoid
FROM (
SELECT category_name,
videoid,
views,
Rank() OVER(partition BY category_name order by views) rank_no
FROM (
SELECT category_name,
videoid,
views
FROM gulivideo_orc lateral view explode(category) t as category_name
GROUP BY category_name,
videoid,
views
ORDER BY category_name,
views DESC)t1
ORDER BY category_name)t2
WHERE rank_no<=10;
3.6 统计每个类别中视频流量 Top10
3.7 统计上传视频最多的用户 Top10 以及他们上传的视频
1.找出上传前10的用户
select uploader,
videos
from
guli_user_orc
order by videos desc
limit 10;t1
2.找到上传的所有视频
select t1.uploader,
videoid,
views
from
()t1 join gulivideo_orc g
on
t.uploader=g.uploader
order by uploader,views desc; t2
SELECT t1.uploader,
videoid,
views
FROM (SELECT uploader,
videos
FROM guli_user_orc
ORDER BY videos DESC
LIMIT 10)t1
JOIN gulivideo_orc g
ON t1.uploader = g.uploader
order by uploader,views desc;
3.8 统计每个类别视频观看数 Top10
1.统计所有类别对应的视频
select
category_name,videoid,views
from
gulivideo_orc
lateral view explode(category) t as category_name;t1
2.对每个类观看数排名
select *,rank() over(partition by category_name order by views desc) rank_no
from
()t1;t2
3.取前十
select * from
()t2
where rank_no<=10;
SELECT *
FROM (
SELECT *,
Rank() OVER(partition BY category_name ORDER BY views DESC) rank_no
FROM (
SELECT category_name,
videoid,
views
FROM gulivideo_orc lateral view explode(category) t as category_name)t1 )t2
WHERE rank_no<=10;
+-------------------+--------------+-----------+-------------+--+
| t2.category_name | t2.videoid | t2.views | t2.rank_no |
+-------------------+--------------+-----------+-------------+--+
| Animals | 2GWPOPSXGYI | 3660009 | 1 |
| Animals | xmsV9R8FsDA | 3164582 | 2 |
| Animals | 12PsUW-8ge4 | 3133523 | 3 |
| Animals | OeNggIGSKH8 | 2457750 | 4 |
| Animals | WofFb_eOxxA | 2075728 | 5 |
| Animals | AgEmZ39EtFk | 1999469 | 6 |
| Animals | a-gW3RbJd8U | 1836870 | 7 |
| Animals | 8CL2hetqpfg | 1646808 | 8 |
| Animals | QmroaYVD_so | 1645984 | 9 |
| Animals | Sg9x5mUjbH8 | 1527238 | 10 |
| Animation | sdUUx5FdySs | 5840839 | 1 |
| Animation | 6B26asyGKDo | 5147533 | 2 |
| Animation | H20dhY01Xjk | 3772116 | 3 |
| Animation | 55YYaJIrmzo | 3356163 | 4 |
| Animation | JzqumbhfxRo | 3230774 | 5 |
| Animation | eAhfZUZiwSE | 3114215 | 6 |
| Animation | h7svw0m-wO0 | 2866490 | 7 |
| Animation | tAq3hWBlalU | 2830024 | 8 |
| Animation | AJzU3NjDikY | 2569611 | 9 |
| Animation | ElrldD02if0 | 2337238 | 10 |
网友评论