美文网首页
Hive实战项目——影音网站数据分析

Hive实战项目——影音网站数据分析

作者: V1cttor | 来源:发表于2019-01-23 16:09 被阅读0次

    0.需求描述

    统计 谷粒视频 网站的常规指标,各种 TopN 指标:

    --统计视频观看数 Top10
    --统计视频类别热度 Top10
    --统计视频观看数 Top20 所属类别
    --统计视频观看数 Top50 所关联视频的所属类别 Rank
    --统计每个类别中的视频热度 Top10
    --统计每个类别中视频流量 Top10
    --统计上传视频最多的用户 Top10 以及他们上传的视频
    --统计每个类别视频观看数 Top10

    1.数据预处理

    对将要处理的数据先进行一次数据清洗,过滤掉不合格的脏数据,同时调整数据的格式


    image.png
    1.1 将影片类型一栏->People & Blogs 中的空格去掉
    1.2 将第9列以后的数据(关联的其他影片ID)用“&“符号连接 (因为在建表时这两列都需要设置为array类型,所以要统一分隔字符)
    1.3 建立maven工程

    pom.xml

    <project xmlns="http://maven.apache.org/POM/4.0.0"
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
        <groupId>com.victor</groupId>
        <artifactId>ETLUtils</artifactId>
        <version>0.0.1-SNAPSHOT</version>
        <build>
        <plugins><plugin>
                    <artifactId>maven-compiler-plugin</artifactId>
                    <version>2.3.2</version>
                    <configuration>
                        <source>1.8</source>
                        <target>1.8</target>
                    </configuration>
                </plugin></plugins></build>
        
        <dependencies>
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>RELEASE</version>
            </dependency>
            <dependency>
                <groupId>org.apache.logging.log4j</groupId>
                <artifactId>log4j-core</artifactId>
                <version>2.8.2</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-common</artifactId>
                <version>2.7.2</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
                <version>2.7.2</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-hdfs</artifactId>
                <version>2.7.2</version>
            </dependency>
            <dependency>
                <groupId>jdk.tools</groupId>
                <artifactId>jdk.tools</artifactId>
                <version>1.8</version>
                <scope>system</scope>
                <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
            </dependency>
        </dependencies>
    </project>
    

    ETLUtilMapper.java

    package com.victor.ETLUtils;
    
    import java.io.IOException;
    
    import org.apache.commons.lang.StringUtils;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    
    public class ETLUtilsMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
    //  PF_ZMlw4rHs DisneyUnleashed 729 Film&Animation 288 96 3.5 2 0 1 xbSFrHzFQ0 4
    //  VP4qSjDNQs RJgGeYiJrj0
    
        Text k = new Text();
    
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
    
            // 将第四个字段中的空格去掉
            String newline = util(line);
            if (StringUtils.isNotBlank(newline)) {
                k.set(newline);
                context.write(k, NullWritable.get());
            }
        }
    
        private String util(String line) {
    
            StringBuffer sb = new StringBuffer();
            String[] fields = line.split("\t");
            if (fields.length < 9) {
                return null;
            }
            fields[3] = fields[3].replaceAll(" ", "");
            for (int i = 0; i < fields.length; i++) {
                // 如果字段小于9 ,丢弃
    
                if (i < 9) {
                    sb.append(fields[i] + "\t");
    
                } else {
                    if (fields.length-1 == i) {
                        sb.append(fields[i]);
                    } else {
                        sb.append(fields[i] + "&");
                    }
                }
    
            }
    
            return sb.toString();
        }
    
    }
    

    ETLUtilDriver.java

    package com.victor.ETLUtils;
    
    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.NullWritable;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    
    import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text;
    
    public class ETLUtilsDriver {
    
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            //args = new String[] {"2.txt","output"};
            Configuration conf = new Configuration();
            Job job = Job.getInstance(conf);
            job.setJarByClass(ETLUtilsDriver.class);
            job.setMapperClass(ETLUtilsMapper.class);
            
            
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(NullWritable.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);
            
            
            
            job.setNumReduceTasks(0);
            
            FileInputFormat.setInputPaths(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
            
            boolean result = job.waitForCompletion(true);
            System.exit(result?0:1);
        }
    
    }
    
    1.4 启动集群,将工程打成Jar包,并与数据文件一并上传至集群中(省略)
    1.5 清洗数据
    [hadoop@hadoop131 software]$ yarn jar ETLUtils-0.0.1-SNAPSHOT.jar com.victor.ETLUtils.ETLUtilsDriver /videotable /videoOutput
    

    处理前数据


    image.png

    处理后数据


    image.png

    2.Hive建表

    思路:先分别建立video和user 的原始表,再建立video和user的ORC压缩表以提高传输执行效率

    2.1 建立原始表

    gulivideo_ori

    CREATE TABLE gulivideo_ori(
    videoId string, 
    uploader string, 
    age int, 
    category array<string>, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array<string>
    )
    row format delimited 
    fields terminated by '\t'
    collection items terminated by '&';
    

    guli_user_ori

    create table guli_user_ori(
    uploader string,
    videos int,
    friends int)
    row format delimited
    fields terminated by "\t"
    stored as textfile;
    
    2.2 加载数据
    load data inpath '/videotable' into table gulivideo_ori;
    load data inpath '/usertable' into table guli_user_ori;
    
    2.3 建立ORC表
    create table gulivideo_orc(
    videoId string, 
    uploader string, 
    age int, 
    category array<string>, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array<string>
    )
    row format delimited 
    fields terminated by '\t'
    collection items terminated by '&'
    stored as orc;
    
    create table guli_user_orc(
    uploader string,
    videos int,
    friends int)
    row format delimited
    fields terminated by "\t"
    stored as orc;
    
    2.4 向ORC表加载数据
    insert into gulivideo_orc
    select * from gulivideo_ori;
    
    insert into guli_user_orc
    select * from guli_user_ori;
    

    3. 业务分析

    3.1 统计视频观看数 Top10
    select videoid,views 
    from gulivideo_orc 
    order by views desc 
    limit 10;
    
    +--------------+-----------+--+
    |   videoid    |   views   |
    +--------------+-----------+--+
    | dMH0bHeiRNg  | 42513417  |
    | 0XxI-hvPRRA  | 20282464  |
    | 1dmVU08zVpA  | 16087899  |
    | RB-wUgnyGv0  | 15712924  |
    | QjA5faZF1A8  | 15256922  |
    | -_CSo1gOd48  | 13199833  |
    | 49IDp76kjPw  | 11970018  |
    | tYnn51C3X_w  | 11823701  |
    | pv5zWaTEVkI  | 11672017  |
    | D2kJZOfq7zk  | 11184051  |
    +--------------+-----------+--+
    
    3.2 统计视频类别热度 Top10

    3.2.1 将表中category字段数组行转列
    select views,hot from
    gulivideo_orc lateral view explode (category) category_t as hot;t1

    3.2.2 统计每个类别的观看总数
    select hot,count(*)
    from t1
    group by hot;t2

    3.2.3 获取观看前10的类别
    select hot,total_view
    from ()t2
    order by total_view desc limit 10;

    SELECT   hot, 
             total_view 
    FROM     ( 
                      SELECT   hot, 
                               Count(*) total_view 
                      FROM     ( 
                                      SELECT views, 
                                             hot 
                                      FROM   gulivideo_orc lateral view explode (category) category_t as hot)t1
                      GROUP BY hot)t2 
    ORDER BY total_view DESC limit 10;
    
    +----------------+-------------+--+
    | category_name  | total_view  |
    +----------------+-------------+--+
    | Music          | 179049      |
    | Entertainment  | 127674      |
    | Comedy         | 87818       |
    | Film           | 73293       |
    |  Animation     | 73293       |
    | Sports         | 67329       |
    |  Games         | 59817       |
    | Gadgets        | 59817       |
    | People         | 48890       |
    |  Blogs         | 48890       |
    +----------------+-------------+--+
    
    3.3 统计视频观看数 Top20 所属类别

    3.3.1 观看数top20视频
    select views,category
    from gulivideo_orc
    order by views desc
    limit 20;t1
    3.3.2 所属类别
    select views,category
    from t1 lateral view explode(category)ct as category_name;

    SELECT views, 
           category_name 
    FROM   ( 
                    SELECT   views, 
                             category 
                    FROM     gulivideo_orc 
                    ORDER BY views DESC limit 20)t1 lateral VIEW explode(category)ct AS category_name ;
    
    +-----------+----------------+--+
    |   views   | category_name  |
    +-----------+----------------+--+
    | 42513417  | Comedy         |
    | 20282464  | Comedy         |
    | 16087899  | Entertainment  |
    | 15712924  | Entertainment  |
    | 15256922  | Music          |
    | 13199833  | People         |
    | 13199833  |  Blogs         |
    | 11970018  | Comedy         |
    | 11823701  | Music          |
    | 11672017  | Music          |
    | 11184051  | People         |
    | 11184051  |  Blogs         |
    | 10786529  | Entertainment  |
    | 10334975  | Entertainment  |
    | 10107491  | Comedy         |
    | 9579911   | Music          |
    | 9566609   | Comedy         |
    | 8825788   |  UNA           |
    | 7533070   | Music          |
    | 7456875   | Entertainment  |
    | 7066676   | Comedy         |
    | 6952767   | Entertainment  |
    +-----------+----------------+--+
    
    3.4 统计视频观看数 Top50 所关联视频的所属类别 Rank

    3.4.1 观看数top10,关联视频
    select
    videoid,views,category,relatedid
    from
    gulivideo_orc
    order by
    views desc
    limit 50;t1

    3.4.2 关联视频行转列
    select distinct(r_id)
    from
    t1 lateral view explode(relatedid) relatedtable as r_id;t2

    3.4.3 视频所属类别
    select r_id,g.category
    from
    t2.join gulivideo_orc g on r_id = g.videoid;t3

    select r_id,g.category
    from
    t2 join gulivideo_orc g on r_id = g.videoid;t3

    3.4.4 类别展开
    select category_name
    from ()t3 lateral view explode(category)t as category_name;t4

    3.4.5 统计类别个数
    select category_name,count(*) hot
    from
    t4 group by category_name,t_sum;t5

    3.4.6 所属类别排名
    select * from
    t5
    order by hot desc;t6

    SELECT   * 
    FROM     ( 
                      SELECT   category_name, 
                               Count(*) hot 
                      FROM     ( 
                                      SELECT category_name 
                                      FROM   ( 
                                                    SELECT r_id, 
                                                           g.category 
                                                    FROM   ( 
                                                                           SELECT DISTINCT(r_id) 
                                                                           FROM            ( 
                                                                                                    SELECT   videoid,
                                                                                                             views,
                                                                                                             category,
                                                                                                             relatedid
                                                                                                    FROM     gulivideo_orc
                                                                                                    ORDER BY views DESC limit 50)t1 lateral VIEW explode(relatedid) relatedtable as r_id)t2
                                                    JOIN   gulivideo_orc g 
                                                    ON     r_id = g.videoid)t3 lateral VIEW explode(category)t as category_name )t4
                      GROUP BY category_name)t5 
    ORDER BY hot DESC;
    
    +-------------------+---------+--+
    | t6.category_name  | t6.hot  |
    +-------------------+---------+--+
    | Comedy            | 14      |
    | Entertainment     | 11      |
    | Music             | 10      |
    | Film              | 3       |
    |  Animation        | 3       |
    | People            | 2       |
    |  Blogs            | 2       |
    | Travel            | 1       |
    | Sports            | 1       |
    | Howto             | 1       |
    |  Places           | 1       |
    |  DIY              | 1       |
    +-------------------+---------+--+
    
    3.5 统计每个类别中的视频热度 Top10
    SELECT category_name, 
           views,
           videoid 
    FROM   ( 
                    SELECT   category_name, 
                             videoid, 
                             views,
                             Rank() OVER(partition BY category_name order by views) rank_no 
                    FROM     ( 
                                      SELECT   category_name, 
                                               videoid, 
                                               views 
                                      FROM     gulivideo_orc lateral view explode(category) t as category_name
                                      GROUP BY category_name, 
                                               videoid, 
                                               views 
                                      ORDER BY category_name, 
                                               views DESC)t1 
                    ORDER BY category_name)t2 
    WHERE  rank_no<=10;
    
    3.6 统计每个类别中视频流量 Top10
    3.7 统计上传视频最多的用户 Top10 以及他们上传的视频

    1.找出上传前10的用户
    select uploader,
    videos
    from
    guli_user_orc
    order by videos desc
    limit 10;t1

    2.找到上传的所有视频
    select t1.uploader,
    videoid,
    views
    from
    ()t1 join gulivideo_orc g
    on
    t.uploader=g.uploader
    order by uploader,views desc; t2

    SELECT t1.uploader, 
           videoid, 
           views 
    FROM   (SELECT uploader, 
                   videos 
            FROM   guli_user_orc 
            ORDER  BY videos DESC 
            LIMIT  10)t1 
           JOIN gulivideo_orc g 
             ON t1.uploader = g.uploader 
    order by uploader,views desc; 
    
    3.8 统计每个类别视频观看数 Top10

    1.统计所有类别对应的视频
    select
    category_name,videoid,views
    from
    gulivideo_orc
    lateral view explode(category) t as category_name;t1

    2.对每个类观看数排名
    select *,rank() over(partition by category_name order by views desc) rank_no
    from
    ()t1;t2

    3.取前十
    select * from
    ()t2
    where rank_no<=10;

    SELECT * 
    FROM   ( 
                    SELECT   *, 
                             Rank() OVER(partition BY category_name ORDER BY views DESC) rank_no 
                    FROM     ( 
                                    SELECT category_name, 
                                           videoid, 
                                           views 
                                    FROM   gulivideo_orc lateral view explode(category) t as category_name)t1 )t2
    WHERE  rank_no<=10;
    
    +-------------------+--------------+-----------+-------------+--+
    | t2.category_name  |  t2.videoid  | t2.views  | t2.rank_no  |
    +-------------------+--------------+-----------+-------------+--+
    |  Animals          | 2GWPOPSXGYI  | 3660009   | 1           |
    |  Animals          | xmsV9R8FsDA  | 3164582   | 2           |
    |  Animals          | 12PsUW-8ge4  | 3133523   | 3           |
    |  Animals          | OeNggIGSKH8  | 2457750   | 4           |
    |  Animals          | WofFb_eOxxA  | 2075728   | 5           |
    |  Animals          | AgEmZ39EtFk  | 1999469   | 6           |
    |  Animals          | a-gW3RbJd8U  | 1836870   | 7           |
    |  Animals          | 8CL2hetqpfg  | 1646808   | 8           |
    |  Animals          | QmroaYVD_so  | 1645984   | 9           |
    |  Animals          | Sg9x5mUjbH8  | 1527238   | 10          |
    |  Animation        | sdUUx5FdySs  | 5840839   | 1           |
    |  Animation        | 6B26asyGKDo  | 5147533   | 2           |
    |  Animation        | H20dhY01Xjk  | 3772116   | 3           |
    |  Animation        | 55YYaJIrmzo  | 3356163   | 4           |
    |  Animation        | JzqumbhfxRo  | 3230774   | 5           |
    |  Animation        | eAhfZUZiwSE  | 3114215   | 6           |
    |  Animation        | h7svw0m-wO0  | 2866490   | 7           |
    |  Animation        | tAq3hWBlalU  | 2830024   | 8           |
    |  Animation        | AJzU3NjDikY  | 2569611   | 9           |
    |  Animation        | ElrldD02if0  | 2337238   | 10          |
    
    

    相关文章

      网友评论

          本文标题:Hive实战项目——影音网站数据分析

          本文链接:https://www.haomeiwen.com/subject/ylrjjqtx.html