开窗函数以及top3销售额统计案例实战

Spark 1.4.x版本以后，为Spark SQL和DataFrame引入了开窗函数，比如最经典，最常用的，row_number()，可以让我们实现分组取topn的逻辑。

案例：统计每个种类的销售额排名前3的产品

先说明一下，row_number()开窗函数的作用
其实，就是给每个分组的数据，按照其排序顺序，打上一个分组内的行号
比如说，有一个分组date=20181231，里面有3条数据，1122，1121，1124,
那么对这个分组的每一行使用row_number()开窗函数以后，三行，依次会获得一个组内的行号
行号从1开始递增，比如1122 1，1121 2，1124 3

row_number()开窗函数的语法说明
首先可以，在SELECT查询时，使用row_number()函数
其次，row_number()函数后面先跟上OVER关键字
然后括号中，是PARTITION BY，也就是说根据哪个字段进行分组
其次是可以用ORDER BY进行组内排序
然后row_number()就可以给每个组内的行，一个组内行号
Java版本

public class RowNumberWindowFunction {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("RowNumberWindowFunctionJava");
        JavaSparkContext sparkContext = new JavaSparkContext(conf);

        // 创建销售额表，sales表
        HiveContext hiveContext = new HiveContext(sparkContext.sc());
        hiveContext.sql("DROP TABLE IF EXISTS sales");
        hiveContext.sql("CREATE TABLE IF NOT EXISTS sales ("
                + "product STRING,"
                + "category STRING,"
                + "revenue BIGINT)");
        hiveContext.sql("LOAD DATA "
                + "LOCAL INPATH '/opt/module/datas/sparkstudy/sql/resource/sales.txt' "
                + "INTO TABLE sales");

        DataFrame top3 = hiveContext.sql(
                "select s.product, s.category, s.revenue " +
                        "from ( " +
                        "select product, category, revenue, " +
                        "row_number() over(partition by category order by revenue desc) rank " +
                        "from sales " +
                        ") s " +
                        "where s.rank < 4"
        );

        hiveContext.sql("DROP TABLE IF EXISTS top3_sales");
        top3.saveAsTable("top3_sales");
    }
}

Scala版本

object RowNumberWindowFunction {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("RowNumberWindowFunctionScala")
    val sparkContext = new SparkContext(conf)

    // 创建销售额表，sales表
    val hiveContext = new HiveContext(sparkContext)
    hiveContext.sql("DROP TABLE IF EXISTS sales")
    hiveContext.sql("CREATE TABLE IF NOT EXISTS sales ("
      + "product STRING,"
      + "category STRING,"
      + "revenue BIGINT)")
    hiveContext.sql("LOAD DATA "
      + "LOCAL INPATH '/opt/module/datas/sparkstudy/sql/resource/sales.txt' "
      + "INTO TABLE sales")

    val top3 = hiveContext.sql(
      "select s.product, s.category, s.revenue " +
        "from ( " +
        "select product, category, revenue, " +
        "row_number() over(partition by category order by revenue desc) rank " +
        "from sales " +
        ") s " +
        "where s.rank < 4"
    )

    hiveContext.sql("DROP TABLE IF EXISTS top3_sales")
    top3.write.saveAsTable("top3_sales")
  }
}