测试数据
- customers.csv
- orders.csv
- order_items.csv
- products.csv
-
哪位用户买的商品最多?
- SQL
spark.sql("select c.customer_id, c.customer_fname, c.customer_lname, tc.order_nums from customers c left join (select t.order_customer_id as customer_id, count(t.order_customer_id) as order_nums from (select * from orders where order_status = 'COMPLETE') t group by t.order_customer_id) tc on tc.customer_id = c.customer_id order by tc.order_nums desc").show(3) +-----------+--------------+--------------+----------+ |customer_id|customer_fname|customer_lname|order_nums| +-----------+--------------+--------------+----------+ | 9337| Mary| Smith| 10| | 7802| Mary| Acevedo| 9| | 749| Jesse| Matthews| 9| +-----------+--------------+--------------+----------+ spark.sql("select c.customer_id, c.customer_fname, c.customer_lname,count(c.customer_id) as order_nums, sum(oi.order_item_subtotal) as total from customers c inner join orders o on c.customer_id = o.order_customer_id inner join order_items oi on o.order_id = oi.order_item_order_id group by c.customer_id, c.customer_fname, c.customer_lname order by total desc").show +-----------+--------------+--------------+----------+------------------+ |customer_id|customer_fname|customer_lname|order_nums| total| +-----------+--------------+--------------+----------+------------------+ | 791| Mary| Smith| 43|10524.169999999995| | 9371| Mary| Patterson| 44| 9299.029999999993| | 8766| Mary| Duncan| 38| 9296.139999999998| +-----------+--------------+--------------+----------+------------------+
- DataFrame API
dfCustomer.alias("c") .join(dfOrder.alias("o"), $"c.customer_id" === $"o.order_customer_id", "right_outer") .filter($"o.order_status" === "COMPLETE") .select("c.customer_id", "c.customer_fname", "c.customer_lname", "o.order_id") .groupBy($"customer_id",$"customer_fname", $"customer_lname") .agg(count($"order_id").as("order_nums")) .sort($"order_nums".desc).show(3) +-----------+--------------+--------------+----------+ |customer_id|customer_fname|customer_lname|order_nums| +-----------+--------------+--------------+----------+ | 9337| Mary| Smith| 10| | 7802| Mary| Acevedo| 9| | 3710| Ashley| Smith| 9| +-----------+--------------+--------------+----------+
-
那个商品卖的最多?
- SQL
// 不考虑是否交易完成 spark.sql("select p.product_id, p.product_name, p.product_category_id, p.product_price, tp.nums from products p left join (select order_item_product_id as product_id, count(*) as nums from order_items group by order_item_product_id) tp on p.product_id = tp.product_id order by tp.nums desc").show(3) +----------+--------------------+-------------------+-------------+-----+ |product_id| product_name|product_category_id|product_price| nums| +----------+--------------------+-------------------+-------------+-----+ | 365|Perfect Fitness P...| 17| 59.99|24515| | 403|Nike Men's CJ Eli...| 18| 129.99|22246| | 502|Nike Men's Dri-FI...| 24| 50.0|21035| +----------+--------------------+-------------------+-------------+-----+ // 只考虑已完成订单中的商品 spark.sql("select p.product_id, p.product_name, p.product_category_id, p.product_price, tp.nums from products p right join (select order_item_product_id as product_id, count(*) as nums from (select * from order_items ot left join orders o on o.order_id = ot.order_item_order_id ) s where s.order_status = 'COMPLETE' group by order_item_product_id) tp on p.product_id = tp.product_id order by tp.nums desc").show(3) +----------+--------------------+-------------+------------+ |product_id| product_name|product_price|product_nums| +----------+--------------------+-------------+------------+ | 365|Perfect Fitness P...| 59.99| 8071| | 403|Nike Men's CJ Eli...| 129.99| 7369| | 502|Nike Men's Dri-FI...| 50.0| 7036| +----------+--------------------+-------------+------------+
- DataFrame API
dfProduct.alias("p") .join(dfOrderItem.alias("ot"), $"p.product_id" === $"ot.order_item_product_id", "right_outer") .select($"product_id",$"product_name", $"product_price", $"order_item_order_id", $"product_category_id") // 可省略 .join(dfOrder.alias("o"), $"o.order_id" === $"order_item_order_id", "left_outer") .filter($"order_status" === "COMPLETE").groupBy($"product_id", $"product_name", $"product_price") .agg(count($"product_id").as("product_nums")) .sort($"product_nums".desc).show(3) +----------+--------------------+-------------+------------+ |product_id| product_name|product_price|product_nums| +----------+--------------------+-------------+------------+ | 365|Perfect Fitness P...| 59.99| 8071| | 403|Nike Men's CJ Eli...| 129.99| 7369| | 502|Nike Men's Dri-FI...| 50.0| 7036| +----------+--------------------+-------------+------------+
注:此结果只统计成功交易的订单中的商品
-
周一到周末的购物分布情况?
- SQL
spark.sql("select dayOfWeek(order_date) as dayofweek, count(order_id) as order_nums from orders where order_status = 'COMPLETE' group by dayofweek order by order_nums desc").show +---------+----------+ |dayofweek|order_nums| +---------+----------+ | 6| 3387| | 5| 3371| | 2| 3356| | 4| 3261| | 3| 3232| | 7| 3203| | 1| 3089| +---------+----------+
- DataFrame API
dfOrder.filter($"order_status" === "COMPLETE").withColumn("dayofweek", day_Of_Week($"order_date")) .select("dayofweek", "order_id").groupBy($"dayofweek").agg(count($"order_id").as("order_nums")) .sort($"order_nums".desc).show +---------+----------+ |dayofweek|order_nums| +---------+----------+ | 6| 3387| | 5| 3371| | 2| 3356| | 4| 3261| | 3| 3232| | 7| 3203| | 1| 3089| +---------+----------+
注:dayOfWeek和day_Of_Week是自定义函数
-
一月到十二月的购物分布情况?
- SQL
// 使用date_format函数 spark.sql("select t.month as month, count(t.month) as order_nums from (select order_id, date_format(order_date, 'MM') as month, order_customer_id from orders where order_status = 'COMPLETE') t group by t.month order by order_nums desc").show(12) +-----+----------+ |month|order_nums| +-----+----------+ | 11| 2141| | 03| 1967| | 07| 1934| | 09| 1933| | 04| 1932| | 01| 1911| | 12| 1898| | 08| 1880| | 02| 1869| | 05| 1854| | 06| 1797| | 10| 1783| +-----+----------+
- DataFrame API
dfOrder.filter($"order_status" === "COMPLETE").withColumn("month", month_Of_Year($"order_date")) .select("month", "order_id").groupBy($"month").agg(count($"order_id").as("order_nums")) .sort($"order_nums".desc).show +-----+----------+ |month|order_nums| +-----+----------+ | 11| 2141| | 3| 1967| | 7| 1934| | 9| 1933| | 4| 1932| | 1| 1911| | 12| 1898| | 8| 1880| | 2| 1869| | 5| 1854| | 6| 1797| | 10| 1783| +-----+----------+
注:month_Of_Year是自定义函数
网友评论