美文网首页
mysql sql优化之 优化GROUP BY 和 DISTIN

mysql sql优化之 优化GROUP BY 和 DISTIN

作者: 尹楷楷 | 来源:发表于2020-04-21 11:36 被阅读0次

为严将军头,为嵇侍中血

数据准备

创建表tb_point 表


SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for tb_point
-- ----------------------------
DROP TABLE IF EXISTS `tb_point`;
CREATE TABLE `tb_point`  (
  `id` varchar(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '主键',
  `create_by` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '创建人',
  `create_time` datetime(0) NULL DEFAULT NULL COMMENT '创建日期',
  `update_by` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '更新人',
  `update_time` datetime(0) NULL DEFAULT NULL COMMENT '更新日期',
  `sys_org_code` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '所属部门',
  `name` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '投放点',
  `address` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '地址',
  `contacts` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '联系人',
  `phone_number` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '联系方式',
  PRIMARY KEY (`id`) USING BTREE,
  INDEX `index_point`(`name`, `create_time`, `address`, `contacts`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic;

-- ----------------------------
-- Records of tb_point
-- ----------------------------
INSERT INTO `tb_point` VALUES ('1249603071806279681', 'qiyeuser', '2020-04-13 15:39:23', NULL, NULL, 'A03', '东风东路小学', '东风东路756号', '李科', '13688889999');
INSERT INTO `tb_point` VALUES ('1249603416783589378', 'qiyeuser', '2020-04-13 15:40:46', 'qiyeuser', '2020-04-15 12:02:01', 'A03', '广交会展馆', '天河区琶洲广交会展馆', '刘经理', '13467891234');
INSERT INTO `tb_point` VALUES ('1249881947077873666', 'qiyeuser', '2020-04-14 10:07:33', 'qiyeuser', '2020-04-14 11:03:45', 'A03', '414钟ll创建的投放点', '深圳市盐田区', '张恒', '15578464565');
INSERT INTO `tb_point` VALUES ('1249899261022179329', 'qiyeuser', '2020-04-14 11:16:20', NULL, NULL, 'A03', 'zll创建的投放点', '广州市天河区', 'zll', '15512345678');
INSERT INTO `tb_point` VALUES ('1249992791816146945', 'qiyeuser', '2020-04-14 17:28:00', NULL, NULL, 'A03', '2020创建1号投放点', '厦门市', 'Zhang', '1234567899');
INSERT INTO `tb_point` VALUES ('1249996228356214785', 'qiyeuser', '2020-04-14 17:41:39', NULL, NULL, 'A03', '17:42测试', '深圳市宝安区', 'Zhang', '12345678');
INSERT INTO `tb_point` VALUES ('1249996578001784834', 'qiyeuser', '2020-04-14 17:43:03', NULL, NULL, 'A03', '17:43测试', '深圳市宝安区', 'Zhang', '15579487');
INSERT INTO `tb_point` VALUES ('1249997067632250882', 'qiyeuser', '2020-04-14 17:44:59', NULL, NULL, 'A03', '17:45测试', '宝安区', 'zhong', '12345678');

SET FOREIGN_KEY_CHECKS = 1;

准备空的tb_box表


SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for tb_box
-- ----------------------------
DROP TABLE IF EXISTS `tb_box`;
CREATE TABLE `tb_box`  (
  `id` varchar(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL COMMENT '主键',
  `create_by` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '创建人',
  `create_time` datetime(0) NULL DEFAULT NULL COMMENT '创建日期',
  `update_by` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '更新人',
  `update_time` datetime(0) NULL DEFAULT NULL COMMENT '更新日期',
  `sys_org_code` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '所属部门',
  `status` int(10) NULL DEFAULT 0 COMMENT '状态',
  `number` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '编号',
  `zi_number` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '自编号',
  `house_address` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '仓库地址',
  `sb_number` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '设备id',
  `point_id` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '投放点id',
  `point` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '投放点',
  `confirm` int(32) NULL DEFAULT 0 COMMENT '商户/企业用户确认入库,默认为0(未确认)1是已确认',
  `last_point` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL COMMENT '最近一次投放点名',
  PRIMARY KEY (`id`) USING BTREE,
  INDEX `index_sb_number`(`sb_number`) USING BTREE,
  INDEX `index_number`(`number`) USING BTREE,
  INDEX `index_point_id`(`point_id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic;

SET FOREIGN_KEY_CHECKS = 1;

函数

CREATE DEFINER=`root`@`%` PROCEDURE `insert_tb_box`(in start int(10),in max_num int(10))
begin
 declare i int default 0;
 /*把autocommit设置成0*/
 set autocommit= 0;
 repeat
 set i=i+1;
 INSERT INTO `test`.`tb_box`(`id`, `create_by`, `create_time`, `update_by`, `update_time`, `sys_org_code`, `status`, `number`, `zi_number`, `house_address`, `sb_number`, `point_id`, `point`, `confirm`, `last_point`) VALUES ((start+i), rand_string(6), now(), rand_string(6), now(), 'A03', 0, rand_string(20), 'A001', '仓库1', rand_string(20), rand_string(20), NULL, 1, rand_string(30));
 until i=max_num end repeat;
commit;
end



CREATE DEFINER=`root`@`localhost` FUNCTION `rand_num`() RETURNS int(5)
begin
   declare i int default 0;
   set i=floor(100+rand()*10);
 return i;
 end



CREATE DEFINER=`root`@`localhost` FUNCTION `rand_string`(n int) RETURNS varchar(255) CHARSET utf8mb4 COLLATE utf8mb4_unicode_ci
begin
   declare chars_str varchar(100) default 'qwertyuiopasdfghjklzxcvbnm';
   declare return_str varchar(255) default '';
   declare i int default 0;
   while i<n do
   set return_str=concat(return_str,substring(chars_str,floor(1+rand()*52),1));
   set i=i+1;
   end while;
   return return_str;
 end

编写存储过程,给tb_box表添加100万条数据


delimiter $$
create procedure insert_tb_box(in start int(10),in max_num int(10))
begin
 declare i int default 0;
 /*把autocommit设置成0*/
 set autocommit= 0;
 repeat
 set i=i+1;
 INSERT INTO `test`.`tb_box`(`id`, `create_by`, `create_time`, `update_by`, `update_time`, `sys_org_code`, `status`, `number`, `zi_number`, `house_address`, `sb_number`, `point_id`, `point`, `confirm`, `last_point`) VALUES ((start+i), rand_string(6), now(), rand_string(6), now(), 'A03', 0, rand_string(20), 'A001', '仓库1', rand_string(20), rand_string(20), NULL, 1, rand_string(30));
 until i=max_num end repeat;
commit;
end $$

-- 生成 一百万条数据
call insert_tb_box(1,1000000);

修改关联数据

update tb_box SET point_id = '1249603071806279681'  WHERE MOD(id,4) = 0
update tb_box SET point_id = '1249603416783589378'  WHERE MOD(id,4) = 1
update tb_box SET point_id = '1249881947077873666'  WHERE MOD(id,4) = 2
update tb_box SET point_id = '1249899261022179329'  WHERE MOD(id,4) = 3

GROUP BY 优化

通常采用标识符做GROUP BY字段性能好于使用其他字段

这是使用 a.id分组优于使用a.name分组

SELECT
    a.id,
    a.NAME,
    count( b.id ) boxCount 
FROM
    tb_point a
    LEFT JOIN tb_box b ON a.id = b.point_id 
GROUP BY
    a.id
ORDER BY
    boxCount DESC

好于

SELECT
    a.id,
    a.NAME,
    count( b.id ) boxCount 
FROM
    tb_point a
    LEFT JOIN tb_box b ON a.id = b.point_id 
GROUP BY
    a.NAME
ORDER BY
    boxCount DESC
使用数据量小表的id作为GRUOPBY字段性能优于使用数据量大表id

尽管这里使用a.id 和b.id 当做分组字段,两个查询查出的数据不同

SELECT
    a.id,
    a.NAME,
    count( b.id ) boxCount 
FROM
    tb_point a
    LEFT JOIN tb_box b ON a.id = b.point_id 
GROUP BY
    a.id
ORDER BY
    boxCount DESC

优于

SELECT
    a.id,
    a.NAME,
    count( b.id ) boxCount 
FROM
    tb_point a
    LEFT JOIN tb_box b ON a.id = b.point_id 
GROUP BY
    b.id
ORDER BY
    boxCount DESC
GRUOPBY和sql_mode的 ONLY_FULL_GROUP_BY冲突

在执行以下语句时会报错:

1055 - Expression #1 of SELECT list is not in GROUP BY clause and contains nonaggregated column 'test.a.id' which is not functionally dependent on columns in GROUP BY clause; this is incompatible with sql_mode=only_full_group_by

  SELECT
    a.id,
    a.NAME,
    a.contacts,
    count( b.id ) boxCount 
FROM
    tb_point a
    LEFT JOIN tb_box b ON a.id = b.point_id 
    GROUP BY
    a.NAME
    

前面在https://www.jianshu.com/p/95e50fd017ea文章中有提到这个问题,是直接修改sql_mode将 ONLY_FULL_GROUP_BY直接干掉。但是在《高性能mysql》中有一段话是这样的:

image.png
作者并不提倡这么干,理由是: 当索引改变,或者优化器选择使用不同的优化策略时都可能导致结果不一样。并指出了不要因为偷懒而导致最后查询出现故障。要求始终使用含义明确的语法。这句话有点无法理解,文中提到的“故障”是什么呢?可能要到我遇到之后才会知道吧。。

那么既然指出不要直接修改 sql_mode,那么我们应该如何让冲突的GRUOPBY语句正确执行呢?

文中有提到,可以使用max()和min()函数来实现;但是这种方式使用max和min函数较真的人可能会说这样写的分组查询有问题,确实如此。但是如果更加在乎查询效率,这样做也无可厚非。

  SELECT
    max(a.id),
    a.NAME,
    max(a.contacts),
    count( b.id ) boxCount 
FROM
    tb_point a
    LEFT JOIN tb_box b ON a.id = b.point_id 
    GROUP BY
    a.NAME
    

如果,实在无法接受使用上面那种方式的话,可以这样使用子查询的方式来进行查询:

SELECT
    a.id,
    a.NAME,
    a.contacts,
    IFNULL(b.ct ,0) ct
FROM
    tb_point a
    LEFT JOIN ( SELECT point_id, count( id ) AS ct FROM tb_box GROUP BY point_id ) b ON a.id = b.point_id

书上对于这种方式有描述如下:
这样写更满足关系理论,但是成本有点高,因为子查询需要填充临时表,而子查询中创建的临时表是没有任何索引的。
作者认为这样写对性能有影响。

但是从我测得结果来看,子查询的耗时反而更少。性能反而更佳。这个子查询耗时0.4秒。而使用max方式耗时0.8秒。几乎一倍。我的mysql版本是 5.7.22-log

最终的方案选择还是要以自己测试结果为准。如果理论和实践冲突,我选择相信自己的实践

为了解其中的原因,我们查看它的执行计划:
可见,因为子查询而产生了一层 DERIVED 临时表,但是这个临时表的Extra字段有显示 Using index、key里面显示自建索引。说明用到了索引。这是查询性能可观的一个重要原因吧;


image.png

另外我分别使用 SHOW PROFILE命令查看各部分耗时,对比之下。没看到有哪部分耗时差别特别大,使用JOIN、MAX 耗时比上子查询耗时都差不多是1倍

GRUOP BY的排序优化

有些时候对一没有建立索引的字段,进行GRUOP BY时。会产生Using filesort 文件内排序。因为GRUOP BY是在排序的基础上进行分组的。

如下面sql:

    EXPLAIN
     SELECT
    a.id,
    a.NAME,
    a.contacts,
    count( b.id ) boxCount 
FROM
    tb_point a
    LEFT JOIN tb_box b ON a.id = b.point_id 
    GROUP BY
    a.contacts
 
image.png

如果业务上不对排序有要求。那么就可以禁止GRUOP BY的排序:

指定 ORDER BY NULL

    
    EXPLAIN
     SELECT
    a.id,
    a.NAME,
    a.contacts,
    count( b.id ) boxCount 
FROM
    tb_point a
    LEFT JOIN tb_box b ON a.id = b.point_id 
    GROUP BY
    a.contacts
ORDER BY NULL

这样就把Using filesort给干掉了! 执行时间 1.237


image.png

当然,多数情况是多排序有要求的。此时也可以在GRUOP BY后面使用DESC和ASC关键字,使分组的结果集按需要的方向排序。如下:

    EXPLAIN
     SELECT
    a.id,
    a.NAME,
    a.contacts,
    count( b.id ) boxCount 
FROM
    tb_point a
    LEFT JOIN tb_box b ON a.id = b.point_id 
    GROUP BY
    a.contacts DESC
GROUP BY WITH ROLLUP 优化

分组查询的一个变种就是要求mysql对分组结果再进行一次超级聚合。可以使用GROUP BY WITH ROLLUP 来实现这种逻辑,但可能性能不佳。因为通过查询计划分析出它是使用 Using temporary; Using filesort 来实现的。

EXPLAIN
SELECT
    a.id,
    a.NAME,
    count( b.id ) boxCount 
FROM
    tb_point a
    LEFT JOIN tb_box b ON a.id = b.point_id 
GROUP BY
    a.id  WITH ROLLUP  
image.png

使用WITH ROLLUP,查询时间2.531秒。不使用0.774 秒。

1、所以,很多时候。我们在应用程序中做超级聚合是最好的!

2、当然也可使用UNION ALL 来实现:

SELECT
    a.id,
    a.NAME,
    count( b.id ) boxCount 
FROM
    tb_point a
    LEFT JOIN tb_box b ON a.id = b.point_id 
GROUP BY
    a.id   
UNION ALL
 SELECT NULL id, null name ,COUNT(point_id) boxCount FROM tb_box 
        

时间 0.951秒

image.png

3、还可以通过FROM子句嵌套使用子查询:

SELECT
    a.id,
    a.NAME,
    count( b.id ) boxCount, 
    boxCounts
FROM
    tb_point a
    LEFT JOIN tb_box b ON a.id = b.point_id  JOIN ( SELECT NULL id, null name ,COUNT(point_id) boxCounts FROM tb_box ) tb 
GROUP BY
    a.id

时间0.944秒

image.png

相关文章

网友评论

      本文标题:mysql sql优化之 优化GROUP BY 和 DISTIN

      本文链接:https://www.haomeiwen.com/subject/ulvjihtx.html