Arksentinel简介
https://github.com/arkcontrol/arkcontrol
Arksentinel 是极数云舟科技有限公司开发的分布式哨兵,通过读取配置信息,获取当前运行实例的信息,并且每个哨兵针对每个实例都会创建对应的监控线程, 如果发现实例出现故障,则从配置库中获取备份实例信息,并进行切换,并根据用户设置调用对应脚本进行订制化操作。架构如下:
其中被监控集群可以为:
-
开启GTID的主从架构
-
Galera Cluster
-
多主模式的MGR
-
Arkproxy
-
Arkgate
-
Arketl
哨兵功特点
-
主备切换
- 数据一致的情况下,配置表中address_num越近,替换优先级越高
-
备节点切换
- 配置表中address_num越近,替换优先级越高
-
实例状态检测
-
哨兵集群自身高可用
-
跨机房检测,避免因为网络隔离导致误操作
-
配合Arkproxy可以做到跨机房高可用
failover切换时间说明
QQ20190315-203530.png当前文档部署环境说明
Arksentinel部署启动
1. 获取Arksentinel可执行文件
-
联系极数云舟获取执行文件(sunkai@cloud-ark.com)
-
或者添加微信(15717194004) 咨询
-
公司内部访问 https://git.cloud-ark.com/arkzone/ark-sentinel/tree/dev, target目录即为配置文件和可执行文件,详见路径下的readme.txt
-
获取文件列表
Arkcontrol 平台包: arkcontrol-v1.5.4-beta-community-linux-x86_64.tar.gz (也可能是其他版本)
Arkcontrol 体验版操作手册.pdf
Arksentinel 哨兵软件包: arksentinel.tar.gz
Arksentinel使用说明.pdf
2. 初始化配置库
- 如果基于Arkcontrol平台
根据Arkcontrol 体验版操作手册.pdf搭建好Arkcontrol平台就会自动初始化ark_sentinel及相关表
但是仍需手动配置sentinel_config表(数据格式如下)
# raft_nodes 为哨兵集群信息,根据实际情况配置ip,port等信息,哨兵集群至少3个节点
raft_config.raft_nodes.machine_type 字段用于区分私有或云机器
failover_timeout.failover_timeout 设置failover超时时间,切换超时后会重新发起选举,进行切换操作
# config_cluster 为Arkcontrol配置库信息, 根据情况配置账号信息
哨兵会逐个尝试连接每个实例,如果全部无法连接,则哨兵停止工作
# notifier_config 为配置报警设置,为'{}'时,哨兵不发送报警信息
# odown_timeout在这个时间内只要有一个哨兵认为实例没有挂掉就不会进行failover操作,超过这个时间只要超过半数哨兵任务实例挂掉就进行failover操作
# ping_interval哨兵检测实例心跳间隔,超过5次心跳没有响应,则该哨兵认为实例挂掉
# arksentinel_user,arksentinel_password 为哨兵连接要监控实例使用的账号信息,注意搭建的集群要对这个账号授权
# replication_user,replication_password 为哨兵切换时重新上实例切换master时用的复制或同步账号
# switchover_script,failover_script 分别表示发生switchover,failover时调用的脚本,为空时不调用
mysql> select * from sentinel_config\G;
*************************** 1\. row ***************************
id: 1
raft_config: {"raft_nodes": [{"ip": "10.0.0.111", "port": 7977, "address": "BeiJing", "machine_type": "private"}, {"ip": "10.0.0.89", "port": 7977, "address": "BeiJing", "machine_type": "private"}, {"ip": "10.0.0.86", "port": 7977, "address": "BeiJing", "machine_type": "private"}], "failover_timeout": 50000}
config_cluster: {"config_cluster": [{"ip": "10.0.0.87", "port": 3306, "user": "sunkai", "password": "sunkai"}]}
odown_timeout: 5000
ping_interval: 1000
notifier_config: {}
arksentinel_user: ark_sentinel
arksentinel_password: ark_sentinel
replication_user: replication
replication_password: replication
switchover_script:
failover_script:
update_time: 2019-02-27 10:50:09
create_time: 2019-01-07 22:46:16
- 如果是单独使用Arksentinel,则手动创建库表
create database ark_sentinel;
use ark_sentinel;
// 1\. 哨兵配置表
CREATE TABLE `sentinel_config` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`raft_config` json NOT NULL COMMENT '配置哨兵节点信息',
`config_cluster` json NOT NULL COMMENT '报错配置信息的配置库列表',
`odown_timeout` int(11) NOT NULL DEFAULT '5000' COMMENT '在这个时间内只要有一个哨兵认为实例没有挂掉就不会进行failover操作,超过这个时间只要超过半数哨兵任务实例挂掉就进行failover操作',
`ping_interval` int(11) NOT NULL DEFAULT '2000' COMMENT '哨兵检测实例心跳间隔,超过5次心跳没有响应则任务实例挂掉',
`notifier_config` json NOT NULL COMMENT '指定报警设置的数据库实例信息',
`arksentinel_user` varchar(255) NOT NULL DEFAULT '' COMMENT '哨兵连接数据库实例用的账号',
`arksentinel_password` varchar(255) NOT NULL DEFAULT '' COMMENT '哨兵连接数据库实例用的密码',
`replication_user` varchar(255) NOT NULL DEFAULT '' COMMENT '集群的复制或同步账号,哨兵切换重新指定master时使用这个复制数据',
`replication_password` varchar(255) NOT NULL DEFAULT '' COMMENT '集群的复制或同步密码',
`switchover_script` varchar(300) NOT NULL DEFAULT '' COMMENT '调用用户自定义switchover脚本,为空时不调用',
`failover_script` varchar(300) NOT NULL DEFAULT '' COMMENT '调用用户自定义failover脚本,为空时不调用',
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4;
demo数据:
mysql> select * from sentinel_config\G;
*************************** 1\. row ***************************
id: 1
raft_config: {"raft_nodes": [{"ip": "10.0.0.111", "port": 7977, "address": "BeiJing", "machine_type": "private"}, {"ip": "10.0.0.89", "port": 7977, "address": "BeiJing", "machine_type": "private"}, {"ip": "10.0.0.86", "port": 7977, "address": "BeiJing", "machine_type": "private"}], "failover_timeout": 50000}
config_cluster: {"config_cluster": [{"ip": "10.0.0.87", "port": 3306, "user": "sunkai", "password": "sunkai"}]}
odown_timeout: 5000
ping_interval: 1000
notifier_config: {"enable": true, "configdb": [{"db": "arkmate", "ip": "10.0.0.87", "port": 3306, "user": "sunkai", "password": "sunkai"}], "receivers": ["sunkai@cloud-ark.com"]}
arksentinel_user: ark_sentinel
arksentinel_password: ark_sentinel
replication_user: replication
replication_password: replication
switchover_script:
failover_script:
update_time: 2019-02-27 10:50:09
create_time: 2019-01-07 22:46:16
// 2\. 高可用切换日志, 记录高可用切换操作信息,不用插入数据
CREATE TABLE `ha_log` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`namespace` varchar(100) NOT NULL COMMENT '集群标志名称, 从arkmate.mate_mysql_cluster通过cluster_name获取',
`operation` varchar(100) NOT NULL COMMENT '具体操作类型:Failover,MarkNormal,MarkMaintain,MarkOnline,AddNode,DelNode,Promotion,ForceMaintain',
`op_ip` varchar(300) NOT NULL COMMENT '操作实例ip',
`op_port` int(11) NOT NULL COMMENT '操作实例port',
`replace_ip` varchar(300) NOT NULL DEFAULT '' COMMENT 'promotion,failover等操作替换实例ip',
`replace_port` int(11) NOT NULL DEFAULT '0' COMMENT 'promotion,failover等操作替换实例port',
`role_flag` varchar(100) NOT NULL DEFAULT '' COMMENT '节点角色,master为写节点,standby为读节点并在写节点挂掉时提升为写,statistic为离线分析节点',
`type` varchar(100) NOT NULL DEFAULT 'Base' COMMENT '哨兵架构类型Base,MasterSlave,GaleraCluster,Arkgate,ETL',
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
KEY `idx_namespace` (`namespace`),
KEY `idx_create_time` (`create_time`,`operation`)
) ENGINE=InnoDB AUTO_INCREMENT=1DEFAULT CHARSET=utf8mb4;
// 3\. 主从架构高可用切换配置表,Arkcontrol页面操作即可插入对应数据
CREATE TABLE `mysql_ms_info` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(500) NOT NULL COMMENT '哨兵的唯一名称,统一为 ip_port@namespace',
`namespace` varchar(100) NOT NULL COMMENT '集群标志名称',
`ip` varchar(15) NOT NULL COMMENT '实例ip',
`port` int(10) unsigned NOT NULL COMMENT '实例port',
`online_status` enum('offline','online') NOT NULL DEFAULT 'offline' COMMENT '标记示例是否在线,offline的节点不在受到监控',
`operation_status` enum('problem','normal','maintain','force_online') NOT NULL DEFAULT 'normal' COMMENT '标记节点状态',
`operation_comment` varchar(500) NOT NULL COMMENT '记录操作信息',
`role_flag` enum('master','statistic','standby') NOT NULL DEFAULT 'standby' COMMENT '节点角色,master为写节点,standby为读节点并在写节点挂掉时提升为写,statistic为离线分析节点',
`weight` int(10) unsigned NOT NULL DEFAULT '100' COMMENT '标记节点权重,将来用于划分query的比重,目前未使用',
`delay_threshold` int(10) unsigned DEFAULT '600' COMMENT '用于监控延迟情况,目前未使用',
`is_switching` tinyint(1) unsigned NOT NULL DEFAULT '0' COMMENT '标记是否正在切换',
`warning_ignored` tinyint(1) unsigned DEFAULT '0' COMMENT '是否忽略报警,目前未使用',
`warning_deadline_time` timestamp NOT NULL DEFAULT '1971-01-02 00:00:00' COMMENT '报警屏蔽截止时间,目前未使用',
`address` varchar(300) NOT NULL DEFAULT '' COMMENT '标记节点位置',
`address_num` int(11) NOT NULL DEFAULT '0' COMMENT '标记位置信息,数字约相近,表示两个节点距离越近',
`rw_domain` varchar(2000) NOT NULL DEFAULT '' COMMENT '实例绑定的写域名,可能有多个,英文逗号分隔',
`read_domain` varchar(2000) NOT NULL DEFAULT '' COMMENT '实例绑定的读域名,可能有多个,英文逗号分隔',
`domain_list` varchar(2000) NOT NULL DEFAULT '' COMMENT '实例绑定的域名,可能有多个,英文逗号分隔',
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `uniq_namespace_ip` (`namespace`,`ip`),
UNIQUE KEY `uniq_ip_port` (`ip`,`port`),
KEY `idx_name` (`name`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4
//4\. MGR架构高可用切换配置表,Arkcontrol页面操作即可插入对应数据
CREATE TABLE `mysql_mgr_info` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(500) NOT NULL COMMENT '哨兵的唯一名称,统一为 ip_port@namespace',
`namespace` varchar(100) NOT NULL COMMENT '集群标志名称',
`ip` varchar(15) NOT NULL COMMENT '实例ip',
`port` int(10) unsigned NOT NULL COMMENT '实例port',
`online_status` enum('offline','online') NOT NULL DEFAULT 'offline' COMMENT '标记示例是否在线,offline的节点不在受到监控',
`operation_status` enum('problem','normal','maintain','force_online') NOT NULL DEFAULT 'normal' COMMENT '标记节点状态',
`operation_comment` varchar(500) NOT NULL COMMENT '记录操作信息',
`role_flag` enum('master','statistic','standby') NOT NULL DEFAULT 'standby' COMMENT '节点角色,master为写节点,standby为读节点并在写节点挂掉时提升为写,statistic为离线分析节点',
`weight` int(10) unsigned NOT NULL DEFAULT '100' COMMENT '标记节点权重,将来用于划分query的比重,目前未使用',
`delay_threshold` int(10) unsigned DEFAULT '600' COMMENT '用于监控延迟情况,目前未使用',
`is_switching` tinyint(1) unsigned NOT NULL DEFAULT '0' COMMENT '标记是否正在切换',
`warning_ignored` tinyint(1) unsigned DEFAULT '0' COMMENT '是否忽略报警,目前未使用',
`warning_deadline_time` timestamp NOT NULL DEFAULT '1971-01-02 00:00:00' COMMENT '报警屏蔽截止时间,目前未使用',
`address` varchar(300) NOT NULL DEFAULT '' COMMENT '标记节点位置',
`address_num` int(11) NOT NULL DEFAULT '0' COMMENT '标记位置信息,数字约相近,表示两个节点距离越近',
`rw_domain` varchar(2000) NOT NULL DEFAULT '' COMMENT '实例绑定的写域名,可能有多个,英文逗号分隔',
`read_domain` varchar(2000) NOT NULL DEFAULT '' COMMENT '实例绑定的读域名,可能有多个,英文逗号分隔',
`domain_list` varchar(10000) NOT NULL DEFAULT '' COMMENT '实例绑定的域名,可能有多个,英文逗号分隔',
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `uniq_namespace_ip` (`namespace`,`ip`),
UNIQUE KEY `uniq_ip_port` (`ip`,`port`),
KEY `idx_name` (`name`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4
3. 启动Arksentinel
将arksentinel.tar.gz传送到目标机器(至少要3台),并解压
# 运行./ark-sentinel -h查看帮助信息
./ark-sentinel -h
Usage of ./ark-sentinel:
-env string
run env beta/prod (default "beta") //prod连接ark_sentinel库, beta连接ark_sentinel_beta库
-ip string
config_server ip (default "10.0.0.87") // 配置库ip
-pass string
config_server password (default "sunkai") //配置库密码
-port int
config_server port (default 3306) // 配置库端口
-sentinel_info
show sentinels info and exist // 可以查看哨兵集群其他节点状态
-user string
config_server user (default "sunkai") // 连接配置库用户
-v show Arksentinel version info and exist // 查看配置库版本
分别在各个目标机器运行ark-sentinel(demo如下)
./ark-sentinel -ip 10.0.0.87 -port 3306 -user sunkai -pass sunkai -env prod
查看哨兵运行状态
[root@acme ark_sentinel]# ./ark-sentinel -ip 10.0.0.87 -port 3306 -user sunkai -pass sunkai -env prod -sentinel_info
start sentinel
sentinel 10.0.0.111_7977 is ok
sentinel 10.0.0.89_7977 is ok
sentinel 10.0.0.86_7977 is ok
主从架构
1. 部署主从架构集群
在Arkcontrol平台首先录入搭建集群机器
然后在Arkcontrol平台的部署中心部署主从集群
image.png2. 搭建完成后登录主库10.0.0.131_3306对哨兵授权
登录主库10.0.0.131_3306
对设置的哨兵账号授权
// sentinel_config 中设置的replication_user, 如果与部署时设置的复制账号相同,则不需要再次对replication_user授权
GRANT REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'replication'@'%' IDENTIFIED BY 'replication';
// sentinel_config 中设置的arksentinel_user
GRANT RELOAD, SUPER, REPLICATION CLIENT, SELECT ON *.* TO 'ark_sentinel'@'10.0.0.%' IDENTIFIED BY 'ark_sentinel';
3. 添加哨兵监控节点
- 可以通过页面添加
添加master节点(可以读写)
添加standby节点(用于读,并且在master节点挂掉后会替换master节点,本节点挂掉会寻找standby或者master节点替换)
添加statistic节点(仅仅用于离线读,发生故障的话只是标记下线,不会发生切换)
将所有节点上线,哨兵开始监控各个上线节点
image.png哨兵log/sentinel.log 显示如下内容
// 开始对10.0.0.131_3306监控
time="2019-03-05T12:01:05+08:00" level=info msg="add sentinel 10.0.0.131_3306@ms_demo1" _source="sentinel/sentinel_manager.go:134"
// 开始对10.0.0.132_3306监控
time="2019-03-05T12:01:05+08:00" level=info msg="add sentinel 10.0.0.132_3306@ms_demo1" _source="sentinel/sentinel_manager.go:134"
//开始对ms_demo1集群状态进行监控(主要是master节点read_only是否为off, slave节点read_only是否为on以及slave status状态是否正常)
time="2019-03-05T12:01:05+08:00" level=info msg="start ms_demo1 checker" _source="checker/base_checker.go:120"
// 打印license过期时间
time="2019-03-05T12:01:20+08:00" level=info msg="license will timeout at 2021-01-16 10:56:30.773275816 +0800 CST" _source="sentinel/sentinel_manager.go:110"
// 哨兵每间隔15s会读取数据库,判断是否有需要上下线的哨兵
time="2019-03-05T12:01:20+08:00" level=info msg="old sentinels 10.0.0.131_3306@ms_demo1 10.0.0.132_3306@ms_demo1 ; new sentinels 10.0.0.131_3306@ms_demo1 10.0.0.132_3306@ms_demo1 10.0.0.133_3306@ms_demo1 " _source="sentinel/sentinel_manager.go:219"
// 开始对10.0.0.133_3306监控
time="2019-03-05T12:01:20+08:00" level=info msg="add sentinel 10.0.0.133_3306@ms_demo1" _source="filename/filename.go:22"
- 如果没有部署Arkcontrol,也可以直接在对应的表中添加数据即可(根据实际情况调整ip,port以及namespace)
INSERT INTO `mysql_ms_info` VALUES (1,'10.0.0.131_3306@ms_demo1','ms_demo1','10.0.0.131',3306,'online','normal','','master',100,200,0,0,'1971-01-02 05:00:00','BeiJing',0,'','',now(),now()),(2,'10.0.0.132_3306@ms_demo1','ms_demo1','10.0.0.132',3306,'online','normal','','standby',100,200,0,0,'1971-01-02 05:00:00','BeiJing',0,'','',now(),now()),(3,'10.0.0.133_3306@ms_demo1','ms_demo1','10.0.0.133',3306,'online','normal','','statistic',100,200,0,0,'1971-01-02 05:00:00','BeiJing',0,'','',now(),now());
4. 测试主节点挂掉时切换
- 当前集群状态
-
kill 掉 10.0.0.131_3306 进程, 模拟数据库异常宕机
-
切换结果
- log/sentinel.log 日志内容
time="2019-03-05T20:27:59+08:00" level=info msg="delete sentinel 10.0.0.131_3306@ms_demo1" _source="filename/filename.go:22"
- 被选举哨兵的 log/10.0.0.131_3306@ms_demo1.log 日志内容
// 5次连接实例失败则认为节点挂掉(连接间隔为sentinel_config表中ping_interval,单位为ms)
ime="2019-03-05T20:27:50+08:00" level=error msg="try connect 10.0.0.131_3306 err dial tcp 10.0.0.131:3306: connect: connection refused" _source="filename/filename.go:22"
time="2019-03-05T20:27:51+08:00" level=error msg="try connect 10.0.0.131_3306 err dial tcp 10.0.0.131:3306: connect: connection refused" _source="filename/filename.go:22"
time="2019-03-05T20:27:52+08:00" level=error msg="try connect 10.0.0.131_3306 err dial tcp 10.0.0.131:3306: connect: connection refused" _source="filename/filename.go:22"
time="2019-03-05T20:27:53+08:00" level=error msg="try connect 10.0.0.131_3306 err dial tcp 10.0.0.131:3306: connect: connection refused" _source="filename/filename.go:22"
time="2019-03-05T20:27:54+08:00" level=error msg="try connect 10.0.0.131_3306 err dial tcp 10.0.0.131:3306: connect: connection refused" _source="filename/filename.go:22"
// 5次连接失败后,认为节点挂掉,标记节点为 Subject Down,开始咨询其他哨兵情况
time="2019-03-05T20:27:54+08:00" level=warning msg="+SDOWN 10.0.0.131_3306@ms_demo1" _source="filename/filename.go:22"
time="2019-03-05T20:27:54+08:00" level=warning msg="sdown=[{10.0.0.89 7977 BeiJing private}], not_sdown=[{10.0.0.111 7977 BeiJing private}], unknown=[{10.0.0.86 7977 BeiJing private}], quorum=2" _source="filename/filename.go:22"
time="2019-03-05T20:27:54+08:00" level=warning msg="sdown=[{10.0.0.89 7977 BeiJing private}], not_sdown=[{10.0.0.111 7977 BeiJing private}], unknown=[{10.0.0.86 7977 BeiJing private}], quorum=2" _source="filename/filename.go:22"
time="2019-03-05T20:27:54+08:00" level=warning msg="sdown=[{10.0.0.89 7977 BeiJing private}], not_sdown=[{10.0.0.111 7977 BeiJing private}], unknown=[{10.0.0.86 7977 BeiJing private}], quorum=2" _source="filename/filename.go:22"
time="2019-03-05T20:27:54+08:00" level=warning msg="sdown=[{10.0.0.89 7977 BeiJing private}], not_sdown=[{10.0.0.111 7977 BeiJing private}], unknown=[{10.0.0.86 7977 BeiJing private}], quorum=2" _source="filename/filename.go:22"
time="2019-03-05T20:27:54+08:00" level=warning msg="sdown=[{10.0.0.89 7977 BeiJing private}], not_sdown=[{10.0.0.111 7977 BeiJing private}], unknown=[{10.0.0.86 7977 BeiJing private}], quorum=2" _source="filename/filename.go:22"
time="2019-03-05T20:27:54+08:00" level=warning msg="sdown=[{10.0.0.89 7977 BeiJing private}], not_sdown=[{10.0.0.111 7977 BeiJing private}], unknown=[{10.0.0.86 7977 BeiJing private}], quorum=2" _source="filename/filename.go:22"
time="2019-03-05T20:27:54+08:00" level=warning msg="sdown=[{10.0.0.89 7977 BeiJing private}], not_sdown=[{10.0.0.111 7977 BeiJing private}], unknown=[{10.0.0.86 7977 BeiJing private}], quorum=2" _source="filename/filename.go:22"
// 超过半数哨兵认为节点SDown,则认为节点为 Object Down
time="2019-03-05T20:27:54+08:00" level=warning msg="sdown=[{10.0.0.111 7977 BeiJing private} {10.0.0.89 7977 BeiJing private}], not_sdown=[], unknown=[{10.0.0.86 7977 BeiJing private}], quorum=2" _source="filename/filename.go:22"
// 标记节点为ODown
time="2019-03-05T20:27:54+08:00" level=error msg="+ODOWN 10.0.0.131_3306@ms_demo1 10.0.0.131 3306" _source="filename/filename.go:22"
// 节点为ODown后发起选举操作,进行切换
time="2019-03-05T20:27:54+08:00" level=info msg="Campaign start" _source="filename/filename.go:22"
time="2019-03-05T20:27:54+08:00" level=info msg="self uuid 10.0.0.89_7977, candidateuuid 10.0.0.89_7977" _source="filename/filename.go:22"
// 选举自己为切换操作者
time="2019-03-05T20:27:54+08:00" level=info msg="Vote SELF 10.0.0.89_7977 on 10.0.0.131_3306@ms_demo1" _source="filename/filename.go:22"
time="2019-03-05T20:27:54+08:00" level=info msg="raft node info after compaign {RaftIP:10.0.0.89 RaftPort:7977 Epoch:0 UUID:10.0.0.89_7977 VotedUUID:10.0.0.89_7977 VotedTime:2019-03-05 20:27:54.533544807 +0800 CST m=+507.328783888 TimeOut:50s Logger:0xc00040db60 RWMutex:{w:{state:0 sema:0} writerSem:0 readerSem:0 readerCount:0 readerWait:0}}" _source="filename/filename.go:22"
// 竞选成功
time="2019-03-05T20:27:54+08:00" level=info msg="Campaign WIN, TimeCost=28.92864ms" _source="filename/filename.go:22"
// 开始进行切换操作
time="2019-03-05T20:27:54+08:00" level=info msg="Failover start" _source="filename/filename.go:22"
// 尝试连接挂掉的节点设置read_only为on,并且通知其他哨兵也尝试连接挂掉的节点设置read_only为on
time="2019-03-05T20:27:55+08:00" level=error msg="try connect 10.0.0.131_3306 err dial tcp 10.0.0.131:3306: connect: connection refused" _source="filename/filename.go:22"
time="2019-03-05T20:27:56+08:00" level=error msg="try connect 10.0.0.131_3306 err dial tcp 10.0.0.131:3306: connect: connection refused" _source="filename/filename.go:22"
time="2019-03-05T20:27:57+08:00" level=error msg="try connect 10.0.0.131_3306 err dial tcp 10.0.0.131:3306: connect: connection refused" _source="filename/filename.go:22"
time="2019-03-05T20:27:57+08:00" level=error msg="10.0.0.89 use ark_sentinel:ark_sentinel conn`t connect 10.0.0.131_3306, error dial tcp 10.0.0.131:3306: connect: connection refused\n\t{10.0.0.111 7977 BeiJing private} connect 10.0.0.131_3306 do sql set global read_only=on error rpc error: code = Unknown desc = 10.0.0.111 use ark_sentinel:ark_sentinel conn`t connect 10.0.0.131_3306, error dial tcp 10.0.0.131:3306: connect: connection refused\n\n{10.0.0.86 7977 BeiJing private} connect 10.0.0.131_3306 do sql set global read_only=on error rpc error: code = Unavailable desc = all SubConns are in TransientFailure, latest connection error: connection error: desc = \"transport: Error while dialing dial tcp 10.0.0.86:7977: connect: connection refused\"\n" _source="filename/filename.go:22"
// 尝试连接挂掉的节点设置read_only为on失败,继续进行后面的操作
time="2019-03-05T20:27:57+08:00" level=info msg="try change readonly result false, time cost 3.03069876s" _source="filename/filename.go:22"
// 保存新master节点的复制信息
time="2019-03-05T20:27:57+08:00" level=info msg="newmaster 10.0.0.132_3306 slave status map[Last_Errno:0 Replicate_Ignore_Server_Ids: Master_TLS_Version: Slave_SQL_Running:No Relay_Log_Space:1039 Last_SQL_Errno:0 Master_Log_File:mysql-bin.000039 Replicate_Wild_Do_Table: Until_Condition:None Read_Master_Log_Pos:538 Relay_Log_Pos:751 Exec_Master_Log_Pos:538 Until_Log_Pos:0 Master_SSL_Allowed:No Master_SSL_Key: Master_UUID:210382f3-3ef6-11e9-b77a-0800277107ff Auto_Position:1 Replicate_Ignore_DB: Until_Log_File: Master_Bind: Master_SSL_Crlpath: Replicate_Ignore_Table: Master_Server_Id:7195 Master_SSL_Crl: Relay_Log_File:relay-bin.000076 Replicate_Do_Table: Skip_Counter:0 Master_Retry_Count:86400 Master_SSL_CA_File: Last_IO_Error_Timestamp: Replicate_Do_DB: Master_SSL_CA_Path: Slave_IO_State: Relay_Master_Log_File:mysql-bin.000039 Master_SSL_Cert: Master_SSL_Verify_Server_Cert:No Last_IO_Errno:0 Last_SQL_Error: SQL_Remaining_Delay: Master_Port:3306 Connect_Retry:60 Slave_IO_Running:No Master_SSL_Cipher: Seconds_Behind_Master: Executed_Gtid_Set:210382f3-3ef6-11e9-b77a-0800277107ff:1-6 Replicate_Rewrite_DB: Channel_Name: Master_User:replication Master_Info_File:mysql.slave_master_info Last_SQL_Error_Timestamp: Last_Error: SQL_Delay:0 Slave_SQL_Running_State: Replicate_Wild_Ignore_Table: Master_Host:10.0.0.131 Last_IO_Error: Retrieved_Gtid_Set:210382f3-3ef6-11e9-b77a-0800277107ff:1-6]\n" _source="filename/filename.go:22"
// 切换master节点状态成功
time="2019-03-05T20:27:57+08:00" level=info msg="10.0.0.131_3306@ms_demo1 切换到 10.0.0.132:3306@ms_demo1 MASTER Failover Successed" _source="filename/filename.go:22"
time="2019-03-05T20:27:58+08:00" level=info msg="FinishFailover result=success" _source="filename/filename.go:22"
// 切换完成后,通知其他哨兵,切换完成,初始化选举,切换状态,如果有哨兵通信异常,则报错误日志
time="2019-03-05T20:27:58+08:00" level=error msg="CallFinishFailover {10.0.0.86 7977 BeiJing private} error rpc error: code = Unavailable desc = all SubConns are in TransientFailure, latest connection error: connection error: desc = \"transport: Error while dialing dial tcp 10.0.0.86:7977: connect: connection refused\" " _source="filename/filename.go:22"
time="2019-03-05T20:27:58+08:00" level=info msg="Failover ends, Result success, Msg [OK] ms_demo1 Master Failover Successed 10.0.0.131_3306 replaced by 10.0.0.132_3306 , TimeCost 3.493220206s" _source="filename/filename.go:22"
// 完成切换后,因为关闭哨兵还需要一段时间,因为还会报错连接失败
time="2019-03-05T20:27:58+08:00" level=error msg="try connect 10.0.0.131_3306 err dial tcp 10.0.0.131:3306: connect: connection refused" _source="filename/filename.go:22"
time="2019-03-05T20:27:59+08:00" level=warning msg="sdown=[{10.0.0.111 7977 BeiJing private} {10.0.0.89 7977 BeiJing private}], not_sdown=[], unknown=[{10.0.0.86 7977 BeiJing private}], quorum=2" _source="filename/filename.go:22"
time="2019-03-05T20:27:59+08:00" level=error msg="try connect 10.0.0.131_3306 err dial tcp 10.0.0.131:3306: connect: connection refused" _source="filename/filename.go:22"
// 哨兵成功关闭
time="2019-03-05T20:27:59+08:00" level=info msg="MasterSlave 10.0.0.131_3306@ms_demo1 stopped" _source="filename/filename.go:22"
- 其他哨兵的 log/10.0.0.131_3306@ms_demo1.log 日志内容(去除重复)
// 选举其他节点做切换操作
time="2019-03-05T20:27:54+08:00" level=info msg="Vote OTHER 10.0.0.89_7977 on 10.0.0.131_3306@ms_demo1" source="raft/node.go:84"
// 等待操作节点完成
time="2019-03-05T20:27:57+08:00" level=info msg="FinishFailover result=success" source="sentinel/base_sentinel.go:328"
// 切换完成后关闭哨兵
time="2019-03-05T20:28:06+08:00" level=info msg="MasterSlave 10.0.0.131_3306@ms_demo1 stopped" source="sentinel/base_sentinel.go:65"
5. 节点重新上线
-
重新启动10.0.0.131_3306 实例
-
开启10.0.0.131_3306对新的主节点10.0.0.132_3306的同步(否则无法将此节点 标记为正常)
- 开始同步新的主节点后标记为正常
也可以通过切换脚本进行
./maintain -ip 10.0.0.131 -port 3306 -role standby -namespace ms_demo1 -op MarkNormal -rpcport 7977 -type MasterSlave
- 上线该节点
也可以通过切换脚本进行
./maintain -ip 10.0.0.131 -port 3306 -role standby -namespace ms_demo1 -op MarkOnline -rpcport 7977 -type MasterSlave
6. 10.0.0.131_3306节点重新提升为写
也可以通过切换脚本进行
./maintain -ip 10.0.0.131 -port 3306 -role standby -namespace ms_demo1 -op Promotion -rpcport 7977 -type MasterSlave
切换后10.0.0.131_3306重新变为master节点
image.png7. 读节点down掉
-
kill掉10.0.0.132_3306进程
-
查看log/sentinel.log
time="2019-03-06T16:14:16+08:00" level=error msg="Check 10.0.0.132_3306 gtid error dial tcp 10.0.0.132:3306: connect: connection refused" _source="filename/
filename.go:22"
// 会优先选择相同角色的节点,如果没有相同角色,则使用master节点替换
time="2019-03-06T16:14:21+08:00" level=error msg="no other standby, try get master node" _source="filename/filename.go:22"
time="2019-03-06T16:14:27+08:00" level=info msg="license will timeout at 2021-01-16 10:56:30.773275816 +0800 CST" _source="filename/filename.go:22"
time="2019-03-06T16:14:27+08:00" level=info msg="old sentinels 10.0.0.131_3306@ms_demo1 10.0.0.132_3306@ms_demo1 10.0.0.133_3306@ms_demo1 ; new sentinels 10
.0.0.131_3306@ms_demo1 10.0.0.133_3306@ms_demo1 " _source="filename/filename.go:22"
time="2019-03-06T16:14:27+08:00" level=info msg="delete sentinel 10.0.0.132_3306@ms_demo1" _source="filename/filename.go:22"
- 查看log/10.0.0.132_3306@ms_demo1.log日志
// 跟主节点一样,依然是先标记为SDown,大多数节点认为10.0.0.132_3306挂掉后标记为ODOWN
time="2019-03-06T16:14:21+08:00" level=error msg="+ODOWN 10.0.0.132_3306@ms_demo1 10.0.0.132 3306" _source="filename/filename.go:22"
// 选取操作节点
time="2019-03-06T16:14:21+08:00" level=info msg="Campaign start" _source="filename/filename.go:22"
time="2019-03-06T16:14:21+08:00" level=info msg="self uuid 10.0.0.89_7977, candidateuuid 10.0.0.89_7977" _source="filename/filename.go:22"
time="2019-03-06T16:14:21+08:00" level=info msg="Vote SELF 10.0.0.89_7977 on 10.0.0.132_3306@ms_demo1" _source="filename/filename.go:22"
time="2019-03-06T16:14:21+08:00" level=info msg="raft node info after compaign {RaftIP:10.0.0.89 RaftPort:7977 Epoch:0 UUID:10.0.0.89_7977 VotedUUID:10.0.0.89_7977 VotedTime:2019-03-06 16:14:21.289495208 +0800 CST m=+71694.084734301 TimeOut:50s Logger:0xc0001a87e0 RWMutex:{w:{state:0 sema:0} writerSem:0 readerSem:0 readerCount:0 readerWait:0}}" _source="filename/filename.go:22"
time="2019-03-06T16:14:21+08:00" level=info msg="Campaign WIN, TimeCost=42.129034ms" _source="filename/filename.go:22"
time="2019-03-06T16:14:21+08:00" level=info msg="Failover start" _source="filename/filename.go:22"
// 会优先选择相同角色的节点,如果没有相同角色,则使用master节点替换
time="2019-03-06T16:14:21+08:00" level=info msg="10.0.0.132_3306@ms_demo1 切换到 10.0.0.131:3306@ms_demo1 Standby Failover Successed" _source="filename/filename.go:22"
time="2019-03-06T16:14:21+08:00" level=info msg="FinishFailover result=success" _source="filename/filename.go:22"
time="2019-03-06T16:14:21+08:00" level=error msg="CallFinishFailover {10.0.0.86 7977 BeiJing private} error rpc error: code = Unavailable desc = all SubConns are in TransientFailure, latest connection error: connection error: desc = \"transport: Error while dialing dial tcp 10.0.0.86:7977: connect: connection refused\" " _source="filename/filename.go:22"
time="2019-03-06T16:14:21+08:00" level=info msg="Failover ends, Result success, Msg [OK] ms_demo1 Standy Failover Successed 10.0.0.132_3306 replaced by 10.0.0.131_3306, TimeCost 80.75902ms" _source="filename/filename.go:22"
-
节点下线
image.png
8. statistic节点down掉
-
kill掉statistic节点
-
查看log/sentinel.log
time="2019-03-06T16:21:44+08:00" level=info msg="old sentinels 10.0.0.133_3306@ms_demo1 10.0.0.131_3306@ms_demo1 ; new sentinels 10.0.0.131_3306@ms_demo1 " _source="filename/filename.go:22"
time="2019-03-06T16:21:44+08:00" level=info msg="delete sentinel 10.0.0.133_3306@ms_demo1" _source="filename/filename.go:22"
- 节点会直接下线
9. 集群列表状态显示
- 如果有节点状态不正常,则显示集群异常
-
如果集群没有添加监控,则显示未添加,如上图
-
如果有个别节点没添加监控则显示部分节点添加
10. 删除节点监控
- 将节点标记为维护
- 删除节点
MGR Cluster
1. 搭建MGR集群
- 可以通过Arkcontrol直接搭建
- 也可以手动搭建,注意设置多主模式(group_replication_single_primary_mode = OFF)
2. 搭建完成后登录主节点10.0.0.131_3307对哨兵授权
// sentinel_config 中设置的replication_user, 如果与部署时设置的复制账号相同,则不需要再次对replication_user授权
GRANT REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'replication'@'%' IDENTIFIED BY 'replication';
// sentinel_config 中设置的arksentinel_user
GRANT RELOAD, SUPER, REPLICATION CLIENT ON *.* TO 'ark_sentinel'@'10.0.0.%' IDENTIFIED BY 'ark_sentinel';
// 哨兵通过SELECT MEMBER_STATE FROM performance_schema.replication_group_members判断节点是否加入集群
GRANT SELECT ON `performance_schema`.* TO 'ark_sentinel'@'10.0.0.%';
3. 添加哨兵监控信息(如果配置了脚本,注意配置脚本配置, 添加过程同主从架构)
image.png4. 测试主节点挂掉
-
kill掉10.0.0.131_3307节点模拟实例挂掉
-
哨兵切换逻辑同主从架构
-
切换结果如下
Galera Cluster
Arkproxy
Arkgate
Arketl
扩展功能
Arksentinel 支持节点按照距离远近来调整切换优先级,哨兵切换节点时首先对比数据同步情况(主从架构,Galera,MGR默认一致),然后根据节点间距离进行优先排序(两个节点配置的address_num绝对值),距离越近优先级越高.
此外,用户可以在切换时配置调用脚本,进行proxy,vip或者域名等操作(当前提供两种切换方式:基于vip和基于Arkproxy,如果用户想要基于其他方式或者无法通过ifconfig进行vip绑定解绑则需要自己编写)
目前拓展功能只支持手动配置对应架构的数据库表.
基于VIP切换测试
初始化测试环境
- 搭建主从架构集群如下
- 配置切换脚本路径
// 登录哨兵配置库,设置切换脚本,在执行故障切换时会调用failover,手动维护操作时调用switchover
// 用户可以根据下面的接口定义编写符合自己场景的切换脚本
update sentinel_config set switchover_script='./switchover',failover_script='./failover';
// 哨兵故障切换时调用接口如下
-namespace string
cluster namespace: a name to mysql cluster (default "namespace")
-newip string
instance ip which replace old one (default "10.0.0.90")
-newport int
instance port which replace old one (default 3306)
-oldip string
instance ip which can`t work well (default "10.0.0.89")
-oldport int
instance port which can`t work well (default 3306)
-role string
instance role: master standby statistic (default "standby")
-type string
sentinel type:Base, MasterSlave, GaleraCluster, Arkgate, ETL (default "Base")
// 哨兵手动运维切换时调用接口如下
-namespace string
cluster namespace: a name to mysql cluster (default "namespace")
-op string
operation type:MarkNormal, MarkMaintain, AddNode, DelNode, Promotion, ForceMaintain (default "MarkNormal")
-opip string
operation instance ip (default "10.0.0.89")
-opport int
operation instance port (default 3306)
-replaceip string
old master ip when Promotion operation (default "10.0.0.90")
-replaceport int
old master port when Promotion operation (default 3306)
-role string
instance role: master standby statistic (default "standby")
-type string
sentinel type:Base, MasterSlave, GaleraCluster, Arkgate, ETL (default "Base")
- 设置切换配置
// 在ark_sentinel中添加 switch_config表
CREATE TABLE `switch_config` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`namespace` varchar(100) NOT NULL COMMENT '集群标志名称',
`switch_type` enum('vip','arkproxy','domain') NOT NULL DEFAULT 'arkproxy' COMMENT '切换方式,分别为虚拟ip,arkproxy,域名',
`config` json DEFAULT NULL COMMENT '不同切换方式对应不同的配置参数',
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `uniq_namespace` (`namespace`)
) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8mb4
// failover脚本使用ssh登录旧节点 执行 ifconfig down(失败的话则忽略,避免机器down掉影响流程)
// 然后登录新节点执行ifconfig up, 最后在新节点执行arping更新
// 因此需要配置ssh,网卡等信息
insert switch_config values(null,'ms_demo1','vip','{"ssh_config": {"nic": "enp0s3", "netmask": "255.255.255.0", "ssh_port": 22, "ssh_user": "root", "ssh_passwd": "duduadmin1234", "ssh_timeout": 5, "router_address": "10.0.0.1"}}',now(),now());
- 设置各个节点的vip
// 设置写节点的vip为10.0.0.135, 设置address_num为1, address_num越相近,表示两个节点位置越近,可以设置address_num为相同
update mysql_ms_info set address='beijing',address_num=1, rw_domain='10.0.0.135', read_domain='' where name='10.0.0.131_3306@ms_demo1';
// 设置读节点的vip
update mysql_ms_info set address='shanghai',address_num=100, rw_domain='', read_domain='10.0.0.136' where name='10.0.0.132_3306@ms_demo1';
update mysql_ms_info set address='shanghai',address_num=100, rw_domain='', read_domain='10.0.0.137' where name='10.0.0.133_3306@ms_demo1';
update mysql_ms_info set address='tianjin',address_num=20, rw_domain='', read_domain='10.0.0.138' where name='10.0.0.89_3306@ms_demo1';
设置完成后Arkcontrol显示如下
image.png- 配置切换脚本配置文件
# 提供的failover脚本默认使用failover.yaml,配置连接哨兵配置库的信息,读取哨兵的rw_domain,read_domain 以及切换配置表switch_config
ip: 10.0.0.87
port: 3306
user: sunkai
password: sunkai
db: ark_sentinel
# 提供的swtichover脚本默认使用swtichover.yaml,配置连接哨兵配置库的信息,读取哨兵的rw_domain,read_domain 以及切换配置表switch_config
ip: 10.0.0.87
port: 3306
user: sunkai
password: sunkai
db: ark_sentinel
-
重新启动Arksentinel(kill后重新启动即可)
-
当前vip还没有绑定到对应机器,可以通过switchover 的movevip操作进行
// 绑定rw_domain 到 master 节点
./switchover -op MoveVIP -domaintype rw_domain -namespace ms_demo1 -opip 10.0.0.131 -opport 3306 -replaceip 10.0.0.131 -replaceport 3306 -type MasterSlave -domain 10.0.0.135
// 绑定read_domain 到 只读节点
./switchover -op MoveVIP -domaintype read_domain -namespace ms_demo1 -opip 10.0.0.131 -opport 3306 -replaceip 10.0.0.132 -replaceport 3306 -type MasterSlave -domain 10.0.0.136
./switchover -op MoveVIP -domaintype read_domain -namespace ms_demo1 -opip 10.0.0.131 -opport 3306 -replaceip 10.0.0.133 -replaceport 3306 -type MasterSlave -domain 10.0.0.137
./switchover -op MoveVIP -domaintype read_domain -namespace ms_demo1 -opip 10.0.0.131 -opport 3306 -replaceip 10.0.0.89 -replaceport 3306 -type MasterSlave -domain 10.0.0.138
模拟线上业务写入发生down机测试
- 运行测试插入数据的脚本
会每隔0.5s插入一条数据,通过数据中断时间差可以检验切换对业务影响时间
python do_rw_sql.py
脚本do_rw_sql.py内容如下:
#!/usr/bin/python
# coding:utf-8
import MySQLdb
import threading
import time
import datetime
# 设置读写用户信息
USER = "rw_user"
PASSWD = "rw_pass"
# 读写vip
HOST = '10.0.0.135'
PORT = 3306
# 测试用库表
TESTDB = "test"
TESTTABLE = "sql_test"
TableStruct = """
create table {}.{} (
id int NOT NULL AUTO_INCREMENT primary key,
content varchar(300),
create_time TIMESTAMP(3)
)
""".format(TESTDB, TESTTABLE)
# 测试线程数
THREAD_NUM = 1
def RunSql(connection, sql):
cursor = connection.cursor()
cursor.execute(sql)
return cursor.fetchall()
class DoSQLThread(threading.Thread):
def __init__(self, thread_id, interval_time):
threading.Thread.__init__(self)
self.threadID = thread_id
self.interval_time = interval_time
self.connection = MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, port=PORT, connect_timeout=10)
self.connection.autocommit(1)
def run(self):
while True:
testsql = "insert into {}.{} values(null,'9999','{}')".format(TESTDB, TESTTABLE,
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3])
try:
res = RunSql(self.connection, testsql)
print res
time.sleep(self.interval_time)
except Exception, e:
try:
print e
time.sleep(self.interval_time)
self.connection = MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, port=PORT, connect_timeout=10)
self.connection.autocommit(1)
except Exception:
time.sleep(self.interval_time)
def InitTestEnv():
connection = MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, port=PORT, connect_timeout=10)
sql = "create database if NOT EXISTS {}".format(TESTDB)
RunSql(connection, sql)
sql = "drop table if EXISTS {}.{}".format(TESTDB, TESTTABLE)
RunSql(connection, sql)
RunSql(connection, TableStruct)
connection.close()
if __name__ == "__main__":
InitTestEnv()
thread_list = []
for thread_id in range(THREAD_NUM):
thread = DoSQLThread(thread_id, 0.5)
thread_list.append(thread)
thread.start()
for thread_obj in thread_list:
thread_obj.join()
-
kill掉10.0.0.131_3306模拟数据库down机
-
do_sql脚本输出
()
(2013, 'Lost connection to MySQL server during query')
(2006, 'MySQL server has gone away')
(2006, 'MySQL server has gone away')
(2006, 'MySQL server has gone away')
(2006, 'MySQL server has gone away')
(2006, 'MySQL server has gone away')
(2006, 'MySQL server has gone away')
(2006, 'MySQL server has gone away')
(2006, 'MySQL server has gone away')
(2006, 'MySQL server has gone away')
()
- ark-sentinel 控制台输出
[mysql] 2019/03/07 11:37:59 packets.go:36: read tcp 10.0.0.89:33522->10.0.0.131:3306: read: connection reset by peer
// 会尝试调用各个哨兵rpc接口,设置旧master节点read_only为on
local change fail, try rpc chagne 2019-03-07 11:38:04.060176724 +0800 CST m=+52115.679563400
local change fail, try rpc chagne 2019-03-07 11:38:04.193253361 +0800 CST m=+52115.812640037
local change fail, try rpc chagne 2019-03-07 11:38:04.330489804 +0800 CST m=+52115.949876480
local change fail, try rpc chagne 2019-03-07 11:38:04.462595189 +0800 CST m=+52116.081981875
local change fail, try rpc chagne 2019-03-07 11:38:04.591546481 +0800 CST m=+52116.210933206
local change fail, try rpc chagne 2019-03-07 11:38:04.723728896 +0800 CST m=+52116.343115588
local change fail, try rpc chagne 2019-03-07 11:38:04.852748488 +0800 CST m=+52116.472135246
local change fail, try rpc chagne 2019-03-07 11:38:04.98334034 +0800 CST m=+52116.602727052
local change fail, try rpc chagne 2019-03-07 11:38:05.133631413 +0800 CST m=+52116.753018089
local change fail, try rpc chagne 2019-03-07 11:38:05.295634165 +0800 CST m=+52116.915020841
local change fail, try rpc chagne 2019-03-07 11:38:05.469536216 +0800 CST m=+52117.088922904
local change fail, try rpc chagne 2019-03-07 11:38:05.6345871 +0800 CST m=+52117.253973776
local change fail, try rpc chagne 2019-03-07 11:38:05.819021038 +0800 CST m=+52117.438407712
local change fail, try rpc chagne 2019-03-07 11:38:05.986973698 +0800 CST m=+52117.606360372
local change fail, try rpc chagne 2019-03-07 11:38:06.136234587 +0800 CST m=+52117.755621323
local change fail, try rpc chagne 2019-03-07 11:38:06.263471174 +0800 CST m=+52117.882857860
local change fail, try rpc chagne 2019-03-07 11:38:06.392140548 +0800 CST m=+52118.011527222
local change fail, try rpc chagne 2019-03-07 11:38:06.52626959 +0800 CST m=+52118.145656272
local change fail, try rpc chagne 2019-03-07 11:38:06.657717163 +0800 CST m=+52118.277103839
local change fail, try rpc chagne 2019-03-07 11:38:06.785464854 +0800 CST m=+52118.404851528
local change fail, try rpc chagne 2019-03-07 11:38:06.914982031 +0800 CST m=+52118.534368717
local change fail, try rpc chagne 2019-03-07 11:38:07.049208155 +0800 CST m=+52118.668594831
// 调用用户的设置的failover脚本,进行vip切换
do cmd ./failover -role=master -namespace=ms_demo1 -type=MasterSlave -oldip=10.0.0.131 -oldport=3306 -newip=10.0.0.89 -newport=3306
resulst true, msg do shell ./failover -role=master -namespace=ms_demo1 -type=MasterSlave -oldip=10.0.0.131 -oldport=3306 -newip=10.0.0.89 -newport=3306
ifconfig enp0s3:135 10.0.0.135 netmask 255.255.255.0 down
ifconfig enp0s3:135 10.0.0.135 netmask 255.255.255.0 up
- log/sentinel.log输出
time="2019-03-07T11:38:18+08:00" level=info msg="delete sentinel 10.0.0.131_3306@ms_demo1" _source="filename/filename.go:22"
- 查看集群中test.sql_test 表在切换时间段数据
| 59 | 9999 | 2019-03-07 11:37:57.628 |
| 60 | 9999 | 2019-03-07 11:37:58.134 |
// 切换点,大概中断10s左右
| 61 | 9999 | 2019-03-07 11:37:58.637 |
| 62 | 9999 | 2019-03-07 11:38:08.818 |
| 63 | 9999 | 2019-03-07 11:38:09.325 |
| 64 | 9999 | 2019-03-07 11:38:09.829 |
- 查看Arkcontrol页面
[图片上传失败...(image-e89ea9-1552650690784)]
- 多次切换,会发现一直是与10.0.0.131_3306 address_num 最近的 10.0.0.89_3306 替换为主
基于Arkproxy切换测试
初始测试环境
- 准备MGR集群
- 搭建对应Arkproxy
根据Arkproxy手册搭建对应Arkproxy,并启动
例如在10.0.0.132,10.0.0.133部署,Arkproxy proxy端口为3337, shell端口为3336
demo如下(注意要关闭trace,digest等可能对数据库有写入的操作,避免切换时因为proxy的操作导致集群数据不一致):
[arkproxy]
proxy_backend_passwd = arkproxy
proxy_backend_user = arkproxy
proxy_check_consistend_period = 10
proxy_check_slave_lag_period = 10
proxy_config_host =
proxy_config_passwd =
proxy_config_port = 0
proxy_config_user =
proxy_consistend_max_delay_time = 1000
proxy_digest_trace = OFF
proxy_enable_consistend_read = 0
proxy_flush_thread_num = 5
proxy_format_fullpath = OFF
proxy_format_num_per_time = 10
proxy_format_parameterize = ON
proxy_format_thread_num = 5
proxy_general_log = OFF
proxy_general_log_file = ./log/proxy.log
proxy_incoming_addresses = 10.0.0.132:3336,10.0.0.133:3336
proxy_init_connect =
proxy_license_key_path = ./key/arkproxy.key
proxy_log_error = ./log/error.log
proxy_max_connections = 151
proxy_multi_write_mode = ON
proxy_namespace =
proxy_non_encrypted_ips = 127.0.0.1
proxy_port = 3337
proxy_recv_queue_size = 1000
proxy_send_queue_size = 10
proxy_server_address = 10.0.0.132
proxy_server_heartbeat_period = 100
proxy_server_uuid = 3ccf91f1-40d1-11e9-9720-080027672d82
proxy_shell_listen_port = 3336
proxy_shell_password = sunkai
proxy_shell_username = sunkai
proxy_sql_trace = OFF
proxy_sql_trace_thread_num = 5
proxy_thread_pool_idle_timeout = 60
proxy_thread_pool_max_threads = 65536
proxy_thread_pool_oversubscribe = 3
proxy_thread_pool_size = 100
proxy_thread_pool_stall_limit = 10
proxy_trace_busy_strategy = 0
proxy_trace_busy_wait_time = 500
proxy_trace_client_queue_size = 1000
proxy_trace_flush_thread_sleep_msec = 500
proxy_trace_format_thread_sleep_msec = 500
proxy_trace_hash_mem_length = 134217728
proxy_trace_hash_size = 200
proxy_trace_max_delay_time = 3
proxy_trace_queue_array_length = 500
proxy_trace_sql_bucket_length = 100
proxy_trace_storage = MYSQL
proxy_trace_storage_host =
proxy_trace_storage_passwd =
proxy_trace_storage_port = 0
proxy_trace_storage_user =
proxy_username_encypted = ON
[server1]
proxy_type = server
backend_host = 10.0.0.131
backend_port = 3307
max_slave_lag = 200
server_status = OFFLINE
weight = 40
config_comment = day day up
[server2]
proxy_type = server
backend_host = 10.0.0.132
backend_port = 3307
max_slave_lag = 1000
server_status = ONLINE
weight = 30
[server3]
proxy_type = server
backend_host = 10.0.0.133
backend_port = 3307
max_slave_lag = 1000
server_status = ONLINE
weight = 30
[router1]
proxy_type = router
router_type = readwrite
router_servers = server1
config_comment = xxxxxxxxx
[router2]
proxy_type = router
router_type = readonly
router_servers = server1,server2,server3
- 配置切换脚本路径
// 登录哨兵配置库,设置切换脚本,在执行故障切换时会调用failover,手动维护操作时调用switchover
// 用户可以根据下面的接口定义编写符合自己场景的切换脚本
update sentinel_config set switchover_script='./switchover',failover_script='./failover';
// 哨兵故障切换时调用接口如下
-namespace string
cluster namespace: a name to mysql cluster (default "namespace")
-newip string
instance ip which replace old one (default "10.0.0.90")
-newport int
instance port which replace old one (default 3306)
-oldip string
instance ip which can`t work well (default "10.0.0.89")
-oldport int
instance port which can`t work well (default 3306)
-role string
instance role: master standby statistic (default "standby")
-type string
sentinel type:Base, MasterSlave, GaleraCluster, Arkgate, ETL (default "Base")
// 哨兵手动运维切换时调用接口如下
-namespace string
cluster namespace: a name to mysql cluster (default "namespace")
-op string
operation type:MarkNormal, MarkMaintain, AddNode, DelNode, Promotion, ForceMaintain (default "MarkNormal")
-opip string
operation instance ip (default "10.0.0.89")
-opport int
operation instance port (default 3306)
-replaceip string
old master ip when Promotion operation (default "10.0.0.90")
-replaceport int
old master port when Promotion operation (default 3306)
-role string
instance role: master standby statistic (default "standby")
-type string
sentinel type:Base, MasterSlave, GaleraCluster, Arkgate, ETL (default "Base")
- 插入切换配置
// 在 ark_sentinel库中添加switch_config表
CREATE TABLE `switch_config` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`namespace` varchar(100) NOT NULL COMMENT '集群标志名称',
`switch_type` enum('vip','arkproxy','domain') NOT NULL DEFAULT 'arkproxy' COMMENT '切换方式,分别为虚拟ip,arkproxy,域名',
`config` json DEFAULT NULL COMMENT '不同切换方式对应不同的配置参数',
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `uniq_namespace` (`namespace`)
) ENGINE=InnoDB AUTO_INCREMENT=7 DEFAULT CHARSET=utf8mb4
// 插入arkproxy对应配置
// proxy_list配置Arkproxy的shell连接信息
insert into switch_config values(null,'mgr_demo1','arkproxy','{"proxy_list": [{"proxy_ip": "10.0.0.132", "proxy_port": 3336, "proxy_user": "sunkai", "proxy_passwd": "sunkai"}, {"proxy_ip": "10.0.0.133", "proxy_port": 3336, "proxy_user": "sunkai", "proxy_passwd": "sunkai"}]}',now(),now());
- 修改do_rw_sql.py中的连接信息(脚本内容见主从架构do_rw_sql.py),连接Arkproxy
USER = "rw_user"
PASSWD = "rw_pass"
HOST = '10.0.0.132'
PORT = 3337
TESTDB = "test"
- 通过Arkproxy对do_rw_sql.py使用账号授权
# 10.0.0.89 为 脚本运行机器ip
grant create,drop,select,insert,update,delete on *.* to 'rw_user'@'10.0.0.89' identified by 'rw_pass';
模拟线上业务写入发生down机测试
- 当前节点状态
- 当前Arkproxy shell状态
mysql> show backend servers;
+----+---------+------------+------+--------+---------------+--------+--------+------------+
| Id | Name | Host | Port | Weight | Max_Slave_Lag | Routed | Status | Comments |
+----+---------+------------+------+--------+---------------+--------+--------+------------+
| 1 | server1 | 10.0.0.131 | 3307 | 40 | 200 | Yes | ONLINE | day day up |
| 2 | server2 | 10.0.0.132 | 3307 | 30 | 1000 | Yes | ONLINE | NULL |
| 3 | server3 | 10.0.0.133 | 3307 | 30 | 1000 | Yes | ONLINE | NULL |
+----+---------+------------+------+--------+---------------+--------+--------+------------+
3 rows in set (0.00 sec)
mysql> show backend routes;
+----+---------+------------+------+------------+-----------+
| Id | Name | Host | Port | Route_Type | Comments |
+----+---------+------------+------+------------+-----------+
| 1 | server1 | 10.0.0.131 | 3307 | Write | xxxxxxxxx |
| 2 | server1 | 10.0.0.131 | 3307 | Read | NULL |
| 3 | server2 | 10.0.0.132 | 3307 | Read | NULL |
| 4 | server3 | 10.0.0.133 | 3307 | Read | NULL |
+----+---------+------------+------+------------+-----------+
4 rows in set (0.00 sec)
- 执行写入脚本(每隔0.5s写入一条数据)
python do_rw_proxy_sql.py
-
kill 节点10.0.0.131_3307 模拟机器实例down机
-
do_rw_proxy_sql.py控制台输出
()
()
//切换过程中发生插入中断
(2006, 'MySQL server has gone away')
(1184, "Aborted connection 9097 to db: 'unconnected' user: 'd86b657c8b0ea0fd' host: '10.0.0.89' (Can not find available backend mysql server)")
(1184, "Aborted connection 9097 to db: 'unconnected' user: 'd86b657c8b0ea0fd' host: '10.0.0.89' (Can not find available backend mysql server)")
(1184, "Aborted connection 9097 to db: 'unconnected' user: 'd86b657c8b0ea0fd' host: '10.0.0.89' (Can not find available backend mysql server)")
(1184, "Aborted connection 9097 to db: 'unconnected' user: 'd86b657c8b0ea0fd' host: '10.0.0.89' (Can not find available backend mysql server)")
(1184, "Aborted connection 9097 to db: 'unconnected' user: 'd86b657c8b0ea0fd' host: '10.0.0.89' (Can not find available backend mysql server)")
(1184, "Aborted connection 9097 to db: 'unconnected' user: 'd86b657c8b0ea0fd' host: '10.0.0.89' (Can not find available backend mysql server)")
(1184, "Aborted connection 9097 to db: 'unconnected' user: 'd86b657c8b0ea0fd' host: '10.0.0.89' (Can not find available backend mysql server)")
(1184, "Aborted connection 9097 to db: 'unconnected' user: 'd86b657c8b0ea0fd' host: '10.0.0.89' (Can not find available backend mysql server)")
(1184, "Aborted connection 9097 to db: 'unconnected' user: 'd86b657c8b0ea0fd' host: '10.0.0.89' (Can not find available backend mysql server)")
(1184, "Aborted connection 9097 to db: 'unconnected' user: 'd86b657c8b0ea0fd' host: '10.0.0.89' (Can not find available backend mysql server)")
(1184, "Aborted connection 9097 to db: 'unconnected' user: 'd86b657c8b0ea0fd' host: '10.0.0.89' (Can not find available backend mysql server)")
()
()
- 切换后节点状态
- Arkproxy状态
// 可以发现原来旧节点下线,会上线添加新的写节点
mysql> show backend servers;
+----+----------------------------------+------------+------+--------+---------------+--------+---------+------------+
| Id | Name | Host | Port | Weight | Max_Slave_Lag | Routed | Status | Comments |
+----+----------------------------------+------------+------+--------+---------------+--------+---------+------------+
| 1 | server1 | 10.0.0.131 | 3307 | 40 | 200 | Yes | OFFLINE | day day up |
| 2 | server2 | 10.0.0.132 | 3307 | 30 | 1000 | Yes | ONLINE | NULL |
| 3 | server3 | 10.0.0.133 | 3307 | 30 | 1000 | Yes | ONLINE | NULL |
| 4 | 5797006bb76008f7b929299e6c5c6132 | 10.0.0.133 | 3307 | 1 | 1000 | Yes | ONLINE | NULL |
+----+----------------------------------+------------+------+--------+---------------+--------+---------+------------+
4 rows in set (0.00 sec)
mysql> show backend routes;
+----+----------------------------------+------------+------+------------+-----------+
| Id | Name | Host | Port | Route_Type | Comments |
+----+----------------------------------+------------+------+------------+-----------+
| 1 | server1 | 10.0.0.131 | 3307 | Write | xxxxxxxxx |
| 2 | 5797006bb76008f7b929299e6c5c6132 | 10.0.0.133 | 3307 | Write | xxxxxxxxx |
| 3 | server1 | 10.0.0.131 | 3307 | Read | NULL |
| 4 | server2 | 10.0.0.132 | 3307 | Read | NULL |
| 5 | server3 | 10.0.0.133 | 3307 | Read | NULL |
+----+----------------------------------+------------+------+------------+-----------+
- 查看test.sql_test在切换时间段数据
| 453 | 9999 | 2019-03-07 23:14:40.399 |
| 460 | 9999 | 2019-03-07 23:14:40.907 |
| 467 | 9999 | 2019-03-07 23:14:41.413 |
// 大概影响了12s写入
| 472 | 9999 | 2019-03-07 23:14:53.704 |
| 479 | 9999 | 2019-03-07 23:14:54.208 |
网友评论