如何解决Alertmanager集群成员不断加入和断开连接
我将监控服务器设置为docker容器,其中包括Prometheus,alertmanager和grafana等服务。 Alertmanager现在在高可用性基础结构中设置了两个集群成员(肯定会增加集群成员以实现真正的高可用性基础结构)。但是,alertmanager集群成员不断上升或下降,这意味着他们都加入集群一段时间,然后其中一个可能与集群断开连接。
Alertmanager-1
docker logs -f --tail 1 alertmanager
level=debug ts=2020-10-02T13:15:02.447Z caller=cluster.go:441 component=cluster msg=refresh result=success addr=<ip-removed>:6783
level=debug ts=2020-10-02T13:15:04.421Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:15:04 [INFO] memberlist: Marking 01EKB1BEPKRT4VDQ4T1B1SVC1F as failed,suspect timeout reached (0 peer confirmations)\n"
level=debug ts=2020-10-02T13:15:04.422Z caller=delegate.go:236 component=cluster received=NotifyLeave node=01EKB1BEPKRT4VDQ4T1B1SVC1F addr=172.18.0.4:6783
level=debug ts=2020-10-02T13:15:04.422Z caller=cluster.go:492 component=cluster msg="peer left" peer=01EKB1BEPKRT4VDQ4T1B1SVC1F
level=debug ts=2020-10-02T13:15:07.442Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:15:07 [DEBUG] memberlist: Failed to join 172.18.0.4: dial tcp 172.18.0.4:6783: connect: connection refused\n"
level=debug ts=2020-10-02T13:15:07.443Z caller=cluster.go:408 component=cluster msg=reconnect result=failure peer=01EKB1BEPKRT4VDQ4T1B1SVC1F addr=172.18.0.4:6783 err="1 error occurred:\n\t* Failed to join 172.18.0.4: dial tcp 172.18.0.4:6783: connect: connection refused\n\n"
level=debug ts=2020-10-02T13:15:09.420Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:15:09 [INFO] memberlist: Suspect 01EKB1BEPKRT4VDQ4T1B1SVC1F has failed,no acks received\n"
level=debug ts=2020-10-02T13:15:12.459Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:15:12 [DEBUG] memberlist: Stream connection from=<ip-removed>:46990\n"
level=debug ts=2020-10-02T13:15:12.461Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:15:12 [WARN] memberlist: Refuting a suspect message (from: 01EKB1J0N7W0T6YT74WFP47TBG)\n"
level=debug ts=2020-10-02T13:15:12.461Z caller=delegate.go:230 component=cluster received=NotifyJoin node=01EKB1BEPKRT4VDQ4T1B1SVC1F addr=172.18.0.4:6783
level=debug ts=2020-10-02T13:15:12.462Z caller=cluster.go:470 component=cluster msg="peer rejoined" peer=01EKB1BEPKRT4VDQ4T1B1SVC1F
level=debug ts=2020-10-02T13:15:14.920Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:15:14 [DEBUG] memberlist: Failed ping: 01EKB1BEPKRT4VDQ4T1B1SVC1F (timeout reached)\n"
level=debug ts=2020-10-02T13:15:17.444Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:15:17 [DEBUG] memberlist: Initiating push/pull sync with: <ip-removed>:6783\n"
level=debug ts=2020-10-02T13:15:17.447Z caller=cluster.go:441 component=cluster msg=refresh result=success addr=<ip-removed>:6783
level=debug ts=2020-10-02T13:15:22.420Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:15:22 [INFO] memberlist: Suspect 01EKB1BEPKRT4VDQ4T1B1SVC1F has failed,no acks received\n"
level=debug ts=2020-10-02T13:15:23.498Z caller=dispatch.go:138 component=dispatcher msg="Received alert" alert=InstanceDown[e5b6eec][active]
level=debug ts=2020-10-02T13:15:23.504Z caller=dispatch.go:138 component=dispatcher msg="Received alert" alert=InstanceDown[e5b6eec][active]
level=debug ts=2020-10-02T13:15:23.505Z caller=dispatch.go:138 component=dispatcher msg="Received alert" alert=InstanceDown[d13bda9][active]
level=debug ts=2020-10-02T13:15:23.920Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:15:23 [DEBUG] memberlist: Failed ping: 01EKB1BEPKRT4VDQ4T1B1SVC1F (timeout reached)\n"
level=debug ts=2020-10-02T13:15:24.903Z caller=dispatch.go:473 component=dispatcher aggrGroup="{}/{severity=\"critical\"}:{alertname=\"InstanceDown\"}" msg=flushing alerts="[InstanceDown[d13bda9][active] InstanceDown[e5b6eec][active]]"
level=debug ts=2020-10-02T13:15:26.421Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:15:26 [INFO] memberlist: Marking 01EKB1BEPKRT4VDQ4T1B1SVC1F as failed,suspect timeout reached (0 peer confirmations)\n"
level=debug ts=2020-10-02T13:15:26.422Z caller=delegate.go:236 component=cluster received=NotifyLeave node=01EKB1BEPKRT4VDQ4T1B1SVC1F addr=172.18.0.4:6783
level=debug ts=2020-10-02T13:15:26.423Z caller=cluster.go:492 component=cluster msg="peer left" peer=01EKB1BEPKRT4VDQ4T1B1SVC1F
Alertmanager-2
docker logs -f --tail 1 alertmanager
level=debug ts=2020-10-02T13:04:03.923Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:04:03 [DEBUG] memberlist: Failed ping: 01EKB1J0N7W0T6YT74WFP47TBG (timeout reached)\n"
level=debug ts=2020-10-02T13:04:11.423Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:04:11 [INFO] memberlist: Suspect 01EKB1J0N7W0T6YT74WFP47TBG has failed,no acks received\n"
level=debug ts=2020-10-02T13:04:12.454Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:04:12 [DEBUG] memberlist: Initiating push/pull sync with: <ip-removed>:6783\n"
level=debug ts=2020-10-02T13:04:12.457Z caller=cluster.go:441 component=cluster msg=refresh result=success addr=<ip-removed>:6783
level=debug ts=2020-10-02T13:04:12.923Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:04:12 [DEBUG] memberlist: Failed ping: 01EKB1J0N7W0T6YT74WFP47TBG (timeout reached)\n"
level=debug ts=2020-10-02T13:04:15.424Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:04:15 [INFO] memberlist: Marking 01EKB1J0N7W0T6YT74WFP47TBG as failed,suspect timeout reached (0 peer confirmations)\n"
level=debug ts=2020-10-02T13:04:15.424Z caller=delegate.go:236 component=cluster received=NotifyLeave node=01EKB1J0N7W0T6YT74WFP47TBG addr=172.18.0.2:6783
level=debug ts=2020-10-02T13:04:15.425Z caller=cluster.go:492 component=cluster msg="peer left" peer=01EKB1J0N7W0T6YT74WFP47TBG
level=debug ts=2020-10-02T13:04:17.444Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:04:17 [DEBUG] memberlist: Stream connection from=<ip-removed>:54340\n"
level=debug ts=2020-10-02T13:04:17.446Z caller=delegate.go:230 component=cluster received=NotifyJoin node=01EKB1J0N7W0T6YT74WFP47TBG addr=<ip-removed>:6783
level=debug ts=2020-10-02T13:04:17.446Z caller=cluster.go:470 component=cluster msg="peer rejoined" peer=01EKB1J0N7W0T6YT74WFP47TBG
level=debug ts=2020-10-02T13:04:20.423Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:04:20 [INFO] memberlist: Suspect 01EKB1J0N7W0T6YT74WFP47TBG has failed,no acks received\n"
level=debug ts=2020-10-02T13:04:21.923Z caller=cluster.go:306 component=cluster memberlist="2020/10/02 13:04:21 [DEBUG] memberlist: Failed ping: 01EKB1J0N7W0T6YT74WFP47TBG (timeout reached)\n"
level=debug ts=2020-10-02T13:04:24.708Z caller=dispatch.go:473 component=dispatcher aggrGroup="{}/{severity=\"critical\"}:{alertname=\"InstanceDown\"}" msg=flushing alerts="[InstanceDown[d13bda9][active] InstanceDown[e5b6eec][active]]"
第一台服务器docker-compose.yml中的alertmanager-1服务
alertmanager:
image: prom/alertmanager:v0.21.0
container_name: alertmanager
ports:
- 9093:9093
- 6783:6783
command:
- '--log.level=debug'
- '--config.file=/etc/alertmanager/alertmanager_config.yml'
- '--storage.path=/alertmanager'
- '--cluster.listen-address=:6783'
- '--cluster.peer=ip:6783' #ip of 2nd monitoring server
volumes:
- ./alertmanager:/etc/alertmanager
restart: always
第二服务器中的alertmanager-2服务:
alertmanager:
image: prom/alertmanager:v0.21.0
container_name: alertmanager
ports:
- 9093:9093
- 6783:6783
command:
- '--log.level=debug'
- '--config.file=/etc/alertmanager/alertmanager_config.yml'
- '--storage.path=/alertmanager'
- '--cluster.listen-address=:6783'
- '--cluster.peer=ip-1:6783' #ip of 1st monitoring server
volumes:
- ./alertmanager:/etc/alertmanager
restart: always
可能是什么问题?
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。