From 3a8d7a43305a64f1585fb3b2aa5357304f4b2edf Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 3 Jan 2023 16:01:31 -0600 Subject: [PATCH 1/3] Adding custom failover verification process. --- Dockerfile | 2 ++ cmd/failover_validation/main.go | 35 +++++++++++++++++++++++++++++++++ pkg/flypg/repmgr.go | 1 + 3 files changed, 38 insertions(+) create mode 100644 cmd/failover_validation/main.go diff --git a/Dockerfile b/Dockerfile index add7deb0..95f99c85 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,8 @@ WORKDIR /go/src/github.com/fly-examples/fly-postgres COPY . . RUN CGO_ENABLED=0 GOOS=linux go build -v -o /fly/bin/event_handler ./cmd/event_handler +RUN CGO_ENABLED=0 GOOS=linux go build -v -o /fly/bin/failover_validation ./cmd/failover_validation + RUN CGO_ENABLED=0 GOOS=linux go build -v -o /fly/bin/standby_cleaner ./cmd/standby_cleaner RUN CGO_ENABLED=0 GOOS=linux go build -v -o /fly/bin/start ./cmd/start COPY ./bin/* /fly/bin/ diff --git a/cmd/failover_validation/main.go b/cmd/failover_validation/main.go new file mode 100644 index 00000000..2addd286 --- /dev/null +++ b/cmd/failover_validation/main.go @@ -0,0 +1,35 @@ +package main + +import ( + "flag" + "fmt" + "os" +) + +func main() { + nodeID := flag.Int("node-id", 0, "the node id") + nodeName := flag.String("node-name", "", "the name of the proposed leader") + visibleNodes := flag.Int("visable-nodes", 0, "Total visible nodes from the perspective of the proposed leader") + totalNodes := flag.Int("total-nodes", 0, "The total number of nodes registered") + flag.Parse() + + fmt.Printf("Verifying failover candidate %d: %s\n", nodeID, *nodeName) + + // If there are no visible nodes, then we can't accept leadership as we are not able to + // confirm a network partition. + if *visibleNodes == 0 { + fmt.Println("Zero visible nodes detected. Enabling read-only mode.") + os.Exit(1) + } + + // TODO - This will remove HA from a 2-node cluster setup until we can workout how to + // differentiate between a down node and a network partition. + + // We have visible nodes, but not enough to meet quorum. + if *visibleNodes < (*totalNodes/2 + 1) { + fmt.Printf("Quorum not met. Total nodes: %d, Visible nodes: %d\n", *totalNodes, *visibleNodes) + os.Exit(1) + } + + os.Exit(0) +} diff --git a/pkg/flypg/repmgr.go b/pkg/flypg/repmgr.go index 48bb92d9..0d03a155 100644 --- a/pkg/flypg/repmgr.go +++ b/pkg/flypg/repmgr.go @@ -135,6 +135,7 @@ func (r *RepMgr) setDefaults() { "event_notifications": "'repmgrd_failover_promote,standby_promote,standby_follow'", "location": r.Region, "primary_visibility_consensus": true, + "failover_validation_command": fmt.Sprintf("'/usr/local/bin/failover_validation -node-id %%n -node-name %%a -visable-nodes %%v -total-nodes %%t'"), } if !r.eligiblePrimary() { From 8f541c95fb41e7e9e35e23623d139ce1edc19fe8 Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Tue, 3 Jan 2023 16:23:12 -0600 Subject: [PATCH 2/3] Cleanup --- cmd/failover_validation/main.go | 13 +++++-------- pkg/flypg/repmgr.go | 2 +- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/cmd/failover_validation/main.go b/cmd/failover_validation/main.go index 2addd286..03564672 100644 --- a/cmd/failover_validation/main.go +++ b/cmd/failover_validation/main.go @@ -7,23 +7,20 @@ import ( ) func main() { - nodeID := flag.Int("node-id", 0, "the node id") - nodeName := flag.String("node-name", "", "the name of the proposed leader") - visibleNodes := flag.Int("visable-nodes", 0, "Total visible nodes from the perspective of the proposed leader") + visibleNodes := flag.Int("visible-nodes", 0, "Total visible nodes from the perspective of the proposed leader") totalNodes := flag.Int("total-nodes", 0, "The total number of nodes registered") flag.Parse() - fmt.Printf("Verifying failover candidate %d: %s\n", nodeID, *nodeName) - // If there are no visible nodes, then we can't accept leadership as we are not able to // confirm a network partition. if *visibleNodes == 0 { - fmt.Println("Zero visible nodes detected. Enabling read-only mode.") + fmt.Println("Zero visible nodes detected.") os.Exit(1) } - // TODO - This will remove HA from a 2-node cluster setup until we can workout how to - // differentiate between a down node and a network partition. + // TODO - This will ultimately remove HA from a 2-node cluster setup. + // This will be the case until we come up with a strategy to differentiate + // between a down node and a network partition. // We have visible nodes, but not enough to meet quorum. if *visibleNodes < (*totalNodes/2 + 1) { diff --git a/pkg/flypg/repmgr.go b/pkg/flypg/repmgr.go index 0d03a155..a3df972e 100644 --- a/pkg/flypg/repmgr.go +++ b/pkg/flypg/repmgr.go @@ -135,7 +135,7 @@ func (r *RepMgr) setDefaults() { "event_notifications": "'repmgrd_failover_promote,standby_promote,standby_follow'", "location": r.Region, "primary_visibility_consensus": true, - "failover_validation_command": fmt.Sprintf("'/usr/local/bin/failover_validation -node-id %%n -node-name %%a -visable-nodes %%v -total-nodes %%t'"), + "failover_validation_command": fmt.Sprintf("'/usr/local/bin/failover_validation -visible-nodes %%v -total-nodes %%t'"), } if !r.eligiblePrimary() { From f956e8a38671ea4e038d1678d1768caecbf2856a Mon Sep 17 00:00:00 2001 From: Shaun Davis Date: Wed, 4 Jan 2023 09:36:03 -0600 Subject: [PATCH 3/3] Cleanup --- cmd/failover_validation/main.go | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/cmd/failover_validation/main.go b/cmd/failover_validation/main.go index 03564672..2a2eefaf 100644 --- a/cmd/failover_validation/main.go +++ b/cmd/failover_validation/main.go @@ -11,20 +11,12 @@ func main() { totalNodes := flag.Int("total-nodes", 0, "The total number of nodes registered") flag.Parse() - // If there are no visible nodes, then we can't accept leadership as we are not able to - // confirm a network partition. - if *visibleNodes == 0 { - fmt.Println("Zero visible nodes detected.") - os.Exit(1) - } - // TODO - This will ultimately remove HA from a 2-node cluster setup. // This will be the case until we come up with a strategy to differentiate // between a down node and a network partition. - // We have visible nodes, but not enough to meet quorum. - if *visibleNodes < (*totalNodes/2 + 1) { - fmt.Printf("Quorum not met. Total nodes: %d, Visible nodes: %d\n", *totalNodes, *visibleNodes) + if *visibleNodes == 0 || *visibleNodes < (*totalNodes/2+1) { + fmt.Printf("Unable to perform failover as quorum can not be met. Total nodes: %d, Visible nodes: %d\n", *totalNodes, *visibleNodes) os.Exit(1) }