diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md index c33ef258cec..e1270eaf797 100644 --- a/changelog/18.0/18.0.0/summary.md +++ b/changelog/18.0/18.0.0/summary.md @@ -8,7 +8,7 @@ - [VTOrc flag `--allow-emergency-reparent`](#new-flag-toggle-ers) - **[Deprecations and Deletions](#deprecations-and-deletions)** - [Deleted `k8stopo`](#deleted-k8stopo) - + - [Deleted `vtgr`](#deleted-vtgr) ## Major Changes @@ -26,4 +26,8 @@ By default, VTOrc will be able to run `EmergencyReparentShard`. The users must s #### Deleted `k8stopo` -The `k8stopo` has been deprecated in Vitess 17, also see https://github.com/vitessio/vitess/issues/13298. With Vitess 18 the `k8stopo` has been removed. \ No newline at end of file +The `k8stopo` has been deprecated in Vitess 17, also see https://github.com/vitessio/vitess/issues/13298. With Vitess 18 the `k8stopo` has been removed. + +#### Deleted `vtgr` + +The `vtgr` has been deprecated in Vitess 17, also see https://github.com/vitessio/vitess/issues/13300. With Vitess 18 `vtgr` has been removed. \ No newline at end of file diff --git a/go.mod b/go.mod index 4a3fea3e3c7..36fe98d7ec4 100644 --- a/go.mod +++ b/go.mod @@ -89,9 +89,7 @@ require ( google.golang.org/protobuf v1.28.1 gopkg.in/DataDog/dd-trace-go.v1 v1.47.0 gopkg.in/asn1-ber.v1 v1.0.0-20181015200546-f715ec2f112d // indirect - gopkg.in/gcfg.v1 v1.2.3 gopkg.in/ldap.v2 v2.5.1 - gopkg.in/warnings.v0 v0.1.2 // indirect gotest.tools v2.2.0+incompatible sigs.k8s.io/yaml v1.3.0 ) diff --git a/go.sum b/go.sum index d642b4b7d1c..746b7012d48 100644 --- a/go.sum +++ b/go.sum @@ -1039,8 +1039,6 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntN gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= -gopkg.in/gcfg.v1 v1.2.3 h1:m8OOJ4ccYHnx2f4gQwpno8nAX5OGOh7RLaaz0pj3Ogs= -gopkg.in/gcfg.v1 v1.2.3/go.mod h1:yesOnuUOFQAhST5vPY4nbZsb/huCgGGXlipJsBn0b3o= gopkg.in/ini.v1 v1.41.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= @@ -1048,8 +1046,6 @@ gopkg.in/ldap.v2 v2.5.1 h1:wiu0okdNfjlBzg6UWvd1Hn8Y+Ux17/u/4nlk4CQr6tU= gopkg.in/ldap.v2 v2.5.1/go.mod h1:oI0cpe/D7HRtBQl8aTg+ZmzFUAvu4lsv3eLXMLGFxWk= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= -gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME= -gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/go/cmd/vtgr/main.go b/go/cmd/vtgr/main.go deleted file mode 100644 index bc403f2aa67..00000000000 --- a/go/cmd/vtgr/main.go +++ /dev/null @@ -1,51 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package main - -import ( - "context" - "fmt" - - "github.com/spf13/pflag" - - "vitess.io/vitess/go/acl" - "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/servenv" - "vitess.io/vitess/go/vt/vtgr" -) - -const deprecationMsg = "vtgr is deprecated and will be removed in Vitess 18. We recommend using VTOrc with semi-sync replication instead." - -func main() { - fmt.Println(deprecationMsg) - - var clustersToWatch []string - servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) { - fs.StringSliceVar(&clustersToWatch, "clusters_to_watch", nil, `Comma-separated list of keyspaces or keyspace/shards that this instance will monitor and repair. Defaults to all clusters in the topology. Example: "ks1,ks2/-80"`) - - acl.RegisterFlags(fs) - }) - servenv.ParseFlags("vtgr") - - log.Warning(deprecationMsg) - - // openTabletDiscovery will open up a connection to topo server - // and populate the tablets in memory - vtgr := vtgr.OpenTabletDiscovery(context.Background(), nil, clustersToWatch) - vtgr.RefreshCluster() - vtgr.ScanAndRepair() - - // block here so that we don't exit directly - select {} -} diff --git a/go/flags/endtoend/flags_test.go b/go/flags/endtoend/flags_test.go index 61bc1dacfc3..ee24fd6a36d 100644 --- a/go/flags/endtoend/flags_test.go +++ b/go/flags/endtoend/flags_test.go @@ -50,9 +50,6 @@ var ( //go:embed vtgate.txt vtgateTxt string - //go:embed vtgr.txt - vtgrTxt string - //go:embed vttablet.txt vttabletTxt string @@ -92,7 +89,6 @@ var ( "vtaclcheck": vtaclcheckTxt, "vtexplain": vtexplainTxt, "vtgate": vtgateTxt, - "vtgr": vtgrTxt, "vttablet": vttabletTxt, "vttlstest": vttlstestTxt, "vtctld": vtctldTxt, diff --git a/go/flags/endtoend/vtgr.txt b/go/flags/endtoend/vtgr.txt deleted file mode 100644 index 4eaccf56b1f..00000000000 --- a/go/flags/endtoend/vtgr.txt +++ /dev/null @@ -1,91 +0,0 @@ -vtgr is deprecated and will be removed in Vitess 18. We recommend using VTOrc with semi-sync replication instead. -Usage of vtgr: - --abort_rebootstrap Don't allow vtgr to rebootstrap an existing group. - --alsologtostderr log to standard error as well as files - --clusters_to_watch strings Comma-separated list of keyspaces or keyspace/shards that this instance will monitor and repair. Defaults to all clusters in the topology. Example: "ks1,ks2/-80" - --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. - --config-file-not-found-handling ConfigFileNotFoundHandling Behavior when a config file is not found. (Options: error, exit, ignore, warn) (default warn) - --config-name string Name of the config file (without extension) to search for. (default "vtconfig") - --config-path strings Paths to search for config files in. (default [{{ .Workdir }}]) - --config-persistence-min-interval duration minimum interval between persisting dynamic config changes back to disk (if no change has occurred, nothing is done). (default 1s) - --config-type string Config file type (omit to infer config type from file extension). - --consul_auth_static_file string JSON File to read the topos/tokens from. - --db-credentials-file string db credentials file; send SIGHUP to reload this file - --db-credentials-server string db credentials server type ('file' - file implementation; 'vault' - HashiCorp Vault implementation) (default "file") - --db-credentials-vault-addr string URL to Vault server - --db-credentials-vault-path string Vault path to credentials JSON blob, e.g.: secret/data/prod/dbcreds - --db-credentials-vault-role-mountpoint string Vault AppRole mountpoint; can also be passed using VAULT_MOUNTPOINT environment variable (default "approle") - --db-credentials-vault-role-secretidfile string Path to file containing Vault AppRole secret_id; can also be passed using VAULT_SECRETID environment variable - --db-credentials-vault-roleid string Vault AppRole id; can also be passed using VAULT_ROLEID environment variable - --db-credentials-vault-timeout duration Timeout for vault API operations (default 10s) - --db-credentials-vault-tls-ca string Path to CA PEM for validating Vault server certificate - --db-credentials-vault-tokenfile string Path to file containing Vault auth token; token can also be passed using VAULT_TOKEN environment variable - --db-credentials-vault-ttl duration How long to cache DB credentials from the Vault server (default 30m0s) - --db_config string Full path to db config file that will be used by VTGR. - --db_flavor string MySQL flavor override. (default "MySQL56") - --db_port int Local mysql port, set this to enable local fast check. - --emit_stats If set, emit stats to push-based monitoring and stats backends - --enable_heartbeat_check Enable heartbeat checking, set together with --group_heartbeat_threshold. - --gr_port int Port to bootstrap a MySQL group. (default 33061) - --group_heartbeat_threshold int VTGR will trigger backoff on inconsistent state if the group heartbeat staleness exceeds this threshold (in seconds). Should be used along with --enable_heartbeat_check. - --grpc_auth_static_client_creds string When using grpc_static_auth in the server, this file provides the credentials to use to authenticate with server. - --grpc_compression string Which protocol to use for compressing gRPC. Default: nothing. Supported: snappy - --grpc_enable_tracing Enable gRPC tracing. - --grpc_initial_conn_window_size int gRPC initial connection window size - --grpc_initial_window_size int gRPC initial window size - --grpc_keepalive_time duration After a duration of this time, if the client doesn't see any activity, it pings the server to see if the transport is still alive. (default 10s) - --grpc_keepalive_timeout duration After having pinged for keepalive check, the client waits for a duration of Timeout and if no activity is seen even after that the connection is closed. (default 10s) - --grpc_max_message_size int Maximum allowed RPC message size. Larger messages will be rejected by gRPC with the error 'exceeding the max size'. (default 16777216) - --grpc_prometheus Enable gRPC monitoring with Prometheus. - -h, --help display usage and exit - --keep_logs duration keep logs for this long (using ctime) (zero to keep forever) - --keep_logs_by_mtime duration keep logs for this long (using mtime) (zero to keep forever) - --lock-timeout duration Maximum time for which a shard/keyspace lock can be acquired for (default 45s) - --log_backtrace_at traceLocation when logging hits line file:N, emit a stack trace (default :0) - --log_dir string If non-empty, write log files in this directory - --log_err_stacks log stack traces for errors - --log_rotate_max_size uint size in bytes at which logs are rotated (glog.MaxSize) (default 1887436800) - --logtostderr log to standard error instead of files - --ping_tablet_timeout duration time to wait when we ping a tablet (default 2s) - --pprof strings enable profiling - --purge_logs_interval duration how often try to remove old logs (default 1h0m0s) - --refresh_interval duration Refresh interval to load tablets. (default 10s) - --remote_operation_timeout duration time to wait for a remote operation (default 15s) - --scan_interval duration Scan interval to diagnose and repair. (default 3s) - --scan_repair_timeout duration Time to wait for a Diagnose and repair operation. (default 3s) - --security_policy string the name of a registered security policy to use for controlling access to URLs - empty means allow all for anyone (built-in policies: deny-all, read-only) - --stats_backend string The name of the registered push-based monitoring/stats backend to use - --stats_combine_dimensions string List of dimensions to be combined into a single "all" value in exported stats vars - --stats_common_tags strings Comma-separated list of common tags for the stats backend. It provides both label and values. Example: label1:value1,label2:value2 - --stats_drop_variables string Variables to be dropped from the list of exported variables. - --stats_emit_period duration Interval between emitting stats to all registered backends (default 1m0s) - --stderrthreshold severity logs at or above this threshold go to stderr (default 1) - --tablet_manager_grpc_ca string the server ca to use to validate servers when connecting - --tablet_manager_grpc_cert string the cert to use to connect - --tablet_manager_grpc_concurrency int concurrency to use to talk to a vttablet server for performance-sensitive RPCs (like ExecuteFetchAs{Dba,AllPrivs,App}) (default 8) - --tablet_manager_grpc_connpool_size int number of tablets to keep tmclient connections open to (default 100) - --tablet_manager_grpc_crl string the server crl to use to validate server certificates when connecting - --tablet_manager_grpc_key string the key to use to connect - --tablet_manager_grpc_server_name string the server name to use to validate server certificate - --tablet_manager_protocol string Protocol to use to make tabletmanager RPCs to vttablets. (default "grpc") - --topo_consul_lock_delay duration LockDelay for consul session. (default 15s) - --topo_consul_lock_session_checks string List of checks for consul session. (default "serfHealth") - --topo_consul_lock_session_ttl string TTL for consul session. - --topo_consul_watch_poll_duration duration time of the long poll for watch queries. (default 30s) - --topo_etcd_lease_ttl int Lease TTL for locks and leader election. The client will use KeepAlive to keep the lease going. (default 30) - --topo_etcd_tls_ca string path to the ca to use to validate the server cert when connecting to the etcd topo server - --topo_etcd_tls_cert string path to the client cert to use to connect to the etcd topo server, requires topo_etcd_tls_key, enables TLS - --topo_etcd_tls_key string path to the client key to use to connect to the etcd topo server, enables TLS - --topo_global_root string the path of the global topology data in the global topology server - --topo_global_server_address string the address of the global topology server - --topo_implementation string the topology implementation to use - --topo_zk_auth_file string auth to use when connecting to the zk topo server, file contents should be :, e.g., digest:user:pass - --topo_zk_base_timeout duration zk base timeout (see zk.Connect) (default 30s) - --topo_zk_max_concurrency int maximum number of pending requests to send to a Zookeeper server. (default 64) - --topo_zk_tls_ca string the server ca to use to validate servers when connecting to the zk topo server - --topo_zk_tls_cert string the cert to use to connect to the zk topo server, requires topo_zk_tls_key, enables TLS - --topo_zk_tls_key string the key to use to connect to the zk topo server, enables TLS - --v Level log level for V logs - -v, --version print binary version - --vmodule moduleSpec comma-separated list of pattern=N settings for file-filtered logging - --vtgr_config string Config file for vtgr. diff --git a/go/test/endtoend/cluster/cluster_process.go b/go/test/endtoend/cluster/cluster_process.go index dfcfcbc4947..9c1f879a832 100644 --- a/go/test/endtoend/cluster/cluster_process.go +++ b/go/test/endtoend/cluster/cluster_process.go @@ -145,7 +145,6 @@ type Vttablet struct { MysqlctlProcess MysqlctlProcess MysqlctldProcess MysqlctldProcess VttabletProcess *VttabletProcess - VtgrProcess *VtgrProcess } // Keyspace : Cluster accepts keyspace to launch it @@ -1244,19 +1243,6 @@ func (cluster *LocalProcessCluster) NewVTOrcProcess(config VTOrcConfiguration) * } } -// NewVtgrProcess creates a new VtgrProcess object -func (cluster *LocalProcessCluster) NewVtgrProcess(clusters []string, config string, grPort int) *VtgrProcess { - base := VtctlProcessInstance(cluster.TopoProcess.Port, cluster.Hostname) - base.Binary = "vtgr" - return &VtgrProcess{ - VtctlProcess: *base, - LogDir: cluster.TmpDirectory, - clusters: clusters, - config: config, - grPort: grPort, - } -} - // VtprocessInstanceFromVttablet creates a new vttablet object func (cluster *LocalProcessCluster) VtprocessInstanceFromVttablet(tablet *Vttablet, shardName string, ksName string) *VttabletProcess { return VttabletProcessInstance( diff --git a/go/test/endtoend/cluster/vtgr_process.go b/go/test/endtoend/cluster/vtgr_process.go deleted file mode 100644 index 1960e469489..00000000000 --- a/go/test/endtoend/cluster/vtgr_process.go +++ /dev/null @@ -1,106 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package cluster - -import ( - "fmt" - "os" - "os/exec" - "path" - "strings" - "syscall" - "time" - - "vitess.io/vitess/go/vt/log" -) - -// VtgrProcess represents the vtgr process -type VtgrProcess struct { - VtctlProcess - LogDir string - ExtraArgs []string - clusters []string - config string - grPort int - proc *exec.Cmd - exit chan error -} - -// Start starts vtgr process with required arguements -func (vtgr *VtgrProcess) Start(alias string) (err error) { - /* minimal command line arguments: - $ vtgr --topo_implementation etcd2 \ - --topo_global_server_address localhost:2379 \ - --topo_global_root /vitess/global \ - --clusters_to_watch ks/0 - */ - vtgr.proc = exec.Command( - vtgr.Binary, - "--topo_implementation", vtgr.TopoImplementation, - "--topo_global_server_address", vtgr.TopoGlobalAddress, - "--topo_global_root", vtgr.TopoGlobalRoot, - "--tablet_manager_protocol", "grpc", - "--scan_repair_timeout", "50s", - "--clusters_to_watch", strings.Join(vtgr.clusters, ","), - ) - if vtgr.config != "" { - vtgr.proc.Args = append(vtgr.proc.Args, fmt.Sprintf("--config=%s", vtgr.config)) - } - if vtgr.grPort != 0 { - vtgr.proc.Args = append(vtgr.proc.Args, fmt.Sprintf("--gr_port=%d", vtgr.grPort)) - } - vtgr.proc.Args = append(vtgr.proc.Args, vtgr.ExtraArgs...) - errFile, _ := os.Create(path.Join(vtgr.LogDir, fmt.Sprintf("vtgr-stderr-%v.txt", alias))) - vtgr.proc.Stderr = errFile - vtgr.proc.Env = append(vtgr.proc.Env, os.Environ()...) - log.Infof("Running vtgr with command: %v", strings.Join(vtgr.proc.Args, " ")) - err = vtgr.proc.Start() - if err != nil { - return - } - - vtgr.exit = make(chan error) - go func() { - if vtgr.proc != nil { - vtgr.exit <- vtgr.proc.Wait() - close(vtgr.exit) - } - }() - - return nil -} - -// TearDown shuts down the running vtgr service -func (vtgr *VtgrProcess) TearDown() error { - if vtgr.proc == nil || vtgr.exit == nil { - return nil - } - // Attempt graceful shutdown with SIGTERM first - _ = vtgr.proc.Process.Signal(syscall.SIGTERM) - - select { - case <-vtgr.exit: - vtgr.proc = nil - return nil - - case <-time.After(10 * time.Second): - vtgr.proc.Process.Kill() - err := <-vtgr.exit - vtgr.proc = nil - return err - } -} diff --git a/go/test/endtoend/vtgr/my.cnf b/go/test/endtoend/vtgr/my.cnf deleted file mode 100644 index 14185182e5a..00000000000 --- a/go/test/endtoend/vtgr/my.cnf +++ /dev/null @@ -1,41 +0,0 @@ -[mysqld] -innodb_log_file_size=4GB -innodb_flush_neighbors=0 -innodb_log_buffer_size=67108864 -innodb_buffer_pool_size=96GB -innodb_buffer_pool_instances=16 -innodb_io_capacity=100 - -log_error_verbosity=3 - -# binlog appliers -slave_parallel_type=LOGICAL_CLOCK -slave_preserve_commit_order=1 -binlog_transaction_dependency_tracking=WRITESET_SESSION -slave_parallel_workers=32 -sync_relay_log=0 -relay_log_recovery=1 - -plugin-load-add='mysql_clone.so' -plugin-load-add='group_replication.so' - -gtid_mode=ON -enforce_gtid_consistency=ON -log_slave_updates=ON -binlog_format=ROW - -# Group replication -loose_group_replication_start_on_boot=OFF -loose_group_replication_bootstrap_group=OFF -# use auto-rejoin instead of expel timeout so that we can remove the group member -# loose_group_replication_member_expel_timeout=0 -loose_group_replication_autorejoin_tries=3 -loose_group_replication_exit_state_action=OFFLINE_MODE -loose_group_replication_communication_debug_options='GCS_DEBUG_BASIC,XCOM_DEBUG_BASIC' -loose_group-replication-recovery-retry-count=3 -loose-group_replication_ssl_mode = REQUIRED -loose-group_replication_recovery_use_ssl = 1 -loose-group_replication_ip_whitelist = "0.0.0.0/0" - -# Set multi-primary mode -loose-group_replication_single_primary_mode = ON \ No newline at end of file diff --git a/go/test/endtoend/vtgr/test_config.json b/go/test/endtoend/vtgr/test_config.json deleted file mode 100644 index 03cf0e49701..00000000000 --- a/go/test/endtoend/vtgr/test_config.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "MySQLTopologyUser": "orc_client_user", - "MySQLTopologyPassword": "orc_client_user_password", - "MySQLReplicaUser": "vt_repl", - "MySQLReplicaPassword": "", - "InstancePollSeconds": 1, - "MySQLConnectTimeoutSeconds": 50, - "MySQLTopologyReadTimeoutSeconds": 50 -} diff --git a/go/test/endtoend/vtgr/vtgr_test.go b/go/test/endtoend/vtgr/vtgr_test.go deleted file mode 100644 index 64bc5ba655e..00000000000 --- a/go/test/endtoend/vtgr/vtgr_test.go +++ /dev/null @@ -1,366 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package vtgr - -import ( - "fmt" - "os" - "os/exec" - "path" - "strconv" - "strings" - "testing" - "time" - - "vitess.io/vitess/go/sqltypes" - - "github.com/stretchr/testify/require" - "gotest.tools/assert" - - "vitess.io/vitess/go/json2" - "vitess.io/vitess/go/test/endtoend/cluster" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" -) - -// To run this test locally on MacOS, set hostname to localhost first: -// $ sudo scutil --set HostName localhost - -func createCluster(t *testing.T, numReplicas int) *cluster.LocalProcessCluster { - keyspaceName := "ks" - shardName := "0" - keyspace := &cluster.Keyspace{Name: keyspaceName} - shard0 := &cluster.Shard{Name: shardName} - hostname := "localhost" - cell1 := "zone1" - tablets := []*cluster.Vttablet{} - clusterInstance := cluster.NewCluster(cell1, hostname) - - os.Setenv("EXTRA_MY_CNF", path.Join(os.Getenv("PWD"), "my.cnf")) - - // Start topo server - err := clusterInstance.StartTopo() - require.NoError(t, err) - - uidBase := 100 - for i := 0; i < numReplicas; i++ { - tablet := clusterInstance.NewVttabletInstance("replica", uidBase+i, cell1) - tablets = append(tablets, tablet) - } - - // Initialize Cluster - shard0.Vttablets = tablets - err = clusterInstance.SetupCluster(keyspace, []cluster.Shard{*shard0}) - require.NoError(t, err) - - // Start MySql - var mysqlCtlProcessList []*exec.Cmd - for _, tablet := range shard0.Vttablets { - proc, err := tablet.MysqlctlProcess.StartProcess() - require.NoError(t, err) - mysqlCtlProcessList = append(mysqlCtlProcessList, proc) - } - - // Wait for mysql processes to start - for _, proc := range mysqlCtlProcessList { - err := proc.Wait() - require.NoError(t, err) - } - for _, tablet := range shard0.Vttablets { - // Reset status, don't wait for the tablet status. We will check it later - tablet.VttabletProcess.ServingStatus = "" - tablet.VttabletProcess.DbFlavor = "MysqlGR" - // If we enable backup the GR setup is a bit wacky - tablet.VttabletProcess.SupportsBackup = false - // Start the tablet - err := tablet.VttabletProcess.Setup() - require.NoError(t, err) - } - - // Start vtgr - we deploy vtgr on the tablet node in the test - baseGrPort := 33061 - for i, tablet := range shard0.Vttablets { - tablet.VtgrProcess = clusterInstance.NewVtgrProcess( - []string{fmt.Sprintf("%s/%s", keyspaceName, shardName)}, - path.Join(os.Getenv("PWD"), "test_config.json"), - baseGrPort+i, - ) - } - - for _, tablet := range shard0.Vttablets { - err := tablet.VttabletProcess.WaitForTabletTypes([]string{"NOT_SERVING"}) - require.NoError(t, err) - } - return clusterInstance -} - -func killTablets(t *testing.T, shard *cluster.Shard) { - for _, tablet := range shard.Vttablets { - if tablet.VtgrProcess != nil { - err := tablet.VtgrProcess.TearDown() - require.NoError(t, err) - } - err := tablet.VttabletProcess.TearDown() - require.NoError(t, err) - } -} - -func TestBasicSetup(t *testing.T) { - defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 2) - keyspace := &clusterInstance.Keyspaces[0] - shard0 := &keyspace.Shards[0] - defer func() { - clusterInstance.Teardown() - killTablets(t, shard0) - }() - for _, tablet := range shard0.Vttablets { - // Until there is a primary, all tablets are replica and should all be NOT_SERVING status - tab := getTablet(t, clusterInstance, tablet.Alias) - assert.Equal(t, tab.Type.String(), "REPLICA") - assert.Equal(t, tablet.VttabletProcess.GetTabletStatus(), "NOT_SERVING") - } - _, err := getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name) - assert.ErrorContains(t, err, "timeout looking for primary tablet") - - tablet1 := shard0.Vttablets[0] - query := `select count(*) - from performance_schema.replication_group_members - where MEMBER_STATE='ONLINE'` - var count int - err = getSQLResult(t, tablet1, query, func(values []sqltypes.Value) bool { - cnt, err := values[0].ToInt64() - if err != nil { - return false - } - count = int(cnt) - return true - }) - require.NoError(t, err) - require.NoError(t, err) - // without vtgr, tablet process will not create a mysql group - // and all the nodes are replicas type in NOT_SERVING state - assert.Equal(t, 0, int(count)) -} - -func TestVTGRSetup(t *testing.T) { - defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 2) - keyspace := &clusterInstance.Keyspaces[0] - shard0 := &keyspace.Shards[0] - defer func() { - clusterInstance.Teardown() - killTablets(t, shard0) - }() - for _, tablet := range shard0.Vttablets { - // Until there is a primary, all tablets are replica and should all be NOT_SERVING status - tab := getTablet(t, clusterInstance, tablet.Alias) - assert.Equal(t, tab.Type.String(), "REPLICA") - assert.Equal(t, tablet.VttabletProcess.GetTabletStatus(), "NOT_SERVING") - } - - // start VTGR processes - for _, tablet := range shard0.Vttablets { - err := tablet.VtgrProcess.Start(tablet.Alias) - require.NoError(t, err) - } - - // VTGR will pick one tablet as the primary - primaryAlias, err := getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name) - require.NoError(t, err) - require.NotEqual(t, nil, primaryAlias) - - tablet1 := shard0.Vttablets[0] - query := `select count(*) - from performance_schema.replication_group_members - where MEMBER_STATE='ONLINE'` - err = getSQLResult(t, tablet1, query, func(values []sqltypes.Value) bool { - cnt, err := values[0].ToInt64() - if err != nil { - return false - } - // VTGR should bootstrap the group and put the replica into the group - return cnt == 2 - }) - require.NoError(t, err) -} - -func TestVTGRWrongPrimaryTablet(t *testing.T) { - defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 2) - keyspace := &clusterInstance.Keyspaces[0] - shard0 := &keyspace.Shards[0] - defer func() { - clusterInstance.Teardown() - killTablets(t, shard0) - }() - for _, tablet := range shard0.Vttablets { - // Until there is a primary, all tablets are replica and should all be NOT_SERVING status - tab := getTablet(t, clusterInstance, tablet.Alias) - assert.Equal(t, tab.Type.String(), "REPLICA") - assert.Equal(t, tablet.VttabletProcess.GetTabletStatus(), "NOT_SERVING") - } - // start VTGR processes - for _, tablet := range shard0.Vttablets { - err := tablet.VtgrProcess.Start(tablet.Alias) - require.NoError(t, err) - } - // VTGR will pick one tablet as the primary - primaryAlias, err := getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name) - require.NoError(t, err) - require.NotEqual(t, nil, primaryAlias) - tablet := shard0.Vttablets[0] - query := `select member_id - from performance_schema.replication_group_members - where member_role='SECONDARY' and member_state='ONLINE'` - var member string - err = getSQLResult(t, tablet, query, func(values []sqltypes.Value) bool { - member = values[0].ToString() - return true - }) - require.NoError(t, err) - query = fmt.Sprintf(`select group_replication_set_as_primary('%s')`, member) - _, err = tablet.VttabletProcess.QueryTabletWithDB(query, "") - require.NoError(t, err) - - // Verify the mysql primary changed, and also the primary tablet changed as well - query = fmt.Sprintf(`select member_role from performance_schema.replication_group_members where member_id='%s'`, member) - err = getSQLResult(t, tablet, query, func(values []sqltypes.Value) bool { - return values[0].ToString() == "PRIMARY" - }) - require.NoError(t, err) - err = verifyPrimaryChange(t, clusterInstance, keyspace.Name, shard0.Name, primaryAlias) - require.NoError(t, err) -} - -func TestVTGRFailover(t *testing.T) { - defer cluster.PanicHandler(t) - clusterInstance := createCluster(t, 3) - keyspace := &clusterInstance.Keyspaces[0] - shard0 := &keyspace.Shards[0] - defer func() { - clusterInstance.Teardown() - killTablets(t, shard0) - }() - for _, tablet := range shard0.Vttablets { - // Until there is a primary, all tablets are replica and should all be NOT_SERVING status - tab := getTablet(t, clusterInstance, tablet.Alias) - assert.Equal(t, tab.Type.String(), "REPLICA") - assert.Equal(t, tablet.VttabletProcess.GetTabletStatus(), "NOT_SERVING") - } - // start VTGR processes - for _, tablet := range shard0.Vttablets { - err := tablet.VtgrProcess.Start(tablet.Alias) - require.NoError(t, err) - } - primaryAlias, err := getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name) - require.NoError(t, err) - // VTGR has init the cluster - require.NotEqual(t, "", primaryAlias) - primaryTablet := findTabletByAlias(shard0.Vttablets, primaryAlias) - require.NotNil(t, primaryTablet) - // Wait until there are two nodes in the group - query := `select count(*) from - performance_schema.replication_group_members - where MEMBER_STATE='ONLINE'` - err = getSQLResult(t, primaryTablet, query, func(values []sqltypes.Value) bool { - return values[0].ToString() == "3" - }) - require.NoError(t, err) - - // Now kill the primary - // VTGR should move mysql primary to a different node and change failover primary tablet - err = primaryTablet.VttabletProcess.TearDown() - require.NoError(t, err) - err = verifyPrimaryChange(t, clusterInstance, keyspace.Name, shard0.Name, primaryAlias) - require.NoError(t, err) - // now the primary has changed - primaryAlias, err = getPrimaryTablet(t, clusterInstance, keyspace.Name, shard0.Name) - require.NoError(t, err) - // verify on the _new_ primary node, we are running the mysql primary as well - primaryTablet = findTabletByAlias(shard0.Vttablets, primaryAlias) - require.NotNil(t, primaryTablet) - query = `SELECT count(*) FROM - performance_schema.replication_group_members - WHERE MEMBER_STATE='ONLINE' AND MEMBER_ROLE='PRIMARY' AND MEMBER_PORT=@@port` - err = getSQLResult(t, primaryTablet, query, func(values []sqltypes.Value) bool { - return values[0].ToString() == "1" - }) - require.NoError(t, err) -} - -func getTablet(t *testing.T, cluster *cluster.LocalProcessCluster, alias string) *topodatapb.Tablet { - result, err := cluster.VtctlclientProcess.ExecuteCommandWithOutput("GetTablet", alias) - require.NoError(t, err) - var tabletInfo topodatapb.Tablet - err = json2.Unmarshal([]byte(result), &tabletInfo) - require.NoError(t, err) - return &tabletInfo -} - -func findTabletByAlias(tablets []*cluster.Vttablet, alias *topodatapb.TabletAlias) *cluster.Vttablet { - for _, tablet := range tablets { - if tablet.Cell == alias.Cell && strings.HasSuffix(tablet.Alias, strconv.Itoa(int(alias.Uid))) { - return tablet - } - } - return nil -} - -func verifyPrimaryChange(t *testing.T, cluster *cluster.LocalProcessCluster, ks, shard string, old *topodatapb.TabletAlias) error { - timeToWait := time.Now().Add(180 * time.Second) - for time.Now().Before(timeToWait) { - time.Sleep(1 * time.Second) - result, err := cluster.VtctlclientProcess.ExecuteCommandWithOutput("GetShard", fmt.Sprintf("%s/%s", ks, shard)) - require.NoError(t, err) - var shardInfo topodatapb.Shard - err = json2.Unmarshal([]byte(result), &shardInfo) - require.NoError(t, err) - if shardInfo.PrimaryAlias.String() != old.String() { - return nil - } - } - return fmt.Errorf("fail to verify primary change") -} - -func getPrimaryTablet(t *testing.T, cluster *cluster.LocalProcessCluster, ks, shard string) (*topodatapb.TabletAlias, error) { - timeToWait := time.Now().Add(180 * time.Second) - for time.Now().Before(timeToWait) { - time.Sleep(1 * time.Second) - result, err := cluster.VtctlclientProcess.ExecuteCommandWithOutput("GetShard", fmt.Sprintf("%s/%s", ks, shard)) - require.NoError(t, err) - var shardInfo topodatapb.Shard - err = json2.Unmarshal([]byte(result), &shardInfo) - require.NoError(t, err) - if shardInfo.PrimaryAlias != nil { - return shardInfo.PrimaryAlias, nil - } - } - return nil, fmt.Errorf("timeout looking for primary tablet") -} - -func getSQLResult(t *testing.T, tablet *cluster.Vttablet, query string, check func([]sqltypes.Value) bool) error { - timeToWait := time.Now().Add(180 * time.Second) - for time.Now().Before(timeToWait) { - time.Sleep(1 * time.Second) - qr, err := tablet.VttabletProcess.QueryTabletWithDB(query, "") - require.NoError(t, err) - if len(qr.Rows) == 1 && check(qr.Rows[0]) { - return nil - } - } - return fmt.Errorf("timeout waiting for sql result") -} diff --git a/go/vt/dbconfigs/credentials.go b/go/vt/dbconfigs/credentials.go index 5a5dbc1c1a1..4e0e5518869 100644 --- a/go/vt/dbconfigs/credentials.go +++ b/go/vt/dbconfigs/credentials.go @@ -61,7 +61,6 @@ var ( "mysqlctld", "vtbackup", "vtcombo", - "vtgr", "vttablet", } ) diff --git a/go/vt/grpcclient/client.go b/go/vt/grpcclient/client.go index d3865c88c84..b2ef0d4fb28 100644 --- a/go/vt/grpcclient/client.go +++ b/go/vt/grpcclient/client.go @@ -56,7 +56,6 @@ var ( "vtctld", "vtgate", "vtgateclienttest", - "vtgr", "vtorc", "vttablet", "vttestserver", diff --git a/go/vt/servenv/servenv.go b/go/vt/servenv/servenv.go index 662e4da5207..1944a39453d 100644 --- a/go/vt/servenv/servenv.go +++ b/go/vt/servenv/servenv.go @@ -425,7 +425,6 @@ func init() { "vtctld", "vtgate", "vtgateclienttest", - "vtgr", "vtorc", "vttablet", "vttestserver", @@ -439,7 +438,6 @@ func init() { "vtcombo", "vtctld", "vtgate", - "vtgr", "vttablet", "vtorc", } { @@ -461,7 +459,6 @@ func RegisterFlagsForTopoBinaries(registerFlags func(fs *pflag.FlagSet)) { "vtctl", "vtctld", "vtgate", - "vtgr", "vttablet", "vttestserver", "zk", diff --git a/go/vt/topo/server.go b/go/vt/topo/server.go index 62162fc06df..1995e8b6ec4 100644 --- a/go/vt/topo/server.go +++ b/go/vt/topo/server.go @@ -174,7 +174,7 @@ var ( } FlagBinaries = []string{"vttablet", "vtctl", "vtctld", "vtcombo", "vtgate", - "vtgr", "vtorc", "vtbackup"} + "vtorc", "vtbackup"} ) func init() { diff --git a/go/vt/vtgr/config/vtgr_config.go b/go/vt/vtgr/config/vtgr_config.go deleted file mode 100644 index 0386bd42541..00000000000 --- a/go/vt/vtgr/config/vtgr_config.go +++ /dev/null @@ -1,604 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package config - -import ( - "encoding/json" - "fmt" - "net/url" - "os" - "regexp" - "strings" - - "gopkg.in/gcfg.v1" - - "vitess.io/vitess/go/vt/vttls" - - "vitess.io/vitess/go/vt/log" -) - -// VTGRConfig is the config for VTGR -type VTGRConfig struct { - DisableReadOnlyProtection bool - BootstrapGroupSize int - MinNumReplica int - BackoffErrorWaitTimeSeconds int - BootstrapWaitTimeSeconds int -} - -var vtgrCfg = newVTGRConfig() - -func newVTGRConfig() *VTGRConfig { - config := &VTGRConfig{ - DisableReadOnlyProtection: false, - BootstrapGroupSize: 5, - MinNumReplica: 3, - BackoffErrorWaitTimeSeconds: 10, - BootstrapWaitTimeSeconds: 10 * 60, - } - return config -} - -// ReadVTGRConfig reads config for VTGR -func ReadVTGRConfig(file string) (*VTGRConfig, error) { - vtgrFile, err := os.Open(file) - if err != nil { - return nil, err - } - decoder := json.NewDecoder(vtgrFile) - err = decoder.Decode(vtgrCfg) - if err != nil { - return nil, err - } - return vtgrCfg, nil -} - -/* - Everything below has been copied over from the VTOrc package -*/ - -var ( - envVariableRegexp = regexp.MustCompile("[$][{](.*)[}]") -) - -const ( - DefaultStatusAPIEndpoint = "/api/status" -) - -const ( - MySQLTopologyMaxPoolConnections = 3 -) - -// Configuration makes for orchestrator configuration input, which can be provided by user via JSON formatted file. -// Some of the parameteres have reasonable default values, and some (like database credentials) are -// strictly expected from user. -// TODO(sougou): change this to yaml parsing, and possible merge with tabletenv. -type Configuration struct { - Debug bool // set debug mode (similar to --debug option) - EnableSyslog bool // Should logs be directed (in addition) to syslog daemon? - ListenAddress string // Where orchestrator HTTP should listen for TCP - ListenSocket string // Where orchestrator HTTP should listen for unix socket (default: empty; when given, TCP is disabled) - HTTPAdvertise string // optional, for raft setups, what is the HTTP address this node will advertise to its peers (potentially use where behind NAT or when rerouting ports; example: "http://11.22.33.44:3030") - AgentsServerPort string // port orchestrator agents talk back to - MySQLTopologyUser string // The user VTOrc will use to connect to MySQL instances - MySQLTopologyPassword string // The password VTOrc will use to connect to MySQL instances - MySQLReplicaUser string // User to set on replica MySQL instances while configuring replication settings on them. If set, use this credential instead of discovering from mysql. TODO(sougou): deprecate this in favor of fetching from vttablet - MySQLReplicaPassword string // Password to set on replica MySQL instances while configuring replication settings on them. - MySQLTopologyCredentialsConfigFile string // my.cnf style configuration file from where to pick credentials. Expecting `user`, `password` under `[client]` section - MySQLTopologySSLPrivateKeyFile string // Private key file used to authenticate with a Topology mysql instance with TLS - MySQLTopologySSLCertFile string // Certificate PEM file used to authenticate with a Topology mysql instance with TLS - MySQLTopologySSLCAFile string // Certificate Authority PEM file used to authenticate with a Topology mysql instance with TLS - MySQLTopologySSLSkipVerify bool // If true, do not strictly validate mutual TLS certs for Topology mysql instances - MySQLTopologyUseMutualTLS bool // Turn on TLS authentication with the Topology MySQL instances - MySQLTopologyUseMixedTLS bool // Mixed TLS and non-TLS authentication with the Topology MySQL instances - MySQLTopologyTLSMinVersion string // Configures the minimal required TLS version for a topology MySQL instance with TLS. Defaults to TLSv1.2. Options: TLSv1.0, TLSv1.1, TLSv1.2, TLSv1.3. - TLSCacheTTLFactor uint // Factor of InstancePollSeconds that we set as TLS info cache expiry - BackendDB string // EXPERIMENTAL: type of backend db; either "mysql" or "sqlite" - SQLite3DataFile string // when BackendDB == "sqlite", full path to sqlite3 datafile - SkipOrchestratorDatabaseUpdate bool // When true, do not check backend database schema nor attempt to update it. Useful when you may be running multiple versions of orchestrator, and you only wish certain boxes to dictate the db structure (or else any time a different orchestrator version runs it will rebuild database schema) - PanicIfDifferentDatabaseDeploy bool // When true, and this process finds the orchestrator backend DB was provisioned by a different version, panic - RaftEnabled bool // When true, setup orchestrator in a raft consensus layout. When false (default) all Raft* variables are ignored - RaftBind string - RaftAdvertise string - RaftDataDir string - DefaultRaftPort int // if a RaftNodes entry does not specify port, use this one - RaftNodes []string // Raft nodes to make initial connection with - ExpectFailureAnalysisConcensus bool - MySQLOrchestratorHost string - MySQLOrchestratorMaxPoolConnections int // The maximum size of the connection pool to the Orchestrator backend. - MySQLOrchestratorPort uint - MySQLOrchestratorDatabase string - MySQLOrchestratorUser string - MySQLOrchestratorPassword string - MySQLOrchestratorCredentialsConfigFile string // my.cnf style configuration file from where to pick credentials. Expecting `user`, `password` under `[client]` section - MySQLOrchestratorSSLPrivateKeyFile string // Private key file used to authenticate with the Orchestrator mysql instance with TLS - MySQLOrchestratorSSLCertFile string // Certificate PEM file used to authenticate with the Orchestrator mysql instance with TLS - MySQLOrchestratorSSLCAFile string // Certificate Authority PEM file used to authenticate with the Orchestrator mysql instance with TLS - MySQLOrchestratorSSLSkipVerify bool // If true, do not strictly validate mutual TLS certs for the Orchestrator mysql instances - MySQLOrchestratorUseMutualTLS bool // Turn on TLS authentication with the Orchestrator MySQL instance - MySQLOrchestratorTLSMinVersion string // Configures the minimal required TLS version for the Orchestrator MySQL instance with TLS. Defaults to TLSv1.2. Options: TLSv1.0, TLSv1.1, TLSv1.2, TLSv1.3. - MySQLOrchestratorReadTimeoutSeconds int // Number of seconds before backend mysql read operation is aborted (driver-side) - MySQLOrchestratorRejectReadOnly bool // Reject read only connections https://github.com/go-sql-driver/mysql#rejectreadonly - MySQLConnectTimeoutSeconds int // Number of seconds before connection is aborted (driver-side) - MySQLDiscoveryReadTimeoutSeconds int // Number of seconds before topology mysql read operation is aborted (driver-side). Used for discovery queries. - MySQLTopologyReadTimeoutSeconds int // Number of seconds before topology mysql read operation is aborted (driver-side). Used for all but discovery queries. - MySQLConnectionLifetimeSeconds int // Number of seconds the mysql driver will keep database connection alive before recycling it - DefaultInstancePort int // In case port was not specified on command line - ReplicationLagQuery string // custom query to check on replica lg (e.g. heartbeat table). Must return a single row with a single numeric column, which is the lag. - ReplicationCredentialsQuery string // custom query to get replication credentials. Must return a single row, with two text columns: 1st is username, 2nd is password. This is optional, and can be used by orchestrator to configure replication after primary takeover or setup of co-primary. You need to ensure the orchestrator user has the privileges to run this query - DiscoverByShowSlaveHosts bool // Attempt SHOW SLAVE HOSTS before PROCESSLIST - UseSuperReadOnly bool // Should orchestrator super_read_only any time it sets read_only - InstancePollSeconds uint // Number of seconds between instance reads - InstanceWriteBufferSize int // Instance write buffer size (max number of instances to flush in one INSERT ODKU) - BufferInstanceWrites bool // Set to 'true' for write-optimization on backend table (compromise: writes can be stale and overwrite non stale data) - InstanceFlushIntervalMilliseconds int // Max interval between instance write buffer flushes - UnseenInstanceForgetHours uint // Number of hours after which an unseen instance is forgotten - SnapshotTopologiesIntervalHours uint // Interval in hour between snapshot-topologies invocation. Default: 0 (disabled) - DiscoveryMaxConcurrency uint // Number of goroutines doing hosts discovery - DiscoveryQueueCapacity uint // Buffer size of the discovery queue. Should be greater than the number of DB instances being discovered - DiscoveryQueueMaxStatisticsSize int // The maximum number of individual secondly statistics taken of the discovery queue - DiscoveryCollectionRetentionSeconds uint // Number of seconds to retain the discovery collection information - DiscoverySeeds []string // Hard coded array of hostname:port, ensuring orchestrator discovers these hosts upon startup, assuming not already known to orchestrator - InstanceBulkOperationsWaitTimeoutSeconds uint // Time to wait on a single instance when doing bulk (many instances) operation - HostnameResolveMethod string // Method by which to "normalize" hostname ("none"/"default"/"cname") - MySQLHostnameResolveMethod string // Method by which to "normalize" hostname via MySQL server. ("none"/"@@hostname"/"@@report_host"; default "@@hostname") - SkipBinlogServerUnresolveCheck bool // Skip the double-check that an unresolved hostname resolves back to same hostname for binlog servers - ExpiryHostnameResolvesMinutes int // Number of minutes after which to expire hostname-resolves - RejectHostnameResolvePattern string // Regexp pattern for resolved hostname that will not be accepted (not cached, not written to db). This is done to avoid storing wrong resolves due to network glitches. - ReasonableReplicationLagSeconds int // Above this value is considered a problem - ProblemIgnoreHostnameFilters []string // Will minimize problem visualization for hostnames matching given regexp filters - VerifyReplicationFilters bool // Include replication filters check before approving topology refactoring - ReasonableMaintenanceReplicationLagSeconds int // Above this value move-up and move-below are blocked - CandidateInstanceExpireMinutes uint // Minutes after which a suggestion to use an instance as a candidate replica (to be preferably promoted on primary failover) is expired. - AuditLogFile string // Name of log file for audit operations. Disabled when empty. - AuditToSyslog bool // If true, audit messages are written to syslog - AuditToBackendDB bool // If true, audit messages are written to the backend DB's `audit` table (default: true) - AuditPurgeDays uint // Days after which audit entries are purged from the database - RemoveTextFromHostnameDisplay string // Text to strip off the hostname on cluster/clusters pages - ReadOnly bool - AuthenticationMethod string // Type of autherntication to use, if any. "" for none, "basic" for BasicAuth, "multi" for advanced BasicAuth, "proxy" for forwarded credentials via reverse proxy, "token" for token based access - OAuthClientID string - OAuthClientSecret string - OAuthScopes []string - HTTPAuthUser string // Username for HTTP Basic authentication (blank disables authentication) - HTTPAuthPassword string // Password for HTTP Basic authentication - AuthUserHeader string // HTTP header indicating auth user, when AuthenticationMethod is "proxy" - PowerAuthUsers []string // On AuthenticationMethod == "proxy", list of users that can make changes. All others are read-only. - PowerAuthGroups []string // list of unix groups the authenticated user must be a member of to make changes. - AccessTokenUseExpirySeconds uint // Time by which an issued token must be used - AccessTokenExpiryMinutes uint // Time after which HTTP access token expires - ClusterNameToAlias map[string]string // map between regex matching cluster name to a human friendly alias - DetectClusterAliasQuery string // Optional query (executed on topology instance) that returns the alias of a cluster. Query will only be executed on cluster primary (though until the topology's primary is resovled it may execute on other/all replicas). If provided, must return one row, one column - DetectClusterDomainQuery string // Optional query (executed on topology instance) that returns the VIP/CNAME/Alias/whatever domain name for the primary of this cluster. Query will only be executed on cluster primary (though until the topology's primary is resovled it may execute on other/all replicas). If provided, must return one row, one column - DetectInstanceAliasQuery string // Optional query (executed on topology instance) that returns the alias of an instance. If provided, must return one row, one column - DetectPromotionRuleQuery string // Optional query (executed on topology instance) that returns the promotion rule of an instance. If provided, must return one row, one column. - DataCenterPattern string // Regexp pattern with one group, extracting the datacenter name from the hostname - RegionPattern string // Regexp pattern with one group, extracting the region name from the hostname - PhysicalEnvironmentPattern string // Regexp pattern with one group, extracting physical environment info from hostname (e.g. combination of datacenter & prod/dev env) - DetectDataCenterQuery string // Optional query (executed on topology instance) that returns the data center of an instance. If provided, must return one row, one column. Overrides DataCenterPattern and useful for installments where DC cannot be inferred by hostname - DetectRegionQuery string // Optional query (executed on topology instance) that returns the region of an instance. If provided, must return one row, one column. Overrides RegionPattern and useful for installments where Region cannot be inferred by hostname - DetectPhysicalEnvironmentQuery string // Optional query (executed on topology instance) that returns the physical environment of an instance. If provided, must return one row, one column. Overrides PhysicalEnvironmentPattern and useful for installments where env cannot be inferred by hostname - DetectSemiSyncEnforcedQuery string // Optional query (executed on topology instance) to determine whether semi-sync is fully enforced for primary writes (async fallback is not allowed under any circumstance). If provided, must return one row, one column, value 0 or 1. - SupportFuzzyPoolHostnames bool // Should "submit-pool-instances" command be able to pass list of fuzzy instances (fuzzy means non-fqdn, but unique enough to recognize). Defaults 'true', implies more queries on backend db - InstancePoolExpiryMinutes uint // Time after which entries in database_instance_pool are expired (resubmit via `submit-pool-instances`) - PromotionIgnoreHostnameFilters []string // Orchestrator will not promote replicas with hostname matching pattern (via -c recovery; for example, avoid promoting dev-dedicated machines) - ServeAgentsHTTP bool // Spawn another HTTP interface dedicated for orchestrator-agent - AgentsUseSSL bool // When "true" orchestrator will listen on agents port with SSL as well as connect to agents via SSL - AgentsUseMutualTLS bool // When "true" Use mutual TLS for the server to agent communication - AgentSSLSkipVerify bool // When using SSL for the Agent, should we ignore SSL certification error - AgentSSLPrivateKeyFile string // Name of Agent SSL private key file, applies only when AgentsUseSSL = true - AgentSSLCertFile string // Name of Agent SSL certification file, applies only when AgentsUseSSL = true - AgentSSLCAFile string // Name of the Agent Certificate Authority file, applies only when AgentsUseSSL = true - AgentSSLValidOUs []string // Valid organizational units when using mutual TLS to communicate with the agents - UseSSL bool // Use SSL on the server web port - UseMutualTLS bool // When "true" Use mutual TLS for the server's web and API connections - SSLSkipVerify bool // When using SSL, should we ignore SSL certification error - SSLPrivateKeyFile string // Name of SSL private key file, applies only when UseSSL = true - SSLCertFile string // Name of SSL certification file, applies only when UseSSL = true - SSLCAFile string // Name of the Certificate Authority file, applies only when UseSSL = true - SSLValidOUs []string // Valid organizational units when using mutual TLS - StatusEndpoint string // Override the status endpoint. Defaults to '/api/status' - StatusOUVerify bool // If true, try to verify OUs when Mutual TLS is on. Defaults to false - AgentPollMinutes uint // Minutes between agent polling - UnseenAgentForgetHours uint // Number of hours after which an unseen agent is forgotten - StaleSeedFailMinutes uint // Number of minutes after which a stale (no progress) seed is considered failed. - SeedAcceptableBytesDiff int64 // Difference in bytes between seed source & target data size that is still considered as successful copy - SeedWaitSecondsBeforeSend int64 // Number of seconds for waiting before start send data command on agent - BinlogEventsChunkSize int // Chunk size (X) for SHOW BINLOG|RELAYLOG EVENTS LIMIT ?,X statements. Smaller means less locking and mroe work to be done - ReduceReplicationAnalysisCount bool // When true, replication analysis will only report instances where possibility of handled problems is possible in the first place (e.g. will not report most leaf nodes, that are mostly uninteresting). When false, provides an entry for every known instance - FailureDetectionPeriodBlockMinutes int // The time for which an instance's failure discovery is kept "active", so as to avoid concurrent "discoveries" of the instance's failure; this preceeds any recovery process, if any. - RecoveryPeriodBlockMinutes int // (supported for backwards compatibility but please use newer `RecoveryPeriodBlockSeconds` instead) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on smae instance as well as flapping - RecoveryPeriodBlockSeconds int // (overrides `RecoveryPeriodBlockMinutes`) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on smae instance as well as flapping - RecoveryIgnoreHostnameFilters []string // Recovery analysis will completely ignore hosts matching given patterns - RecoverPrimaryClusterFilters []string // Only do primary recovery on clusters matching these regexp patterns (of course the ".*" pattern matches everything) - RecoverIntermediatePrimaryClusterFilters []string // Only do IM recovery on clusters matching these regexp patterns (of course the ".*" pattern matches everything) - ProcessesShellCommand string // Shell that executes command scripts - OnFailureDetectionProcesses []string // Processes to execute when detecting a failover scenario (before making a decision whether to failover or not). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {autoPrimaryRecovery}, {autoIntermediatePrimaryRecovery} - PreFailoverProcesses []string // Processes to execute before doing a failover (aborting operation should any once of them exits with non-zero code; order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterDomain}, {failedPort}, {countReplicas}, {replicaHosts}, {isDowntimed} - PostFailoverProcesses []string // Processes to execute after doing a failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {isSuccessful}, {lostReplicas}, {countLostReplicas} - PostUnsuccessfulFailoverProcesses []string // Processes to execute after a not-completely-successful failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isPrimary}, {isCoPrimary}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {isSuccessful}, {lostReplicas}, {countLostReplicas} - PostPrimaryFailoverProcesses []string // Processes to execute after doing a primary failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses - PostIntermediatePrimaryFailoverProcesses []string // Processes to execute after doing a primary failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses - PostTakePrimaryProcesses []string // Processes to execute after a successful Take-Primary event has taken place - CoPrimaryRecoveryMustPromoteOtherCoPrimary bool // When 'false', anything can get promoted (and candidates are prefered over others). When 'true', orchestrator will promote the other co-primary or else fail - DetachLostReplicasAfterPrimaryFailover bool // Should replicas that are not to be lost in primary recovery (i.e. were more up-to-date than promoted replica) be forcibly detached - ApplyMySQLPromotionAfterPrimaryFailover bool // Should orchestrator take upon itself to apply MySQL primary promotion: set read_only=0, detach replication, etc. - PreventCrossDataCenterPrimaryFailover bool // When true (default: false), cross-DC primary failover are not allowed, orchestrator will do all it can to only fail over within same DC, or else not fail over at all. - PreventCrossRegionPrimaryFailover bool // When true (default: false), cross-region primary failover are not allowed, orchestrator will do all it can to only fail over within same region, or else not fail over at all. - PrimaryFailoverLostInstancesDowntimeMinutes uint // Number of minutes to downtime any server that was lost after a primary failover (including failed primary & lost replicas). 0 to disable - PrimaryFailoverDetachReplicaPrimaryHost bool // Should orchestrator issue a detach-replica-primary-host on newly promoted primary (this makes sure the new primary will not attempt to replicate old primary if that comes back to life). Defaults 'false'. Meaningless if ApplyMySQLPromotionAfterPrimaryFailover is 'true'. - FailPrimaryPromotionOnLagMinutes uint // when > 0, fail a primary promotion if the candidate replica is lagging >= configured number of minutes. - FailPrimaryPromotionIfSQLThreadNotUpToDate bool // when true, and a primary failover takes place, if candidate primary has not consumed all relay logs, promotion is aborted with error - DelayPrimaryPromotionIfSQLThreadNotUpToDate bool // when true, and a primary failover takes place, if candidate primary has not consumed all relay logs, delay promotion until the sql thread has caught up - PostponeReplicaRecoveryOnLagMinutes uint // On crash recovery, replicas that are lagging more than given minutes are only resurrected late in the recovery process, after primary/IM has been elected and processes executed. Value of 0 disables this feature - OSCIgnoreHostnameFilters []string // OSC replicas recommendation will ignore replica hostnames matching given patterns - URLPrefix string // URL prefix to run orchestrator on non-root web path, e.g. /orchestrator to put it behind nginx. - DiscoveryIgnoreReplicaHostnameFilters []string // Regexp filters to apply to prevent auto-discovering new replicas. Usage: unreachable servers due to firewalls, applications which trigger binlog dumps - DiscoveryIgnorePrimaryHostnameFilters []string // Regexp filters to apply to prevent auto-discovering a primary. Usage: pointing your primary temporarily to replicate seom data from external host - DiscoveryIgnoreHostnameFilters []string // Regexp filters to apply to prevent discovering instances of any kind - WebMessage string // If provided, will be shown on all web pages below the title bar - MaxConcurrentReplicaOperations int // Maximum number of concurrent operations on replicas - InstanceDBExecContextTimeoutSeconds int // Timeout on context used while calling ExecContext on instance database - LockShardTimeoutSeconds int // Timeout on context used to lock shard. Should be a small value because we should fail-fast - WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockShardTimeoutSeconds since that is the total time we use for an ERS. -} - -// ToJSONString will marshal this configuration as JSON -func (config *Configuration) ToJSONString() string { - b, _ := json.Marshal(config) - return string(b) -} - -// Config is *the* configuration instance, used globally to get configuration data -var Config = newConfiguration() - -func newConfiguration() *Configuration { - return &Configuration{ - Debug: false, - EnableSyslog: false, - ListenAddress: ":3000", - ListenSocket: "", - HTTPAdvertise: "", - AgentsServerPort: ":3001", - StatusEndpoint: DefaultStatusAPIEndpoint, - StatusOUVerify: false, - BackendDB: "sqlite", - SQLite3DataFile: "file::memory:?mode=memory&cache=shared", - SkipOrchestratorDatabaseUpdate: false, - PanicIfDifferentDatabaseDeploy: false, - RaftBind: "127.0.0.1:10008", - RaftAdvertise: "", - RaftDataDir: "", - DefaultRaftPort: 10008, - RaftNodes: []string{}, - ExpectFailureAnalysisConcensus: true, - MySQLOrchestratorMaxPoolConnections: 128, // limit concurrent conns to backend DB - MySQLOrchestratorPort: 3306, - MySQLTopologyUseMutualTLS: false, - MySQLTopologyUseMixedTLS: true, - MySQLOrchestratorUseMutualTLS: false, - MySQLConnectTimeoutSeconds: 2, - MySQLOrchestratorReadTimeoutSeconds: 30, - MySQLOrchestratorRejectReadOnly: false, - MySQLDiscoveryReadTimeoutSeconds: 10, - MySQLTopologyReadTimeoutSeconds: 600, - MySQLConnectionLifetimeSeconds: 0, - DefaultInstancePort: 3306, - TLSCacheTTLFactor: 100, - InstancePollSeconds: 5, - InstanceWriteBufferSize: 100, - BufferInstanceWrites: false, - InstanceFlushIntervalMilliseconds: 100, - UnseenInstanceForgetHours: 240, - SnapshotTopologiesIntervalHours: 0, - DiscoverByShowSlaveHosts: false, - UseSuperReadOnly: false, - DiscoveryMaxConcurrency: 300, - DiscoveryQueueCapacity: 100000, - DiscoveryQueueMaxStatisticsSize: 120, - DiscoveryCollectionRetentionSeconds: 120, - DiscoverySeeds: []string{}, - InstanceBulkOperationsWaitTimeoutSeconds: 10, - HostnameResolveMethod: "default", - MySQLHostnameResolveMethod: "none", - SkipBinlogServerUnresolveCheck: true, - ExpiryHostnameResolvesMinutes: 60, - RejectHostnameResolvePattern: "", - ReasonableReplicationLagSeconds: 10, - ProblemIgnoreHostnameFilters: []string{}, - VerifyReplicationFilters: false, - ReasonableMaintenanceReplicationLagSeconds: 20, - CandidateInstanceExpireMinutes: 60, - AuditLogFile: "", - AuditToSyslog: false, - AuditToBackendDB: false, - AuditPurgeDays: 7, - RemoveTextFromHostnameDisplay: "", - ReadOnly: false, - AuthenticationMethod: "", - HTTPAuthUser: "", - HTTPAuthPassword: "", - AuthUserHeader: "X-Forwarded-User", - PowerAuthUsers: []string{"*"}, - PowerAuthGroups: []string{}, - AccessTokenUseExpirySeconds: 60, - AccessTokenExpiryMinutes: 1440, - ClusterNameToAlias: make(map[string]string), - DetectClusterAliasQuery: "", - DetectClusterDomainQuery: "", - DetectInstanceAliasQuery: "", - DetectPromotionRuleQuery: "", - DataCenterPattern: "", - PhysicalEnvironmentPattern: "", - DetectDataCenterQuery: "", - DetectPhysicalEnvironmentQuery: "", - DetectSemiSyncEnforcedQuery: "", - SupportFuzzyPoolHostnames: true, - InstancePoolExpiryMinutes: 60, - PromotionIgnoreHostnameFilters: []string{}, - ServeAgentsHTTP: false, - AgentsUseSSL: false, - AgentsUseMutualTLS: false, - AgentSSLValidOUs: []string{}, - AgentSSLSkipVerify: false, - AgentSSLPrivateKeyFile: "", - AgentSSLCertFile: "", - AgentSSLCAFile: "", - UseSSL: false, - UseMutualTLS: false, - SSLValidOUs: []string{}, - SSLSkipVerify: false, - SSLPrivateKeyFile: "", - SSLCertFile: "", - SSLCAFile: "", - AgentPollMinutes: 60, - UnseenAgentForgetHours: 6, - StaleSeedFailMinutes: 60, - SeedAcceptableBytesDiff: 8192, - SeedWaitSecondsBeforeSend: 2, - BinlogEventsChunkSize: 10000, - ReduceReplicationAnalysisCount: true, - FailureDetectionPeriodBlockMinutes: 60, - RecoveryPeriodBlockMinutes: 60, - RecoveryPeriodBlockSeconds: 3600, - RecoveryIgnoreHostnameFilters: []string{}, - RecoverPrimaryClusterFilters: []string{"*"}, - RecoverIntermediatePrimaryClusterFilters: []string{}, - ProcessesShellCommand: "bash", - OnFailureDetectionProcesses: []string{}, - PreFailoverProcesses: []string{}, - PostPrimaryFailoverProcesses: []string{}, - PostIntermediatePrimaryFailoverProcesses: []string{}, - PostFailoverProcesses: []string{}, - PostUnsuccessfulFailoverProcesses: []string{}, - PostTakePrimaryProcesses: []string{}, - CoPrimaryRecoveryMustPromoteOtherCoPrimary: true, - DetachLostReplicasAfterPrimaryFailover: true, - ApplyMySQLPromotionAfterPrimaryFailover: true, - PreventCrossDataCenterPrimaryFailover: false, - PreventCrossRegionPrimaryFailover: false, - PrimaryFailoverLostInstancesDowntimeMinutes: 0, - PrimaryFailoverDetachReplicaPrimaryHost: false, - FailPrimaryPromotionOnLagMinutes: 0, - FailPrimaryPromotionIfSQLThreadNotUpToDate: false, - DelayPrimaryPromotionIfSQLThreadNotUpToDate: true, - PostponeReplicaRecoveryOnLagMinutes: 0, - OSCIgnoreHostnameFilters: []string{}, - URLPrefix: "", - DiscoveryIgnoreReplicaHostnameFilters: []string{}, - WebMessage: "", - MaxConcurrentReplicaOperations: 5, - InstanceDBExecContextTimeoutSeconds: 30, - LockShardTimeoutSeconds: 30, - WaitReplicasTimeoutSeconds: 30, - } -} - -func (config *Configuration) MySQLOrchestratorTLSMinVersionNumber() uint16 { - // We can ignore the error here, we already checked for valid options if it's set. - // If it's not set, we get a safe default back here. - minVersion, _ := vttls.TLSVersionToNumber(config.MySQLOrchestratorTLSMinVersion) - return minVersion -} - -func (config *Configuration) MySQLTopologyTLSMinVersionNumber() uint16 { - // We can ignore the error here, we already checked for valid options if it's set. - // If it's not set, we get a safe default back here. - minVersion, _ := vttls.TLSVersionToNumber(config.MySQLTopologyTLSMinVersion) - return minVersion -} - -func (config *Configuration) postReadAdjustments() error { - if config.MySQLOrchestratorCredentialsConfigFile != "" { - mySQLConfig := struct { - Client struct { - User string - Password string - } - }{} - err := gcfg.ReadFileInto(&mySQLConfig, config.MySQLOrchestratorCredentialsConfigFile) - if err != nil { - log.Fatalf("Failed to parse gcfg data from file: %+v", err) - } else { - log.Infof("Parsed orchestrator credentials from %s", config.MySQLOrchestratorCredentialsConfigFile) - config.MySQLOrchestratorUser = mySQLConfig.Client.User - config.MySQLOrchestratorPassword = mySQLConfig.Client.Password - } - } - { - // We accept password in the form "${SOME_ENV_VARIABLE}" in which case we pull - // the given variable from os env - submatch := envVariableRegexp.FindStringSubmatch(config.MySQLOrchestratorPassword) - if len(submatch) > 1 { - config.MySQLOrchestratorPassword = os.Getenv(submatch[1]) - } - } - if config.MySQLTopologyCredentialsConfigFile != "" { - mySQLConfig := struct { - Client struct { - User string - Password string - } - }{} - err := gcfg.ReadFileInto(&mySQLConfig, config.MySQLTopologyCredentialsConfigFile) - if err != nil { - log.Fatalf("Failed to parse gcfg data from file: %+v", err) - } else { - log.Infof("Parsed topology credentials from %s", config.MySQLTopologyCredentialsConfigFile) - config.MySQLTopologyUser = mySQLConfig.Client.User - config.MySQLTopologyPassword = mySQLConfig.Client.Password - } - } - { - // We accept password in the form "${SOME_ENV_VARIABLE}" in which case we pull - // the given variable from os env - submatch := envVariableRegexp.FindStringSubmatch(config.MySQLTopologyPassword) - if len(submatch) > 1 { - config.MySQLTopologyPassword = os.Getenv(submatch[1]) - } - } - - if config.RecoveryPeriodBlockSeconds == 0 && config.RecoveryPeriodBlockMinutes > 0 { - // RecoveryPeriodBlockSeconds is a newer addition that overrides RecoveryPeriodBlockMinutes - // The code does not consider RecoveryPeriodBlockMinutes anymore, but RecoveryPeriodBlockMinutes - // still supported in config file for backwards compatibility - config.RecoveryPeriodBlockSeconds = config.RecoveryPeriodBlockMinutes * 60 - } - - if config.FailPrimaryPromotionIfSQLThreadNotUpToDate && config.DelayPrimaryPromotionIfSQLThreadNotUpToDate { - return fmt.Errorf("Cannot have both FailPrimaryPromotionIfSQLThreadNotUpToDate and DelayPrimaryPromotionIfSQLThreadNotUpToDate enabled") - } - if config.FailPrimaryPromotionOnLagMinutes > 0 && config.ReplicationLagQuery == "" { - return fmt.Errorf("nonzero FailPrimaryPromotionOnLagMinutes requires ReplicationLagQuery to be set") - } - - if config.URLPrefix != "" { - // Ensure the prefix starts with "/" and has no trailing one. - config.URLPrefix = strings.TrimLeft(config.URLPrefix, "/") - config.URLPrefix = strings.TrimRight(config.URLPrefix, "/") - config.URLPrefix = "/" + config.URLPrefix - } - - if config.IsSQLite() && config.SQLite3DataFile == "" { - return fmt.Errorf("SQLite3DataFile must be set when BackendDB is sqlite") - } - if config.RaftEnabled && config.RaftDataDir == "" { - return fmt.Errorf("RaftDataDir must be defined since raft is enabled (RaftEnabled)") - } - if config.RaftEnabled && config.RaftBind == "" { - return fmt.Errorf("RaftBind must be defined since raft is enabled (RaftEnabled)") - } - if config.RaftAdvertise == "" { - config.RaftAdvertise = config.RaftBind - } - if config.HTTPAdvertise != "" { - u, err := url.Parse(config.HTTPAdvertise) - if err != nil { - return fmt.Errorf("Failed parsing HTTPAdvertise %s: %s", config.HTTPAdvertise, err.Error()) - } - if u.Scheme == "" { - return fmt.Errorf("If specified, HTTPAdvertise must include scheme (http:// or https://)") - } - if u.Hostname() == "" { - return fmt.Errorf("If specified, HTTPAdvertise must include host name") - } - if u.Port() == "" { - return fmt.Errorf("If specified, HTTPAdvertise must include port number") - } - if u.Path != "" { - return fmt.Errorf("If specified, HTTPAdvertise must not specify a path") - } - if config.InstanceWriteBufferSize <= 0 { - config.BufferInstanceWrites = false - } - } - - if config.MySQLOrchestratorTLSMinVersion != "" { - _, err := vttls.TLSVersionToNumber(config.MySQLOrchestratorTLSMinVersion) - if err != nil { - return fmt.Errorf("If specified, MySQLOrchestratorTLSMinVersion must be one of TLSv1.0, TLSv1.1, TLSv1.2, TLSv1.3") - } - } - - if config.MySQLTopologyTLSMinVersion != "" { - _, err := vttls.TLSVersionToNumber(config.MySQLTopologyTLSMinVersion) - if err != nil { - return fmt.Errorf("If specified, MySQLTopologyTLSMinVersion must be one of TLSv1.0, TLSv1.1, TLSv1.2, TLSv1.3") - } - } - - return nil -} - -func (config *Configuration) IsSQLite() bool { - return strings.Contains(config.BackendDB, "sqlite") -} - -func (config *Configuration) IsMySQL() bool { - return config.BackendDB == "mysql" || config.BackendDB == "" -} - -// read reads configuration from given file, or silently skips if the file does not exist. -// If the file does exist, then it is expected to be in valid JSON format or the function bails out. -func read(fileName string) (*Configuration, error) { - if fileName == "" { - return Config, fmt.Errorf("Empty file name") - } - file, err := os.Open(fileName) - if err != nil { - return Config, err - } - decoder := json.NewDecoder(file) - err = decoder.Decode(Config) - if err == nil { - log.Infof("Read config: %s", fileName) - } else { - log.Fatal("Cannot read config file:", fileName, err) - } - if err := Config.postReadAdjustments(); err != nil { - log.Fatal(err) - } - return Config, err -} - -// ForceRead reads configuration from given file name or bails out if it fails -func ForceRead(fileName string) *Configuration { - _, err := read(fileName) - if err != nil { - log.Fatal("Cannot read config file:", fileName, err) - } - return Config -} - -// CLIFlags stores some command line flags that are globally available in the process' lifetime -type CLIFlags struct { - Noop *bool - SkipUnresolve *bool - SkipUnresolveCheck *bool - BinlogFile *string - GrabElection *bool - Version *bool - Statement *string - PromotionRule *string - ConfiguredVersion string - SkipContinuousRegistration *bool - EnableDatabaseUpdate *bool - IgnoreRaftSetup *bool - Tag *string -} - -var RuntimeCLIFlags CLIFlags diff --git a/go/vt/vtgr/config/vtgr_config.json b/go/vt/vtgr/config/vtgr_config.json deleted file mode 100644 index 1c1ecae562a..00000000000 --- a/go/vt/vtgr/config/vtgr_config.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "BackoffErrorWaitTimeSeconds": 5, - "BootstrapGroupSize": 3 -} \ No newline at end of file diff --git a/go/vt/vtgr/config/vtgr_config_test.go b/go/vt/vtgr/config/vtgr_config_test.go deleted file mode 100644 index ec4312096a9..00000000000 --- a/go/vt/vtgr/config/vtgr_config_test.go +++ /dev/null @@ -1,37 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package config - -import ( - "os" - "path/filepath" - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestReadConfig(t *testing.T) { - path, _ := os.Getwd() - config, err := ReadVTGRConfig(filepath.Join(path, "vtgr_config.json")) - assert.NoError(t, err) - // Make sure VTGR config honors the default setting - assert.Equal(t, false, config.DisableReadOnlyProtection) - assert.Equal(t, 600, config.BootstrapWaitTimeSeconds) - // Make sure the config is load correctly - assert.Equal(t, 3, config.BootstrapGroupSize) - assert.Equal(t, 5, config.BackoffErrorWaitTimeSeconds) -} diff --git a/go/vt/vtgr/controller/controller.go b/go/vt/vtgr/controller/controller.go deleted file mode 100644 index 2b2c36cd320..00000000000 --- a/go/vt/vtgr/controller/controller.go +++ /dev/null @@ -1,26 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controller - -import ( - "math/rand" - "time" -) - -func init() { - rand.Seed(time.Now().UnixNano()) -} diff --git a/go/vt/vtgr/controller/diagnose.go b/go/vt/vtgr/controller/diagnose.go deleted file mode 100644 index b0896f4555a..00000000000 --- a/go/vt/vtgr/controller/diagnose.go +++ /dev/null @@ -1,586 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controller - -import ( - "context" - "errors" - "fmt" - "math/rand" - "os" - "sort" - "strings" - "sync" - "time" - - "github.com/spf13/pflag" - - "vitess.io/vitess/go/mysql" - "vitess.io/vitess/go/vt/concurrency" - "vitess.io/vitess/go/vt/servenv" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/vterrors" - "vitess.io/vitess/go/vt/vtgr/db" -) - -var pingTabletTimeout = 2 * time.Second - -func init() { - servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) { - fs.DurationVar(&pingTabletTimeout, "ping_tablet_timeout", 2*time.Second, "time to wait when we ping a tablet") - }) -} - -// DiagnoseType is the types of Diagnose result -type DiagnoseType string - -type instanceGTIDSet struct { - gtids mysql.GTIDSet - instance *grInstance -} - -// groupGTIDRecorder is used to help us query all the instance in parallel and record the result -// it helps us to take care of the consistency / synchronization among go routines -type groupGTIDRecorder struct { - name string - gtidWithInstances []*instanceGTIDSet - hasActive bool - sync.Mutex -} - -const ( - // DiagnoseTypeError represents an DiagnoseTypeError status - DiagnoseTypeError DiagnoseType = "error" - // DiagnoseTypeHealthy represents everything is DiagnoseTypeHealthy - DiagnoseTypeHealthy = "Healthy" - // DiagnoseTypeShardHasNoGroup represents the cluster has not init yet - DiagnoseTypeShardHasNoGroup = "ShardHasNoGroup" - // DiagnoseTypeShardHasInactiveGroup represents the status where we have a group name but no member in it - DiagnoseTypeShardHasInactiveGroup = "ShardHasInactiveGroup" - // DiagnoseTypeInsufficientGroupSize represents the cluster has insufficient group members - DiagnoseTypeInsufficientGroupSize = "InsufficientGroupSize" - // DiagnoseTypeReadOnlyShard represents the cluster who has a read only node - DiagnoseTypeReadOnlyShard = "ReadOnlyShard" - // DiagnoseTypeUnreachablePrimary represents the primary tablet is unreachable - DiagnoseTypeUnreachablePrimary = "UnreachablePrimary" - // DiagnoseTypeWrongPrimaryTablet represents the primary tablet is incorrect based on mysql group - DiagnoseTypeWrongPrimaryTablet = "WrongPrimaryTablet" - // DiagnoseTypeUnconnectedReplica represents cluster with primary tablet, but a node is not connected to it - DiagnoseTypeUnconnectedReplica = "UnconnectedReplica" - // DiagnoseTypeBackoffError represents a transient error e.g., the primary is unreachable - DiagnoseTypeBackoffError = "BackoffError" - // DiagnoseTypeBootstrapBackoff represents an ongoing bootstrap - DiagnoseTypeBootstrapBackoff = "BootstrapBackoff" - - // diagnoseTypeUnknown represents a unclear intermediate diagnose state - diagnoseTypeUnknown = "Unknown" -) - -// ScanAndRepairShard scans a particular shard by first Diagnose the shard with info from grShard -// and then repair the probelm if the shard is unhealthy -func (shard *GRShard) ScanAndRepairShard(ctx context.Context) { - status, err := shard.Diagnose(ctx) - if err != nil { - shard.logger.Errorf("fail to scanAndRepairShard %v/%v because of Diagnose error: %v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard, err) - return - } - // We are able to get Diagnose without error - // - // Note: all the recovery function should first try to grab a shard level lock - // and check the trigger conditions before doing anything. This is to avoid - // other VTGR instance try to do the same thing - shard.logger.Infof("%v status is %v", formatKeyspaceShard(shard.KeyspaceShard), status) - if _, err := shard.Repair(ctx, status); err != nil { - shard.logger.Errorf("failed to repair %v: %v", status, err) - } -} - -// Diagnose the shard in the following order: -// TODO: use FSM to make sure the status transition is correct -// 1. if the shard has a group that every node agreed on -// 2. if the group has any active (online / recovering) member -// 3. if the shard has initialized a Vitess primary -// 4. if primary tablet is reachable -// 5. if Vitess primary and mysql primary reconciled -// 6. if we have enough group members -// 7. if the primary node has read_only=OFF -// 8. if there is a node that is not in Mysql group -func (shard *GRShard) Diagnose(ctx context.Context) (DiagnoseType, error) { - shard.Lock() - defer shard.Unlock() - diagnoseResult, err := shard.diagnoseLocked(ctx) - shard.shardStatusCollector.recordDiagnoseResult(diagnoseResult) - shard.populateVTGRStatusLocked() - if diagnoseResult != DiagnoseTypeHealthy { - shard.logger.Warningf(`VTGR diagnose shard as unhealthy for %s/%s: result=%v, last_result=%v, instances=%v, primary=%v, primary_tablet=%v, problematics=%v, unreachables=%v,\n%v`, - shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard, - shard.shardStatusCollector.status.DiagnoseResult, - shard.lastDiagnoseResult, - shard.shardStatusCollector.status.Instances, - shard.shardStatusCollector.status.Primary, - shard.primaryTabletAlias(), - shard.shardStatusCollector.status.Problematics, - shard.shardStatusCollector.status.Unreachables, - shard.sqlGroup.ToString()) - } - if diagnoseResult != shard.lastDiagnoseResult { - shard.lastDiagnoseResult = diagnoseResult - shard.lastDiagnoseSince = time.Now() - } - return diagnoseResult, err -} - -func (shard *GRShard) diagnoseLocked(ctx context.Context) (DiagnoseType, error) { - // fast path only diagnose problem Vitess primary - // which does not needed if the shard is inactive - if shard.localDbPort != 0 && shard.isActive.Load() { - localView := shard.getLocalView() - if localView != nil { - fastDiagnose := shard.fastPathDiagnose(ctx, localView) - if fastDiagnose != diagnoseTypeUnknown { - // If we can use local sql group info to diagnose - // we should record the view as well. This view is all we need - // later VTGR needs to find group name, primary etc from - // SQLGroup for repairing instead of getting nil - shard.sqlGroup.overrideView([]*db.GroupView{localView}) - shard.logger.Infof("Diagnose %v from fast path", fastDiagnose) - return fastDiagnose, nil - } - } - } - // fast path is disabled or cannot diagnose the shard - // fall back to the normal strategy where we fetch info from all the nodes - err := shard.refreshSQLGroup() - if err != nil { - if errors.Is(err, db.ErrGroupBackoffError) { - return DiagnoseTypeBackoffError, nil - } - if errors.Is(err, db.ErrGroupOngoingBootstrap) { - return DiagnoseTypeBootstrapBackoff, nil - } - return DiagnoseTypeError, vterrors.Wrap(err, "fail to refreshSQLGroup") - } - // First, we check if there is any group in the shard - // if no, we should bootstrap one - mysqlGroup := shard.shardAgreedGroupName() - if mysqlGroup == "" { - if len(shard.sqlGroup.views) != shard.sqlGroup.expectedBootstrapSize { - return DiagnoseTypeError, fmt.Errorf("fail to diagnose ShardHasNoGroup with %v nodes", len(shard.sqlGroup.views)) - } - return DiagnoseTypeShardHasNoGroup, nil - } - // We handle the case where the shard has an agreed group name but all nodes are offline - // In this situation, instead of bootstrap a group, we should re-build the - // old group for the shard - if shard.isAllOfflineOrError() { - shard.logger.Info("Found all members are OFFLINE or ERROR") - // On rebootstrap, we always want to make sure _all_ the nodes in topo are reachable - // unless we override the rebootstrap size - desiredRebootstrapSize := len(shard.instances) - if shard.sqlGroup.rebootstrapSize != 0 { - desiredRebootstrapSize = shard.sqlGroup.rebootstrapSize - } - if len(shard.sqlGroup.views) != desiredRebootstrapSize { - return DiagnoseTypeError, fmt.Errorf("fail to diagnose ShardHasInactiveGroup with %v nodes expecting %v", len(shard.sqlGroup.views), desiredRebootstrapSize) - } - return DiagnoseTypeShardHasInactiveGroup, nil - } - - // We only check Vitess primary iff shard is active. - // Otherwise VTGR will only make sure there is a mysql group in the shard. - if shard.isActive.Load() { - // Secondly, we check if there is a primary tablet. - // If there is a group but we cannot find a primary tablet - // we should set it based on mysql group - hasWrongPrimary, err := shard.hasWrongPrimaryTablet(ctx) - if err != nil { - // errMissingGroup means we cannot find a mysql group for the shard - // we are in DiagnoseTypeShardHasNoGroup state - if err == errMissingGroup { - shard.logger.Warning("Missing mysql group") - return DiagnoseTypeShardHasNoGroup, nil - } - // errMissingPrimaryTablet means we cannot find a tablet based on mysql primary - // which means the tablet disconnected from topo server and we cannot find it - if err == errMissingPrimaryTablet { - return DiagnoseTypeUnreachablePrimary, nil - } - return DiagnoseTypeError, vterrors.Wrap(err, "fail to diagnose shardNeedsInitialized") - } - if hasWrongPrimary { - return DiagnoseTypeWrongPrimaryTablet, nil - } - - // Thirdly, we check if primary tablet is reachable - isPrimaryReachable, err := shard.isPrimaryReachable(ctx) - if err != nil { - return DiagnoseTypeError, vterrors.Wrap(err, "fail to diagnose isPrimaryReachable") - } - if !isPrimaryReachable { - return DiagnoseTypeUnreachablePrimary, nil - } - } - - // At this point, the primary tablet should be consistent with mysql primary - // so the view from priamry tablet should be accurate - onlineMembers, isReadOnly := shard.getOnlineGroupInfo() - // If we found a writable shard in the inactive shard - // we should consider the shard as InsufficientGroupSize to set read only - if !isReadOnly && !shard.isActive.Load() { - return DiagnoseTypeInsufficientGroupSize, nil - } - // Then we check if we satisfy the minimum replica requirement - if shard.minNumReplicas > 0 { - if onlineMembers >= shard.minNumReplicas && isReadOnly && shard.isActive.Load() { - return DiagnoseTypeReadOnlyShard, nil - } - // If we disable readonly protection and still found we have a read only shard, - // we should return DiagnoseTypeReadOnlyShard so that VTGR can turn off read only - if shard.disableReadOnlyProtection && isReadOnly && shard.isActive.Load() { - return DiagnoseTypeReadOnlyShard, nil - } - // We don't check isActive here since if it is inactive, VTGR should already return InsufficientGroupSize - if !shard.disableReadOnlyProtection && onlineMembers < shard.minNumReplicas && !isReadOnly { - return DiagnoseTypeInsufficientGroupSize, nil - } - } - - // Lastly, we check if there is a replica that is not connected to primary node - disconnectedInstance, err := shard.disconnectedInstance() - if err != nil { - return DiagnoseTypeError, vterrors.Wrap(err, "fail to diagnose disconnectedInstance") - } - if disconnectedInstance != nil { - return DiagnoseTypeUnconnectedReplica, nil - } - - // If we get here, shard is DiagnoseTypeHealthy - return DiagnoseTypeHealthy, nil -} - -func (shard *GRShard) getLocalView() *db.GroupView { - localHostname, _ := os.Hostname() - localInst := shard.findTabletByHostAndPort(localHostname, shard.localDbPort) - if localInst == nil { - return nil - } - // TODO: consider using -db_socket to read local info - view, err := shard.dbAgent.FetchGroupView(localInst.alias, localInst.instanceKey) - // We still have the fallback logic if this failed, therefore we don't raise error - // but try to get local view with best effort - if err != nil { - shard.logger.Errorf("failed to fetch local group view: %v", err) - } - return view -} - -func (shard *GRShard) fastPathDiagnose(ctx context.Context, view *db.GroupView) DiagnoseType { - pHost, pPort, isOnline := view.GetPrimaryView() - primaryTablet := shard.findShardPrimaryTablet() - if !isOnline || pHost == "" || pPort == 0 || primaryTablet == nil { - return diagnoseTypeUnknown - } - // VTGR will only bootstrap a group when it observes same number of views as group_size - // it means if we can find an ONLINE primary, we should be able to trust the view reported locally - // together with the primary tablet from topo server, we can determine: - // - if we need to failover vitess - // - if we need to failover mysql - if primaryTablet.instanceKey.Hostname != pHost || primaryTablet.instanceKey.Port != pPort { - // we find a mismatch but if the reported mysql primary is not in - // topology we should consider it as unreachable. - if shard.findTabletByHostAndPort(pHost, pPort) == nil { - return DiagnoseTypeUnreachablePrimary - } - return DiagnoseTypeWrongPrimaryTablet - } - if !shard.instanceReachable(ctx, primaryTablet) { - return DiagnoseTypeUnreachablePrimary - } - return diagnoseTypeUnknown -} - -func (shard *GRShard) shardAgreedGroupName() string { - if len(shard.instances) == 0 { - return "" - } - return shard.sqlGroup.GetGroupName() -} - -func (shard *GRShard) isAllOfflineOrError() bool { - return shard.sqlGroup.IsAllOfflineOrError() -} - -func (shard *GRShard) getOnlineGroupInfo() (int, bool) { - return shard.sqlGroup.GetOnlineGroupInfo() -} - -func (shard *GRShard) hasWrongPrimaryTablet(ctx context.Context) (bool, error) { - // Find out the hostname and port of the primary in mysql group - // we try to use local instance and then fallback to a random instance to check mysqld - // in case the primary is unreachable - host, port, _ := shard.sqlGroup.GetPrimary() - if !isHostPortValid(host, port) { - shard.logger.Warningf("Invalid address for primary %v:%v", host, port) - return false, errMissingGroup - } - // Make sure we have a tablet available - // findTabletByHostAndPort returns nil when we cannot find a tablet - // that is running on host:port, which means the tablet get stuck - // or when the tablet is not reachable - // we retrun errMissingPrimaryTablet so that VTGR will trigger a failover - tablet := shard.findTabletByHostAndPort(host, port) - if tablet == nil || !shard.instanceReachable(ctx, tablet) { - shard.logger.Errorf("Failed to find tablet that is running with mysql on %v:%v", host, port) - return false, errMissingPrimaryTablet - } - // Now we know we have a valid mysql primary in the group - // we should make sure tablets are aligned with it - primary := shard.findShardPrimaryTablet() - // If we failed to find primary for shard, it mostly means we are initializing the shard - // return true directly so that VTGR will set primary tablet according to MySQL group - if primary == nil { - shard.logger.Infof("unable to find primary tablet for %v", formatKeyspaceShard(shard.KeyspaceShard)) - return true, nil - } - return (host != primary.instanceKey.Hostname) || (port != primary.instanceKey.Port), nil -} - -func (shard *GRShard) isPrimaryReachable(ctx context.Context) (bool, error) { - primaryTablet := shard.findShardPrimaryTablet() - if primaryTablet == nil { - return false, fmt.Errorf("unable to find primary for %v", formatKeyspaceShard(shard.KeyspaceShard)) - } - return shard.instanceReachable(ctx, primaryTablet), nil -} - -func (shard *GRShard) instanceReachable(ctx context.Context, instance *grInstance) bool { - pingCtx, cancel := context.WithTimeout(context.Background(), pingTabletTimeout) - defer cancel() - c := make(chan error, 1) - // tmc.Ping create grpc client connection first without timeout via dial - // then call the grpc endpoint using the context with timeout - // this is problematic if the host is really unreachable, we have to wait the - // all the retries inside grpc.dial with exponential backoff - go func() { c <- shard.tmc.Ping(pingCtx, instance.tablet) }() - select { - case <-pingCtx.Done(): - shard.logger.Errorf("Ping abort timeout %v", pingTabletTimeout) - return false - case err := <-c: - if err != nil { - shard.logger.Errorf("Ping error host=%v: %v", instance.instanceKey.Hostname, err) - } - return err == nil - } -} - -// findShardPrimaryTablet returns the primary for the shard -// it is either based on shard info from global topo or based on tablet types -// from local topo -func (shard *GRShard) findShardPrimaryTablet() *grInstance { - var primaryInstance *grInstance - for _, instance := range shard.instances { - if shard.primaryAlias == instance.alias { - return instance - } - } - return primaryInstance -} - -func (shard *GRShard) primaryTabletAlias() string { - primary := shard.findShardPrimaryTablet() - if primary == nil { - return "UNKNOWN" - } - return primary.alias -} - -// disconnectedInstance iterates all known the replica records -// and checks mysql to see if the group replication is setup on it -func (shard *GRShard) disconnectedInstance() (*grInstance, error) { - primaryInstance := shard.findShardPrimaryTablet() - // if there is no primary, we should recover from DiagnoseTypeWrongPrimaryTablet - if primaryInstance == nil { - return nil, fmt.Errorf("%v does not have primary", formatKeyspaceShard(shard.KeyspaceShard)) - } - // Up to this check, we know: - // - shard has an agreed group - // - shard has a primary tablet - // - shard primary tablet is running on the same node as mysql - rand.Shuffle(len(shard.instances), func(i, j int) { - shard.instances[i], shard.instances[j] = shard.instances[j], shard.instances[i] - }) - for _, instance := range shard.instances { - // Skip instance without hostname because they are not up and running - // also skip instances that raised unrecoverable errors - if shard.shardStatusCollector.isUnreachable(instance) { - shard.logger.Infof("Skip %v to check disconnectedInstance because it is unhealthy", instance.alias) - continue - } - isUnconnected := shard.sqlGroup.IsUnconnectedReplica(instance.instanceKey) - if isUnconnected { - return instance, nil - } - } - return nil, nil -} - -func (recorder *groupGTIDRecorder) recordGroupStatus(name string, isActive bool) error { - recorder.Lock() - defer recorder.Unlock() - if recorder.name != "" && recorder.name != name { - return fmt.Errorf("group has more than one group name") - } - recorder.name = name - // hasActive records true if any node finds an active member - if isActive { - recorder.hasActive = true - } - return nil -} - -func (recorder *groupGTIDRecorder) recordGroupGTIDs(gtids mysql.GTIDSet, instance *grInstance) { - recorder.Lock() - defer recorder.Unlock() - recorder.gtidWithInstances = append(recorder.gtidWithInstances, &instanceGTIDSet{gtids: gtids, instance: instance}) -} - -func (recorder *groupGTIDRecorder) sort() { - sort.SliceStable(recorder.gtidWithInstances, func(i, j int) bool { - return recorder.gtidWithInstances[i].instance.alias < recorder.gtidWithInstances[j].instance.alias - }) -} - -func (collector *shardStatusCollector) recordDiagnoseResult(result DiagnoseType) { - collector.Lock() - defer collector.Unlock() - collector.status.DiagnoseResult = result -} - -func (collector *shardStatusCollector) recordUnreachables(instance *grInstance) { - collector.Lock() - defer collector.Unlock() - // dedup - // the list size is at most same as number instances in a shard so iterate to dedup is not terrible - for _, alias := range collector.status.Unreachables { - if alias == instance.alias { - return - } - } - collector.status.Unreachables = append(collector.status.Unreachables, instance.alias) -} - -func (collector *shardStatusCollector) clear() { - collector.Lock() - defer collector.Unlock() - collector.status.Unreachables = nil - collector.status.Problematics = nil -} - -func (collector *shardStatusCollector) recordProblematics(instance *grInstance) { - collector.Lock() - defer collector.Unlock() - // dedup - // the list size is at most same as number instances in a shard so iterate to dedup is not terrible - for _, alias := range collector.status.Problematics { - if alias == instance.alias { - return - } - } - collector.status.Problematics = append(collector.status.Problematics, instance.alias) -} - -func formatKeyspaceShard(keyspaceShard *topo.KeyspaceShard) string { - return fmt.Sprintf("%v/%v", keyspaceShard.Keyspace, keyspaceShard.Shard) -} - -func isHostPortValid(host string, port int) bool { - return host != "" && port != 0 -} - -// We use forAllInstances in two cases: -// 1. FetchGroupView GTIDs to find a candidate for failover. -// If a node is not healthy it should not be considered as a failover candidate -// -// 2. FetchGroupView group member status to see if we need to bootstrap a group, -// either for the first time or rebuild a group after all the nodes are died. -// -// caller will be responsible to decide if they want to tolerate errors from the forAllInstances call -func (shard *GRShard) forAllInstances(task func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder)) *concurrency.AllErrorRecorder { - errorRecord := concurrency.AllErrorRecorder{} - shard.shardStatusCollector.clear() - var wg sync.WaitGroup - for _, instance := range shard.instances { - wg.Add(1) - go task(instance, &wg, &errorRecord) - } - wg.Wait() - if len(errorRecord.Errors) > 0 { - shard.logger.Errorf("get errors in forAllInstances call: %v", errorRecord.Error()) - } - return &errorRecord -} - -func unreachableError(err error) bool { - contains := []string{ - // "no such host"/"no route to host" is the error when a host is not reachalbe - "no such host", - "no route to host", - // "connect: connection refused" is the error when a mysqld refused the connection - "connect: connection refused", - // "invalid mysql instance key" is the error when a tablet does not populate mysql hostname or port - // this can happen if the tablet crashed. We keep them in the grShard.instances list to compute - // quorum but consider it as an unreachable host. - "invalid mysql instance key", - } - for _, k := range contains { - if strings.Contains(err.Error(), k) { - return true - } - } - return false -} - -// refreshSQLGroup hits all instances and renders a SQL group locally for later diagnoses -// the SQL group contains a list of "views" for the group from all the available nodes -func (shard *GRShard) refreshSQLGroup() error { - // reset views in sql group - shard.sqlGroup.clear() - er := shard.forAllInstances(func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { - defer wg.Done() - view, err := shard.dbAgent.FetchGroupView(instance.alias, instance.instanceKey) - // We just log error here because we rely on mysql tells us if it is happy or not - // If the node is unreachable - if err != nil { - er.RecordError(err) - shard.shardStatusCollector.recordProblematics(instance) - if unreachableError(err) { - shard.shardStatusCollector.recordUnreachables(instance) - } - shard.logger.Errorf("%v get error while fetch group info: %v", instance.alias, err) - return - } - shard.sqlGroup.recordView(view) - }) - // Only raise error if we failed to get any data from mysql - // otherwise, we will use what we get from mysql directly - if len(er.Errors) == len(shard.instances) { - shard.logger.Errorf("fail to fetch any data for mysql") - return db.ErrGroupBackoffError - } - return shard.sqlGroup.Resolve() -} diff --git a/go/vt/vtgr/controller/diagnose_test.go b/go/vt/vtgr/controller/diagnose_test.go deleted file mode 100644 index c8b81bb70da..00000000000 --- a/go/vt/vtgr/controller/diagnose_test.go +++ /dev/null @@ -1,900 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controller - -import ( - "context" - "errors" - "math" - "os" - "strconv" - "strings" - "testing" - "time" - - "github.com/golang/mock/gomock" - "github.com/stretchr/testify/assert" - - "vitess.io/vitess/go/mysql" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/topo/memorytopo" - "vitess.io/vitess/go/vt/vtctl/grpcvtctldserver/testutil" - "vitess.io/vitess/go/vt/vtgr/config" - "vitess.io/vitess/go/vt/vtgr/db" - "vitess.io/vitess/go/vt/vtgr/inst" - - topodatapb "vitess.io/vitess/go/vt/proto/topodata" -) - -const diagnoseGroupSize = 3 - -var ( - testHost, _ = os.Hostname() - alias0 = "test_cell-0000000000" - alias1 = "test_cell-0000000001" - alias2 = "test_cell-0000000002" - testPort0 = 17000 - testPort1 = 17001 - testPort2 = 17002 -) - -type testGroupInput struct { - groupName string - readOnly bool - checkResult int - groupState []db.TestGroupState - gtid mysql.GTIDSet -} - -func TestShardIsHealthy(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ctx := context.Background() - ts := memorytopo.NewServer("test_cell") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tmc := NewMockGRTmcClient(ctrl) - dbAgent := db.NewMockAgent(ctrl) - tablet1 := buildTabletInfo(uint32(testPort0), testHost, testPort0, topodatapb.TabletType_PRIMARY, time.Now()) - tablet2 := buildTabletInfo(uint32(testPort1), testHost, testPort1, topodatapb.TabletType_SPARE, time.Time{}) - tablet3 := buildTabletInfo(uint32(testPort2), testHost, testPort2, topodatapb.TabletType_REPLICA, time.Time{}) - testutil.AddTablet(ctx, t, ts, tablet1.Tablet, nil) - testutil.AddTablet(ctx, t, ts, tablet2.Tablet, nil) - testutil.AddTablet(ctx, t, ts, tablet3.Tablet, nil) - ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { - si.PrimaryAlias = tablet1.Alias - return nil - }) - dbAgent. - EXPECT(). - FetchGroupView(gomock.Any(), gomock.Any()). - DoAndReturn(func(alias string, target *inst.InstanceKey) (*db.GroupView, error) { - return db.BuildGroupView(alias, "group", testHost, testPort0, false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }), nil - }). - AnyTimes() - tmc.EXPECT().Ping(gomock.Any(), gomock.Any()).Return(nil).AnyTimes() - cfg := &config.VTGRConfig{BootstrapGroupSize: 3, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, tmc, ts, dbAgent, cfg, testPort0, true) - shard.refreshTabletsInShardLocked(ctx) - diagnose, _ := shard.Diagnose(ctx) - assert.Equal(t, DiagnoseTypeHealthy, string(diagnose)) -} - -func TestTabletIssueDiagnoses(t *testing.T) { - type data struct { - pingable bool - ttype topodatapb.TabletType - } - var tablettests = []struct { - name string - expected DiagnoseType - errMessage string - primaryAlias string - inputs []data - }{ - {name: "healthy shard", expected: DiagnoseTypeHealthy, errMessage: "", primaryAlias: "test_cell-0000017000", inputs: []data{ - {true, topodatapb.TabletType_PRIMARY}, - {true, topodatapb.TabletType_REPLICA}, - {true, topodatapb.TabletType_REPLICA}, - }}, - {name: "non primary tablet is not pingable", expected: DiagnoseTypeHealthy, errMessage: "", primaryAlias: "test_cell-0000017000", inputs: []data{ // vtgr should do nothing - {true, topodatapb.TabletType_PRIMARY}, - {false, topodatapb.TabletType_REPLICA}, - {false, topodatapb.TabletType_REPLICA}, - }}, - {name: "primary tablet is not pingable", expected: DiagnoseTypeUnreachablePrimary, errMessage: "", primaryAlias: "test_cell-0000017000", inputs: []data{ // vtgr should trigger a failover - {false, topodatapb.TabletType_PRIMARY}, - {true, topodatapb.TabletType_REPLICA}, - {true, topodatapb.TabletType_REPLICA}, - }}, - {name: "no primary tablet", expected: DiagnoseTypeWrongPrimaryTablet, errMessage: "", primaryAlias: "", inputs: []data{ // vtgr should create one based on mysql - {true, topodatapb.TabletType_REPLICA}, - {true, topodatapb.TabletType_REPLICA}, - {true, topodatapb.TabletType_REPLICA}, - }}, - {name: "wrong primary in tablet types", expected: DiagnoseTypeWrongPrimaryTablet, errMessage: "", primaryAlias: "test_cell-0000017001", inputs: []data{ // shard info returns differently comparing with tablet type - {true, topodatapb.TabletType_PRIMARY}, - {true, topodatapb.TabletType_REPLICA}, - {true, topodatapb.TabletType_REPLICA}, - }}, - {name: "mysql and vttablet has different primary", expected: DiagnoseTypeWrongPrimaryTablet, errMessage: "", primaryAlias: "test_cell-0000017001", inputs: []data{ // vtgr should fix vttablet - {true, topodatapb.TabletType_REPLICA}, - {true, topodatapb.TabletType_PRIMARY}, - {true, topodatapb.TabletType_REPLICA}, - }}, - {name: "unreachable wrong vttablet primary", expected: DiagnoseTypeWrongPrimaryTablet, errMessage: "", primaryAlias: "test_cell-0000017001", inputs: []data{ // vtgr should fix vttablet - {true, topodatapb.TabletType_REPLICA}, - {false, topodatapb.TabletType_PRIMARY}, - {true, topodatapb.TabletType_REPLICA}, - }}, - {name: "unreachable uninitialized primary vttablet", expected: DiagnoseTypeUnreachablePrimary, errMessage: "", inputs: []data{ // vtgr should failover - {false, topodatapb.TabletType_REPLICA}, - {true, topodatapb.TabletType_REPLICA}, - {true, topodatapb.TabletType_REPLICA}, - }}, - } - for _, tt := range tablettests { - t.Run(tt.name, func(t *testing.T) { - expected := tt.expected - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ts := NewMockGRTopo(ctrl) - tmc := NewMockGRTmcClient(ctrl) - dbAgent := db.NewMockAgent(ctrl) - tablets := make(map[string]*topo.TabletInfo) - if tt.primaryAlias == "" { - ts. - EXPECT(). - GetShard(gomock.Any(), gomock.Eq("ks"), gomock.Eq("0")). - Return(&topo.ShardInfo{Shard: &topodatapb.Shard{}}, nil) - } - for i, input := range tt.inputs { - id := uint32(testPort0 + i) - tablet := buildTabletInfo(id, testHost, testPort0+i, input.ttype, time.Now()) - tablets[tablet.AliasString()] = tablet - var response = struct { - pingable bool - }{input.pingable} - if tt.primaryAlias == tablet.AliasString() { - si := &topo.ShardInfo{ - Shard: &topodatapb.Shard{ - PrimaryAlias: tablet.Alias, - }, - } - ts. - EXPECT(). - GetShard(gomock.Any(), gomock.Eq("ks"), gomock.Eq("0")). - Return(si, nil) - } - dbAgent. - EXPECT(). - FetchGroupView(gomock.Any(), gomock.Any()). - DoAndReturn(func(alias string, target *inst.InstanceKey) (*db.GroupView, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - return db.BuildGroupView(alias, "group", testHost, testPort0, false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }), nil - }). - AnyTimes() - tmc. - EXPECT(). - Ping(gomock.Any(), &topodatapb.Tablet{ - Alias: tablet.Alias, - Hostname: tablet.Hostname, - Keyspace: tablet.Keyspace, - Shard: tablet.Shard, - Type: tablet.Type, - Tags: tablet.Tags, - MysqlHostname: tablet.MysqlHostname, - MysqlPort: tablet.MysqlPort, - PrimaryTermStartTime: tablet.PrimaryTermStartTime, - }). - DoAndReturn(func(_ context.Context, t *topodatapb.Tablet) error { - if !response.pingable { - return errors.New("unreachable") - } - return nil - }). - AnyTimes() - } - ts. - EXPECT(). - GetTabletMapForShardByCell(gomock.Any(), gomock.Eq("ks"), gomock.Eq("0"), gomock.Any()). - Return(tablets, nil) - - ctx := context.Background() - cfg := &config.VTGRConfig{BootstrapGroupSize: diagnoseGroupSize, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, tmc, ts, dbAgent, cfg, testPort0, true) - shard.refreshTabletsInShardLocked(ctx) - diagnose, err := shard.Diagnose(ctx) - assert.Equal(t, expected, diagnose) - if tt.errMessage == "" { - assert.NoError(t, err) - } else { - assert.Error(t, err) - assert.True(t, strings.Contains(err.Error(), tt.errMessage), err.Error()) - } - }) - } -} - -func TestMysqlIssueDiagnoses(t *testing.T) { - cfg := &config.VTGRConfig{BootstrapGroupSize: diagnoseGroupSize, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - disableProtectionCfg := &config.VTGRConfig{BootstrapGroupSize: diagnoseGroupSize, MinNumReplica: 2, DisableReadOnlyProtection: true, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - heartbeatThreshold = 10 - defer func() { - heartbeatThreshold = math.MaxInt64 - }() - type data struct { - alias string - groupName string - readOnly bool - checkResult int - groupInput []db.TestGroupState - ttype topodatapb.TabletType - } - var sqltests = []struct { - name string - expected DiagnoseType - errMessage string - config *config.VTGRConfig - inputs []data - removeTablets []string // to simulate missing tablet in topology - }{ - {name: "healthy shard", expected: DiagnoseTypeHealthy, errMessage: "", inputs: []data{ - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "recovering primary shard", expected: DiagnoseTypeBackoffError, errMessage: "", inputs: []data{ - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "RECOVERING", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "RECOVERING", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "RECOVERING", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "no group in shard", expected: DiagnoseTypeShardHasNoGroup, errMessage: "", inputs: []data{ - {alias0, "", true, 0, []db.TestGroupState{ - {MemberHost: "", MemberPort: "", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {alias1, "", true, 0, []db.TestGroupState{ - {MemberHost: "", MemberPort: "", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "", true, 0, []db.TestGroupState{ - {MemberHost: "", MemberPort: "", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "fail to bootstrap with incorrect number of nodes", expected: DiagnoseTypeError, errMessage: "fail to diagnose ShardHasNoGroup with 3 nodes", inputs: []data{ - {alias0, "", true, 0, []db.TestGroupState{ - {MemberHost: "", MemberPort: "", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {alias1, "", true, 0, []db.TestGroupState{ - {MemberHost: "", MemberPort: "", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "", true, 0, []db.TestGroupState{ - {MemberHost: "", MemberPort: "", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }, config: &config.VTGRConfig{BootstrapGroupSize: 2, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1}}, - {name: "unreachable node", expected: DiagnoseTypeBackoffError, errMessage: "", inputs: []data{ - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "mysql and tablet has different primary", expected: DiagnoseTypeWrongPrimaryTablet, errMessage: "", inputs: []data{ // vtgr should failover vttablet - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "mysql primary out of topology", expected: DiagnoseTypeUnreachablePrimary, errMessage: "", inputs: []data{ // vtgr should failover mysql - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }, removeTablets: []string{alias0}}, - {name: "one error node", expected: DiagnoseTypeUnconnectedReplica, errMessage: "", inputs: []data{ - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ERROR", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ERROR", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "inactive group with divergent state", expected: DiagnoseTypeShardHasInactiveGroup, errMessage: "", inputs: []data{ - {alias0, "group", true, 11, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "OFFLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 11, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 11, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "two error node", expected: DiagnoseTypeInsufficientGroupSize, errMessage: "", inputs: []data{ - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ERROR", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "insufficient group member", expected: DiagnoseTypeInsufficientGroupSize, errMessage: "", inputs: []data{ - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{}, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "unconnected node", expected: DiagnoseTypeBackoffError, errMessage: "", inputs: []data{ - {alias0, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "unreachable primary", expected: DiagnoseTypeBackoffError, errMessage: "", inputs: []data{ - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "UNREACHABLE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "UNREACHABLE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "UNREACHABLE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "more than one group name", expected: DiagnoseTypeError, errMessage: "fail to refreshSQLGroup: group has split brain", inputs: []data{ // vtgr should raise error - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group_xxx", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "different primary", expected: DiagnoseTypeError, errMessage: "fail to refreshSQLGroup: group has split brain", inputs: []data{ // vtgr should raise error - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "insufficient members in group", expected: DiagnoseTypeInsufficientGroupSize, errMessage: "", inputs: []data{ - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - // the shard has insufficient member, but the primary is already read_only - // we should try to connect the replica node - {name: "insufficient members in read only shard", expected: DiagnoseTypeUnconnectedReplica, errMessage: "", inputs: []data{ - {alias0, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "insufficient members in group with disable read only protection", expected: DiagnoseTypeUnconnectedReplica, errMessage: "", config: disableProtectionCfg, inputs: []data{ - {alias0, "group", false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "read only with disable read only protection", expected: DiagnoseTypeReadOnlyShard, errMessage: "", config: disableProtectionCfg, inputs: []data{ - {alias0, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "read only healthy shard", expected: DiagnoseTypeReadOnlyShard, errMessage: "", inputs: []data{ - {alias0, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "inconsistent member state", expected: DiagnoseTypeBackoffError, errMessage: "", inputs: []data{ - {alias0, "group", true, 11, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {alias1, "group", true, 12, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias2, "group", true, math.MaxInt64, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "network partition", expected: DiagnoseTypeBackoffError, errMessage: "", inputs: []data{ - {alias0, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "OFFLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "OFFLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "start bootstrap in progress", expected: DiagnoseTypeBootstrapBackoff, errMessage: "", inputs: []data{ - {alias0, "group", true, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias1, "", true, 0, []db.TestGroupState{}, topodatapb.TabletType_REPLICA}, - {alias2, "", true, 0, []db.TestGroupState{ - {MemberHost: "", MemberPort: "", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }}, - } - for _, tt := range sqltests { - t.Run(tt.name, func(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ts := NewMockGRTopo(ctrl) - tmc := NewMockGRTmcClient(ctrl) - dbAgent := db.NewMockAgent(ctrl) - tablets := make(map[string]*topo.TabletInfo) - expected := tt.expected - inputMap := make(map[string]testGroupInput) - if tt.config == nil { - tt.config = cfg - } - conf := tt.config - hasPrimary := false - for i, input := range tt.inputs { - id := uint32(i) - //id := uint32(testPort0 + i) - tablet := buildTabletInfo(id, testHost, testPort0+i, input.ttype, time.Now()) - tablets[tablet.AliasString()] = tablet - inputMap[input.alias] = testGroupInput{ - input.groupName, - input.readOnly, - input.checkResult, - input.groupInput, - nil, - } - if tablet.Type == topodatapb.TabletType_PRIMARY { - si := &topo.ShardInfo{ - Shard: &topodatapb.Shard{ - PrimaryAlias: tablet.Alias, - }, - } - ts. - EXPECT(). - GetShard(gomock.Any(), gomock.Eq("ks"), gomock.Eq("0")). - Return(si, nil) - hasPrimary = true - } - dbAgent. - EXPECT(). - FetchGroupView(gomock.Any(), gomock.Any()). - DoAndReturn(func(alias string, target *inst.InstanceKey) (*db.GroupView, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - s := inputMap[alias] - view := db.BuildGroupView(alias, s.groupName, target.Hostname, target.Port, s.readOnly, s.checkResult, s.groupState) - return view, nil - }). - AnyTimes() - } - if !hasPrimary { - ts. - EXPECT(). - GetShard(gomock.Any(), gomock.Eq("ks"), gomock.Eq("0")). - Return(&topo.ShardInfo{Shard: &topodatapb.Shard{}}, nil) - } - for _, tid := range tt.removeTablets { - delete(tablets, tid) - } - ts. - EXPECT(). - GetTabletMapForShardByCell(gomock.Any(), gomock.Eq("ks"), gomock.Eq("0"), gomock.Any()). - Return(tablets, nil) - tmc.EXPECT().Ping(gomock.Any(), gomock.Any()).Return(nil).AnyTimes() - - ctx := context.Background() - shard := NewGRShard("ks", "0", nil, tmc, ts, dbAgent, conf, testPort0, true) - shard.refreshTabletsInShardLocked(ctx) - diagnose, err := shard.Diagnose(ctx) - assert.Equal(t, expected, diagnose) - if tt.errMessage == "" { - assert.NoError(t, err) - } else { - assert.Error(t, err) - assert.True(t, strings.Contains(err.Error(), tt.errMessage), err.Error()) - } - }) - } -} - -func TestDiagnoseWithInactive(t *testing.T) { - cfg := &config.VTGRConfig{BootstrapGroupSize: diagnoseGroupSize, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - type data struct { - alias string - groupName string - readOnly bool - pingable bool - groupInput []db.TestGroupState - ttype topodatapb.TabletType - } - var sqltests = []struct { - name string - expected DiagnoseType - errMessage string - config *config.VTGRConfig - inputs []data - rebootstrapGroupSize int - removeTablets []string // to simulate missing tablet in topology - }{ - // although mysql and vitess has different primary, but since this is an active shard, VTGR won't fix that - {name: "mysql and tablet has different primary", expected: DiagnoseTypeHealthy, errMessage: "", inputs: []data{ - {alias0, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias1, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias2, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "different primary with unconnected node", expected: DiagnoseTypeUnconnectedReplica, errMessage: "", inputs: []data{ - {alias0, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias1, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias2, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "primary tablet is not pingable", expected: DiagnoseTypeHealthy, errMessage: "", inputs: []data{ - {alias0, "group", true, false, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - // This is a read only shard, but since it's an inactive shard we will diagnose it as healthy - {name: "read only healthy shard", expected: DiagnoseTypeHealthy, errMessage: "", inputs: []data{ - {alias0, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "writable shard", expected: DiagnoseTypeInsufficientGroupSize, errMessage: "", inputs: []data{ - {alias0, "group", false, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {name: "error when there are only two nodes", expected: DiagnoseTypeError, errMessage: "fail to diagnose ShardHasInactiveGroup with 3 nodes expecting 2", inputs: []data{ - {alias0, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {alias1, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {alias2, "group", true, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }, rebootstrapGroupSize: 2}, - } - for _, tt := range sqltests { - t.Run(tt.name, func(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ctx := context.Background() - ts := memorytopo.NewServer("test_cell") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tmc := NewMockGRTmcClient(ctrl) - dbAgent := db.NewMockAgent(ctrl) - expected := tt.expected - inputMap := make(map[string]testGroupInput) - pingable := make(map[string]bool) - if tt.config == nil { - tt.config = cfg - } - conf := tt.config - for i, input := range tt.inputs { - tablet := buildTabletInfo(uint32(i), testHost, testPort0+i, input.ttype, time.Now()) - testutil.AddTablet(ctx, t, ts, tablet.Tablet, nil) - inputMap[input.alias] = testGroupInput{ - input.groupName, - input.readOnly, - 0, - input.groupInput, - nil, - } - pingable[input.alias] = input.pingable - if tablet.Type == topodatapb.TabletType_PRIMARY { - ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { - si.PrimaryAlias = tablet.Alias - return nil - }) - } - dbAgent. - EXPECT(). - FetchGroupView(gomock.Any(), gomock.Any()). - DoAndReturn(func(alias string, target *inst.InstanceKey) (*db.GroupView, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - s := inputMap[alias] - view := db.BuildGroupView(alias, s.groupName, target.Hostname, target.Port, s.readOnly, s.checkResult, s.groupState) - return view, nil - }). - AnyTimes() - tmc. - EXPECT(). - Ping(gomock.Any(), &topodatapb.Tablet{ - Alias: tablet.Alias, - Hostname: tablet.Hostname, - Keyspace: tablet.Keyspace, - Shard: tablet.Shard, - Type: tablet.Type, - Tags: tablet.Tags, - MysqlHostname: tablet.MysqlHostname, - MysqlPort: tablet.MysqlPort, - PrimaryTermStartTime: tablet.PrimaryTermStartTime, - }). - DoAndReturn(func(_ context.Context, t *topodatapb.Tablet) error { - if !pingable[tablet.Alias.String()] { - return errors.New("unreachable") - } - return nil - }). - AnyTimes() - } - shard := NewGRShard("ks", "0", nil, tmc, ts, dbAgent, conf, testPort0, false) - if tt.rebootstrapGroupSize != 0 { - shard.OverrideRebootstrapGroupSize(tt.rebootstrapGroupSize) - } - shard.refreshTabletsInShardLocked(ctx) - diagnose, err := shard.Diagnose(ctx) - assert.Equal(t, expected, diagnose) - if tt.errMessage == "" { - assert.NoError(t, err) - } else { - assert.Error(t, err) - assert.True(t, strings.Contains(err.Error(), tt.errMessage), err.Error()) - } - }) - } -} - -func TestGroupStatusRecorder(t *testing.T) { - r := &groupGTIDRecorder{} - - err := r.recordGroupStatus("group1", true) - assert.NoError(t, err) - assert.Equal(t, r.name, "group1") - assert.Equal(t, r.hasActive, true) - - err = r.recordGroupStatus("group2", false) - assert.Error(t, err, "group has more than one group name") - assert.Equal(t, r.name, "group1") - - err = r.recordGroupStatus("group1", false) - assert.NoError(t, err) - assert.Equal(t, r.name, "group1") - assert.Equal(t, r.hasActive, true) - - pos1, err := mysql.ParsePosition(mysql.Mysql56FlavorID, "264a8230-67d2-11eb-acdd-0a8d91f24125:1-22:1000019-1000021") - assert.NoError(t, err) - inst1 := &grInstance{alias: "alias1"} - r.recordGroupGTIDs(pos1.GTIDSet, inst1) - pos2, err := mysql.ParsePosition(mysql.Mysql56FlavorID, "264a8230-67d2-11eb-acdd-0a8d91f24125:1-1000021") - assert.NoError(t, err) - inst2 := &grInstance{alias: "alias2"} - r.recordGroupGTIDs(pos2.GTIDSet, inst2) - assert.Equal(t, len(r.gtidWithInstances), 2) - assert.Equal(t, r.gtidWithInstances[0].instance, inst1) - assert.Equal(t, pos1.GTIDSet.Equal(r.gtidWithInstances[0].gtids), true) - assert.Equal(t, r.gtidWithInstances[1].instance, inst2) - assert.Equal(t, pos2.GTIDSet.Equal(r.gtidWithInstances[1].gtids), true) -} diff --git a/go/vt/vtgr/controller/error.go b/go/vt/vtgr/controller/error.go deleted file mode 100644 index 5613c802524..00000000000 --- a/go/vt/vtgr/controller/error.go +++ /dev/null @@ -1,25 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controller - -import "errors" - -var ( - errMissingPrimaryTablet = errors.New("no primary tablet available") - errMissingGroup = errors.New("no mysql group") - errForceAbortBootstrap = errors.New("force abort bootstrap") -) diff --git a/go/vt/vtgr/controller/group.go b/go/vt/vtgr/controller/group.go deleted file mode 100644 index 3469d63acbb..00000000000 --- a/go/vt/vtgr/controller/group.go +++ /dev/null @@ -1,443 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controller - -import ( - "fmt" - "math" - "sort" - "strings" - "sync" - - "github.com/spf13/pflag" - - "vitess.io/vitess/go/stats" - "vitess.io/vitess/go/vt/servenv" - "vitess.io/vitess/go/vt/vtgr/db" - "vitess.io/vitess/go/vt/vtgr/inst" - "vitess.io/vitess/go/vt/vtgr/log" -) - -var ( - groupOnlineSize = stats.NewGaugesWithMultiLabels("MysqlGroupOnlineSize", "Online MySQL server in the group", []string{"Keyspace", "Shard"}) - isLostQuorum = stats.NewGaugesWithMultiLabels("MysqlGroupLostQuorum", "If MySQL group lost quorum", []string{"Keyspace", "Shard"}) - - heartbeatThreshold int -) - -func init() { - servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) { - fs.IntVar(&heartbeatThreshold, "group_heartbeat_threshold", 0, "VTGR will trigger backoff on inconsistent state if the group heartbeat staleness exceeds this threshold (in seconds). Should be used along with --enable_heartbeat_check.") - }) -} - -// SQLGroup contains views from all the nodes within the shard -type SQLGroup struct { - views []*db.GroupView - resolvedView *ResolvedView - logger *log.Logger - expectedBootstrapSize int - // rebootstrapSize is init to 0 - // when it is not 0, we allow some nodes to be unhealthy during a rebootstrap - rebootstrapSize int - singlePrimary bool - heartbeatThreshold int - statsTags []string - sync.Mutex -} - -// NewSQLGroup creates a new SQLGroup -func NewSQLGroup(size int, singlePrimary bool, keyspace, shard string) *SQLGroup { - return &SQLGroup{ - expectedBootstrapSize: size, - rebootstrapSize: 0, - singlePrimary: singlePrimary, - statsTags: []string{keyspace, shard}, - logger: log.NewVTGRLogger(keyspace, shard), - heartbeatThreshold: heartbeatThreshold, - } -} - -// ResolvedView is the resolved view -type ResolvedView struct { - groupName string - view map[inst.InstanceKey]db.GroupMember - logger *log.Logger -} - -// recordView adds a view to the group -func (group *SQLGroup) recordView(view *db.GroupView) { - group.Lock() - defer group.Unlock() - group.views = append(group.views, view) -} - -// overrideView overrides a view to the group -func (group *SQLGroup) overrideView(views []*db.GroupView) { - group.Lock() - defer group.Unlock() - group.views = views - group.resolveLocked() -} - -// clear reset the views -func (group *SQLGroup) clear() { - group.Lock() - defer group.Unlock() - group.views = nil - group.resolvedView = nil -} - -// GetViews returns views from everyone in the group -func (group *SQLGroup) GetViews() []*db.GroupView { - group.Lock() - defer group.Unlock() - return group.views -} - -// GetGroupName returns the group name -func (group *SQLGroup) GetGroupName() string { - group.Lock() - defer group.Unlock() - rv := group.resolvedView - return rv.groupName -} - -// GetOnlineGroupInfo returns number of online members in the group and also if the primary is read only -func (group *SQLGroup) GetOnlineGroupInfo() (int, bool) { - group.Lock() - defer group.Unlock() - rv := group.resolvedView - view := rv.view - onlineSize := 0 - isPrimaryReadOnly := false - for _, status := range view { - if status.State == db.ONLINE { - onlineSize++ - } - if status.Role == db.PRIMARY { - isPrimaryReadOnly = isPrimaryReadOnly || status.ReadOnly - } - } - return onlineSize, isPrimaryReadOnly -} - -// IsUnconnectedReplica checks if the node is connected to a group -func (group *SQLGroup) IsUnconnectedReplica(instanceKey *inst.InstanceKey) bool { - if instanceKey == nil { - return false - } - group.Lock() - defer group.Unlock() - rv := group.resolvedView - view := rv.view - status, ok := view[*instanceKey] - if !ok { - return true - } - return status.State != db.ONLINE && status.State != db.RECOVERING -} - -// IsAllOfflineOrError returns true if all the nodes are in offline mode -func (group *SQLGroup) IsAllOfflineOrError() bool { - group.Lock() - defer group.Unlock() - rv := group.resolvedView - view := rv.view - for _, status := range view { - if status.State != db.OFFLINE && status.State != db.ERROR { - return false - } - } - return true -} - -// GetStatus returns GroupMember status for given a host -func (group *SQLGroup) GetStatus(instanceKey *inst.InstanceKey) *db.GroupMember { - if instanceKey == nil { - return nil - } - group.Lock() - defer group.Unlock() - rv := group.resolvedView - view := rv.view - status, ok := view[*instanceKey] - if !ok { - return nil - } - return &status -} - -// IsSafeToBootstrap checks if it is safe to bootstrap a mysql group -func (group *SQLGroup) IsSafeToBootstrap() bool { - group.Lock() - defer group.Unlock() - // for bootstrap we require group at least has quorum number of views - // this is to make sure we don't bootstrap a group improperly - if len(group.views) < group.expectedBootstrapSize { - group.logger.Errorf("[sql_group] cannot bootstrap because we only have %v views | expected %v", len(group.views), group.expectedBootstrapSize) - return false - } - return group.isSafeToRebootstrapLocked() -} - -// IsSafeToRebootstrap checks if it is safe to rebootstrap a group -// It does not check group size as IsSafeToBootstrap, since when we -// reach here it means VTGR already checked there were group expectedBootstrapSize -// number of nodes in topo server, therefore we just rebootstrap -// as long as we can reach all the nodes in topo server -func (group *SQLGroup) IsSafeToRebootstrap() bool { - group.Lock() - defer group.Unlock() - return group.isSafeToRebootstrapLocked() -} - -func (group *SQLGroup) isSafeToRebootstrapLocked() bool { - // we think it is safe to bootstrap a group if all the views don't have a primary host - host, port, _ := group.getPrimaryLocked() - if host != "" || port != 0 { - group.logger.Warningf("not safe to bootstrap sql group because %v/%v might already be primary", host, port) - } - return host == "" && port == 0 -} - -// GetPrimary returns the hostname, port of the primary that everyone agreed on -// isActive bool indicates if there is any node in the group whose primary is "ONLINE" -func (group *SQLGroup) GetPrimary() (string, int, bool) { - group.Lock() - defer group.Unlock() - return group.getPrimaryLocked() -} - -func (group *SQLGroup) getPrimaryLocked() (string, int, bool) { - rv := group.resolvedView - view := rv.view - for instance, status := range view { - if status.Role == db.PRIMARY { - return instance.Hostname, instance.Port, status.State == db.ONLINE - } - } - return "", 0, false -} - -// Resolve merges the views into a map -func (group *SQLGroup) Resolve() error { - group.Lock() - defer group.Unlock() - return group.resolveLocked() -} -func (group *SQLGroup) resolveLocked() error { - rv := &ResolvedView{logger: group.logger} - group.resolvedView = rv - // a node that is not in the group might be outlier with big lag - // iterate over all views to get global minStalenessResult first - minStalenessResult := math.MaxInt32 - for _, view := range group.views { - if view.HeartbeatStaleness < minStalenessResult { - minStalenessResult = view.HeartbeatStaleness - } - } - m := make(map[inst.InstanceKey]db.GroupMember) - for _, view := range group.views { - if rv.groupName == "" && view.GroupName != "" { - rv.groupName = view.GroupName - } - if view.GroupName != "" && rv.groupName != view.GroupName { - group.logger.Errorf("previous group name %v found %v", rv.groupName, view.GroupName) - return db.ErrGroupSplitBrain - } - for _, member := range view.UnresolvedMembers { - instance := view.CreateInstanceKey(member) - memberState := member.State - memberRole := member.Role - isReadOnly := member.ReadOnly - st, ok := m[instance] - if !ok { - m[instance] = db.GroupMember{ - HostName: instance.Hostname, - Port: instance.Port, - State: memberState, - Role: memberRole, - ReadOnly: isReadOnly, - } - continue - } - if st.State == memberState && st.Role == memberRole && st.ReadOnly == isReadOnly { - continue - } - // Members in a group should eventually converge on a state - // if there is a partition, then a node should be removed from - // a group. If a node is reported as ONLINE together with - // some other state, we back off if we see a node with diverged state - if memberState != db.UNKNOWNSTATE && - st.State != db.UNKNOWNSTATE && - st.State != memberState && - (st.State == db.ONLINE || memberState == db.ONLINE) { - group.logger.Warningf("found inconsistent member state for %v: %v vs %v", instance.Hostname, st.State, memberState) - if group.heartbeatThreshold != 0 && - // Check minStalenessResult among the group is not math.MaxInt32 - // which means at least one node returns the lag from _vt.heartbeat table - // otherwise we don't trigger backoff on inconsistent state - minStalenessResult != math.MaxInt32 && - minStalenessResult >= group.heartbeatThreshold { - group.logger.Warningf("ErrGroupBackoffError by staled heartbeat check %v", minStalenessResult) - var sb strings.Builder - for _, view := range group.views { - sb.WriteString(fmt.Sprintf("%v staleness=%v\n", view.MySQLHost, view.HeartbeatStaleness)) - } - group.logger.Warningf("%v", sb.String()) - return db.ErrGroupBackoffError - } - } - m[instance] = db.GroupMember{ - HostName: instance.Hostname, - Port: instance.Port, - State: group.mergeState(st.State, memberState), - Role: group.mergeRole(st.Role, memberRole), - ReadOnly: st.ReadOnly || isReadOnly, - } - } - } - rv.view = m - return group.resolvedView.validate(group.singlePrimary, group.statsTags) -} - -func (rv *ResolvedView) validate(singlePrimary bool, statsTags []string) error { - if !rv.hasGroup() { - rv.logger.Info("Resolved view does not have a group") - return nil - } - hasPrimary := false - primaryState := db.UNKNOWNSTATE - var onlineCount, recoveringCount, unreachableCount, offlineCount, errorCount int - for _, status := range rv.view { - if status.Role == db.PRIMARY { - if singlePrimary && hasPrimary { - rv.logger.Errorf("Found more than one primary in the group") - return db.ErrGroupSplitBrain - } - hasPrimary = true - primaryState = status.State - if status.State != db.ONLINE { - rv.logger.Warningf("Found a PRIMARY not ONLINE (%v)", status.State) - } - } - switch status.State { - case db.ONLINE: - onlineCount++ - case db.UNREACHABLE: - unreachableCount++ - case db.OFFLINE: - offlineCount++ - case db.ERROR: - errorCount++ - case db.RECOVERING: - recoveringCount++ - } - } - groupOnlineSize.Set(statsTags, int64(onlineCount)) - if unreachableCount > 0 || errorCount > 0 || offlineCount > 0 { - rv.logger.Warningf("Some of nodes are unconnected in the group. hasPrimary=%v (%v), online_count=%v, recovering_count=%v, unreachable_count=%v, offline_count=%v, error_count=%v", hasPrimary, primaryState, onlineCount, recoveringCount, unreachableCount, offlineCount, errorCount) - } - if unreachableCount >= len(rv.view)/2+1 { - rv.logger.Errorf("Backoff error by quorum unreachable: found %v number of UNREACHABLE nodes while quorum is %v", unreachableCount, len(rv.view)/2+1) - isLostQuorum.Set(statsTags, 1) - } else { - isLostQuorum.Set(statsTags, 0) - } - // In theory there should be no UNREACHABLE nodes - // raise ErrGroupBackoffError to backoff and wait - // If we lost quorum, then the group is not writable - // If we still have a functioning group, we can backoff and wait - // the unreachable node should either be expelled or we have a frozen view - // Note: this means we should set group_replication_unreachable_majority_timeout - // greater than 0. Otherwise VTGR can see all nodes are ONLINE when a single node - // is partitioned and end up doing nothing. - if unreachableCount > 0 { - return db.ErrGroupBackoffError - } - // Ongoing bootstrap, we should backoff and wait - if recoveringCount == 1 && (offlineCount+recoveringCount == len(rv.view)) { - rv.logger.Warningf("Group has one recovery node with all others in offline mode") - return db.ErrGroupOngoingBootstrap - } - // We don't have quorum number of unreachable, but the primary is not online - // This most likely means there is a failover in the group we should back off and wait - if hasPrimary && primaryState != db.ONLINE { - rv.logger.Warningf("Found a PRIMARY that is not ONLINE (%v)", primaryState) - return db.ErrGroupBackoffError - } - // If all the node in view are OFFLINE or ERROR, it is an inactive group - // It is expected to have no primary in this case - if !hasPrimary && (offlineCount+errorCount != len(rv.view)) { - rv.logger.Warningf("Group is NOT all offline or error without a primary node") - return db.ErrGroupBackoffError - } - return nil -} - -func (rv *ResolvedView) hasGroup() bool { - return rv.groupName != "" -} - -func (group *SQLGroup) mergeState(s1, s2 db.MemberState) db.MemberState { - return db.MemberState(group.maxStatus(int(s1), int(s2))) -} - -func (group *SQLGroup) mergeRole(r1, r2 db.MemberRole) db.MemberRole { - return db.MemberRole(group.maxStatus(int(r1), int(r2))) -} - -func (group *SQLGroup) maxStatus(a, b int) int { - if a > b { - return a - } - return b -} - -// ToString returns a string representatino of the sql group -func (group *SQLGroup) ToString() string { - group.Lock() - defer group.Unlock() - var sb strings.Builder - views := group.views - for _, view := range views { - sb.WriteString(fmt.Sprintf("[%s] SQLGroup group=%s", view.TabletAlias, view.GroupName)) - for _, member := range view.UnresolvedMembers { - sb.WriteString(fmt.Sprintf(" | %s %s %s readonly=%v", member.HostName, member.Role, member.State, member.ReadOnly)) - } - sb.WriteString("\n") - } - rv := group.resolvedView - if rv != nil { - sb.WriteString("[resolved_view]\n") - sb.WriteString(fmt.Sprintf("group_name=%v\n", rv.groupName)) - keys := make([]inst.InstanceKey, 0, len(rv.view)) - for k := range rv.view { - keys = append(keys, k) - } - sort.Slice(keys, func(i, j int) bool { - return keys[i].Hostname < keys[j].Hostname - }) - for _, instance := range keys { - status := rv.view[instance] - sb.WriteString(fmt.Sprintf("[%s] state=%v role=%v readonly=%v\n", instance.Hostname, status.State, status.Role, status.ReadOnly)) - - } - } - return sb.String() -} diff --git a/go/vt/vtgr/controller/group_test.go b/go/vt/vtgr/controller/group_test.go deleted file mode 100644 index edfeca14500..00000000000 --- a/go/vt/vtgr/controller/group_test.go +++ /dev/null @@ -1,454 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controller - -import ( - "math" - "testing" - - "vitess.io/vitess/go/vt/vtgr/log" - - "vitess.io/vitess/go/vt/vtgr/db" - "vitess.io/vitess/go/vt/vtgr/inst" - - "github.com/stretchr/testify/assert" -) - -func TestSQLGroupToString(t *testing.T) { - group := NewSQLGroup(2, true, "ks", "0") - v1 := db.NewGroupView("v1", "host1", 10) - v1.GroupName = "group_name" - var l1 []*db.GroupMember - var l2 []*db.GroupMember - m1 := db.NewGroupMember("ONLINE", "PRIMARY", "host1", 10, false) - m2 := db.NewGroupMember("ONLINE", "SECONDARY", "host2", 10, true) - m3 := db.NewGroupMember("OFFLINE", "SECONDARY", "host3", 10, true) - l1 = append(l1, m1) - l1 = append(l1, m2) - v1.UnresolvedMembers = l1 - l2 = append(l2, m3) - v2 := db.NewGroupView("v2", "host2", 10) - v2.GroupName = "group_name" - v2.UnresolvedMembers = l2 - group.recordView(v2) - group.recordView(v1) - assert.Equal(t, `[v2] SQLGroup group=group_name | host3 SECONDARY OFFLINE readonly=true -[v1] SQLGroup group=group_name | host1 PRIMARY ONLINE readonly=false | host2 SECONDARY ONLINE readonly=true -`, group.ToString()) - group.Resolve() - assert.Equal(t, `[v2] SQLGroup group=group_name | host3 SECONDARY OFFLINE readonly=true -[v1] SQLGroup group=group_name | host1 PRIMARY ONLINE readonly=false | host2 SECONDARY ONLINE readonly=true -[resolved_view] -group_name=group_name -[host1] state=ONLINE role=PRIMARY readonly=false -[host2] state=ONLINE role=SECONDARY readonly=true -[host3] state=OFFLINE role=SECONDARY readonly=true -`, group.ToString()) -} - -func TestGetGroupName(t *testing.T) { - group := NewSQLGroup(3, true, "ks", "0") - v1 := db.NewGroupView("v1", "host1", 10) - v1.GroupName = "group" - v1.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("OFFLINE", "", "host1", 10, true), - } - group.recordView(v1) - v2 := db.NewGroupView("v2", "host2", 10) - v2.GroupName = "group" - v2.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("OFFLINE", "", "", 0, true), - } - group.recordView(v2) - err := group.Resolve() - assert.NoError(t, err) - name := group.GetGroupName() - assert.Equal(t, "group", name) - v3 := db.NewGroupView("v3", "host3", 10) - v3.GroupName = "group_foo" - group.recordView(v3) - err = group.Resolve() - assert.Errorf(t, err, "group has split brain") - name = group.GetGroupName() - // group keeps the group name before finding a divergent group name - assert.Equal(t, "group", name) -} - -func TestIsActiveWithMultiplePrimary(t *testing.T) { - group := NewSQLGroup(2, true, "ks", "0") - v1 := db.NewGroupView("v1", "host1", 10) - v1.GroupName = "group" - v1.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("ONLINE", "PRIMARY", "host1", 10, false), - db.NewGroupMember("ONLINE", "SECONDARY", "host2", 10, true), - } - group.recordView(v1) - v2 := db.NewGroupView("v2", "host2", 10) - v2.GroupName = "group" - v2.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("ONLINE", "SECONDARY", "host1", 10, true), - db.NewGroupMember("ONLINE", "PRIMARY", "host2", 10, false), - } - group.recordView(v2) - err := group.Resolve() - assert.Errorf(t, err, "group network partition") -} - -func TestIsSafeToBootstrap(t *testing.T) { - group := NewSQLGroup(1, true, "ks", "0") - isSafe := group.IsSafeToBootstrap() - assert.False(t, isSafe) - v1 := db.NewGroupView("v1", "host1", 10) - v1.GroupName = "group" - v1.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("OFFLINE", "", "", 0, true), - db.NewGroupMember("OFFLINE", "", "", 0, true), - } - group.recordView(v1) - group.Resolve() - isSafe = group.IsSafeToBootstrap() - assert.True(t, isSafe) -} - -func TestIsSafeToBootstrapWithPrimary(t *testing.T) { - group := NewSQLGroup(1, true, "ks", "0") - v1 := db.NewGroupView("v1", "host1", 10) - v1.GroupName = "group" - // it is not safe to bootstrap if we see a primary node in group - v1.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("ONLINE", "PRIMARY", "host1", 0, false), - db.NewGroupMember("OFFLINE", "", "", 0, true), - } - group.recordView(v1) - group.Resolve() - isSafe := group.IsSafeToBootstrap() - assert.False(t, isSafe) -} - -func TestIsUnconnectedReplica(t *testing.T) { - group := NewSQLGroup(1, true, "ks", "0") - isSafe := group.IsSafeToBootstrap() - assert.False(t, isSafe) - v1 := db.NewGroupView("v1", "host1", 10) - v1.GroupName = "group" - v1.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("ONLINE", "PRIMARY", "host1", 10, false), - db.NewGroupMember("ONLINE", "SECONDARY", "host2", 10, true), - } - group.recordView(v1) - group.Resolve() - isUnconnected := group.IsUnconnectedReplica(&inst.InstanceKey{Hostname: "host2", Port: 10}) - assert.False(t, isUnconnected) -} - -func TestGetOnlineGroupSizeFromPrimary(t *testing.T) { - group := NewSQLGroup(1, true, "ks", "0") - isSafe := group.IsSafeToBootstrap() - assert.False(t, isSafe) - v1 := db.NewGroupView("v1", "host1", 10) - v1.GroupName = "group" - v1.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("ONLINE", "PRIMARY", "host1", 10, false), - db.NewGroupMember("ONLINE", "SECONDARY", "host2", 10, true), - db.NewGroupMember("RECOVERING", "SECONDARY", "host3", 10, true), - } - v2 := db.NewGroupView("v2", "host2", 10) - v2.GroupName = "group" - v2.UnresolvedMembers = []*db.GroupMember{} - group.recordView(v1) - group.recordView(v2) - group.Resolve() - size, readOnly := group.GetOnlineGroupInfo() - assert.Equal(t, 2, size) - assert.False(t, readOnly) -} - -func TestNetworkPartition(t *testing.T) { - group := NewSQLGroup(3, true, "ks", "0") - v1 := db.NewGroupView("v1", "host1", 10) - v1.GroupName = "group" - v1.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("ONLINE", "PRIMARY", "host1", 10, false), - db.NewGroupMember("UNREACHABLE", "SECONDARY", "host2", 10, true), - db.NewGroupMember("UNREACHABLE", "SECONDARY", "host3", 10, true), - } - v2 := db.NewGroupView("v2", "host2", 10) - v2.GroupName = "group" - v2.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("OFFLINE", "", "host2", 10, true), - } - v3 := db.NewGroupView("v3", "host3", 10) - v3.GroupName = "group" - v3.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("OFFLINE", "", "host3", 10, true), - } - group.recordView(v1) - group.recordView(v2) - group.recordView(v3) - err := group.Resolve() - assert.EqualErrorf(t, err, "group backoff error", err.Error()) - rv := group.resolvedView - assert.Equal(t, "group", rv.groupName) - assert.Equal(t, map[inst.InstanceKey]db.GroupMember{ - {Hostname: "host1", Port: 10}: {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false}, - {Hostname: "host2", Port: 10}: {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.UNREACHABLE, ReadOnly: true}, - {Hostname: "host3", Port: 10}: {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.UNREACHABLE, ReadOnly: true}, - }, rv.view) -} - -func TestInconsistentState(t *testing.T) { - group := NewSQLGroup(3, true, "ks", "0") - v1 := db.NewGroupView("v1", "host1", 10) - v1.GroupName = "group" - v1.HeartbeatStaleness = 11 - v1.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("ONLINE", "PRIMARY", "host1", 10, false), - db.NewGroupMember("ONLINE", "SECONDARY", "host2", 10, true), - db.NewGroupMember("ONLINE", "SECONDARY", "host3", 10, true), - } - v2 := db.NewGroupView("v2", "host2", 10) - v2.GroupName = "group" - v2.HeartbeatStaleness = 11 - v2.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("OFFLINE", "", "host2", 10, true), - } - v3 := db.NewGroupView("v3", "host3", 10) - v3.GroupName = "group" - v3.HeartbeatStaleness = 13 - v3.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("OFFLINE", "", "host3", 10, true), - } - group.recordView(v1) - group.recordView(v2) - group.recordView(v3) - group.heartbeatThreshold = 10 - err := group.Resolve() - assert.EqualErrorf(t, err, "group backoff error", err.Error()) - rv := group.resolvedView - assert.Equal(t, "group", rv.groupName) - assert.Nil(t, rv.view) -} - -func TestInconsistentStateWithInvalidStaleResult(t *testing.T) { - group := NewSQLGroup(3, true, "ks", "0") - v1 := db.NewGroupView("v1", "host1", 10) - v1.GroupName = "group" - v1.HeartbeatStaleness = math.MaxInt32 - v1.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("ONLINE", "PRIMARY", "host1", 10, false), - db.NewGroupMember("ONLINE", "SECONDARY", "host2", 10, true), - db.NewGroupMember("ONLINE", "SECONDARY", "host3", 10, true), - } - v2 := db.NewGroupView("v2", "host2", 10) - v2.GroupName = "group" - v2.HeartbeatStaleness = math.MaxInt32 - v2.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("OFFLINE", "", "host2", 10, true), - } - v3 := db.NewGroupView("v3", "host3", 10) - v3.GroupName = "group" - v3.HeartbeatStaleness = math.MaxInt32 - v3.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("OFFLINE", "", "host3", 10, true), - } - group.recordView(v1) - group.recordView(v2) - group.recordView(v3) - group.heartbeatThreshold = 10 - err := group.Resolve() - // Same setup as TestInconsistentState but because HeartbeatStaleness are all MaxInt32 - // the backoff is not triggered - assert.NoError(t, err) - rv := group.resolvedView - assert.Equal(t, "group", rv.groupName) -} - -func TestInconsistentUnknownState(t *testing.T) { - group := NewSQLGroup(3, true, "ks", "0") - v1 := db.NewGroupView("v1", "host1", 10) - v1.GroupName = "group" - v1.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("ONLINE", "PRIMARY", "host1", 10, false), - db.NewGroupMember("RECOVERING", "SECONDARY", "host2", 10, true), - db.NewGroupMember("ONLINE", "SECONDARY", "host3", 10, true), - } - v2 := db.NewGroupView("v2", "host2", 10) - v2.GroupName = "group" - v2.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("", "", "host2", 10, true), - } - v3 := db.NewGroupView("v3", "host3", 10) - v3.GroupName = "group" - v3.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("ONLINE", "SECONDARY", "host3", 10, true), - } - group.recordView(v1) - group.recordView(v2) - group.recordView(v3) - err := group.Resolve() - // host 2 reports itself with empty state - // therefore we shouldn't raise error even with inconsistent state - assert.NoError(t, err) - rv := group.resolvedView - assert.Equal(t, "group", rv.groupName) - assert.Equal(t, map[inst.InstanceKey]db.GroupMember{ - {Hostname: "host1", Port: 10}: {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false}, - {Hostname: "host2", Port: 10}: {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.RECOVERING, ReadOnly: true}, - {Hostname: "host3", Port: 10}: {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - }, rv.view) -} - -func TestIsBootstrapInProcess(t *testing.T) { - group := NewSQLGroup(3, true, "ks", "0") - v1 := db.NewGroupView("v1", "host1", 10) - v1.GroupName = "group" - v1.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("RECOVERING", "SECONDARY", "host1", 10, false), - } - v2 := db.NewGroupView("v2", "host2", 10) - v2.GroupName = "group" - v2.UnresolvedMembers = []*db.GroupMember{ - db.NewGroupMember("OFFLINE", "", "host2", 10, false), - } - v3 := db.NewGroupView("v3", "host", 10) - v3.GroupName = "group" - v3.UnresolvedMembers = []*db.GroupMember{} - group.recordView(v1) - group.recordView(v2) - group.recordView(v3) - err := group.Resolve() - assert.Errorf(t, err, "group transient error") -} - -func TestResolve(t *testing.T) { - healthyView := []*db.GroupMember{ - {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false}, - {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - } - var testCases = []struct { - testName string - views []*db.GroupView - expected *ResolvedView - errorMsg string - }{ - {"test healthy shard", []*db.GroupView{ - {MySQLHost: "host1", MySQLPort: 10, GroupName: "group", UnresolvedMembers: healthyView}, - {MySQLHost: "host2", MySQLPort: 10, GroupName: "group", UnresolvedMembers: healthyView}, - {MySQLHost: "host3", MySQLPort: 10, GroupName: "group", UnresolvedMembers: healthyView}, - }, &ResolvedView{"group", map[inst.InstanceKey]db.GroupMember{ - {Hostname: "host1", Port: 10}: {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false}, - {Hostname: "host2", Port: 10}: {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - {Hostname: "host3", Port: 10}: {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - }, nil}, ""}, - {"test readonly with unreachable primary", []*db.GroupView{ // host1 is unreachable - {MySQLHost: "host2", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{ - {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false}, - {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: false}, - }}, - {MySQLHost: "host3", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{ - {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false}, - {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: false}, - {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - }}, - }, &ResolvedView{"group", map[inst.InstanceKey]db.GroupMember{ - {Hostname: "host1", Port: 10}: {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false}, - {Hostname: "host2", Port: 10}: {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - {Hostname: "host3", Port: 10}: {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - }, nil}, ""}, - {"test split brain by group name", []*db.GroupView{ - {MySQLHost: "host1", MySQLPort: 10, GroupName: "group", UnresolvedMembers: healthyView}, - {MySQLHost: "host2", MySQLPort: 10, GroupName: "group1", UnresolvedMembers: healthyView}, - {MySQLHost: "host3", MySQLPort: 10, GroupName: "group", UnresolvedMembers: healthyView}, - }, nil, "group has split brain"}, - {"test empty hostname", []*db.GroupView{ - {MySQLHost: "host1", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{ - {HostName: "", Port: 0, Role: db.UNKNOWNROLE, State: db.OFFLINE, ReadOnly: true}, - }}, - {MySQLHost: "host2", MySQLPort: 10, GroupName: "", UnresolvedMembers: []*db.GroupMember{ - {HostName: "host2", Port: 10, Role: db.UNKNOWNROLE, State: db.OFFLINE, ReadOnly: true}, - }}, - {MySQLHost: "host3", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{ - {HostName: "host3", Port: 10, Role: db.UNKNOWNROLE, State: db.OFFLINE, ReadOnly: true}, - }}, - }, &ResolvedView{"group", map[inst.InstanceKey]db.GroupMember{ - {Hostname: "host1", Port: 10}: {HostName: "host1", Port: 10, Role: db.UNKNOWNROLE, State: db.OFFLINE, ReadOnly: true}, - {Hostname: "host2", Port: 10}: {HostName: "host2", Port: 10, Role: db.UNKNOWNROLE, State: db.OFFLINE, ReadOnly: true}, - {Hostname: "host3", Port: 10}: {HostName: "host3", Port: 10, Role: db.UNKNOWNROLE, State: db.OFFLINE, ReadOnly: true}, - }, nil}, ""}, - {"test network partition by majority unreachable", []*db.GroupView{ - {MySQLHost: "host1", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{ - {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.UNREACHABLE, ReadOnly: false}, - {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.UNREACHABLE, ReadOnly: true}, - }}, - }, nil, "group backoff error"}, - {"test no network partition with less then majority unreachable", []*db.GroupView{ - {MySQLHost: "host1", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{ - {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false}, - {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.UNREACHABLE, ReadOnly: false}, - }}, - {MySQLHost: "host2", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{ - {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false}, - {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.UNREACHABLE, ReadOnly: false}, - }}, - }, &ResolvedView{"group", map[inst.InstanceKey]db.GroupMember{ - {Hostname: "host1", Port: 10}: {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false}, - {Hostname: "host2", Port: 10}: {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true}, - {Hostname: "host3", Port: 10}: {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.UNREACHABLE, ReadOnly: false}, - }, nil}, "group backoff error"}, - {"test network partition by unreachable primary", []*db.GroupView{ - {MySQLHost: "host2", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{ - {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.UNREACHABLE}, - {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE}, - {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.ONLINE}, - }}, - {MySQLHost: "host3", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{ - {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.UNREACHABLE}, - {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE}, - {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.ONLINE}, - }}, - }, nil, "group backoff error"}, - {"test bootstrap ongoing", []*db.GroupView{ - {MySQLHost: "host1", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{ - {HostName: "", Port: 0, Role: db.SECONDARY, State: db.RECOVERING, ReadOnly: true}, - }}, - {MySQLHost: "host2", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{}}, - {MySQLHost: "host3", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{}}, - }, nil, "group ongoing bootstrap"}, - } - for _, testCase := range testCases { - t.Run(testCase.testName, func(t *testing.T) { - group := SQLGroup{views: testCase.views, statsTags: []string{"ks", "0"}, logger: log.NewVTGRLogger("ks", "0")} - err := group.Resolve() - if testCase.errorMsg != "" { - assert.EqualError(t, err, testCase.errorMsg) - } else { - assert.NoError(t, err) - } - if testCase.expected != nil { - rv := group.resolvedView - expected := testCase.expected - assert.Equal(t, expected.view, rv.view) - assert.Equal(t, expected.groupName, rv.groupName) - } - }) - } -} diff --git a/go/vt/vtgr/controller/mock_refresh.go b/go/vt/vtgr/controller/mock_refresh.go deleted file mode 100644 index 30ed5a187e7..00000000000 --- a/go/vt/vtgr/controller/mock_refresh.go +++ /dev/null @@ -1,148 +0,0 @@ -// Code generated by MockGen. DO NOT EDIT. -// Source: go/vt/vtgr/controller/refresh.go -package controller - -import ( - reflect "reflect" - - gomock "github.com/golang/mock/gomock" - context "golang.org/x/net/context" - - topodata "vitess.io/vitess/go/vt/proto/topodata" - topo "vitess.io/vitess/go/vt/topo" -) - -// MockGRTopo is a mock of GRTopo interface. -type MockGRTopo struct { - ctrl *gomock.Controller - recorder *MockGRTopoMockRecorder -} - -// MockGRTopoMockRecorder is the mock recorder for MockGRTopo. -type MockGRTopoMockRecorder struct { - mock *MockGRTopo -} - -// NewMockGRTopo creates a new mock instance. -func NewMockGRTopo(ctrl *gomock.Controller) *MockGRTopo { - mock := &MockGRTopo{ctrl: ctrl} - mock.recorder = &MockGRTopoMockRecorder{mock} - return mock -} - -// EXPECT returns an object that allows the caller to indicate expected use. -func (m *MockGRTopo) EXPECT() *MockGRTopoMockRecorder { - return m.recorder -} - -// GetShard mocks base method. -func (m *MockGRTopo) GetShard(ctx context.Context, keyspace, shard string) (*topo.ShardInfo, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetShard", ctx, keyspace, shard) - ret0, _ := ret[0].(*topo.ShardInfo) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetShard indicates an expected call of GetShard. -func (mr *MockGRTopoMockRecorder) GetShard(ctx, keyspace, shard any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetShard", reflect.TypeOf((*MockGRTopo)(nil).GetShard), ctx, keyspace, shard) -} - -// GetShardNames mocks base method. -func (m *MockGRTopo) GetShardNames(ctx context.Context, keyspace string) ([]string, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetShardNames", ctx, keyspace) - ret0, _ := ret[0].([]string) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetShardNames indicates an expected call of GetShardNames. -func (mr *MockGRTopoMockRecorder) GetShardNames(ctx, keyspace any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetShardNames", reflect.TypeOf((*MockGRTopo)(nil).GetShardNames), ctx, keyspace) -} - -// GetTabletMapForShardByCell mocks base method. -func (m *MockGRTopo) GetTabletMapForShardByCell(ctx context.Context, keyspace, shard string, cells []string) (map[string]*topo.TabletInfo, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetTabletMapForShardByCell", ctx, keyspace, shard, cells) - ret0, _ := ret[0].(map[string]*topo.TabletInfo) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetTabletMapForShardByCell indicates an expected call of GetTabletMapForShardByCell. -func (mr *MockGRTopoMockRecorder) GetTabletMapForShardByCell(ctx, keyspace, shard, cells any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTabletMapForShardByCell", reflect.TypeOf((*MockGRTopo)(nil).GetTabletMapForShardByCell), ctx, keyspace, shard, cells) -} - -// LockShard mocks base method. -func (m *MockGRTopo) LockShard(ctx context.Context, keyspace, shard, action string) (context.Context, func(*error), error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "LockShard", ctx, keyspace, shard, action) - ret0, _ := ret[0].(context.Context) - ret1, _ := ret[1].(func(*error)) - ret2, _ := ret[2].(error) - return ret0, ret1, ret2 -} - -// LockShard indicates an expected call of LockShard. -func (mr *MockGRTopoMockRecorder) LockShard(ctx, keyspace, shard, action any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LockShard", reflect.TypeOf((*MockGRTopo)(nil).LockShard), ctx, keyspace, shard, action) -} - -// MockGRTmcClient is a mock of GRTmcClient interface. -type MockGRTmcClient struct { - ctrl *gomock.Controller - recorder *MockGRTmcClientMockRecorder -} - -// MockGRTmcClientMockRecorder is the mock recorder for MockGRTmcClient. -type MockGRTmcClientMockRecorder struct { - mock *MockGRTmcClient -} - -// NewMockGRTmcClient creates a new mock instance. -func NewMockGRTmcClient(ctrl *gomock.Controller) *MockGRTmcClient { - mock := &MockGRTmcClient{ctrl: ctrl} - mock.recorder = &MockGRTmcClientMockRecorder{mock} - return mock -} - -// EXPECT returns an object that allows the caller to indicate expected use. -func (m *MockGRTmcClient) EXPECT() *MockGRTmcClientMockRecorder { - return m.recorder -} - -// ChangeType mocks base method. -func (m *MockGRTmcClient) ChangeType(ctx context.Context, tablet *topodata.Tablet, dbType topodata.TabletType, semiSync bool) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "ChangeType", ctx, tablet, dbType) - ret0, _ := ret[0].(error) - return ret0 -} - -// ChangeType indicates an expected call of ChangeType. -func (mr *MockGRTmcClientMockRecorder) ChangeType(ctx, tablet, dbType any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ChangeType", reflect.TypeOf((*MockGRTmcClient)(nil).ChangeType), ctx, tablet, dbType) -} - -// Ping mocks base method. -func (m *MockGRTmcClient) Ping(ctx context.Context, tablet *topodata.Tablet) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "Ping", ctx, tablet) - ret0, _ := ret[0].(error) - return ret0 -} - -// Ping indicates an expected call of Ping. -func (mr *MockGRTmcClientMockRecorder) Ping(ctx, tablet any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Ping", reflect.TypeOf((*MockGRTmcClient)(nil).Ping), ctx, tablet) -} diff --git a/go/vt/vtgr/controller/refresh.go b/go/vt/vtgr/controller/refresh.go deleted file mode 100644 index 25e56ad21e6..00000000000 --- a/go/vt/vtgr/controller/refresh.go +++ /dev/null @@ -1,360 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controller - -import ( - "fmt" - "strconv" - "sync" - "sync/atomic" - "time" - - "vitess.io/vitess/go/vt/topo/topoproto" - - "golang.org/x/net/context" - - "vitess.io/vitess/go/stats" - "vitess.io/vitess/go/vt/logutil" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/vtgr/config" - "vitess.io/vitess/go/vt/vtgr/db" - "vitess.io/vitess/go/vt/vtgr/inst" - "vitess.io/vitess/go/vt/vtgr/log" -) - -var ( - lockShardTimingsMs = stats.NewMultiTimings("lockShard", "time vtgr takes to lock the shard", []string{"operation", "success"}) -) - -// grInstance represents an instance that's running MySQL GR -// it wraps a InstanceKey plus some tablet related information -type grInstance struct { - instanceKey *inst.InstanceKey - tablet *topodatapb.Tablet - primaryTimeStamp time.Time - alias string -} - -// GRTopo is VTGR wrapper for topo server -type GRTopo interface { - GetShardNames(ctx context.Context, keyspace string) ([]string, error) - GetShard(ctx context.Context, keyspace, shard string) (*topo.ShardInfo, error) - GetTabletMapForShardByCell(ctx context.Context, keyspace, shard string, cells []string) (map[string]*topo.TabletInfo, error) - LockShard(ctx context.Context, keyspace, shard, action string) (context.Context, func(*error), error) -} - -// GRTmcClient is VTGR wrapper for tmc client -type GRTmcClient interface { - ChangeType(ctx context.Context, tablet *topodatapb.Tablet, dbType topodatapb.TabletType, semiSync bool) error - Ping(ctx context.Context, tablet *topodatapb.Tablet) error -} - -// GRShard stores the information about a Vitess shard that's running MySQL GR -type GRShard struct { - KeyspaceShard *topo.KeyspaceShard - cells []string - instances []*grInstance - primaryAlias string - shardStatusCollector *shardStatusCollector - sqlGroup *SQLGroup - ts GRTopo - tmc GRTmcClient - dbAgent db.Agent - - // Every GRShard tracks a unlock function after it grab a topo lock for the shard - // VTGR needs to release the topo lock before gracefully shutdown - unlock func(*error) - // mutex to protect unlock function access - unlockMu sync.Mutex - - // configuration - minNumReplicas int - localDbPort int - disableReadOnlyProtection bool - - transientErrorWaitTime time.Duration - bootstrapWaitTime time.Duration - - lastDiagnoseResult DiagnoseType - lastDiagnoseSince time.Time - - isActive atomic.Bool - - logger *log.Logger - - // lock prevents multiple go routine fights with each other - sync.Mutex -} - -// shardStatusCollector is used for collecting shard status -type shardStatusCollector struct { - status *ShardStatus - sync.Mutex -} - -// ShardStatus is used for debugging purpose to get current status of a shard -type ShardStatus struct { - Keyspace string - Shard string - Instances []string - Unreachables []string - Problematics []string - Primary string - DiagnoseResult DiagnoseType -} - -func newShardStatusCollector(keyspace, shard string) *shardStatusCollector { - return &shardStatusCollector{ - status: &ShardStatus{Keyspace: keyspace, Shard: shard}, - } -} - -// NewGRShard creates a new GRShard -func NewGRShard( - keyspace, shard string, - cells []string, - tmc GRTmcClient, - ts GRTopo, - dbAgent db.Agent, - config *config.VTGRConfig, - localDbPort int, - isActive bool) *GRShard { - grShard := &GRShard{ - KeyspaceShard: &topo.KeyspaceShard{Keyspace: keyspace, Shard: shard}, - cells: cells, - shardStatusCollector: newShardStatusCollector(keyspace, shard), - tmc: tmc, - ts: ts, - dbAgent: dbAgent, - unlock: nil, - sqlGroup: NewSQLGroup(config.BootstrapGroupSize, true, keyspace, shard), - minNumReplicas: config.MinNumReplica, - disableReadOnlyProtection: config.DisableReadOnlyProtection, - localDbPort: localDbPort, - logger: log.NewVTGRLogger(keyspace, shard), - transientErrorWaitTime: time.Duration(config.BackoffErrorWaitTimeSeconds) * time.Second, - bootstrapWaitTime: time.Duration(config.BootstrapWaitTimeSeconds) * time.Second, - } - grShard.isActive.Store(isActive) - return grShard -} - -// refreshTabletsInShardLocked is called by repair to get a fresh view of the shard -// The caller is responsible to make sure the lock on GRShard -func (shard *GRShard) refreshTabletsInShardLocked(ctx context.Context) { - instances, err := shard.refreshTabletsInShardInternal(ctx) - if err == nil { - shard.instances = instances - } - primary, err := shard.refreshPrimaryShard(ctx) - if err == nil { - shard.primaryAlias = primary - return - } - // If we failed to refreshPrimaryShard, use primary from local tablets - shard.primaryAlias = shard.findPrimaryFromLocalCell() -} - -// UpdateTabletsInShardWithLock updates the shard instances with a lock -func (shard *GRShard) UpdateTabletsInShardWithLock(ctx context.Context) { - instances, err := shard.refreshTabletsInShardInternal(ctx) - if err == nil { - // Take a per shard lock here when we actually refresh the data to avoid - // race conditions bewteen controller and repair tasks - shard.Lock() - shard.instances = instances - shard.Unlock() - } - primary, err := shard.refreshPrimaryShard(ctx) - // We set primary separately from instances so that if global topo is not available - // VTGR can still discover the new tablets from local cell - shard.Lock() - defer shard.Unlock() - if err == nil { - shard.primaryAlias = primary - return - } - shard.primaryAlias = shard.findPrimaryFromLocalCell() -} - -func (shard *GRShard) refreshTabletsInShardInternal(ctx context.Context) ([]*grInstance, error) { - keyspace, shardName := shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard - tablets, err := shard.ts.GetTabletMapForShardByCell(ctx, keyspace, shardName, shard.cells) - if err != nil { - shard.logger.Errorf("Error fetching tablets for keyspace/shardName %v/%v: %v", keyspace, shardName, err) - return nil, err - } - return parseTabletInfos(tablets), nil -} - -func (shard *GRShard) refreshPrimaryShard(ctx context.Context) (string, error) { - keyspace, shardName := shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard - si, err := shard.ts.GetShard(ctx, keyspace, shardName) - if err != nil { - shard.logger.Errorf("Error calling GetShard: %v", err) - return "", err - } - return topoproto.TabletAliasString(si.PrimaryAlias), nil -} - -// findPrimaryFromLocalCell iterates through the replicas stored in grShard and returns -// the one that's marked as primary -func (shard *GRShard) findPrimaryFromLocalCell() string { - var latestPrimaryTimestamp time.Time - var primaryInstance *grInstance - for _, instance := range shard.instances { - if instance.tablet.Type == topodatapb.TabletType_PRIMARY { - // It is possible that there are more than one master in topo server - // we should compare timestamp to pick the latest one - if latestPrimaryTimestamp.Before(instance.primaryTimeStamp) { - latestPrimaryTimestamp = instance.primaryTimeStamp - primaryInstance = instance - } - } - } - if primaryInstance != nil { - return primaryInstance.alias - } - return "" -} - -// parseTabletInfos replaces the replica reports for the shard key -// Note: this is not thread-safe -func parseTabletInfos(tablets map[string]*topo.TabletInfo) []*grInstance { - // collect all replicas - var newReplicas []*grInstance - for alias, tabletInfo := range tablets { - tablet := tabletInfo.Tablet - // Only monitor primary, replica and ronly tablet types - switch tablet.Type { - case topodatapb.TabletType_PRIMARY, topodatapb.TabletType_REPLICA, topodatapb.TabletType_RDONLY: - // mysql hostname and port might be empty here if tablet is not running - // we will treat them as unreachable - instanceKey := inst.InstanceKey{ - Hostname: tablet.MysqlHostname, - Port: int(tablet.MysqlPort), - } - grInstance := grInstance{ - instanceKey: &instanceKey, - tablet: tablet, - primaryTimeStamp: logutil.ProtoToTime(tablet.PrimaryTermStartTime), - alias: alias, - } - newReplicas = append(newReplicas, &grInstance) - } - } - return newReplicas -} - -// LockShard locks the keyspace-shard on topo server to prevent others from executing conflicting actions. -func (shard *GRShard) LockShard(ctx context.Context, action string) (context.Context, error) { - if shard.KeyspaceShard.Keyspace == "" || shard.KeyspaceShard.Shard == "" { - return nil, fmt.Errorf("try to grab lock with incomplete information: %v/%v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard) - } - shard.unlockMu.Lock() - defer shard.unlockMu.Unlock() - if shard.unlock != nil { - return nil, fmt.Errorf("try to grab lock for %s/%s while the shard holds an unlock function", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard) - } - start := time.Now() - ctx, unlock, err := shard.ts.LockShard(ctx, shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard, fmt.Sprintf("VTGR repairing %s", action)) - lockShardTimingsMs.Record([]string{action, strconv.FormatBool(err == nil)}, start) - if err != nil { - return nil, err - } - shard.unlock = unlock - return ctx, nil -} - -// UnlockShard unlocks the keyspace-shard on topo server -// and set the unlock function to nil in the container -func (shard *GRShard) UnlockShard() { - shard.unlockMu.Lock() - defer shard.unlockMu.Unlock() - if shard.unlock == nil { - shard.logger.Warningf("Shard %s/%s does not hold a lock", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard) - return - } - var err error - shard.unlock(&err) - shard.unlock = nil -} - -func (shard *GRShard) findTabletByHostAndPort(host string, port int) *grInstance { - for _, instance := range shard.instances { - if instance.instanceKey.Hostname == host && instance.instanceKey.Port == port { - return instance - } - } - return nil -} - -func (shard *GRShard) populateVTGRStatusLocked() { - var instanceList []string - for _, instance := range shard.instances { - instanceList = append(instanceList, instance.alias) - } - shard.shardStatusCollector.status.Instances = instanceList - if primary := shard.findShardPrimaryTablet(); primary != nil { - shard.shardStatusCollector.status.Primary = primary.alias - } -} - -// GetCurrentShardStatuses returns the status collector has -func (shard *GRShard) GetCurrentShardStatuses() ShardStatus { - shard.Lock() - collector := shard.shardStatusCollector - // dereference status so that we return a copy of the struct - status := *collector.status - shard.Unlock() - return status -} - -// OverrideRebootstrapGroupSize force override the group expectedBootstrapSize used in safety check for rebootstrap -func (shard *GRShard) OverrideRebootstrapGroupSize(groupSize int) error { - shard.Lock() - defer shard.Unlock() - shard.logger.Infof("Override rebootstrap group size=%v", groupSize) - shard.sqlGroup.rebootstrapSize = groupSize - return nil -} - -// GetUnlock returns the unlock function for the shard for testing -func (shard *GRShard) GetUnlock() func(*error) { - shard.unlockMu.Lock() - defer shard.unlockMu.Unlock() - return shard.unlock -} - -// SetIsActive sets isActive for the shard -func (shard *GRShard) SetIsActive(isActive bool) { - shard.logger.Infof("Setting is active to %v", isActive) - shard.isActive.Store(isActive) -} - -func (collector *shardStatusCollector) isUnreachable(instance *grInstance) bool { - if instance.instanceKey == nil || instance.instanceKey.Hostname == "" { - return true - } - for _, alias := range collector.status.Unreachables { - if instance.alias == alias { - return true - } - } - return false -} diff --git a/go/vt/vtgr/controller/refresh_test.go b/go/vt/vtgr/controller/refresh_test.go deleted file mode 100644 index a1bbef74fc7..00000000000 --- a/go/vt/vtgr/controller/refresh_test.go +++ /dev/null @@ -1,159 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controller - -import ( - "context" - "fmt" - "sort" - "testing" - "time" - - "github.com/golang/mock/gomock" - "github.com/stretchr/testify/assert" - - "vitess.io/vitess/go/vt/logutil" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/topo/memorytopo" - "vitess.io/vitess/go/vt/vtctl/grpcvtctldserver/testutil" - "vitess.io/vitess/go/vt/vtgr/config" -) - -func TestRefreshTabletsInShard(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ctx := context.Background() - ts := memorytopo.NewServer("test_cell") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tablet1 := buildTabletInfo(uint32(0), testHost, testPort0, topodatapb.TabletType_PRIMARY, time.Time{}) - tablet2 := buildTabletInfo(uint32(1), testHost, testPort1, topodatapb.TabletType_SPARE, time.Time{}) - tablet3 := buildTabletInfo(uint32(2), testHost, 0, topodatapb.TabletType_REPLICA, time.Time{}) - testutil.AddTablet(ctx, t, ts, tablet1.Tablet, nil) - testutil.AddTablet(ctx, t, ts, tablet2.Tablet, nil) - testutil.AddTablet(ctx, t, ts, tablet3.Tablet, nil) - cfg := &config.VTGRConfig{BootstrapGroupSize: 3, MinNumReplica: 0, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, nil, ts, nil, cfg, testPort0, true) - assert.Equal(t, "ks", shard.shardStatusCollector.status.Keyspace) - assert.Equal(t, "0", shard.shardStatusCollector.status.Shard) - shard.refreshTabletsInShardLocked(context.Background()) - instances := shard.instances - // only have 2 instances here because we filter out the spare tablet - assert.Equal(t, 2, len(instances)) - sort.Slice(instances[:], func(i, j int) bool { - return instances[i].alias < instances[j].alias - }) - assert.Equal(t, testHost, instances[0].tablet.Hostname) - assert.Equal(t, int32(testPort0), instances[0].tablet.MysqlPort) - assert.Equal(t, topodatapb.TabletType_PRIMARY, instances[0].tablet.Type) - // host 3 is missing mysql host but we still put it in the instances list here - assert.Equal(t, testHost, instances[1].instanceKey.Hostname) - assert.Equal(t, int32(0), instances[1].tablet.MysqlPort) - assert.Equal(t, topodatapb.TabletType_REPLICA, instances[1].tablet.Type) -} - -func TestRefreshWithCells(t *testing.T) { - ctx := context.Background() - ts := memorytopo.NewServer("cell1", "cell2", "cell3") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tablet1 := buildTabletInfoWithCell(uint32(0), testHost, "cell1", testPort0, topodatapb.TabletType_REPLICA, time.Time{}) - tablet2 := buildTabletInfoWithCell(uint32(1), testHost, "cell2", testPort1, topodatapb.TabletType_REPLICA, time.Time{}) - tablet3 := buildTabletInfoWithCell(uint32(2), testHost, "cell3", testPort2, topodatapb.TabletType_REPLICA, time.Time{}) - testutil.AddTablet(ctx, t, ts, tablet1.Tablet, nil) - testutil.AddTablet(ctx, t, ts, tablet2.Tablet, nil) - testutil.AddTablet(ctx, t, ts, tablet3.Tablet, nil) - cfg := &config.VTGRConfig{BootstrapGroupSize: 3, MinNumReplica: 0, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", []string{"cell1", "cell3"}, nil, ts, nil, cfg, testPort0, true) - shard.refreshTabletsInShardLocked(context.Background()) - instances := shard.instances - // only have 2 instances here because we are not watching cell2 - assert.Equal(t, 2, len(instances)) - sort.Slice(instances[:], func(i, j int) bool { - return instances[i].alias < instances[j].alias - }) - assert.Equal(t, "cell1-0000000000", instances[0].alias) - assert.Equal(t, "cell3-0000000002", instances[1].alias) -} - -func TestRefreshWithEmptyCells(t *testing.T) { - ctx := context.Background() - ts := memorytopo.NewServer("cell1", "cell2", "cell3") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tablet1 := buildTabletInfoWithCell(uint32(0), testHost, "cell1", testPort0, topodatapb.TabletType_REPLICA, time.Time{}) - tablet2 := buildTabletInfoWithCell(uint32(1), testHost, "cell2", testPort1, topodatapb.TabletType_REPLICA, time.Time{}) - tablet3 := buildTabletInfoWithCell(uint32(2), testHost, "cell3", testPort2, topodatapb.TabletType_REPLICA, time.Time{}) - testutil.AddTablet(ctx, t, ts, tablet1.Tablet, nil) - testutil.AddTablet(ctx, t, ts, tablet2.Tablet, nil) - testutil.AddTablet(ctx, t, ts, tablet3.Tablet, nil) - cfg := &config.VTGRConfig{BootstrapGroupSize: 3, MinNumReplica: 0, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, nil, ts, nil, cfg, testPort0, true) - shard.refreshTabletsInShardLocked(context.Background()) - instances := shard.instances - // nil cell will return everything - assert.Equal(t, 3, len(instances)) - sort.Slice(instances[:], func(i, j int) bool { - return instances[i].alias < instances[j].alias - }) - assert.Equal(t, "cell1-0000000000", instances[0].alias) - assert.Equal(t, "cell2-0000000001", instances[1].alias) - assert.Equal(t, "cell3-0000000002", instances[2].alias) -} - -func TestLockRelease(t *testing.T) { - ctx := context.Background() - ts := memorytopo.NewServer("cell1", "cell2", "cell3") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - cfg := &config.VTGRConfig{BootstrapGroupSize: 3, MinNumReplica: 0, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, nil, ts, nil, cfg, testPort0, true) - ctx, err := shard.LockShard(ctx, "") - assert.NoError(t, err) - // make sure we get the lock - err = shard.checkShardLocked(ctx) - assert.NoError(t, err) - assert.NotNil(t, shard.unlock) - shard.UnlockShard() - assert.Nil(t, shard.unlock) - err = shard.checkShardLocked(ctx) - assert.EqualError(t, err, "lost topology lock; aborting: shard ks/0 is not locked (no lockInfo in map)") -} - -func buildTabletInfo(id uint32, host string, mysqlPort int, ttype topodatapb.TabletType, primaryTermTime time.Time) *topo.TabletInfo { - return buildTabletInfoWithCell(id, host, "test_cell", mysqlPort, ttype, primaryTermTime) -} - -func buildTabletInfoWithCell(id uint32, host, cell string, mysqlPort int, ttype topodatapb.TabletType, primaryTermTime time.Time) *topo.TabletInfo { - alias := &topodatapb.TabletAlias{Cell: cell, Uid: id} - return &topo.TabletInfo{Tablet: &topodatapb.Tablet{ - Alias: alias, - Hostname: host, - MysqlHostname: host, - MysqlPort: int32(mysqlPort), - Keyspace: "ks", - Shard: "0", - Type: ttype, - PrimaryTermStartTime: logutil.TimeToProto(primaryTermTime), - Tags: map[string]string{"hostname": fmt.Sprintf("host_%d", id)}, - }} -} diff --git a/go/vt/vtgr/controller/repair.go b/go/vt/vtgr/controller/repair.go deleted file mode 100644 index a7fa64d7c97..00000000000 --- a/go/vt/vtgr/controller/repair.go +++ /dev/null @@ -1,767 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controller - -import ( - "context" - "errors" - "fmt" - "sort" - "strconv" - "sync" - "time" - - "github.com/spf13/pflag" - - "vitess.io/vitess/go/mysql" - "vitess.io/vitess/go/stats" - "vitess.io/vitess/go/vt/concurrency" - "vitess.io/vitess/go/vt/servenv" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/vterrors" - "vitess.io/vitess/go/vt/vtgr/db" - - topodatapb "vitess.io/vitess/go/vt/proto/topodata" -) - -var ( - repairTimingsMs = stats.NewMultiTimings("repairTimingsMs", "time vtgr takes to repair", []string{"status", "success"}) - unexpectedLockLost = stats.NewCountersWithMultiLabels("unexpectedLockLost", "unexpected lost of the lock", []string{"Keyspace", "Shard"}) - - abortRebootstrap bool -) - -func init() { - servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) { - fs.BoolVar(&abortRebootstrap, "abort_rebootstrap", false, "Don't allow vtgr to rebootstrap an existing group.") - }) -} - -// RepairResultCode is the code for repair -type RepairResultCode string - -const ( - // Success means successfully repaired - Success RepairResultCode = "Success" - // Fail means failed to repaire - Fail RepairResultCode = "Fail" - // Noop means do nothing - Noop RepairResultCode = "Noop" -) - -// Repair tries to fix shard based on the diagnose type -func (shard *GRShard) Repair(ctx context.Context, status DiagnoseType) (RepairResultCode, error) { - shard.Lock() - defer shard.Unlock() - var err error - code := Noop - switch status { - case DiagnoseTypeShardHasNoGroup: - code, err = shard.repairShardHasNoGroup(ctx) - case DiagnoseTypeShardHasInactiveGroup: - code, err = shard.repairShardHasInactiveGroup(ctx) - case DiagnoseTypeWrongPrimaryTablet: - code, err = shard.repairWrongPrimaryTablet(ctx) - case DiagnoseTypeUnconnectedReplica: - code, err = shard.repairUnconnectedReplica(ctx) - case DiagnoseTypeUnreachablePrimary: - code, err = shard.repairUnreachablePrimary(ctx) - case DiagnoseTypeInsufficientGroupSize: - code, err = shard.repairInsufficientGroupSize(ctx) - case DiagnoseTypeReadOnlyShard: - code, err = shard.repairReadOnlyShard(ctx) - case DiagnoseTypeBootstrapBackoff, DiagnoseTypeBackoffError: - code, err = shard.repairBackoffError(ctx, status) - case DiagnoseTypeError: - shard.logger.Errorf("%v is %v", formatKeyspaceShard(shard.KeyspaceShard), status) - case DiagnoseTypeHealthy: - start := time.Now() - repairTimingsMs.Record([]string{string(status), "true"}, start) - } - if status != DiagnoseTypeHealthy { - shard.logger.Infof("VTGR repaired %v status=%v | code=%v", formatKeyspaceShard(shard.KeyspaceShard), status, code) - } - return code, vterrors.Wrap(err, "vtgr repair") -} - -func (shard *GRShard) repairShardHasNoGroup(ctx context.Context) (RepairResultCode, error) { - ctx, err := shard.LockShard(ctx, "repairShardHasNoGroup") - if err != nil { - shard.logger.Warningf("repairShardHasNoPrimaryTablet fails to grab lock for the shard %v: %v", shard.KeyspaceShard, err) - return Noop, err - } - defer shard.UnlockShard() - shard.refreshTabletsInShardLocked(ctx) - // Diagnose() will call shardAgreedGroup as the first thing - // which will update mysqlGroup stored in the shard - status, err := shard.diagnoseLocked(ctx) - if err != nil { - shard.logger.Errorf("Failed to diagnose: %v", err) - return Fail, err - } - if status != DiagnoseTypeShardHasNoGroup { - shard.logger.Infof("Shard %v is no longer in DiagnoseTypeShardHasNoGroup: %v", formatKeyspaceShard(shard.KeyspaceShard), status) - return Noop, nil - } - start := time.Now() - err = shard.repairShardHasNoGroupAction(ctx) - repairTimingsMs.Record([]string{DiagnoseTypeShardHasNoGroup, strconv.FormatBool(err == nil)}, start) - if err != nil { - return Fail, err - } - return Success, nil -} - -func (shard *GRShard) repairShardHasNoGroupAction(ctx context.Context) error { - // If group is not empty AND there is at least one active group member - // we don't need to bootstrap. Instead we should try to join the group - mysqlGroup := shard.shardAgreedGroupName() - isAllOffline := shard.isAllOfflineOrError() - if mysqlGroup != "" { - shard.logger.Infof("Shard %v already have a group %v", formatKeyspaceShard(shard.KeyspaceShard), mysqlGroup) - return nil - } - // This should not really happen in reality - if mysqlGroup == "" && !isAllOffline { - return fmt.Errorf("shard %v has empty group name but some node is not OFFLINE", formatKeyspaceShard(shard.KeyspaceShard)) - } - - // Now we know group is null and there is no active node - // we should bootstrap the group - replicas := shard.instances - // Sanity check to make sure there is at least one instance - if len(replicas) == 0 { - shard.logger.Warningf("Cannot find any instance for the shard %v", formatKeyspaceShard(shard.KeyspaceShard)) - return nil - } - if !shard.sqlGroup.IsSafeToBootstrap() { - return errors.New("unsafe to bootstrap group") - } - var candidate *grInstance - sort.SliceStable(replicas, func(i, j int) bool { - return replicas[i].alias < replicas[j].alias - }) - for _, replica := range replicas { - if !shard.shardStatusCollector.isUnreachable(replica) { - candidate = replica - break - } - } - if candidate == nil { - return errors.New("fail to find any candidate to bootstrap") - } - // Bootstrap the group - shard.logger.Infof("Bootstrapping the group for %v on host=%v", formatKeyspaceShard(shard.KeyspaceShard), candidate.instanceKey.Hostname) - // Make sure we still hold the topo server lock before moving on - if err := shard.checkShardLocked(ctx); err != nil { - return err - } - if err := shard.dbAgent.BootstrapGroupLocked(candidate.instanceKey); err != nil { - // if bootstrap failed, the next one that gets the lock will try to do it again - shard.logger.Errorf("Failed to bootstrap mysql group on %v: %v", candidate.instanceKey.Hostname, err) - return err - } - shard.logger.Infof("Bootstrapped the group for %v", formatKeyspaceShard(shard.KeyspaceShard)) - return nil -} - -func (shard *GRShard) repairShardHasInactiveGroup(ctx context.Context) (RepairResultCode, error) { - ctx, err := shard.LockShard(ctx, "repairShardHasInactiveGroup") - if err != nil { - shard.logger.Warningf("repairShardHasInactiveGroup fails to grab lock for the shard %v: %v", shard.KeyspaceShard, err) - return Noop, err - } - defer shard.UnlockShard() - shard.refreshTabletsInShardLocked(ctx) - // Diagnose() will call shardAgreedGroup as the first thing - // which will update mysqlGroup stored in the shard - status, err := shard.diagnoseLocked(ctx) - if err != nil { - shard.logger.Errorf("Failed to diagnose: %v", err) - return Fail, err - } - if status != DiagnoseTypeShardHasInactiveGroup { - shard.logger.Infof("Shard %v is no longer in DiagnoseTypeShardHasInactiveGroup: %v", formatKeyspaceShard(shard.KeyspaceShard), status) - return Noop, nil - } - // Now we know the shard has an agreed group but no member in it - // We should find one with the largest GTID set as the - // new mysql primary to bootstrap the group - start := time.Now() - err = shard.stopAndRebootstrap(ctx) - repairTimingsMs.Record([]string{DiagnoseTypeShardHasInactiveGroup, strconv.FormatBool(err == nil)}, start) - if err != nil { - return Fail, err - } - return Success, nil -} - -func (shard *GRShard) repairBackoffError(ctx context.Context, diagnose DiagnoseType) (RepairResultCode, error) { - ctx, err := shard.LockShard(ctx, "repairBackoffError") - if err != nil { - shard.logger.Warningf("repairBackoffError fails to grab lock for the shard %v: %v", shard.KeyspaceShard, err) - return Noop, err - } - defer shard.UnlockShard() - shard.refreshTabletsInShardLocked(ctx) - status, err := shard.diagnoseLocked(ctx) - if err != nil { - shard.logger.Errorf("Failed to diagnose: %v", err) - return Fail, err - } - if status != diagnose { - shard.logger.Infof("Shard %v is no longer in %v: %v", formatKeyspaceShard(shard.KeyspaceShard), diagnose, status) - return Noop, nil - } - if shard.lastDiagnoseResult != diagnose { - shard.logger.Infof("diagnose shard as %v but last diagnose result was %v", diagnose, shard.lastDiagnoseResult) - return Noop, nil - } - now := time.Now() - var waitTime time.Duration - switch diagnose { - case DiagnoseTypeBackoffError: - waitTime = shard.transientErrorWaitTime - case DiagnoseTypeBootstrapBackoff: - waitTime = shard.bootstrapWaitTime - default: - return Fail, fmt.Errorf("unsupported diagnose for repairBackoffError: %v", diagnose) - } - if now.Sub(shard.lastDiagnoseSince) < waitTime { - shard.logger.Infof("Detected %v at %v. In wait time for network partition", diagnose, shard.lastDiagnoseSince) - return Noop, nil - } - shard.logger.Infof("Detected %v at %v. Start repairing after %v", diagnose, shard.lastDiagnoseSince, shard.transientErrorWaitTime) - err = shard.stopAndRebootstrap(ctx) - repairTimingsMs.Record([]string{DiagnoseTypeBackoffError, strconv.FormatBool(err == nil)}, now) - if err != nil { - return Fail, err - } - return Success, nil -} - -func (shard *GRShard) stopAndRebootstrap(ctx context.Context) error { - // Make sure we still hold the topo server lock before moving on - if err := shard.checkShardLocked(ctx); err != nil { - return err - } - // Before bootstrap the group, we need to stop group first - // abort aggressively here as soon as we encounter an error - // StopGroupLocked will check if instance is NOT in "ONLINE"/"RECOVERING" state (i.e., UNREACHABLE, ERROR or OFFLINE) - errorRecorder := shard.forAllInstances(func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { - defer wg.Done() - status := shard.sqlGroup.GetStatus(instance.instanceKey) - if status != nil && status.State == db.OFFLINE { - shard.logger.Infof("stop group replication on %v skipped because it is already OFFLINE", instance.alias) - return - } - shard.logger.Infof("stop group replication on %v", instance.alias) - err := shard.dbAgent.StopGroupLocked(instance.instanceKey) - if err != nil { - if !unreachableError(err) { - er.RecordError(err) - } - shard.logger.Warningf("Error during stop group replication on %v: %v", instance.instanceKey.Hostname, err) - } - }) - // We don't check allowPartialUnhealthyNodes here because we don't record unreachableError here - // hence if errorRecorder has error, it indicates the mysqld is still reachable but there is nothing - // else went wrong. - if errorRecorder.HasErrors() { - shard.logger.Errorf("Failed to stop group replication %v", errorRecorder.Error()) - return errorRecorder.Error() - } - shard.logger.Infof("Stop the group for %v", formatKeyspaceShard(shard.KeyspaceShard)) - shard.logger.Info("Start find candidate to rebootstrap") - candidate, err := shard.findRebootstrapCandidate(ctx) - if err != nil { - shard.logger.Errorf("Failed to find rebootstrap candidate: %v", err) - return err - } - shard.refreshSQLGroup() - if !shard.sqlGroup.IsSafeToRebootstrap() { - return errors.New("unsafe to bootstrap group") - } - if abortRebootstrap { - shard.logger.Warningf("Abort stopAndRebootstrap because rebootstrap hook override") - return errForceAbortBootstrap - } - shard.logger.Infof("Rebootstrap %v on %v", formatKeyspaceShard(shard.KeyspaceShard), candidate.instanceKey.Hostname) - // Make sure we still hold the topo server lock before moving on - if err := shard.checkShardLocked(ctx); err != nil { - return err - } - uuid := shard.sqlGroup.GetGroupName() - if uuid == "" { - return errors.New("trying to rebootstrap without uuid") - } - return shard.dbAgent.RebootstrapGroupLocked(candidate.instanceKey, uuid) -} - -// allowPartialUnhealthyNodes returns true if rebootstrapSize is set to non-zero -// and the error we get is less than (total_num_tablet - rebootstrapSize) -func (shard *GRShard) allowPartialUnhealthyNodes(errorRecorder *concurrency.AllErrorRecorder) bool { - if shard.sqlGroup.rebootstrapSize != 0 && len(shard.instances)-shard.sqlGroup.rebootstrapSize >= len(errorRecorder.GetErrors()) { - shard.logger.Warningf("Allow unhealthy nodes during the reboot group_size=%v, rebootstrap_config=%v, error=%v", shard.sqlGroup.expectedBootstrapSize, shard.sqlGroup.rebootstrapSize, len(errorRecorder.GetErrors())) - return true - } - return false -} - -func (shard *GRShard) getGTIDSetFromAll(skipPrimary bool) (*groupGTIDRecorder, *concurrency.AllErrorRecorder, error) { - if len(shard.instances) == 0 { - return nil, nil, fmt.Errorf("%v has 0 instance", formatKeyspaceShard(shard.KeyspaceShard)) - } - // Before we do failover, we first verify if there is no one agreed group name. - // If not, VTGR is not smart enough to figure out how to failover - // Note: the caller should make sure the mysqlGroup is refreshed after we grab a shard level lock - mysqlGroup := shard.shardAgreedGroupName() - if mysqlGroup == "" { - return nil, nil, fmt.Errorf("unable to find an agreed group name in %v", formatKeyspaceShard(shard.KeyspaceShard)) - } - primary := shard.findShardPrimaryTablet() - var mysqlPrimaryHost string - var mysqlPrimaryPort int - // skipPrimary is true when we manual failover or if there is a unreachalbe primary tablet - // in both case, there should be a reconciled primary tablet - if skipPrimary && primary != nil { - status := shard.sqlGroup.GetStatus(primary.instanceKey) - mysqlPrimaryHost, mysqlPrimaryPort = status.HostName, status.Port - shard.logger.Infof("Found primary instance from MySQL on %v", mysqlPrimaryHost) - } - gtidRecorder := &groupGTIDRecorder{} - // Iterate through all the instances in the shard and find the one with largest GTID set with best effort - // We wrap it with forAllInstances so that the failover can continue if there is a host - // that is unreachable - errorRecorder := shard.forAllInstances(func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { - defer wg.Done() - if skipPrimary && instance.instanceKey.Hostname == mysqlPrimaryHost && instance.instanceKey.Port == mysqlPrimaryPort { - shard.logger.Infof("Skip %v to failover to a non-primary node", mysqlPrimaryHost) - return - } - gtids, err := shard.dbAgent.FetchApplierGTIDSet(instance.instanceKey) - if err != nil { - er.RecordError(err) - shard.logger.Errorf("%v get error while fetch applier GTIDs: %v", instance.alias, err) - shard.shardStatusCollector.recordProblematics(instance) - if unreachableError(err) { - shard.shardStatusCollector.recordUnreachables(instance) - } - return - } - if gtids == nil { - shard.logger.Warningf("[failover candidate] skip %s with empty gtid", instance.alias) - return - } - gtidRecorder.recordGroupGTIDs(gtids, instance) - }) - return gtidRecorder, errorRecorder, nil -} - -func (shard *GRShard) findRebootstrapCandidate(ctx context.Context) (*grInstance, error) { - gtidRecorder, errorRecorder, err := shard.getGTIDSetFromAll(false) - if err != nil { - shard.logger.Errorf("Failed to get gtid from all: %v", err) - return nil, err - } - err = errorRecorder.Error() - // We cannot tolerate any error from mysql during a rebootstrap. - if err != nil && !shard.allowPartialUnhealthyNodes(errorRecorder) { - shard.logger.Errorf("Failed to fetch all GTID with forAllInstances for rebootstrap: %v", err) - return nil, err - } - candidate, err := shard.findFailoverCandidateFromRecorder(ctx, gtidRecorder, nil) - if err != nil { - shard.logger.Errorf("Failed to find rebootstrap candidate by GTID after forAllInstances: %v", err) - return nil, err - } - if candidate == nil { - return nil, fmt.Errorf("failed to find rebootstrap candidate for %v", formatKeyspaceShard(shard.KeyspaceShard)) - } - if !shard.instanceReachable(ctx, candidate) { - shard.logger.Errorf("rebootstrap candidate %v (%v) is not reachable via ping", candidate.alias, candidate.instanceKey.Hostname) - return nil, fmt.Errorf("%v is unreachable", candidate.alias) - } - shard.logger.Infof("%v is the rebootstrap candidate", candidate.alias) - return candidate, nil -} - -// Caller of this function should make sure it gets the shard lock and it has the -// latest view of a shard. Otherwise, we might skip the wrong node when we locate the candidate -func (shard *GRShard) findFailoverCandidate(ctx context.Context) (*grInstance, error) { - gtidRecorder, errorRecorder, err := shard.getGTIDSetFromAll(true) - if err != nil { - shard.logger.Errorf("Failed to get gtid from all: %v", err) - return nil, err - } - err = errorRecorder.Error() - // During the repair for unreachable primary we still have a mysql group. - // Failover within the group is safe, finding the largest GTID is an optimization. - // therefore we don't check error from errorRecorder just log it - if err != nil { - shard.logger.Warningf("Errors when fetch all GTID with forAllInstances for failover: %v", err) - } - shard.forAllInstances(func(instance *grInstance, wg *sync.WaitGroup, er concurrency.ErrorRecorder) { - defer wg.Done() - if !shard.instanceReachable(ctx, instance) { - shard.logger.Errorf("%v is not reachable via ping", instance.alias) - shard.shardStatusCollector.recordProblematics(instance) - shard.shardStatusCollector.recordUnreachables(instance) - } - }) - var candidate *grInstance - candidate, err = shard.findFailoverCandidateFromRecorder(ctx, gtidRecorder, func(c context.Context, instance *grInstance) bool { - return !shard.shardStatusCollector.isUnreachable(instance) - }) - if err != nil { - shard.logger.Errorf("Failed to find failover candidate by GTID after forAllInstances: %v", err) - return nil, err - } - if candidate == nil { - return nil, fmt.Errorf("failed to find failover candidate for %v", formatKeyspaceShard(shard.KeyspaceShard)) - } - shard.logger.Infof("%v is the failover candidate", candidate.alias) - return candidate, nil -} - -func (shard *GRShard) repairWrongPrimaryTablet(ctx context.Context) (RepairResultCode, error) { - ctx, err := shard.LockShard(ctx, "repairWrongPrimaryTablet") - if err != nil { - shard.logger.Warningf("repairWrongPrimaryTablet fails to grab lock for the shard %v: %v", shard.KeyspaceShard, err) - return Noop, err - } - defer shard.UnlockShard() - // We grab shard level lock and check again if there is no primary - // to avoid race conditions - shard.refreshTabletsInShardLocked(ctx) - status, err := shard.diagnoseLocked(ctx) - if err != nil { - shard.logger.Errorf("Failed to diagnose: %v", err) - return Fail, err - } - if status != DiagnoseTypeWrongPrimaryTablet { - shard.logger.Infof("Shard %v is no longer in DiagnoseTypeWrongPrimaryTablet: %v", formatKeyspaceShard(shard.KeyspaceShard), status) - return Noop, nil - } - start := time.Now() - err = shard.fixPrimaryTabletLocked(ctx) - repairTimingsMs.Record([]string{DiagnoseTypeWrongPrimaryTablet, strconv.FormatBool(err == nil)}, start) - if err != nil { - return Fail, err - } - return Success, nil -} - -// fixPrimaryTabletLocked changes Vitess primary tablet based on mysql group -func (shard *GRShard) fixPrimaryTabletLocked(ctx context.Context) error { - host, port, isActive := shard.sqlGroup.GetPrimary() - if !isActive { - return db.ErrGroupInactive - } - // Primary tablet does not run mysql primary, we need to change it accordingly - candidate := shard.findTabletByHostAndPort(host, port) - if candidate == nil { - return errMissingPrimaryTablet - } - // Make sure we still hold the topo server lock before moving on - if err := shard.checkShardLocked(ctx); err != nil { - return err - } - err := shard.tmc.ChangeType(ctx, candidate.tablet, topodatapb.TabletType_PRIMARY, false) - if err != nil { - return fmt.Errorf("failed to change type to primary on %v: %v", candidate.alias, err) - } - shard.logger.Infof("Successfully make %v the primary tablet", candidate.alias) - return nil -} - -// repairUnconnectedReplica usually handle the case when there is a DiagnoseTypeHealthy tablet and -// it is not connected to mysql primary node -func (shard *GRShard) repairUnconnectedReplica(ctx context.Context) (RepairResultCode, error) { - ctx, err := shard.LockShard(ctx, "repairUnconnectedReplica") - if err != nil { - shard.logger.Warningf("repairUnconnectedReplica fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err) - return Noop, err - } - defer shard.UnlockShard() - shard.refreshTabletsInShardLocked(ctx) - status, err := shard.diagnoseLocked(ctx) - if err != nil { - shard.logger.Errorf("Failed to diagnose: %v", err) - return Fail, err - } - if status != DiagnoseTypeUnconnectedReplica { - shard.logger.Infof("Shard %v is no longer in DiagnoseTypeUnconnectedReplica: %v", formatKeyspaceShard(shard.KeyspaceShard), status) - return Noop, nil - } - start := time.Now() - err = shard.repairUnconnectedReplicaAction(ctx) - repairTimingsMs.Record([]string{DiagnoseTypeUnconnectedReplica, strconv.FormatBool(err == nil)}, start) - if err != nil { - return Fail, err - } - return Success, nil -} - -func (shard *GRShard) repairUnconnectedReplicaAction(ctx context.Context) error { - primaryInstance := shard.findShardPrimaryTablet() - target, err := shard.disconnectedInstance() - if err != nil { - return err - } - if target == nil { - shard.logger.Infof("there is no instance without group for %v", formatKeyspaceShard(shard.KeyspaceShard)) - return nil - } - shard.logger.Infof("Connecting replica %v to %v", target.instanceKey.Hostname, primaryInstance.instanceKey.Hostname) - status := shard.sqlGroup.GetStatus(target.instanceKey) - // Make sure we still hold the topo server lock before moving on - if err := shard.checkShardLocked(ctx); err != nil { - return err - } - if status != nil && status.State != db.OFFLINE { - shard.logger.Infof("stop group replication on %v (%v) before join the group", target.alias, status.State) - err := shard.dbAgent.StopGroupLocked(target.instanceKey) - if err != nil { - shard.logger.Errorf("Failed to stop group replication on %v: %v", target.instanceKey.Hostname, err) - return err - } - // Make sure we still hold the topo server lock before moving on - if err := shard.checkShardLocked(ctx); err != nil { - return err - } - } - return shard.dbAgent.JoinGroupLocked(target.instanceKey, primaryInstance.instanceKey) -} - -func (shard *GRShard) repairUnreachablePrimary(ctx context.Context) (RepairResultCode, error) { - ctx, err := shard.LockShard(ctx, "repairUnreachablePrimary") - if err != nil { - shard.logger.Warningf("repairUnreachablePrimary fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err) - return Noop, err - } - defer shard.UnlockShard() - shard.refreshTabletsInShardLocked(ctx) - status, err := shard.diagnoseLocked(ctx) - if err != nil { - shard.logger.Errorf("Failed to diagnose: %v", err) - return Fail, err - } - if status != DiagnoseTypeUnreachablePrimary { - shard.logger.Infof("Shard %v is no longer in DiagnoseTypeUnreachablePrimary: %v", formatKeyspaceShard(shard.KeyspaceShard), status) - return Noop, nil - } - // We are here because either: - // 1. we have a primary tablet, but it's not reachable - // 2. we cannot find primary tablet but we do have a mysql group - // we need to failover mysql manually - // - // other case will be handled by different testGroupInput, e.g., - // has reachable primary tablet, but run on different node than mysql -> DiagnoseTypeWrongPrimaryTablet - start := time.Now() - err = shard.failoverLocked(ctx) - repairTimingsMs.Record([]string{DiagnoseTypeUnreachablePrimary, strconv.FormatBool(err == nil)}, start) - if err != nil { - return Fail, err - } - return Success, nil -} - -func (shard *GRShard) repairInsufficientGroupSize(ctx context.Context) (RepairResultCode, error) { - ctx, err := shard.LockShard(ctx, "repairInsufficientGroupSize") - if err != nil { - shard.logger.Warningf("repairInsufficientGroupSize fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err) - return Noop, err - } - defer shard.UnlockShard() - shard.refreshTabletsInShardLocked(ctx) - status, err := shard.diagnoseLocked(ctx) - if err != nil { - shard.logger.Errorf("Failed to diagnose: %v", err) - return Fail, err - } - if status != DiagnoseTypeInsufficientGroupSize { - shard.logger.Infof("Shard %v is no longer in DiagnoseTypeInsufficientGroupSize: %v", formatKeyspaceShard(shard.KeyspaceShard), status) - return Noop, nil - } - // We check primary tablet is consistent with sql primary before InsufficientGroupSize - // therefore primary we found here is correct and healthy - primary := shard.findShardPrimaryTablet() - // Make sure we still hold the topo server lock before moving on - if err := shard.checkShardLocked(ctx); err != nil { - return Fail, err - } - // mysql group will set super_read_only properly automatically - // https://mysqlhighavailability.com/protecting-your-data-fail-safe-enhancements-to-group-replication/ - // since Vitess only knows one writable node (primary tablet) if we want to make sure there is no write - // after there is insufficient members, we can just set primary mysql node to be read only - err = shard.dbAgent.SetReadOnly(primary.instanceKey, true) - if err != nil { - return Fail, err - } - return Success, nil -} - -func (shard *GRShard) repairReadOnlyShard(ctx context.Context) (RepairResultCode, error) { - ctx, err := shard.LockShard(ctx, "repairReadOnlyShard") - if err != nil { - shard.logger.Warningf("repairReadOnlyShard fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err) - return Noop, err - } - defer shard.UnlockShard() - shard.refreshTabletsInShardLocked(ctx) - status, err := shard.diagnoseLocked(ctx) - if err != nil { - shard.logger.Errorf("Failed to diagnose: %v", err) - return Fail, err - } - if status != DiagnoseTypeReadOnlyShard { - shard.logger.Infof("Shard %v is no longer in DiagnoseTypeReadOnlyShard: %v", formatKeyspaceShard(shard.KeyspaceShard), status) - return Noop, nil - } - primary := shard.findShardPrimaryTablet() - // Make sure we still hold the topo server lock before moving on - if err := shard.checkShardLocked(ctx); err != nil { - return Fail, err - } - // undo what we did repairInsufficientGroupSize - err = shard.dbAgent.SetReadOnly(primary.instanceKey, false) - if err != nil { - return Fail, err - } - return Success, nil -} - -// Failover takes a shard and find an node with largest GTID as the mysql primary of the group -func (shard *GRShard) Failover(ctx context.Context) error { - ctx, err := shard.LockShard(ctx, "Failover") - if err != nil { - shard.logger.Warningf("Failover fails to grab lock for the shard %v: %v", formatKeyspaceShard(shard.KeyspaceShard), err) - return err - } - defer shard.UnlockShard() - shard.refreshTabletsInShardLocked(ctx) - return shard.failoverLocked(ctx) -} - -func (shard *GRShard) failoverLocked(ctx context.Context) error { - candidate, err := shard.findFailoverCandidate(ctx) - if err != nil { - shard.logger.Errorf("Failed to find failover candidate: %v", err) - return err - } - // Make sure we still hold the topo server lock before moving on - if err := shard.checkShardLocked(ctx); err != nil { - return err - } - err = shard.dbAgent.Failover(candidate.instanceKey) - if err != nil { - shard.logger.Errorf("Failed to failover mysql to %v", candidate.alias) - return err - } - shard.logger.Infof("Successfully failover MySQL to %v for %v", candidate.instanceKey.Hostname, formatKeyspaceShard(shard.KeyspaceShard)) - if !shard.isActive.Load() { - shard.logger.Infof("Skip vttablet failover on an inactive shard %v", formatKeyspaceShard(shard.KeyspaceShard)) - return nil - } - // Make sure we still hold the topo server lock before moving on - if err := shard.checkShardLocked(ctx); err != nil { - return err - } - err = shard.tmc.ChangeType(ctx, candidate.tablet, topodatapb.TabletType_PRIMARY, false) - if err != nil { - shard.logger.Errorf("Failed to failover Vitess %v", candidate.alias) - return err - } - shard.logger.Infof("Successfully failover Vitess to %v for %v", candidate.alias, formatKeyspaceShard(shard.KeyspaceShard)) - return nil -} - -func (shard *GRShard) findFailoverCandidateFromRecorder(ctx context.Context, recorder *groupGTIDRecorder, check func(context.Context, *grInstance) bool) (*grInstance, error) { - if len(recorder.gtidWithInstances) == 0 { - return nil, fmt.Errorf("empty failover candidate list for %v", formatKeyspaceShard(shard.KeyspaceShard)) - } - // Sort the gtidWithInstances slice so that we have consistent candidate - // in case they have same gtid set - recorder.sort() - for _, gtidInst := range recorder.gtidWithInstances { - shard.logger.Infof("[failover candidates] %s gtid %s", gtidInst.instance.alias, gtidInst.gtids.String()) - } - var largestGTIDs mysql.GTIDSet - var candidate *grInstance - var divergentCandidates []string - // All the instances in the recorder have a reachable mysqld - // hence anyone is a valid failover candidate - for _, elem := range recorder.gtidWithInstances { - gtids := elem.gtids - inst := elem.instance - if check != nil && !check(ctx, inst) { - shard.logger.Warningf("Skip %v as candidate with gtid %v because it failed the check", inst.alias, gtids.String()) - continue - } - if largestGTIDs == nil { - largestGTIDs = gtids - candidate = inst - continue - } - // If largestGTIDs is subset of current gtids, it means instance has larger GTID than candidate - // we need to swap them out - isSubset, isSuperset := compareGTIDSet(largestGTIDs, gtids) - if isSubset { - largestGTIDs = gtids - candidate = inst - continue - } - // largestGTIDs is neither subset nor super set of gtids - // we log and append to candidates so that we know there is a problem in the group - // after the iteration - if !isSuperset { - shard.logger.Errorf("FetchGroupView divergent GITD set from host=%v GTIDSet=%v", inst.instanceKey.Hostname, gtids) - divergentCandidates = append(divergentCandidates, inst.alias) - } - } - // unless GTID set diverged, the candidates should be empty - if len(divergentCandidates) > 0 { - divergentCandidates = append(divergentCandidates, candidate.alias) - return nil, fmt.Errorf("found more than one failover candidates by GTID set for %v: %v", formatKeyspaceShard(shard.KeyspaceShard), divergentCandidates) - } - return candidate, nil -} - -func compareGTIDSet(set1, set2 mysql.GTIDSet) (bool, bool) { - isSubset := set2.Contains(set1) - // If set1 is subset of set2 we find a GTID super set and just need to record it - if isSubset { - return true, false - } - // If set1 is not a subset of set2 we need to see if set1 is actually a super set of set2 - // this is to controller GTID set divergence - isSubset = set1.Contains(set2) - // We know set1 is not subset of set2 if set2 is also not subset of set1, it means - // there is a divergent in GTID sets - return false, isSubset -} - -func (shard *GRShard) checkShardLocked(ctx context.Context) error { - if err := topo.CheckShardLocked(ctx, shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard); err != nil { - labels := []string{shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard} - unexpectedLockLost.Add(labels, 1) - shard.logger.Errorf("lost topology lock; aborting") - return vterrors.Wrap(err, "lost topology lock; aborting") - } - return nil -} diff --git a/go/vt/vtgr/controller/repair_test.go b/go/vt/vtgr/controller/repair_test.go deleted file mode 100644 index ada1def2cff..00000000000 --- a/go/vt/vtgr/controller/repair_test.go +++ /dev/null @@ -1,1355 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package controller - -import ( - "context" - "errors" - "fmt" - "math/rand" - "strconv" - "strings" - "sync" - "testing" - "time" - - "vitess.io/vitess/go/mysql" - topodatapb "vitess.io/vitess/go/vt/proto/topodata" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/topo/memorytopo" - "vitess.io/vitess/go/vt/vtctl/grpcvtctldserver/testutil" - "vitess.io/vitess/go/vt/vtgr/config" - "vitess.io/vitess/go/vt/vtgr/db" - "vitess.io/vitess/go/vt/vtgr/inst" - - gomock "github.com/golang/mock/gomock" - "github.com/stretchr/testify/assert" -) - -const repairGroupSize = 3 - -func TestRepairShardHasNoGroup(t *testing.T) { - type data struct { - mysqlhost string - mysqlport int - groupName string - readOnly bool - groupInput []db.TestGroupState - ttype topodatapb.TabletType - } - var testcases = []struct { - name string - expectedCalls int - errorMsg string - inputs []data - }{ - {"shard without group", 1, "", []data{ - {testHost, testPort0, "", true, []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "", true, []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "", true, []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"healthy shard", 0, "", []data{ - {testHost, testPort0, "group", false, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {testHost, testPort1, "group", true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "group", true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"no active member for group", 0, "", []data{ // this should rebootstrap a group by DiagnoseTypeShardHasInactiveGroup - {testHost, testPort0, "group", true, []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "", false, []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "", true, []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"raise error for unreachable primary", 0, "", []data{ // shoud be ShardHasInactiveGroup - {testHost, testPort0, "group", true, []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "", true, []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "", true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "UNREACHABLE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"raise error without bootstrap with only one reachable node", 0, "vtgr repair: fail to diagnose ShardHasNoGroup with 1 nodes", []data{ - {"", 0, "group", true, []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "", true, []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - {"", testPort2, "", true, []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"raise error when there are not enough members", 0, "vtgr repair: fail to diagnose ShardHasNoGroup with 1 nodes", []data{ - {testHost, testPort0, "", true, []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }}, - } - tablets := make(map[string]*topo.TabletInfo) - for _, tt := range testcases { - t.Run(tt.name, func(t *testing.T) { - ctx := context.Background() - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ts := memorytopo.NewServer("test_cell") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tmc := NewMockGRTmcClient(ctrl) - dbAgent := db.NewMockAgent(ctrl) - inputMap := make(map[int]testGroupInput) - dbAgent. - EXPECT(). - // RepairShardHasNoGroup is fixed by calling BootstrapGroupLocked - BootstrapGroupLocked(gomock.Any()). - DoAndReturn(func(target *inst.InstanceKey) error { - if target.Hostname == "" || target.Port == 0 { - return errors.New("invalid mysql instance key") - } - input := inputMap[target.Port] - groupState := input.groupState - if len(groupState) == 1 && groupState[0].MemberState == "OFFLINE" { - groupState[0].MemberState = "ONLINE" - groupState[0].MemberRole = "PRIMARY" - groupState[0].MemberHost = target.Hostname - groupState[0].MemberPort = strconv.Itoa(target.Port) - input.groupState = groupState - } else { - for i, s := range groupState { - if s.MemberHost == target.Hostname { - s.MemberState = "ONLINE" - s.MemberRole = "PRIMARY" - groupState[i] = s - } - input.groupState = groupState - } - } - inputMap[target.Port] = input - return nil - }). - Times(tt.expectedCalls) - for i, input := range tt.inputs { - tablet := buildTabletInfo(uint32(testPort0+i), input.mysqlhost, testPort0+i, input.ttype, time.Now()) - testutil.AddTablet(ctx, t, ts, tablet.Tablet, nil) - if tablet.Type == topodatapb.TabletType_PRIMARY { - ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { - si.PrimaryAlias = tablet.Alias - return nil - }) - } - tablets[tablet.AliasString()] = tablet - inputMap[input.mysqlport] = testGroupInput{ - input.groupName, - input.readOnly, - 0, - input.groupInput, - nil, - } - dbAgent. - EXPECT(). - FetchGroupView(gomock.Eq(tablet.AliasString()), gomock.Any()). - DoAndReturn(func(alias string, target *inst.InstanceKey) (*db.GroupView, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - s := inputMap[target.Port] - view := db.BuildGroupView(alias, s.groupName, target.Hostname, target.Port, s.readOnly, s.checkResult, s.groupState) - return view, nil - }). - AnyTimes() - } - tmc. - EXPECT(). - Ping(gomock.Any(), gomock.Any()). - Return(nil). - AnyTimes() - cfg := &config.VTGRConfig{BootstrapGroupSize: repairGroupSize, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, tmc, ts, dbAgent, cfg, testPort0, true) - shard.UpdateTabletsInShardWithLock(ctx) - _, err := shard.Repair(ctx, DiagnoseTypeShardHasNoGroup) - if tt.errorMsg == "" { - assert.NoError(t, err) - } else { - assert.EqualError(t, err, tt.errorMsg) - } - }) - } -} - -func TestRepairShardHasInactiveGroup(t *testing.T) { - type data struct { - mysqlhost string - mysqlport int - groupName string - groupInput []db.TestGroupState - pingable bool - gtid mysql.GTIDSet - ttype topodatapb.TabletType - } - sid1 := "3e11fa47-71ca-11e1-9e33-c80aa9429562" - var testcases = []struct { - name string - errorMsg string - expectedCandidatePort int - rebootstrapSize int - inputs []data - }{ - {"shard has inactive group", "", testPort0, 0, []data{ - {testHost, testPort0, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_PRIMARY}, - {testHost, testPort2, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"shard has inactive group and partial group name", "", testPort0, 0, []data{ - {testHost, testPort0, "", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_PRIMARY}, - {testHost, testPort2, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"unreachable rebootstrap candidate", "vtgr repair: test_cell-0000017000 is unreachable", 0, 0, []data{ - {testHost, testPort0, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, false, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_PRIMARY}, - {testHost, testPort2, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"inactive shard with empty gtid", "", testPort0, 0, []data{ - {testHost, testPort0, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet("", ""), topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet("", ""), topodatapb.TabletType_REPLICA}, - }}, - {"shard has more than one group", "vtgr repair: fail to refreshSQLGroup: group has split brain", 0, 0, []data{ // vtgr raises error - {testHost, testPort0, "group1", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "group2", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "group1", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"shard has inconsistent gtids", "vtgr repair: found more than one failover candidates by GTID set for ks/0", 0, 0, []data{ // vtgr raises error - {testHost, testPort0, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet("264a8230-67d2-11eb-acdd-0a8d91f24125", "1-9"), topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"error on one unreachable mysql", "vtgr repair: fail to diagnose ShardHasInactiveGroup with 2 nodes expecting 3", 0, 0, []data{ - {"", 0, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-11"), topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"error on one unreachable tablet", "vtgr repair: test_cell-0000017000 is unreachable", 0, 0, []data{ - {testHost, testPort0, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, false, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"shard has active member", "", 0, 0, []data{ // vtgr sees an active node it should not try to bootstrap - {testHost, testPort0, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "group", []db.TestGroupState{ - {MemberHost: "host_2", MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"shard has active member but more than one group", "vtgr repair: fail to refreshSQLGroup: group has split brain", 0, 0, []data{ // split brain should overweight active member diagnose - {testHost, testPort0, "group1", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - {testHost, testPort1, "group1", []db.TestGroupState{ - {MemberHost: "host_2", MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "group2", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"error on two unreachable mysql", "vtgr repair: fail to diagnose ShardHasInactiveGroup with 1 nodes expecting 3", 0, 0, []data{ - {"", 0, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-11"), topodatapb.TabletType_REPLICA}, - {"", 0, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"no error on two unreachable mysqls with allowUnhealthyNodeOnReboot", "", testPort2, 1, []data{ - {"", 0, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-11"), topodatapb.TabletType_REPLICA}, - {"", 0, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - {testHost, testPort2, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"shard with fewer than configured members can still rebootstrap", "", testPort0, 0, []data{ - {testHost, testPort0, "group", []db.TestGroupState{ - {MemberHost: "", MemberPort: "NULL", MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid1, "1-10"), topodatapb.TabletType_REPLICA}, - }}, - } - tablets := make(map[string]*topo.TabletInfo) - for _, tt := range testcases { - t.Run(tt.name, func(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ctx := context.Background() - ts := memorytopo.NewServer("test_cell") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tmc := NewMockGRTmcClient(ctrl) - dbAgent := db.NewMockAgent(ctrl) - expectedCalls := 0 - if tt.expectedCandidatePort != 0 { - expectedCalls = 1 - } - inputMap := make(map[int]testGroupInput) - pingable := make(map[string]bool) - var lock sync.Mutex - dbAgent. - EXPECT(). - // RepairShardHasNoGroup is fixed by calling RebootstrapGroupLocked - RebootstrapGroupLocked(&inst.InstanceKey{Hostname: testHost, Port: tt.expectedCandidatePort}, gomock.Any()). - DoAndReturn(func(target *inst.InstanceKey, name string) error { - if target.Hostname == "" || target.Port == 0 { - return errors.New("invalid mysql instance key") - } - input := inputMap[target.Port] - groupState := input.groupState - if len(groupState) == 1 && groupState[0].MemberState == "OFFLINE" { - groupState[0].MemberState = "ONLINE" - groupState[0].MemberRole = "PRIMARY" - groupState[0].MemberHost = target.Hostname - groupState[0].MemberPort = strconv.Itoa(target.Port) - input.groupState = groupState - } else { - for i, s := range groupState { - if s.MemberHost == target.Hostname { - s.MemberState = "ONLINE" - s.MemberRole = "PRIMARY" - groupState[i] = s - } - input.groupState = groupState - } - } - inputMap[target.Port] = input - if name != "group" { - return errors.New("unexpected group name") - } - return nil - }). - Times(expectedCalls) - for i, input := range tt.inputs { - tablet := buildTabletInfo(uint32(testPort0+i), input.mysqlhost, input.mysqlport, input.ttype, time.Now()) - testutil.AddTablet(ctx, t, ts, tablet.Tablet, nil) - if tablet.Type == topodatapb.TabletType_PRIMARY { - ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { - si.PrimaryAlias = tablet.Alias - return nil - }) - } - tablets[tablet.AliasString()] = tablet - inputMap[input.mysqlport] = testGroupInput{ - input.groupName, - false, - 0, - input.groupInput, - input.gtid, - } - pingable[tablet.Alias.String()] = input.pingable - dbAgent. - EXPECT(). - FetchGroupView(gomock.Eq(tablet.AliasString()), gomock.Any()). - DoAndReturn(func(alias string, target *inst.InstanceKey) (*db.GroupView, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - s := inputMap[target.Port] - view := db.BuildGroupView(alias, s.groupName, target.Hostname, target.Port, s.readOnly, s.checkResult, s.groupState) - return view, nil - }). - AnyTimes() - dbAgent. - EXPECT(). - FetchApplierGTIDSet(gomock.Any()). - DoAndReturn(func(target *inst.InstanceKey) (mysql.GTIDSet, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - return inputMap[target.Port].gtid, nil - }). - AnyTimes() - dbAgent. - EXPECT(). - StopGroupLocked(gomock.Any()). - DoAndReturn(func(target *inst.InstanceKey) error { - if target.Hostname == "" || target.Port == 0 { - return errors.New("invalid mysql instance key") - } - lock.Lock() - view := inputMap[target.Port] - view.groupState = []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(target.Port), MemberState: "OFFLINE", MemberRole: ""}, - } - inputMap[target.Port] = view - lock.Unlock() - return nil - }). - AnyTimes() - tmc. - EXPECT(). - Ping(gomock.Any(), gomock.Any()). - DoAndReturn(func(_ context.Context, t *topodatapb.Tablet) error { - if !pingable[t.Alias.String()] { - return errors.New("unreachable") - } - return nil - }). - AnyTimes() - } - cfg := &config.VTGRConfig{BootstrapGroupSize: repairGroupSize, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, tmc, ts, dbAgent, cfg, testPort0, true) - if tt.rebootstrapSize != 0 { - shard.OverrideRebootstrapGroupSize(tt.rebootstrapSize) - } - _, err := shard.Repair(ctx, DiagnoseTypeShardHasInactiveGroup) - if tt.errorMsg == "" { - assert.NoError(t, err) - } else { - assert.Error(t, err, tt.errorMsg) - assert.True(t, strings.Contains(err.Error(), tt.errorMsg), err.Error()) - } - }) - } -} - -func TestRepairWrongPrimaryTablet(t *testing.T) { - type data struct { - mysqlport int - groupName string - groupInput []db.TestGroupState - ttype topodatapb.TabletType - } - - var testcases = []struct { - name string - errorMsg string - expectedCandidatePort int - shardPrimary string - inputs []data - }{ - {"fix no primary tablet in shard", "", testPort0, "", []data{ - {testPort0, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {testPort1, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {testPort2, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"fix wrong primary tablet", "", testPort0, "test_cell-0000017001", []data{ - {testPort0, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {testPort1, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {testPort2, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"fix wrong primary tablet based on shard info", "", testPort0, "test_cell-0000017001", []data{ - {testPort0, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {testPort1, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {testPort2, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"fix shard if there is an unreachable secondary", "", testPort0, "test_cell-0000017001", []data{ - {testPort0, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {testPort1, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {testPort2, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"diagnose as ShardHasInactiveGroup if quorum number of not online", "", 0, "test_cell-0000017001", []data{ - {testPort0, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "UNREACHABLE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {testPort1, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {testPort2, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "UNREACHABLE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"tolerate failed nodes", "", testPort0, "test_cell-0000017001", []data{ - {testPort0, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {0, "group", []db.TestGroupState{}, topodatapb.TabletType_PRIMARY}, - {0, "group", []db.TestGroupState{}, topodatapb.TabletType_REPLICA}, - }}, - {"raise error if all nodes failed", "", 0, "", []data{ // diagnose as DiagnoseTypeShardNetworkPartition - {0, "group", []db.TestGroupState{}, topodatapb.TabletType_REPLICA}, - {0, "group", []db.TestGroupState{}, topodatapb.TabletType_PRIMARY}, - {0, "group", []db.TestGroupState{}, topodatapb.TabletType_REPLICA}, - }}, - } - for _, tt := range testcases { - t.Run(tt.name, func(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ctx := context.Background() - ts := memorytopo.NewServer("test_cell") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tmc := NewMockGRTmcClient(ctrl) - dbAgent := db.NewMockAgent(ctrl) - tablets := make(map[string]*topo.TabletInfo) - tmc. - EXPECT(). - Ping(gomock.Any(), gomock.Any()). - Return(nil). - AnyTimes() - expectedCalls := 0 - if tt.expectedCandidatePort != 0 { - expectedCalls = 1 - } - var candidate *topo.TabletInfo - inputMap := make(map[string]testGroupInput) - for i, input := range tt.inputs { - tablet := buildTabletInfo(uint32(testPort0+i), testHost, input.mysqlport, input.ttype, time.Now()) - testutil.AddTablet(ctx, t, ts, tablet.Tablet, nil) - if tablet.AliasString() == tt.shardPrimary { - ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { - si.PrimaryAlias = tablet.Alias - return nil - }) - } - tablets[tablet.AliasString()] = tablet - inputMap[tablet.AliasString()] = testGroupInput{ - input.groupName, - false, - 0, - input.groupInput, - nil, - } - if expectedCalls > 0 && input.mysqlport == tt.expectedCandidatePort { - candidate = tablet - } - dbAgent. - EXPECT(). - FetchGroupView(gomock.Eq(tablet.AliasString()), gomock.Eq(&inst.InstanceKey{Hostname: testHost, Port: input.mysqlport})). - DoAndReturn(func(alias string, target *inst.InstanceKey) (*db.GroupView, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - s := inputMap[alias] - view := db.BuildGroupView(alias, s.groupName, target.Hostname, target.Port, s.readOnly, s.checkResult, s.groupState) - return view, nil - }). - AnyTimes() - } - if candidate != nil { - tmc. - EXPECT(). - ChangeType(gomock.Any(), gomock.Any(), topodatapb.TabletType_PRIMARY). - Return(nil). - Times(expectedCalls) - } - cfg := &config.VTGRConfig{BootstrapGroupSize: repairGroupSize, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, tmc, ts, dbAgent, cfg, testPort0, true) - _, err := shard.Repair(ctx, DiagnoseTypeWrongPrimaryTablet) - if tt.errorMsg == "" { - assert.NoError(t, err) - } else { - assert.Error(t, err) - assert.True(t, strings.Contains(err.Error(), tt.errorMsg), err.Error()) - } - }) - } -} - -func TestRepairUnconnectedReplica(t *testing.T) { - type data struct { - alias string - port int - groupName string - readOnly bool - groupInput []db.TestGroupState - ttype topodatapb.TabletType - } - var testcases = []struct { - name string - errorMsg string - expectedCandidatePort int - inputs []data - }{ - {"fix unconnected replica tablet", "", testPort2, []data{ - {alias0, testPort0, "group", false, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, testPort1, "group", true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, testPort2, "", true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"do nothing if shard has wrong primary tablet", "", 0, []data{ // this should be diagnosed as DiagnoseTypeWrongPrimaryTablet instead - {alias0, testPort0, "group", true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, testPort1, "group", false, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, testPort2, "", true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "OFFLINE", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"fix replica in ERROR state", "", testPort2, []data{ - {alias0, testPort0, "group", false, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, testPort1, "group", true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, testPort2, "group", true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ERROR", MemberRole: ""}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"fix replica with two nodes in ERROR state", "", 0, []data{ // InsufficientGroupSize - {alias0, testPort0, "group", false, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, testPort1, "group", true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, testPort2, "group", true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ERROR", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - } - for _, tt := range testcases { - t.Run(tt.name, func(t *testing.T) { - rand.Seed(1) - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ctx := context.Background() - ts := memorytopo.NewServer("test_cell") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tmc := NewMockGRTmcClient(ctrl) - dbAgent := db.NewMockAgent(ctrl) - tablets := make(map[string]*topo.TabletInfo) - tmc. - EXPECT(). - Ping(gomock.Any(), gomock.Any()). - Return(nil). - AnyTimes() - if tt.expectedCandidatePort != 0 { - dbAgent. - EXPECT(). - StopGroupLocked(gomock.Eq(&inst.InstanceKey{Hostname: testHost, Port: tt.expectedCandidatePort})). - Return(nil). - AnyTimes() - dbAgent. - EXPECT(). - JoinGroupLocked(gomock.Eq(&inst.InstanceKey{Hostname: testHost, Port: tt.expectedCandidatePort}), gomock.Any()). - Return(nil). - Times(1) - } - inputMap := make(map[string]testGroupInput) - for i, input := range tt.inputs { - tablet := buildTabletInfo(uint32(i), testHost, input.port, input.ttype, time.Now()) - testutil.AddTablet(ctx, t, ts, tablet.Tablet, nil) - if tablet.Type == topodatapb.TabletType_PRIMARY { - ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { - si.PrimaryAlias = tablet.Alias - return nil - }) - } - tablets[tablet.AliasString()] = tablet - inputMap[input.alias] = testGroupInput{ - input.groupName, - input.readOnly, - 0, - input.groupInput, - nil, - } - dbAgent. - EXPECT(). - FetchGroupView(gomock.Eq(tablet.AliasString()), gomock.Eq(&inst.InstanceKey{Hostname: testHost, Port: input.port})). - DoAndReturn(func(alias string, target *inst.InstanceKey) (*db.GroupView, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - s := inputMap[alias] - view := db.BuildGroupView(alias, s.groupName, target.Hostname, target.Port, s.readOnly, s.checkResult, s.groupState) - return view, nil - }). - AnyTimes() - } - cfg := &config.VTGRConfig{BootstrapGroupSize: repairGroupSize, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, tmc, ts, dbAgent, cfg, testPort0, true) - _, err := shard.Repair(ctx, DiagnoseTypeUnconnectedReplica) - if tt.errorMsg == "" { - assert.NoError(t, err) - } else { - assert.Error(t, err) - assert.True(t, strings.Contains(err.Error(), tt.errorMsg), err.Error()) - } - }) - } -} - -func TestRepairUnreachablePrimary(t *testing.T) { - type data struct { - port int - pingalbe bool - gtid mysql.GTIDSet - ttype topodatapb.TabletType - } - sid := "3e11fa47-71ca-11e1-9e33-c80aa9429562" - var testcases = []struct { - name string - errorMsg string - expectedCandidatePort int - inputs []data - }{ - {"primary is unreachable", "", testPort1, []data{ - {testPort0, false, getMysql56GTIDSet(sid, "1-11"), topodatapb.TabletType_PRIMARY}, - {testPort1, true, getMysql56GTIDSet(sid, "1-10"), topodatapb.TabletType_REPLICA}, - {testPort2, true, getMysql56GTIDSet(sid, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"failover to reachable node when primary is unreachable", "", testPort2, []data{ - {testPort0, false, getMysql56GTIDSet(sid, "1-11"), topodatapb.TabletType_PRIMARY}, - {testPort1, false, getMysql56GTIDSet(sid, "1-10"), topodatapb.TabletType_REPLICA}, - {testPort2, true, getMysql56GTIDSet(sid, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"do nothing if replica is unreachable", "", 0, []data{ - {testPort0, true, getMysql56GTIDSet(sid, "1-10"), topodatapb.TabletType_PRIMARY}, - {testPort1, false, getMysql56GTIDSet(sid, "1-10"), topodatapb.TabletType_REPLICA}, - {testPort2, false, getMysql56GTIDSet(sid, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"raise error if gtid divergence", "vtgr repair: found more than one failover candidates by GTID set for ks/0", 0, []data{ - {testPort0, false, getMysql56GTIDSet(sid, "1-10"), topodatapb.TabletType_PRIMARY}, - {testPort1, true, getMysql56GTIDSet("264a8230-67d2-11eb-acdd-0a8d91f24125", "1-10"), topodatapb.TabletType_REPLICA}, - {testPort2, true, getMysql56GTIDSet(sid, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - } - for _, tt := range testcases { - t.Run(tt.name, func(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ctx := context.Background() - ts := memorytopo.NewServer("test_cell") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tmc := NewMockGRTmcClient(ctrl) - dbAgent := db.NewMockAgent(ctrl) - dbAgent. - EXPECT(). - FetchGroupView(gomock.Any(), gomock.Any()). - DoAndReturn(func(alias string, target *inst.InstanceKey) (*db.GroupView, error) { - return db.BuildGroupView(alias, "group", target.Hostname, target.Port, false, 0, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - }), nil - }). - AnyTimes() - expectedCalls := 0 - if tt.expectedCandidatePort != 0 { - expectedCalls = 1 - } - dbAgent. - EXPECT(). - Failover(&inst.InstanceKey{Hostname: testHost, Port: tt.expectedCandidatePort}). - Return(nil). - Times(expectedCalls) - tmc. - EXPECT(). - ChangeType(gomock.Any(), gomock.Any(), topodatapb.TabletType_PRIMARY). - Return(nil). - Times(expectedCalls) - status := make(map[int32]struct { - pingalbe bool - gtid mysql.GTIDSet - }) - for i, input := range tt.inputs { - tablet := buildTabletInfo(uint32(i), testHost, input.port, input.ttype, time.Now()) - testutil.AddTablet(ctx, t, ts, tablet.Tablet, nil) - if tablet.Type == topodatapb.TabletType_PRIMARY { - ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { - si.PrimaryAlias = tablet.Alias - return nil - }) - } - status[tablet.MysqlPort] = struct { - pingalbe bool - gtid mysql.GTIDSet - }{ - input.pingalbe, - input.gtid, - } - dbAgent. - EXPECT(). - FetchApplierGTIDSet(gomock.Eq(&inst.InstanceKey{Hostname: testHost, Port: input.port})). - DoAndReturn(func(target *inst.InstanceKey) (mysql.GTIDSet, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - return status[int32(target.Port)].gtid, nil - }). - AnyTimes() - tmc. - EXPECT(). - Ping(gomock.Any(), gomock.Any()). - DoAndReturn(func(_ context.Context, t *topodatapb.Tablet) error { - if !status[t.MysqlPort].pingalbe { - return errors.New("unreachable") - } - return nil - }). - AnyTimes() - } - cfg := &config.VTGRConfig{BootstrapGroupSize: repairGroupSize, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, tmc, ts, dbAgent, cfg, testPort0, true) - _, err := shard.Repair(ctx, DiagnoseTypeUnreachablePrimary) - if tt.errorMsg == "" { - assert.NoError(t, err) - } else { - assert.Error(t, err, tt.errorMsg) - assert.True(t, strings.Contains(err.Error(), tt.errorMsg)) - } - }) - } -} - -func TestRepairInsufficientGroupSize(t *testing.T) { - type data struct { - alias string - readOnly bool - groupInput []db.TestGroupState - ttype topodatapb.TabletType - } - var testcases = []struct { - name string - errorMsg string - expectedCandidatePort int - inputs []data - }{ - {"fix insufficient group expectedBootstrapSize", "", testPort0, []data{ - {alias0, false, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - } - for _, tt := range testcases { - t.Run(tt.name, func(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ctx := context.Background() - ts := memorytopo.NewServer("test_cell") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tmc := NewMockGRTmcClient(ctrl) - dbAgent := db.NewMockAgent(ctrl) - tablets := make(map[string]*topo.TabletInfo) - tmc. - EXPECT(). - Ping(gomock.Any(), gomock.Any()). - Return(nil). - AnyTimes() - if tt.expectedCandidatePort != 0 { - dbAgent. - EXPECT(). - SetReadOnly(gomock.Eq(&inst.InstanceKey{Hostname: testHost, Port: tt.expectedCandidatePort}), true). - Return(nil). - Times(1) - } - inputMap := make(map[string]testGroupInput) - for i, input := range tt.inputs { - tablet := buildTabletInfo(uint32(i), testHost, testPort0+i, input.ttype, time.Now()) - testutil.AddTablet(ctx, t, ts, tablet.Tablet, nil) - if tablet.Type == topodatapb.TabletType_PRIMARY { - ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { - si.PrimaryAlias = tablet.Alias - return nil - }) - } - tablets[tablet.AliasString()] = tablet - inputMap[input.alias] = testGroupInput{ - "group", - input.readOnly, - 0, - input.groupInput, - nil, - } - dbAgent. - EXPECT(). - FetchGroupView(gomock.Any(), gomock.Any()). - DoAndReturn(func(alias string, target *inst.InstanceKey) (*db.GroupView, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - s := inputMap[alias] - view := db.BuildGroupView(alias, s.groupName, target.Hostname, target.Port, s.readOnly, s.checkResult, s.groupState) - return view, nil - }). - AnyTimes() - } - cfg := &config.VTGRConfig{BootstrapGroupSize: repairGroupSize, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, tmc, ts, dbAgent, cfg, testPort0, true) - _, err := shard.Repair(ctx, DiagnoseTypeInsufficientGroupSize) - if tt.errorMsg == "" { - assert.NoError(t, err) - } else { - assert.Error(t, err) - assert.True(t, strings.Contains(err.Error(), tt.errorMsg), err.Error()) - } - }) - } -} - -func TestRepairReadOnlyShard(t *testing.T) { - type data struct { - alias string - port int - readOnly bool - groupInput []db.TestGroupState - ttype topodatapb.TabletType - } - var testcases = []struct { - name string - errorMsg string - expectedCandidatePort int - inputs []data - }{ - {"fix readonly shard", "", testPort0, []data{ - {alias0, testPort0, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, testPort1, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, testPort2, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - {"do nothing if primary is not read only", "", 0, []data{ - {alias0, testPort0, false, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_PRIMARY}, - {alias1, testPort1, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - {alias2, testPort2, true, []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "ONLINE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, topodatapb.TabletType_REPLICA}, - }}, - } - for _, tt := range testcases { - t.Run(tt.name, func(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ctx := context.Background() - ts := memorytopo.NewServer("test_cell") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tmc := NewMockGRTmcClient(ctrl) - dbAgent := db.NewMockAgent(ctrl) - tablets := make(map[string]*topo.TabletInfo) - tmc. - EXPECT(). - Ping(gomock.Any(), gomock.Any()). - Return(nil). - AnyTimes() - if tt.expectedCandidatePort != 0 { - dbAgent. - EXPECT(). - SetReadOnly(gomock.Eq(&inst.InstanceKey{Hostname: testHost, Port: tt.expectedCandidatePort}), false). - Return(nil). - Times(1) - } - inputMap := make(map[string]testGroupInput) - for i, input := range tt.inputs { - tablet := buildTabletInfo(uint32(i), testHost, input.port, input.ttype, time.Now()) - testutil.AddTablet(ctx, t, ts, tablet.Tablet, nil) - if tablet.Type == topodatapb.TabletType_PRIMARY { - ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { - si.PrimaryAlias = tablet.Alias - return nil - }) - } - tablets[tablet.AliasString()] = tablet - inputMap[input.alias] = testGroupInput{ - "group", - input.readOnly, - 0, - input.groupInput, - nil, - } - dbAgent. - EXPECT(). - FetchGroupView(gomock.Eq(tablet.AliasString()), gomock.Any()). - DoAndReturn(func(alias string, target *inst.InstanceKey) (*db.GroupView, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - s := inputMap[alias] - view := db.BuildGroupView(alias, s.groupName, target.Hostname, target.Port, s.readOnly, s.checkResult, s.groupState) - return view, nil - }). - AnyTimes() - } - cfg := &config.VTGRConfig{BootstrapGroupSize: repairGroupSize, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, tmc, ts, dbAgent, cfg, testPort0, true) - _, err := shard.Repair(ctx, DiagnoseTypeReadOnlyShard) - if tt.errorMsg == "" { - assert.NoError(t, err) - } else { - assert.Error(t, err) - assert.True(t, strings.Contains(err.Error(), tt.errorMsg), err.Error()) - } - }) - } -} - -func TestRepairBackoffError(t *testing.T) { - type data struct { - alias string - mysqlhost string - mysqlport int - groupName string - groupInput []db.TestGroupState - pingable bool - gtid mysql.GTIDSet - ttype topodatapb.TabletType - } - sid := "3e11fa47-71ca-11e1-9e33-c80aa9429562" - var testcases = []struct { - name string - errorMsg string - expectedCandidatePort int - diagnose DiagnoseType - inputs []data - }{ - {"shard has network partition", "", testPort0, DiagnoseTypeBackoffError, []data{ - {alias0, testHost, testPort0, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "UNREACHABLE", MemberRole: "PRIMARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "ONLINE", MemberRole: "SECONDARY"}, - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "UNREACHABLE", MemberRole: "SECONDARY"}, - }, true, getMysql56GTIDSet(sid, "1-10"), topodatapb.TabletType_REPLICA}, - {alias1, testHost, testPort1, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid, "1-9"), topodatapb.TabletType_REPLICA}, - {alias2, testHost, testPort2, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - {"shard bootstrap in progress", "", testPort0, DiagnoseTypeBootstrapBackoff, []data{ - {alias0, testHost, testPort0, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort0), MemberState: "RECOVERING", MemberRole: "SECONDARY"}, - }, true, getMysql56GTIDSet(sid, "1-10"), topodatapb.TabletType_REPLICA}, - {alias1, testHost, testPort1, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort1), MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid, "1-9"), topodatapb.TabletType_REPLICA}, - {alias2, testHost, testPort2, "group", []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(testPort2), MemberState: "OFFLINE", MemberRole: ""}, - }, true, getMysql56GTIDSet(sid, "1-9"), topodatapb.TabletType_REPLICA}, - }}, - } - tablets := make(map[string]*topo.TabletInfo) - for _, tt := range testcases { - t.Run(tt.name, func(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - ctx := context.Background() - ts := memorytopo.NewServer("test_cell") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - tmc := NewMockGRTmcClient(ctrl) - dbAgent := db.NewMockAgent(ctrl) - expectedCalls := 0 - if tt.expectedCandidatePort != 0 { - expectedCalls = 1 - } - inputMap := make(map[int]testGroupInput) - pingable := make(map[string]bool) - var lock sync.Mutex - dbAgent. - EXPECT(). - RebootstrapGroupLocked(&inst.InstanceKey{Hostname: testHost, Port: tt.expectedCandidatePort}, "group"). - DoAndReturn(func(target *inst.InstanceKey, name string) error { - if target.Hostname == "" || target.Port == 0 { - return errors.New("invalid mysql instance key") - } - input := inputMap[target.Port] - groupState := input.groupState - if len(groupState) == 1 && groupState[0].MemberState == "OFFLINE" { - groupState[0].MemberState = "ONLINE" - groupState[0].MemberRole = "PRIMARY" - groupState[0].MemberHost = target.Hostname - groupState[0].MemberPort = strconv.Itoa(target.Port) - input.groupState = groupState - } else { - for i, s := range groupState { - if s.MemberHost == target.Hostname { - s.MemberState = "ONLINE" - s.MemberRole = "PRIMARY" - groupState[i] = s - } - input.groupState = groupState - } - } - inputMap[target.Port] = input - return nil - }). - Times(expectedCalls) - for i, input := range tt.inputs { - tablet := buildTabletInfo(uint32(i), input.mysqlhost, input.mysqlport, input.ttype, time.Now()) - testutil.AddTablet(ctx, t, ts, tablet.Tablet, nil) - if tablet.Type == topodatapb.TabletType_PRIMARY { - ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error { - si.PrimaryAlias = tablet.Alias - return nil - }) - } - tablets[tablet.AliasString()] = tablet - inputMap[input.mysqlport] = testGroupInput{ - input.groupName, - false, - 0, - input.groupInput, - input.gtid, - } - pingable[input.alias] = input.pingable - dbAgent. - EXPECT(). - FetchGroupView(gomock.Eq(tablet.AliasString()), gomock.Any()). - DoAndReturn(func(alias string, target *inst.InstanceKey) (*db.GroupView, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - s := inputMap[target.Port] - view := db.BuildGroupView(alias, s.groupName, target.Hostname, target.Port, s.readOnly, s.checkResult, s.groupState) - return view, nil - }). - AnyTimes() - dbAgent. - EXPECT(). - FetchApplierGTIDSet(gomock.Any()). - DoAndReturn(func(target *inst.InstanceKey) (mysql.GTIDSet, error) { - if target.Hostname == "" || target.Port == 0 { - return nil, errors.New("invalid mysql instance key") - } - return inputMap[target.Port].gtid, nil - }). - AnyTimes() - dbAgent. - EXPECT(). - StopGroupLocked(gomock.Any()). - DoAndReturn(func(target *inst.InstanceKey) error { - lock.Lock() - view := inputMap[target.Port] - view.groupState = []db.TestGroupState{ - {MemberHost: testHost, MemberPort: strconv.Itoa(target.Port), MemberState: "OFFLINE", MemberRole: ""}, - } - inputMap[target.Port] = view - lock.Unlock() - return nil - }). - AnyTimes() - tmc. - EXPECT(). - Ping(gomock.Any(), gomock.Any()). - DoAndReturn(func(_ context.Context, t *topodatapb.Tablet) error { - if !pingable[input.alias] { - return errors.New("unreachable") - } - return nil - }). - AnyTimes() - } - cfg := &config.VTGRConfig{BootstrapGroupSize: repairGroupSize, MinNumReplica: 2, BackoffErrorWaitTimeSeconds: 1, BootstrapWaitTimeSeconds: 1} - shard := NewGRShard("ks", "0", nil, tmc, ts, dbAgent, cfg, testPort0, true) - shard.lastDiagnoseResult = tt.diagnose - _, err := shard.Repair(ctx, tt.diagnose) - if tt.errorMsg == "" { - assert.NoError(t, err) - } else { - assert.Error(t, err, tt.errorMsg) - assert.True(t, strings.Contains(err.Error(), tt.errorMsg), err.Error()) - } - }) - } -} - -func getMysql56GTIDSet(sid, interval string) mysql.GTIDSet { - input := fmt.Sprintf("%s:%s", sid, interval) - pos, _ := mysql.ParsePosition(mysql.Mysql56FlavorID, input) - return pos.GTIDSet -} diff --git a/go/vt/vtgr/db/db.go b/go/vt/vtgr/db/db.go deleted file mode 100644 index f9a0ab2b478..00000000000 --- a/go/vt/vtgr/db/db.go +++ /dev/null @@ -1,381 +0,0 @@ -/* - Copyright 2014 Outbrain Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -/* - This file has been copied over from VTOrc package -*/ - -package db - -import ( - "database/sql" - "fmt" - "strings" - "sync" - "time" - - "vitess.io/vitess/go/vt/external/golib/sqlutils" - "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/vtgr/config" -) - -var ( - EmptyArgs []any - Db DB = (*vtorcDB)(nil) -) - -var mysqlURI string -var dbMutex sync.Mutex - -type DB interface { - QueryOrchestrator(query string, argsArray []any, onRow func(sqlutils.RowMap) error) error -} - -type vtorcDB struct { -} - -var _ DB = (*vtorcDB)(nil) - -func (m *vtorcDB) QueryOrchestrator(query string, argsArray []any, onRow func(sqlutils.RowMap) error) error { - return QueryOrchestrator(query, argsArray, onRow) -} - -type DummySQLResult struct { -} - -func (dummyRes DummySQLResult) LastInsertId() (int64, error) { - return 0, nil -} - -func (dummyRes DummySQLResult) RowsAffected() (int64, error) { - return 1, nil -} - -func getMySQLURI() string { - dbMutex.Lock() - defer dbMutex.Unlock() - if mysqlURI != "" { - return mysqlURI - } - mysqlURI := fmt.Sprintf("%s:%s@tcp(%s:%d)/%s?timeout=%ds&readTimeout=%ds&rejectReadOnly=%t&interpolateParams=true", - config.Config.MySQLOrchestratorUser, - config.Config.MySQLOrchestratorPassword, - config.Config.MySQLOrchestratorHost, - config.Config.MySQLOrchestratorPort, - config.Config.MySQLOrchestratorDatabase, - config.Config.MySQLConnectTimeoutSeconds, - config.Config.MySQLOrchestratorReadTimeoutSeconds, - config.Config.MySQLOrchestratorRejectReadOnly, - ) - if config.Config.MySQLOrchestratorUseMutualTLS { - mysqlURI, _ = SetupMySQLOrchestratorTLS(mysqlURI) - } - return mysqlURI -} - -// OpenDiscovery returns a DB instance to access a topology instance. -// It has lower read timeout than OpenTopology and is intended to -// be used with low-latency discovery queries. -func OpenDiscovery(host string, port int) (*sql.DB, error) { - return openTopology(host, port, config.Config.MySQLDiscoveryReadTimeoutSeconds) -} - -// OpenTopology returns a DB instance to access a topology instance. -func OpenTopology(host string, port int) (*sql.DB, error) { - return openTopology(host, port, config.Config.MySQLTopologyReadTimeoutSeconds) -} - -func openTopology(host string, port int, readTimeout int) (db *sql.DB, err error) { - uri := fmt.Sprintf("%s:%s@tcp(%s:%d)/?timeout=%ds&readTimeout=%ds&interpolateParams=true", - config.Config.MySQLTopologyUser, - config.Config.MySQLTopologyPassword, - host, port, - config.Config.MySQLConnectTimeoutSeconds, - readTimeout, - ) - - if config.Config.MySQLTopologyUseMutualTLS || - (config.Config.MySQLTopologyUseMixedTLS && requiresTLS(host, port, uri)) { - if uri, err = SetupMySQLTopologyTLS(uri); err != nil { - return nil, err - } - } - if db, _, err = sqlutils.GetDB(uri); err != nil { - return nil, err - } - if config.Config.MySQLConnectionLifetimeSeconds > 0 { - db.SetConnMaxLifetime(time.Duration(config.Config.MySQLConnectionLifetimeSeconds) * time.Second) - } - db.SetMaxOpenConns(config.MySQLTopologyMaxPoolConnections) - db.SetMaxIdleConns(config.MySQLTopologyMaxPoolConnections) - return db, err -} - -func openOrchestratorMySQLGeneric() (db *sql.DB, fromCache bool, err error) { - uri := fmt.Sprintf("%s:%s@tcp(%s:%d)/?timeout=%ds&readTimeout=%ds&interpolateParams=true", - config.Config.MySQLOrchestratorUser, - config.Config.MySQLOrchestratorPassword, - config.Config.MySQLOrchestratorHost, - config.Config.MySQLOrchestratorPort, - config.Config.MySQLConnectTimeoutSeconds, - config.Config.MySQLOrchestratorReadTimeoutSeconds, - ) - if config.Config.MySQLOrchestratorUseMutualTLS { - uri, _ = SetupMySQLOrchestratorTLS(uri) - } - return sqlutils.GetDB(uri) -} - -func IsSQLite() bool { - return config.Config.IsSQLite() -} - -// OpenTopology returns the DB instance for the orchestrator backed database -func OpenOrchestrator() (db *sql.DB, err error) { - var fromCache bool - if IsSQLite() { - db, fromCache, err = sqlutils.GetSQLiteDB(config.Config.SQLite3DataFile) - if err == nil && !fromCache { - log.Infof("Connected to orchestrator backend: sqlite on %v", config.Config.SQLite3DataFile) - } - if db != nil { - db.SetMaxOpenConns(1) - db.SetMaxIdleConns(1) - } - } else { - if db, fromCache, err := openOrchestratorMySQLGeneric(); err != nil { - log.Errorf(err.Error()) - return db, err - } else if !fromCache { - // first time ever we talk to MySQL - query := fmt.Sprintf("create database if not exists %s", config.Config.MySQLOrchestratorDatabase) - if _, err := db.Exec(query); err != nil { - log.Errorf(err.Error()) - return db, err - } - } - db, fromCache, err = sqlutils.GetDB(getMySQLURI()) - if err == nil && !fromCache { - // do not show the password but do show what we connect to. - safeMySQLURI := fmt.Sprintf("%s:?@tcp(%s:%d)/%s?timeout=%ds", config.Config.MySQLOrchestratorUser, - config.Config.MySQLOrchestratorHost, config.Config.MySQLOrchestratorPort, config.Config.MySQLOrchestratorDatabase, config.Config.MySQLConnectTimeoutSeconds) - log.Infof("Connected to orchestrator backend: %v", safeMySQLURI) - if config.Config.MySQLOrchestratorMaxPoolConnections > 0 { - log.Infof("Orchestrator pool SetMaxOpenConns: %d", config.Config.MySQLOrchestratorMaxPoolConnections) - db.SetMaxOpenConns(config.Config.MySQLOrchestratorMaxPoolConnections) - } - if config.Config.MySQLConnectionLifetimeSeconds > 0 { - db.SetConnMaxLifetime(time.Duration(config.Config.MySQLConnectionLifetimeSeconds) * time.Second) - } - } - } - if err == nil && !fromCache { - if !config.Config.SkipOrchestratorDatabaseUpdate { - initOrchestratorDB(db) - } - // A low value here will trigger reconnects which could - // make the number of backend connections hit the tcp - // limit. That's bad. I could make this setting dynamic - // but then people need to know which value to use. For now - // allow up to 25% of MySQLOrchestratorMaxPoolConnections - // to be idle. That should provide a good number which - // does not keep the maximum number of connections open but - // at the same time does not trigger disconnections and - // reconnections too frequently. - maxIdleConns := int(config.Config.MySQLOrchestratorMaxPoolConnections * 25 / 100) - if maxIdleConns < 10 { - maxIdleConns = 10 - } - log.Infof("Connecting to backend %s:%d: maxConnections: %d, maxIdleConns: %d", - config.Config.MySQLOrchestratorHost, - config.Config.MySQLOrchestratorPort, - config.Config.MySQLOrchestratorMaxPoolConnections, - maxIdleConns) - db.SetMaxIdleConns(maxIdleConns) - } - return db, err -} - -func translateStatement(statement string) (string, error) { - if IsSQLite() { - statement = sqlutils.ToSqlite3Dialect(statement) - } - return statement, nil -} - -// versionIsDeployed checks if given version has already been deployed -func versionIsDeployed(db *sql.DB) (result bool, err error) { - query := ` - select - count(*) as is_deployed - from - orchestrator_db_deployments - where - deployed_version = ? - ` - err = db.QueryRow(query, config.RuntimeCLIFlags.ConfiguredVersion).Scan(&result) - // err means the table 'orchestrator_db_deployments' does not even exist, in which case we proceed - // to deploy. - // If there's another error to this, like DB gone bad, then we're about to find out anyway. - return result, err -} - -// registerOrchestratorDeployment updates the orchestrator_metadata table upon successful deployment -func registerOrchestratorDeployment(db *sql.DB) error { - query := ` - replace into orchestrator_db_deployments ( - deployed_version, deployed_timestamp - ) values ( - ?, NOW() - ) - ` - if _, err := execInternal(db, query, config.RuntimeCLIFlags.ConfiguredVersion); err != nil { - log.Fatalf("Unable to write to orchestrator_metadata: %+v", err) - } - log.Infof("Migrated database schema to version [%+v]", config.RuntimeCLIFlags.ConfiguredVersion) - return nil -} - -// deployStatements will issue given sql queries that are not already known to be deployed. -// This iterates both lists (to-run and already-deployed) and also verifies no contraditions. -func deployStatements(db *sql.DB, queries []string) error { - tx, err := db.Begin() - if err != nil { - log.Fatal(err.Error()) - } - // Ugly workaround ahead. - // Origin of this workaround is the existence of some "timestamp NOT NULL," column definitions, - // where in NO_ZERO_IN_DATE,NO_ZERO_DATE sql_mode are invalid (since default is implicitly "0") - // This means installation of orchestrator fails on such configured servers, and in particular on 5.7 - // where this setting is the dfault. - // For purpose of backwards compatability, what we do is force sql_mode to be more relaxed, create the schemas - // along with the "invalid" definition, and then go ahead and fix those definitions via following ALTER statements. - // My bad. - originalSQLMode := "" - if config.Config.IsMySQL() { - _ = tx.QueryRow(`select @@session.sql_mode`).Scan(&originalSQLMode) - if _, err := tx.Exec(`set @@session.sql_mode=REPLACE(@@session.sql_mode, 'NO_ZERO_DATE', '')`); err != nil { - log.Fatal(err.Error()) - } - if _, err := tx.Exec(`set @@session.sql_mode=REPLACE(@@session.sql_mode, 'NO_ZERO_IN_DATE', '')`); err != nil { - log.Fatal(err.Error()) - } - } - for _, query := range queries { - query, err := translateStatement(query) - if err != nil { - log.Fatalf("Cannot initiate orchestrator: %+v; query=%+v", err, query) - return err - } - if _, err := tx.Exec(query); err != nil { - if strings.Contains(err.Error(), "syntax error") { - log.Fatalf("Cannot initiate orchestrator: %+v; query=%+v", err, query) - return err - } - if !sqlutils.IsAlterTable(query) && !sqlutils.IsCreateIndex(query) && !sqlutils.IsDropIndex(query) { - log.Fatalf("Cannot initiate orchestrator: %+v; query=%+v", err, query) - return err - } - if !strings.Contains(err.Error(), "duplicate column name") && - !strings.Contains(err.Error(), "Duplicate column name") && - !strings.Contains(err.Error(), "check that column/key exists") && - !strings.Contains(err.Error(), "already exists") && - !strings.Contains(err.Error(), "Duplicate key name") { - log.Errorf("Error initiating orchestrator: %+v; query=%+v", err, query) - } - } - } - if config.Config.IsMySQL() { - if _, err := tx.Exec(`set session sql_mode=?`, originalSQLMode); err != nil { - log.Fatal(err.Error()) - } - } - if err := tx.Commit(); err != nil { - log.Fatal(err.Error()) - } - return nil -} - -// initOrchestratorDB attempts to create/upgrade the orchestrator backend database. It is created once in the -// application's lifetime. -func initOrchestratorDB(db *sql.DB) error { - log.Info("Initializing orchestrator") - - versionAlreadyDeployed, err := versionIsDeployed(db) - if versionAlreadyDeployed && config.RuntimeCLIFlags.ConfiguredVersion != "" && err == nil { - // Already deployed with this version - return nil - } - if config.Config.PanicIfDifferentDatabaseDeploy && config.RuntimeCLIFlags.ConfiguredVersion != "" && !versionAlreadyDeployed { - log.Fatalf("PanicIfDifferentDatabaseDeploy is set. Configured version %s is not the version found in the database", config.RuntimeCLIFlags.ConfiguredVersion) - } - log.Info("Migrating database schema") - deployStatements(db, generateSQLBase) - deployStatements(db, generateSQLPatches) - registerOrchestratorDeployment(db) - - if IsSQLite() { - ExecOrchestrator(`PRAGMA journal_mode = WAL`) - ExecOrchestrator(`PRAGMA synchronous = NORMAL`) - } - - return nil -} - -// execInternal -func execInternal(db *sql.DB, query string, args ...any) (sql.Result, error) { - var err error - query, err = translateStatement(query) - if err != nil { - return nil, err - } - res, err := sqlutils.ExecNoPrepare(db, query, args...) - return res, err -} - -// ExecOrchestrator will execute given query on the orchestrator backend database. -func ExecOrchestrator(query string, args ...any) (sql.Result, error) { - var err error - query, err = translateStatement(query) - if err != nil { - return nil, err - } - db, err := OpenOrchestrator() - if err != nil { - return nil, err - } - res, err := sqlutils.ExecNoPrepare(db, query, args...) - return res, err -} - -// QueryOrchestrator -func QueryOrchestrator(query string, argsArray []any, onRow func(sqlutils.RowMap) error) error { - query, err := translateStatement(query) - if err != nil { - log.Fatalf("Cannot query orchestrator: %+v; query=%+v", err, query) - return err - } - db, err := OpenOrchestrator() - if err != nil { - return err - } - - if err = sqlutils.QueryRowsMap(db, query, onRow, argsArray...); err != nil { - log.Warning(err.Error()) - } - - return err -} diff --git a/go/vt/vtgr/db/generate_base.go b/go/vt/vtgr/db/generate_base.go deleted file mode 100644 index d1923223e5d..00000000000 --- a/go/vt/vtgr/db/generate_base.go +++ /dev/null @@ -1,862 +0,0 @@ -/* - Copyright 2017 Shlomi Noach, GitHub Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -/* - This file has been copied over from VTOrc package -*/ - -package db - -// generateSQLBase & generateSQLPatches are lists of SQL statements required to build the orchestrator backend -var generateSQLBase = []string{ - ` - CREATE TABLE IF NOT EXISTS database_instance ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - last_checked timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - last_seen timestamp NULL DEFAULT NULL, - server_id int(10) unsigned NOT NULL, - version varchar(128) CHARACTER SET ascii NOT NULL, - binlog_format varchar(16) CHARACTER SET ascii NOT NULL, - log_bin tinyint(3) unsigned NOT NULL, - log_replica_updates tinyint(3) unsigned NOT NULL, - binary_log_file varchar(128) CHARACTER SET ascii NOT NULL, - binary_log_pos bigint(20) unsigned NOT NULL, - source_host varchar(128) CHARACTER SET ascii NOT NULL, - source_port smallint(5) unsigned NOT NULL, - replica_sql_running tinyint(3) unsigned NOT NULL, - replica_io_running tinyint(3) unsigned NOT NULL, - source_log_file varchar(128) CHARACTER SET ascii NOT NULL, - read_source_log_pos bigint(20) unsigned NOT NULL, - relay_source_log_file varchar(128) CHARACTER SET ascii NOT NULL, - exec_source_log_pos bigint(20) unsigned NOT NULL, - replication_lag_seconds bigint(20) unsigned DEFAULT NULL, - replica_lag_seconds bigint(20) unsigned DEFAULT NULL, - num_replica_hosts int(10) unsigned NOT NULL, - replica_hosts text CHARACTER SET ascii NOT NULL, - cluster_name varchar(128) CHARACTER SET ascii NOT NULL, - PRIMARY KEY (hostname,port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX cluster_name_idx ON database_instance - `, - ` - CREATE INDEX cluster_name_idx_database_instance ON database_instance(cluster_name) - `, - ` - DROP INDEX last_checked_idx ON database_instance - `, - ` - CREATE INDEX last_checked_idx_database_instance ON database_instance(last_checked) - `, - ` - DROP INDEX last_seen_idx ON database_instance - `, - ` - CREATE INDEX last_seen_idx_database_instance ON database_instance(last_seen) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_maintenance ( - database_instance_maintenance_id int(10) unsigned NOT NULL AUTO_INCREMENT, - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - maintenance_active tinyint(4) DEFAULT NULL, - begin_timestamp timestamp NULL DEFAULT NULL, - end_timestamp timestamp NULL DEFAULT NULL, - owner varchar(128) CHARACTER SET utf8 NOT NULL, - reason text CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (database_instance_maintenance_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX maintenance_uidx ON database_instance_maintenance - `, - ` - CREATE UNIQUE INDEX maintenance_uidx_database_instance_maintenance ON database_instance_maintenance (maintenance_active, hostname, port) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_long_running_queries ( - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - process_id bigint(20) NOT NULL, - process_started_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - process_user varchar(16) CHARACTER SET utf8 NOT NULL, - process_host varchar(128) CHARACTER SET utf8 NOT NULL, - process_db varchar(128) CHARACTER SET utf8 NOT NULL, - process_command varchar(16) CHARACTER SET utf8 NOT NULL, - process_time_seconds int(11) NOT NULL, - process_state varchar(128) CHARACTER SET utf8 NOT NULL, - process_info varchar(1024) CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (hostname,port,process_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX process_started_at_idx ON database_instance_long_running_queries - `, - ` - CREATE INDEX process_started_at_idx_database_instance_long_running_queries ON database_instance_long_running_queries (process_started_at) - `, - ` - CREATE TABLE IF NOT EXISTS audit ( - audit_id bigint(20) unsigned NOT NULL AUTO_INCREMENT, - audit_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - audit_type varchar(128) CHARACTER SET ascii NOT NULL, - hostname varchar(128) CHARACTER SET ascii NOT NULL DEFAULT '', - port smallint(5) unsigned NOT NULL, - message text CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (audit_id) - ) ENGINE=InnoDB DEFAULT CHARSET=latin1 - `, - ` - DROP INDEX audit_timestamp_idx ON audit - `, - ` - CREATE INDEX audit_timestamp_idx_audit ON audit (audit_timestamp) - `, - ` - DROP INDEX host_port_idx ON audit - `, - ` - CREATE INDEX host_port_idx_audit ON audit (hostname, port, audit_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS host_agent ( - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - token varchar(128) NOT NULL, - last_submitted timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - last_checked timestamp NULL DEFAULT NULL, - last_seen timestamp NULL DEFAULT NULL, - mysql_port smallint(5) unsigned DEFAULT NULL, - count_mysql_snapshots smallint(5) unsigned NOT NULL, - PRIMARY KEY (hostname) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX token_idx ON host_agent - `, - ` - CREATE INDEX token_idx_host_agent ON host_agent (token) - `, - ` - DROP INDEX last_submitted_idx ON host_agent - `, - ` - CREATE INDEX last_submitted_idx_host_agent ON host_agent (last_submitted) - `, - ` - DROP INDEX last_checked_idx ON host_agent - `, - ` - CREATE INDEX last_checked_idx_host_agent ON host_agent (last_checked) - `, - ` - DROP INDEX last_seen_idx ON host_agent - `, - ` - CREATE INDEX last_seen_idx_host_agent ON host_agent (last_seen) - `, - ` - CREATE TABLE IF NOT EXISTS agent_seed ( - agent_seed_id int(10) unsigned NOT NULL AUTO_INCREMENT, - target_hostname varchar(128) NOT NULL, - source_hostname varchar(128) NOT NULL, - start_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - end_timestamp timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - is_complete tinyint(3) unsigned NOT NULL DEFAULT '0', - is_successful tinyint(3) unsigned NOT NULL DEFAULT '0', - PRIMARY KEY (agent_seed_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX target_hostname_idx ON agent_seed - `, - ` - CREATE INDEX target_hostname_idx_agent_seed ON agent_seed (target_hostname,is_complete) - `, - ` - DROP INDEX source_hostname_idx ON agent_seed - `, - ` - CREATE INDEX source_hostname_idx_agent_seed ON agent_seed (source_hostname,is_complete) - `, - ` - DROP INDEX start_timestamp_idx ON agent_seed - `, - ` - CREATE INDEX start_timestamp_idx_agent_seed ON agent_seed (start_timestamp) - `, - ` - DROP INDEX is_complete_idx ON agent_seed - `, - ` - CREATE INDEX is_complete_idx_agent_seed ON agent_seed (is_complete,start_timestamp) - `, - ` - DROP INDEX is_successful_idx ON agent_seed - `, - ` - CREATE INDEX is_successful_idx_agent_seed ON agent_seed (is_successful, start_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS agent_seed_state ( - agent_seed_state_id int(10) unsigned NOT NULL AUTO_INCREMENT, - agent_seed_id int(10) unsigned NOT NULL, - state_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - state_action varchar(127) NOT NULL, - error_message varchar(255) NOT NULL, - PRIMARY KEY (agent_seed_state_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX agent_seed_idx ON agent_seed_state - `, - ` - CREATE INDEX agent_seed_idx_agent_seed_state ON agent_seed_state (agent_seed_id, state_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS host_attributes ( - hostname varchar(128) NOT NULL, - attribute_name varchar(128) NOT NULL, - attribute_value varchar(128) NOT NULL, - submit_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - expire_timestamp timestamp NULL DEFAULT NULL, - PRIMARY KEY (hostname,attribute_name) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX attribute_name_idx ON host_attributes - `, - ` - CREATE INDEX attribute_name_idx_host_attributes ON host_attributes (attribute_name) - `, - ` - DROP INDEX attribute_value_idx ON host_attributes - `, - ` - CREATE INDEX attribute_value_idx_host_attributes ON host_attributes (attribute_value) - `, - ` - DROP INDEX submit_timestamp_idx ON host_attributes - `, - ` - CREATE INDEX submit_timestamp_idx_host_attributes ON host_attributes (submit_timestamp) - `, - ` - DROP INDEX expire_timestamp_idx ON host_attributes - `, - ` - CREATE INDEX expire_timestamp_idx_host_attributes ON host_attributes (expire_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS hostname_resolve ( - hostname varchar(128) NOT NULL, - resolved_hostname varchar(128) NOT NULL, - resolved_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (hostname) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX resolved_timestamp_idx ON hostname_resolve - `, - ` - CREATE INDEX resolved_timestamp_idx_hostname_resolve ON hostname_resolve (resolved_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS active_node ( - anchor tinyint unsigned NOT NULL, - hostname varchar(128) CHARACTER SET ascii NOT NULL, - token varchar(128) NOT NULL, - last_seen_active timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (anchor) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - INSERT IGNORE INTO active_node (anchor, hostname, token, last_seen_active) - VALUES (1, '', '', NOW()) - `, - ` - CREATE TABLE IF NOT EXISTS node_health ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - token varchar(128) NOT NULL, - last_seen_active timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (hostname, token) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP VIEW IF EXISTS _whats_wrong - `, - ` - DROP VIEW IF EXISTS whats_wrong - `, - ` - DROP VIEW IF EXISTS whats_wrong_summary - `, - ` - CREATE TABLE IF NOT EXISTS topology_recovery ( - recovery_id bigint unsigned not null auto_increment, - hostname varchar(128) NOT NULL, - port smallint unsigned NOT NULL, - in_active_period tinyint unsigned NOT NULL DEFAULT 0, - start_active_period timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - end_active_period_unixtime int unsigned, - end_recovery timestamp NULL DEFAULT NULL, - processing_node_hostname varchar(128) CHARACTER SET ascii NOT NULL, - processcing_node_token varchar(128) NOT NULL, - successor_hostname varchar(128) DEFAULT NULL, - successor_port smallint unsigned DEFAULT NULL, - PRIMARY KEY (recovery_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX in_active_start_period_idx ON topology_recovery - `, - ` - CREATE INDEX in_active_start_period_idx_topology_recovery ON topology_recovery (in_active_period, start_active_period) - `, - ` - DROP INDEX start_active_period_idx ON topology_recovery - `, - ` - CREATE INDEX start_active_period_idx_topology_recovery ON topology_recovery (start_active_period) - `, - ` - DROP INDEX hostname_port_active_period_uidx ON topology_recovery - `, - ` - CREATE UNIQUE INDEX hostname_port_active_period_uidx_topology_recovery ON topology_recovery (hostname, port, in_active_period, end_active_period_unixtime) - `, - ` - CREATE TABLE IF NOT EXISTS hostname_unresolve ( - hostname varchar(128) NOT NULL, - unresolved_hostname varchar(128) NOT NULL, - PRIMARY KEY (hostname) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX unresolved_hostname_idx ON hostname_unresolve - `, - ` - CREATE INDEX unresolved_hostname_idx_hostname_unresolve ON hostname_unresolve (unresolved_hostname) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_pool ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - pool varchar(128) NOT NULL, - PRIMARY KEY (hostname, port, pool) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX pool_idx ON database_instance_pool - `, - ` - CREATE INDEX pool_idx_database_instance_pool ON database_instance_pool (pool) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_topology_history ( - snapshot_unix_timestamp INT UNSIGNED NOT NULL, - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - source_host varchar(128) CHARACTER SET ascii NOT NULL, - source_port smallint(5) unsigned NOT NULL, - cluster_name tinytext CHARACTER SET ascii NOT NULL, - PRIMARY KEY (snapshot_unix_timestamp, hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX cluster_name_idx ON database_instance_topology_history - `, - ` - CREATE INDEX cluster_name_idx_database_instance_topology_history ON database_instance_topology_history (snapshot_unix_timestamp, cluster_name(128)) - `, - ` - CREATE TABLE IF NOT EXISTS candidate_database_instance ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - last_suggested TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX last_suggested_idx ON candidate_database_instance - `, - ` - CREATE INDEX last_suggested_idx_candidate_database_instance ON candidate_database_instance (last_suggested) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_downtime ( - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - downtime_active tinyint(4) DEFAULT NULL, - begin_timestamp timestamp DEFAULT CURRENT_TIMESTAMP, - end_timestamp timestamp NULL DEFAULT NULL, - owner varchar(128) CHARACTER SET utf8 NOT NULL, - reason text CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS topology_failure_detection ( - detection_id bigint(20) unsigned NOT NULL AUTO_INCREMENT, - hostname varchar(128) NOT NULL, - port smallint unsigned NOT NULL, - in_active_period tinyint unsigned NOT NULL DEFAULT '0', - start_active_period timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - end_active_period_unixtime int unsigned NOT NULL, - processing_node_hostname varchar(128) NOT NULL, - processcing_node_token varchar(128) NOT NULL, - analysis varchar(128) NOT NULL, - cluster_name varchar(128) NOT NULL, - count_affected_replicas int unsigned NOT NULL, - replica_hosts text NOT NULL, - PRIMARY KEY (detection_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX hostname_port_active_period_uidx ON topology_failure_detection - `, - ` - DROP INDEX in_active_start_period_idx ON topology_failure_detection - `, - ` - CREATE INDEX in_active_start_period_idx_topology_failure_detection ON topology_failure_detection (in_active_period, start_active_period) - `, - ` - CREATE TABLE IF NOT EXISTS hostname_resolve_history ( - resolved_hostname varchar(128) NOT NULL, - hostname varchar(128) NOT NULL, - resolved_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (resolved_hostname) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX hostname ON hostname_resolve_history - `, - ` - CREATE INDEX hostname_idx_hostname_resolve_history ON hostname_resolve_history (hostname) - `, - ` - DROP INDEX resolved_timestamp_idx ON hostname_resolve_history - `, - ` - CREATE INDEX resolved_timestamp_idx_hostname_resolve_history ON hostname_resolve_history (resolved_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS hostname_unresolve_history ( - unresolved_hostname varchar(128) NOT NULL, - hostname varchar(128) NOT NULL, - last_registered TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (unresolved_hostname) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX hostname ON hostname_unresolve_history - `, - ` - CREATE INDEX hostname_idx_hostname_unresolve_history ON hostname_unresolve_history (hostname) - `, - ` - DROP INDEX last_registered_idx ON hostname_unresolve_history - `, - ` - CREATE INDEX last_registered_idx_hostname_unresolve_history ON hostname_unresolve_history (last_registered) - `, - ` - CREATE TABLE IF NOT EXISTS cluster_domain_name ( - cluster_name varchar(128) CHARACTER SET ascii NOT NULL, - domain_name varchar(128) NOT NULL, - PRIMARY KEY (cluster_name) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX domain_name_idx ON cluster_domain_name - `, - ` - CREATE INDEX domain_name_idx_cluster_domain_name ON cluster_domain_name (domain_name(32)) - `, - ` - CREATE TABLE IF NOT EXISTS primary_position_equivalence ( - equivalence_id bigint unsigned not null auto_increment, - primary1_hostname varchar(128) CHARACTER SET ascii NOT NULL, - primary1_port smallint(5) unsigned NOT NULL, - primary1_binary_log_file varchar(128) CHARACTER SET ascii NOT NULL, - primary1_binary_log_pos bigint(20) unsigned NOT NULL, - primary2_hostname varchar(128) CHARACTER SET ascii NOT NULL, - primary2_port smallint(5) unsigned NOT NULL, - primary2_binary_log_file varchar(128) CHARACTER SET ascii NOT NULL, - primary2_binary_log_pos bigint(20) unsigned NOT NULL, - last_suggested TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (equivalence_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX equivalence_uidx ON primary_position_equivalence - `, - ` - CREATE UNIQUE INDEX equivalence_uidx_primary_position_equivalence ON primary_position_equivalence (primary1_hostname, primary1_port, primary1_binary_log_file, primary1_binary_log_pos, primary2_hostname, primary2_port) - `, - ` - DROP INDEX primary2_idx ON primary_position_equivalence - `, - ` - CREATE INDEX primary2_idx_primary_position_equivalence ON primary_position_equivalence (primary2_hostname, primary2_port, primary2_binary_log_file, primary2_binary_log_pos) - `, - ` - DROP INDEX last_suggested_idx ON primary_position_equivalence - `, - ` - CREATE INDEX last_suggested_idx_primary_position_equivalence ON primary_position_equivalence (last_suggested) - `, - ` - CREATE TABLE IF NOT EXISTS async_request ( - request_id bigint unsigned NOT NULL AUTO_INCREMENT, - command varchar(128) charset ascii not null, - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - destination_hostname varchar(128) NOT NULL, - destination_port smallint(5) unsigned NOT NULL, - pattern text CHARACTER SET utf8 NOT NULL, - gtid_hint varchar(32) charset ascii not null, - begin_timestamp timestamp NULL DEFAULT NULL, - end_timestamp timestamp NULL DEFAULT NULL, - story text CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (request_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX begin_timestamp_idx ON async_request - `, - ` - CREATE INDEX begin_timestamp_idx_async_request ON async_request (begin_timestamp) - `, - ` - DROP INDEX end_timestamp_idx ON async_request - `, - ` - CREATE INDEX end_timestamp_idx_async_request ON async_request (end_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS blocked_topology_recovery ( - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - cluster_name varchar(128) NOT NULL, - analysis varchar(128) NOT NULL, - last_blocked_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - blocking_recovery_id bigint unsigned, - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX cluster_blocked_idx ON blocked_topology_recovery - `, - ` - CREATE INDEX cluster_blocked_idx_blocked_topology_recovery ON blocked_topology_recovery (cluster_name, last_blocked_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_last_analysis ( - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - analysis_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - analysis varchar(128) NOT NULL, - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX analysis_timestamp_idx ON database_instance_last_analysis - `, - ` - CREATE INDEX analysis_timestamp_idx_database_instance_last_analysis ON database_instance_last_analysis (analysis_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_analysis_changelog ( - changelog_id bigint unsigned not null auto_increment, - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - analysis_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - analysis varchar(128) NOT NULL, - PRIMARY KEY (changelog_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX analysis_timestamp_idx ON database_instance_analysis_changelog - `, - ` - CREATE INDEX analysis_timestamp_idx_database_instance_analysis_changelog ON database_instance_analysis_changelog (analysis_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS node_health_history ( - history_id bigint unsigned not null auto_increment, - hostname varchar(128) CHARACTER SET ascii NOT NULL, - token varchar(128) NOT NULL, - first_seen_active timestamp NOT NULL, - extra_info varchar(128) CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (history_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX first_seen_active_idx ON node_health_history - `, - ` - CREATE INDEX first_seen_active_idx_node_health_history ON node_health_history (first_seen_active) - `, - ` - DROP INDEX hostname_token_idx ON node_health_history - `, - ` - CREATE UNIQUE INDEX hostname_token_idx_node_health_history ON node_health_history (hostname, token) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_coordinates_history ( - history_id bigint unsigned not null auto_increment, - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - recorded_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - binary_log_file varchar(128) NOT NULL, - binary_log_pos bigint(20) unsigned NOT NULL, - relay_log_file varchar(128) NOT NULL, - relay_log_pos bigint(20) unsigned NOT NULL, - PRIMARY KEY (history_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX hostname_port_recorded_timestmp_idx ON database_instance_coordinates_history - `, - ` - CREATE INDEX hostname_port_recorded_idx_database_instance_coordinates_history ON database_instance_coordinates_history (hostname, port, recorded_timestamp) - `, - ` - DROP INDEX recorded_timestmp_idx ON database_instance_coordinates_history - `, - ` - CREATE INDEX recorded_timestmp_idx_database_instance_coordinates_history ON database_instance_coordinates_history (recorded_timestamp) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_binlog_files_history ( - history_id bigint unsigned not null auto_increment, - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - binary_log_file varchar(128) NOT NULL, - binary_log_pos bigint(20) unsigned NOT NULL, - first_seen timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - last_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - PRIMARY KEY (history_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX hostname_port_file_idx ON database_instance_binlog_files_history - `, - ` - CREATE UNIQUE INDEX hostname_port_file_idx_database_instance_binlog_files_history ON database_instance_binlog_files_history (hostname, port, binary_log_file) - `, - ` - DROP INDEX last_seen_idx ON database_instance_binlog_files_history - `, - ` - CREATE INDEX last_seen_idx_database_instance_binlog_files_history ON database_instance_binlog_files_history (last_seen) - `, - ` - CREATE TABLE IF NOT EXISTS access_token ( - access_token_id bigint unsigned not null auto_increment, - public_token varchar(128) NOT NULL, - secret_token varchar(128) NOT NULL, - generated_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - generated_by varchar(128) CHARACTER SET utf8 NOT NULL, - is_acquired tinyint unsigned NOT NULL DEFAULT '0', - PRIMARY KEY (access_token_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX public_token_idx ON access_token - `, - ` - CREATE UNIQUE INDEX public_token_uidx_access_token ON access_token (public_token) - `, - ` - DROP INDEX generated_at_idx ON access_token - `, - ` - CREATE INDEX generated_at_idx_access_token ON access_token (generated_at) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_recent_relaylog_history ( - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - current_relay_log_file varchar(128) NOT NULL, - current_relay_log_pos bigint(20) unsigned NOT NULL, - current_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - prev_relay_log_file varchar(128) NOT NULL, - prev_relay_log_pos bigint(20) unsigned NOT NULL, - prev_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - DROP INDEX current_seen_idx ON database_instance_recent_relaylog_history - `, - ` - CREATE INDEX current_seen_idx_database_instance_recent_relaylog_history ON database_instance_recent_relaylog_history (current_seen) - `, - ` - CREATE TABLE IF NOT EXISTS orchestrator_metadata ( - anchor tinyint unsigned NOT NULL, - last_deployed_version varchar(128) CHARACTER SET ascii NOT NULL, - last_deployed_timestamp timestamp NOT NULL, - PRIMARY KEY (anchor) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS orchestrator_db_deployments ( - deployed_version varchar(128) CHARACTER SET ascii NOT NULL, - deployed_timestamp timestamp NOT NULL, - PRIMARY KEY (deployed_version) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS global_recovery_disable ( - disable_recovery tinyint unsigned NOT NULL COMMENT 'Insert 1 to disable recovery globally', - PRIMARY KEY (disable_recovery) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS topology_recovery_steps ( - recovery_step_id bigint unsigned not null auto_increment, - recovery_uid varchar(128) CHARACTER SET ascii NOT NULL, - audit_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - message text CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (recovery_step_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS raft_store ( - store_id bigint unsigned not null auto_increment, - store_key varbinary(512) not null, - store_value blob not null, - PRIMARY KEY (store_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE INDEX store_key_idx_raft_store ON raft_store (store_key) - `, - ` - CREATE TABLE IF NOT EXISTS raft_log ( - log_index bigint unsigned not null auto_increment, - term bigint not null, - log_type int not null, - data blob not null, - PRIMARY KEY (log_index) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS raft_snapshot ( - snapshot_id bigint unsigned not null auto_increment, - snapshot_name varchar(128) CHARACTER SET utf8 NOT NULL, - snapshot_meta varchar(4096) CHARACTER SET utf8 NOT NULL, - PRIMARY KEY (snapshot_id) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE UNIQUE INDEX snapshot_name_uidx_raft_snapshot ON raft_snapshot (snapshot_name) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_peer_analysis ( - peer varchar(128) NOT NULL, - hostname varchar(128) NOT NULL, - port smallint(5) unsigned NOT NULL, - analysis_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - analysis varchar(128) NOT NULL, - PRIMARY KEY (peer, hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_tls ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - required tinyint unsigned NOT NULL DEFAULT 0, - PRIMARY KEY (hostname,port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS cluster_injected_pseudo_gtid ( - cluster_name varchar(128) NOT NULL, - time_injected timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (cluster_name) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS hostname_ips ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - ipv4 varchar(128) CHARACTER SET ascii NOT NULL, - ipv6 varchar(128) CHARACTER SET ascii NOT NULL, - last_updated timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (hostname) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_tags ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - tag_name varchar(128) CHARACTER SET utf8 NOT NULL, - tag_value varchar(128) CHARACTER SET utf8 NOT NULL, - last_updated timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (hostname, port, tag_name) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE INDEX tag_name_idx_database_instance_tags ON database_instance_tags (tag_name) - `, - ` - CREATE TABLE IF NOT EXISTS database_instance_stale_binlog_coordinates ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - binary_log_file varchar(128) NOT NULL, - binary_log_pos bigint(20) unsigned NOT NULL, - first_seen timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE INDEX first_seen_idx_database_instance_stale_binlog_coordinates ON database_instance_stale_binlog_coordinates (first_seen) - `, - ` - CREATE TABLE IF NOT EXISTS vitess_tablet ( - hostname varchar(128) CHARACTER SET ascii NOT NULL, - port smallint(5) unsigned NOT NULL, - keyspace varchar(128) CHARACTER SET ascii NOT NULL, - shard varchar(128) CHARACTER SET ascii NOT NULL, - cell varchar(128) CHARACTER SET ascii NOT NULL, - tablet_type smallint(5) NOT NULL, - primary_timestamp timestamp NOT NULL, - info varchar(512) CHARACTER SET ascii NOT NULL, - PRIMARY KEY (hostname, port) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, - ` - CREATE INDEX cell_idx_vitess_tablet ON vitess_tablet (cell) - `, - ` - CREATE INDEX ks_idx_vitess_tablet ON vitess_tablet (keyspace, shard) - `, - ` - CREATE TABLE IF NOT EXISTS vitess_keyspace ( - keyspace varchar(128) CHARACTER SET ascii NOT NULL, - keyspace_type smallint(5) NOT NULL, - durability_policy varchar(512) CHARACTER SET ascii NOT NULL, - PRIMARY KEY (keyspace) - ) ENGINE=InnoDB DEFAULT CHARSET=ascii - `, -} diff --git a/go/vt/vtgr/db/generate_patches.go b/go/vt/vtgr/db/generate_patches.go deleted file mode 100644 index 3760b3e694a..00000000000 --- a/go/vt/vtgr/db/generate_patches.go +++ /dev/null @@ -1,583 +0,0 @@ -/* - Copyright 2017 Shlomi Noach, GitHub Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -/* - This file has been copied over from VTOrc package -*/ - -package db - -// generateSQLPatches contains DDLs for patching schema to the latest version. -// Add new statements at the end of the list so they form a changelog. -var generateSQLPatches = []string{ - ` - ALTER TABLE - database_instance - ADD COLUMN read_only TINYINT UNSIGNED NOT NULL AFTER version - `, - ` - ALTER TABLE - database_instance - ADD COLUMN last_sql_error TEXT NOT NULL AFTER exec_source_log_pos - `, - ` - ALTER TABLE - database_instance - ADD COLUMN last_io_error TEXT NOT NULL AFTER last_sql_error - `, - ` - ALTER TABLE - database_instance - ADD COLUMN oracle_gtid TINYINT UNSIGNED NOT NULL AFTER replica_io_running - `, - ` - ALTER TABLE - database_instance - ADD COLUMN mariadb_gtid TINYINT UNSIGNED NOT NULL AFTER oracle_gtid - `, - ` - ALTER TABLE - database_instance - ADD COLUMN relay_log_file varchar(128) CHARACTER SET ascii NOT NULL AFTER exec_source_log_pos - `, - ` - ALTER TABLE - database_instance - ADD COLUMN relay_log_pos bigint unsigned NOT NULL AFTER relay_log_file - `, - ` - DROP INDEX source_host_port_idx ON database_instance - `, - ` - ALTER TABLE - database_instance - ADD INDEX source_host_port_idx_database_instance (source_host, source_port) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN pseudo_gtid TINYINT UNSIGNED NOT NULL AFTER mariadb_gtid - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_depth TINYINT UNSIGNED NOT NULL AFTER cluster_name - `, - ` - ALTER TABLE - database_instance - ADD COLUMN has_replication_filters TINYINT UNSIGNED NOT NULL AFTER replica_io_running - `, - ` - ALTER TABLE - database_instance - ADD COLUMN data_center varchar(32) CHARACTER SET ascii NOT NULL AFTER cluster_name - `, - ` - ALTER TABLE - database_instance - ADD COLUMN physical_environment varchar(32) CHARACTER SET ascii NOT NULL AFTER data_center - `, - ` - ALTER TABLE - database_instance_maintenance - ADD KEY active_timestamp_idx (maintenance_active, begin_timestamp) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN is_co_primary TINYINT UNSIGNED NOT NULL AFTER replication_depth - `, - ` - ALTER TABLE - database_instance_maintenance - ADD KEY active_end_timestamp_idx (maintenance_active, end_timestamp) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN sql_delay INT UNSIGNED NOT NULL AFTER replica_lag_seconds - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN analysis varchar(128) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN cluster_name varchar(128) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN count_affected_replicas int unsigned NOT NULL - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN replica_hosts text CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE hostname_unresolve - ADD COLUMN last_registered TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - ` - ALTER TABLE hostname_unresolve - ADD KEY last_registered_idx (last_registered) - `, - ` - ALTER TABLE topology_recovery - ADD KEY cluster_name_in_active_idx (cluster_name, in_active_period) - `, - ` - ALTER TABLE topology_recovery - ADD KEY end_recovery_idx (end_recovery) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN binlog_server TINYINT UNSIGNED NOT NULL AFTER version - `, - ` - ALTER TABLE cluster_domain_name - ADD COLUMN last_registered TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - ` - ALTER TABLE cluster_domain_name - ADD KEY last_registered_idx (last_registered) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN supports_oracle_gtid TINYINT UNSIGNED NOT NULL AFTER oracle_gtid - `, - ` - ALTER TABLE - database_instance - ADD COLUMN executed_gtid_set text CHARACTER SET ascii NOT NULL AFTER oracle_gtid - `, - ` - ALTER TABLE - database_instance - ADD COLUMN server_uuid varchar(64) CHARACTER SET ascii NOT NULL AFTER server_id - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN is_successful TINYINT UNSIGNED NOT NULL DEFAULT 0 AFTER processcing_node_token - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN acknowledged TINYINT UNSIGNED NOT NULL DEFAULT 0 - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN acknowledged_by varchar(128) CHARACTER SET utf8 NOT NULL - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN acknowledge_comment text CHARACTER SET utf8 NOT NULL - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN participating_instances text CHARACTER SET ascii NOT NULL after replica_hosts - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN lost_replicas text CHARACTER SET ascii NOT NULL after participating_instances - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN all_errors text CHARACTER SET ascii NOT NULL after lost_replicas - `, - ` - ALTER TABLE audit - ADD COLUMN cluster_name varchar(128) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER port - `, - ` - ALTER TABLE candidate_database_instance - ADD COLUMN priority TINYINT SIGNED NOT NULL DEFAULT 1 comment 'positive promote, nagative unpromotes' - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN acknowledged_at TIMESTAMP NULL after acknowledged - `, - ` - ALTER TABLE - topology_recovery - ADD KEY acknowledged_idx (acknowledged, acknowledged_at) - `, - ` - ALTER TABLE - blocked_topology_recovery - ADD KEY last_blocked_idx (last_blocked_timestamp) - `, - ` - ALTER TABLE candidate_database_instance - ADD COLUMN promotion_rule enum('must', 'prefer', 'neutral', 'prefer_not', 'must_not') NOT NULL DEFAULT 'neutral' - `, - ` - ALTER TABLE node_health /* sqlite3-skip */ - DROP PRIMARY KEY, - ADD PRIMARY KEY (hostname, token) - `, - ` - ALTER TABLE node_health - ADD COLUMN extra_info varchar(128) CHARACTER SET utf8 NOT NULL - `, - ` - ALTER TABLE agent_seed /* sqlite3-skip */ - MODIFY end_timestamp timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' - `, - ` - ALTER TABLE active_node /* sqlite3-skip */ - MODIFY last_seen_active timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - - ` - ALTER TABLE node_health /* sqlite3-skip */ - MODIFY last_seen_active timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - ` - ALTER TABLE candidate_database_instance /* sqlite3-skip */ - MODIFY last_suggested timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - ` - ALTER TABLE primary_position_equivalence /* sqlite3-skip */ - MODIFY last_suggested timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - ` - ALTER TABLE - database_instance - ADD COLUMN last_attempted_check TIMESTAMP NOT NULL DEFAULT '1971-01-01 00:00:00' AFTER last_checked - `, - ` - ALTER TABLE - database_instance /* sqlite3-skip */ - MODIFY last_attempted_check TIMESTAMP NOT NULL DEFAULT '1971-01-01 00:00:00' - `, - ` - ALTER TABLE - database_instance_analysis_changelog - ADD KEY instance_timestamp_idx (hostname, port, analysis_timestamp) - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN last_detection_id bigint unsigned NOT NULL - `, - ` - ALTER TABLE - topology_recovery - ADD KEY last_detection_idx (last_detection_id) - `, - ` - ALTER TABLE node_health_history - ADD COLUMN command varchar(128) CHARACTER SET utf8 NOT NULL - `, - ` - ALTER TABLE node_health - ADD COLUMN command varchar(128) CHARACTER SET utf8 NOT NULL - `, - ` - ALTER TABLE database_instance_topology_history - ADD COLUMN version varchar(128) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE - database_instance - ADD COLUMN gtid_purged text CHARACTER SET ascii NOT NULL AFTER executed_gtid_set - `, - ` - ALTER TABLE - database_instance_coordinates_history - ADD COLUMN last_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' AFTER recorded_timestamp - `, - ` - ALTER TABLE - access_token - ADD COLUMN is_reentrant TINYINT UNSIGNED NOT NULL default 0 - `, - ` - ALTER TABLE - access_token - ADD COLUMN acquired_at timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' - `, - ` - ALTER TABLE - database_instance_pool - ADD COLUMN registered_at timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' - `, - ` - ALTER TABLE - database_instance - ADD COLUMN has_replication_credentials TINYINT UNSIGNED NOT NULL - `, - ` - ALTER TABLE - database_instance - ADD COLUMN allow_tls TINYINT UNSIGNED NOT NULL AFTER sql_delay - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_enforced TINYINT UNSIGNED NOT NULL AFTER physical_environment - `, - ` - ALTER TABLE - database_instance - ADD COLUMN instance_alias varchar(128) CHARACTER SET ascii NOT NULL AFTER physical_environment - `, - ` - ALTER TABLE - topology_recovery - ADD COLUMN successor_alias varchar(128) DEFAULT NULL - `, - ` - ALTER TABLE - database_instance /* sqlite3-skip */ - MODIFY cluster_name varchar(128) NOT NULL - `, - ` - ALTER TABLE - node_health - ADD INDEX last_seen_active_idx (last_seen_active) - `, - ` - ALTER TABLE - database_instance_maintenance - ADD COLUMN processing_node_hostname varchar(128) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE - database_instance_maintenance - ADD COLUMN processing_node_token varchar(128) NOT NULL - `, - ` - ALTER TABLE - database_instance_maintenance - ADD COLUMN explicitly_bounded TINYINT UNSIGNED NOT NULL - `, - ` - ALTER TABLE node_health_history - ADD COLUMN app_version varchar(64) CHARACTER SET ascii NOT NULL DEFAULT "" - `, - ` - ALTER TABLE node_health - ADD COLUMN app_version varchar(64) CHARACTER SET ascii NOT NULL DEFAULT "" - `, - ` - ALTER TABLE node_health_history /* sqlite3-skip */ - MODIFY app_version varchar(64) CHARACTER SET ascii NOT NULL DEFAULT "" - `, - ` - ALTER TABLE node_health /* sqlite3-skip */ - MODIFY app_version varchar(64) CHARACTER SET ascii NOT NULL DEFAULT "" - `, - ` - ALTER TABLE - database_instance - ADD COLUMN version_comment varchar(128) NOT NULL DEFAULT '' - `, - ` - ALTER TABLE active_node - ADD COLUMN first_seen_active timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' - `, - ` - ALTER TABLE node_health - ADD COLUMN first_seen_active timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' - `, - ` - ALTER TABLE database_instance - ADD COLUMN major_version varchar(16) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE - database_instance - ADD COLUMN binlog_row_image varchar(16) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE topology_recovery - ADD COLUMN uid varchar(128) CHARACTER SET ascii NOT NULL - `, - ` - CREATE INDEX uid_idx_topology_recovery ON topology_recovery(uid) - `, - ` - CREATE INDEX recovery_uid_idx_topology_recovery_steps ON topology_recovery_steps(recovery_uid) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN last_discovery_latency bigint not null - `, - ` - CREATE INDEX end_timestamp_idx_database_instance_downtime ON database_instance_downtime(end_timestamp) - `, - ` - ALTER TABLE - topology_failure_detection - ADD COLUMN is_actionable tinyint not null default 0 - `, - ` - DROP INDEX hostname_port_active_period_uidx_topology_failure_detection ON topology_failure_detection - `, - ` - CREATE UNIQUE INDEX host_port_active_recoverable_uidx_topology_failure_detection ON topology_failure_detection (hostname, port, in_active_period, end_active_period_unixtime, is_actionable) - `, - ` - ALTER TABLE raft_snapshot - ADD COLUMN created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP - `, - ` - ALTER TABLE node_health - ADD COLUMN db_backend varchar(255) CHARACTER SET ascii NOT NULL DEFAULT "" - `, - ` - ALTER TABLE node_health - ADD COLUMN incrementing_indicator bigint not null default 0 - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_primary_enabled TINYINT UNSIGNED NOT NULL - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_replica_enabled TINYINT UNSIGNED NOT NULL - `, - ` - ALTER TABLE - database_instance - ADD COLUMN gtid_mode varchar(32) CHARACTER SET ascii NOT NULL - `, - ` - ALTER TABLE - database_instance - ADD COLUMN last_check_partial_success tinyint unsigned NOT NULL after last_attempted_check - `, - ` - ALTER TABLE - database_instance - ADD COLUMN source_uuid varchar(64) CHARACTER SET ascii NOT NULL AFTER oracle_gtid - `, - ` - ALTER TABLE - database_instance - ADD COLUMN gtid_errant text CHARACTER SET ascii NOT NULL AFTER gtid_purged - `, - ` - ALTER TABLE - database_instance - ADD COLUMN ancestry_uuid text CHARACTER SET ascii NOT NULL AFTER source_uuid - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_sql_thread_state tinyint signed not null default 0 AFTER replica_io_running - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_io_thread_state tinyint signed not null default 0 AFTER replication_sql_thread_state - `, - ` - ALTER TABLE - database_instance_tags /* sqlite3-skip */ - DROP PRIMARY KEY, - ADD PRIMARY KEY (hostname, port, tag_name) - `, - ` - ALTER TABLE - database_instance - ADD COLUMN region varchar(32) CHARACTER SET ascii NOT NULL AFTER data_center - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_primary_timeout INT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_primary_enabled - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_primary_wait_for_replica_count INT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_primary_timeout - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_primary_status TINYINT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_primary_wait_for_replica_count - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_replica_status TINYINT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_primary_status - `, - ` - ALTER TABLE - database_instance - ADD COLUMN semi_sync_primary_clients INT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_primary_status - `, - ` - ALTER TABLE /* sqlite3-skip */ - database_instance - MODIFY semi_sync_primary_timeout BIGINT UNSIGNED NOT NULL DEFAULT 0 - `, - // Fields related to Replication Group the instance belongs to - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_name VARCHAR(64) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER gtid_mode - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_is_single_primary_mode TINYINT UNSIGNED NOT NULL DEFAULT 1 AFTER replication_group_name - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_member_state VARCHAR(16) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER replication_group_is_single_primary_mode - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_member_role VARCHAR(16) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER replication_group_member_state - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_members text CHARACTER SET ascii NOT NULL AFTER replication_group_member_role - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_primary_host varchar(128) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER replication_group_members - `, - ` - ALTER TABLE - database_instance - ADD COLUMN replication_group_primary_port smallint(5) unsigned NOT NULL DEFAULT 0 AFTER replication_group_primary_host - `, -} diff --git a/go/vt/vtgr/db/mock_mysql.go b/go/vt/vtgr/db/mock_mysql.go deleted file mode 100644 index a74d8359099..00000000000 --- a/go/vt/vtgr/db/mock_mysql.go +++ /dev/null @@ -1,191 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package db - -import ( - reflect "reflect" - "strconv" - - gomock "github.com/golang/mock/gomock" - - mysql "vitess.io/vitess/go/mysql" - inst "vitess.io/vitess/go/vt/vtgr/inst" -) - -// MockAgent is a mock of Agent interface -type MockAgent struct { - ctrl *gomock.Controller - recorder *MockAgentMockRecorder -} - -// MockAgentMockRecorder is the mock recorder for MockAgent -type MockAgentMockRecorder struct { - mock *MockAgent -} - -// NewMockAgent creates a new mock instance -func NewMockAgent(ctrl *gomock.Controller) *MockAgent { - mock := &MockAgent{ctrl: ctrl} - mock.recorder = &MockAgentMockRecorder{mock} - return mock -} - -// EXPECT returns an object that allows the caller to indicate expected use -func (m *MockAgent) EXPECT() *MockAgentMockRecorder { - return m.recorder -} - -// BootstrapGroupLocked mocks base method -func (m *MockAgent) BootstrapGroupLocked(instanceKey *inst.InstanceKey) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "BootstrapGroupLocked", instanceKey) - ret0, _ := ret[0].(error) - return ret0 -} - -// BootstrapGroupLocked indicates an expected call of BootstrapGroupLocked -func (mr *MockAgentMockRecorder) BootstrapGroupLocked(instanceKey any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "BootstrapGroupLocked", reflect.TypeOf((*MockAgent)(nil).BootstrapGroupLocked), instanceKey) -} - -// RebootstrapGroupLocked mocks base method -func (m *MockAgent) RebootstrapGroupLocked(instanceKey *inst.InstanceKey, name string) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "RebootstrapGroupLocked", instanceKey, name) - ret0, _ := ret[0].(error) - return ret0 -} - -// RebootstrapGroupLocked indicates an expected call of RebootstrapGroupLocked -func (mr *MockAgentMockRecorder) RebootstrapGroupLocked(instanceKey, name any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RebootstrapGroupLocked", reflect.TypeOf((*MockAgent)(nil).RebootstrapGroupLocked), instanceKey, name) -} - -// StopGroupLocked mocks base method -func (m *MockAgent) StopGroupLocked(instanceKey *inst.InstanceKey) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "StopGroupLocked", instanceKey) - ret0, _ := ret[0].(error) - return ret0 -} - -// StopGroupLocked indicates an expected call of StopGroupLocked -func (mr *MockAgentMockRecorder) StopGroupLocked(instanceKey any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "StopGroupLocked", reflect.TypeOf((*MockAgent)(nil).StopGroupLocked), instanceKey) -} - -// JoinGroupLocked mocks base method -func (m *MockAgent) JoinGroupLocked(instanceKey, primaryKey *inst.InstanceKey) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "JoinGroupLocked", instanceKey, primaryKey) - ret0, _ := ret[0].(error) - return ret0 -} - -// JoinGroupLocked indicates an expected call of JoinGroupLocked -func (mr *MockAgentMockRecorder) JoinGroupLocked(instanceKey, primaryKey any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "JoinGroupLocked", reflect.TypeOf((*MockAgent)(nil).JoinGroupLocked), instanceKey, primaryKey) -} - -// SetReadOnly mocks base method -func (m *MockAgent) SetReadOnly(instanceKey *inst.InstanceKey, readOnly bool) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "SetReadOnly", instanceKey, readOnly) - ret0, _ := ret[0].(error) - return ret0 -} - -// SetReadOnly indicates an expected call of SetReadOnly -func (mr *MockAgentMockRecorder) SetReadOnly(instanceKey, readOnly any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetReadOnly", reflect.TypeOf((*MockAgent)(nil).SetReadOnly), instanceKey, readOnly) -} - -// FetchApplierGTIDSet mocks base method -func (m *MockAgent) FetchApplierGTIDSet(instanceKey *inst.InstanceKey) (mysql.GTIDSet, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "FetchApplierGTIDSet", instanceKey) - ret0, _ := ret[0].(mysql.GTIDSet) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// FetchApplierGTIDSet indicates an expected call of FetchApplierGTIDSet -func (mr *MockAgentMockRecorder) FetchApplierGTIDSet(instanceKey any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchApplierGTIDSet", reflect.TypeOf((*MockAgent)(nil).FetchApplierGTIDSet), instanceKey) -} - -// Failover mocks base method -func (m *MockAgent) Failover(instance *inst.InstanceKey) error { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "Failover", instance) - ret0, _ := ret[0].(error) - return ret0 -} - -// Failover indicates an expected call of Failover -func (mr *MockAgentMockRecorder) Failover(instance any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Failover", reflect.TypeOf((*MockAgent)(nil).Failover), instance) -} - -// FetchGroupView mocks base method -func (m *MockAgent) FetchGroupView(alias string, instanceKey *inst.InstanceKey) (*GroupView, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "FetchGroupView", alias, instanceKey) - ret0, _ := ret[0].(*GroupView) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// FetchGroupView indicates an expected call of FetchGroupView -func (mr *MockAgentMockRecorder) FetchGroupView(alias, instanceKey any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FetchGroupView", reflect.TypeOf((*MockAgent)(nil).FetchGroupView), alias, instanceKey) -} - -// TestGroupState mocks a row from mysql -type TestGroupState struct { - MemberHost, MemberPort, MemberState, MemberRole string -} - -// BuildGroupView builds gruop view from input -func BuildGroupView(alias, groupName, host string, port int, readOnly bool, stalenessResult int, inputs []TestGroupState) *GroupView { - view := NewGroupView(alias, host, port) - view.GroupName = groupName - // group_name, member_host, member_port, member_state, member_role, is_local - for _, row := range inputs { - memberPort, _ := strconv.Atoi(row.MemberPort) - member := NewGroupMember( - row.MemberState, - row.MemberRole, - row.MemberHost, - memberPort, - false) - if host == row.MemberHost && port == memberPort { - member.ReadOnly = readOnly - } - view.UnresolvedMembers = append(view.UnresolvedMembers, member) - view.HeartbeatStaleness = stalenessResult - } - return view -} diff --git a/go/vt/vtgr/db/mysql.go b/go/vt/vtgr/db/mysql.go deleted file mode 100644 index 8c3787c9187..00000000000 --- a/go/vt/vtgr/db/mysql.go +++ /dev/null @@ -1,590 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package db - -import ( - "errors" - "fmt" - "math" - "strconv" - "strings" - - gouuid "github.com/google/uuid" - "github.com/spf13/pflag" - - "vitess.io/vitess/go/vt/external/golib/sqlutils" - - "vitess.io/vitess/go/mysql" - "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/servenv" - "vitess.io/vitess/go/vt/vtgr/config" - "vitess.io/vitess/go/vt/vtgr/inst" -) - -var ( - configFilePath string - dbFlavor = "MySQL56" - mysqlGroupPort = 33061 - enableHeartbeatCheck bool - - // ErrGroupSplitBrain is the error when mysql group is split-brain - ErrGroupSplitBrain = errors.New("group has split brain") - // ErrGroupBackoffError is either the transient error or network partition from the group - ErrGroupBackoffError = errors.New("group backoff error") - // ErrGroupOngoingBootstrap is the error when a bootstrap is in progress - ErrGroupOngoingBootstrap = errors.New("group ongoing bootstrap") - // ErrGroupInactive is the error when mysql group is inactive unexpectedly - ErrGroupInactive = errors.New("group is inactive") - // ErrInvalidInstance is the error when the instance key has empty hostname - ErrInvalidInstance = errors.New("invalid mysql instance key") -) - -func init() { - servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) { - fs.StringVar(&configFilePath, "db_config", "", "Full path to db config file that will be used by VTGR.") - fs.StringVar(&dbFlavor, "db_flavor", "MySQL56", "MySQL flavor override.") - fs.IntVar(&mysqlGroupPort, "gr_port", 33061, "Port to bootstrap a MySQL group.") - fs.BoolVar(&enableHeartbeatCheck, "enable_heartbeat_check", false, "Enable heartbeat checking, set together with --group_heartbeat_threshold.") - }) -} - -// Agent is used by vtgr to interact with Mysql -type Agent interface { - // BootstrapGroupLocked bootstraps a mysql group - // the caller should grab a lock before - BootstrapGroupLocked(instanceKey *inst.InstanceKey) error - - // RebootstrapGroupLocked rebootstrap a group with an existing name - RebootstrapGroupLocked(instanceKey *inst.InstanceKey, name string) error - - // StopGroupLocked stops a mysql group - StopGroupLocked(instanceKey *inst.InstanceKey) error - - // JoinGroupLocked puts an instance into a mysql group based on primary instance - // the caller should grab a lock before - JoinGroupLocked(instanceKey *inst.InstanceKey, primaryKey *inst.InstanceKey) error - - // SetReadOnly set super_read_only variable - // https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_super_read_only - SetReadOnly(instanceKey *inst.InstanceKey, readOnly bool) error - - // FetchApplierGTIDSet fetches the GTID set from group_replication_applier channel - FetchApplierGTIDSet(instanceKey *inst.InstanceKey) (mysql.GTIDSet, error) - - // Failover move the mysql primary to the node defined by memberUUID - Failover(instance *inst.InstanceKey) error - - // FetchGroupView fetches group related information - FetchGroupView(alias string, instanceKey *inst.InstanceKey) (*GroupView, error) -} - -// MemberState is member state -type MemberState int - -// MemberRole is member role -type MemberRole int - -const ( - UNKNOWNSTATE MemberState = iota - OFFLINE - UNREACHABLE - RECOVERING - ONLINE - ERROR -) - -const ( - UNKNOWNROLE MemberRole = iota - SECONDARY - PRIMARY -) - -// GroupMember represents a ROW we get from performance_schema -type GroupMember struct { - HostName string - Port int - Role MemberRole - State MemberState - ReadOnly bool -} - -// GroupView is an instance's view for the group -type GroupView struct { - TabletAlias string - MySQLHost string - MySQLPort int - GroupName string - HeartbeatStaleness int - UnresolvedMembers []*GroupMember -} - -// SQLAgentImpl implements Agent -type SQLAgentImpl struct { - config *config.Configuration - dbFlavor string - enableHeartbeat bool -} - -// NewGroupView creates a new GroupView -func NewGroupView(alias, host string, port int) *GroupView { - return &GroupView{TabletAlias: alias, MySQLHost: host, MySQLPort: port} -} - -// NewGroupMember creates a new GroupMember -func NewGroupMember(state, role, host string, port int, readonly bool) *GroupMember { - return &GroupMember{ - State: toMemberState(state), - Role: toMemberRole(role), - HostName: host, - Port: port, - ReadOnly: readonly, - } -} - -// NewVTGRSqlAgent creates a SQLAgentImpl -func NewVTGRSqlAgent() *SQLAgentImpl { - var conf *config.Configuration - if (configFilePath) != "" { - log.Infof("use config from %v", configFilePath) - conf = config.ForceRead(configFilePath) - } else { - log.Warningf("use default config") - conf = config.Config - } - agent := &SQLAgentImpl{ - config: conf, - dbFlavor: dbFlavor, - enableHeartbeat: enableHeartbeatCheck, - } - return agent -} - -// BootstrapGroupLocked implements Agent interface -func (agent *SQLAgentImpl) BootstrapGroupLocked(instanceKey *inst.InstanceKey) error { - if instanceKey == nil { - return errors.New("nil instance key for bootstrap") - } - // Before bootstrap a group, double check locally there is really nothing running locally - uuid, state, err := agent.getGroupNameAndMemberState(instanceKey) - if err != nil { - return err - } - if state != "" && state != inst.GroupReplicationMemberStateOffline { - return fmt.Errorf("%v not OFFLINE mode %v [group_name=%v]", instanceKey.Hostname, state, uuid) - } - // If there is a group name stored locally, we should try to reuse it - // for port, we will override with a new one - if uuid == "" { - uuid = gouuid.New().String() - log.Infof("Try to bootstrap with a new uuid") - } - log.Infof("Bootstrap group on %v with %v", instanceKey.Hostname, uuid) - return agent.bootstrapInternal(instanceKey, uuid) -} - -func (agent *SQLAgentImpl) RebootstrapGroupLocked(instanceKey *inst.InstanceKey, name string) error { - log.Infof("Rebootstrapping group on %v with %v", instanceKey.Hostname, name) - return agent.bootstrapInternal(instanceKey, name) -} - -func (agent *SQLAgentImpl) bootstrapInternal(instanceKey *inst.InstanceKey, uuid string) error { - // Use persist to set group_replication_group_name - // so that the instance will persist the name after restart - cmds := []string{ - "set global offline_mode=0", - fmt.Sprintf("set @@persist.group_replication_group_name=\"%s\"", uuid), - fmt.Sprintf("set global group_replication_local_address=\"%s:%d\"", instanceKey.Hostname, mysqlGroupPort), - fmt.Sprintf("set global group_replication_group_seeds=\"%s:%d\"", instanceKey.Hostname, mysqlGroupPort), - "set global group_replication_bootstrap_group=ON", - fmt.Sprintf("start group_replication user='%s', password='%s'", agent.config.MySQLReplicaUser, agent.config.MySQLReplicaPassword), - "set global group_replication_bootstrap_group=OFF", - } - for _, cmd := range cmds { - if err := execInstanceWithTopo(instanceKey, cmd); err != nil { - log.Errorf("Failed to execute: %v: %v", cmd, err) - return err - } - } - return nil -} - -// StopGroupLocked implements Agent interface -func (agent *SQLAgentImpl) StopGroupLocked(instanceKey *inst.InstanceKey) error { - cmd := "stop group_replication" - return execInstanceWithTopo(instanceKey, cmd) -} - -// SetReadOnly implements Agent interface -func (agent *SQLAgentImpl) SetReadOnly(instanceKey *inst.InstanceKey, readOnly bool) error { - // Setting super_read_only ON implicitly forces read_only ON - // Setting read_only OFF implicitly forces super_read_only OFF - // https://www.perconaicom/blog/2016/09/27/using-the-super_read_only-system-variable/ - if readOnly { - return execInstance(instanceKey, "set @@global.super_read_only=1") - } - return execInstance(instanceKey, "set @@global.read_only=0") -} - -// JoinGroupLocked implements Agent interface -// Note: caller should grab the lock before calling this -func (agent *SQLAgentImpl) JoinGroupLocked(instanceKey *inst.InstanceKey, primaryInstanceKey *inst.InstanceKey) error { - var numExistingMembers int - var uuid string - query := `select count(*) as count, @@group_replication_group_name as group_name - from performance_schema.replication_group_members where member_state='ONLINE'` - err := fetchInstance(primaryInstanceKey, query, func(m sqlutils.RowMap) error { - numExistingMembers = m.GetInt("count") - uuid = m.GetString("group_name") - return nil - }) - if err != nil { - return err - } - if numExistingMembers == 0 { - return fmt.Errorf("there is no group members found on %v:%v", primaryInstanceKey.Hostname, primaryInstanceKey.Port) - } - // The queries above are executed on the primary instance - // now let's do one more check with local information to make sure it's OK to join the primary - localGroup, state, err := agent.getGroupNameAndMemberState(instanceKey) - if err != nil { - return err - } - if localGroup != "" && localGroup != uuid { - return fmt.Errorf("%v has a different group name (%v) than primary %v (%v)", instanceKey.Hostname, localGroup, primaryInstanceKey.Hostname, uuid) - } - if state == inst.GroupReplicationMemberStateOnline || state == inst.GroupReplicationMemberStateRecovering { - return fmt.Errorf("%v [%v] is alredy in a group %v", instanceKey.Hostname, state, localGroup) - } - var primaryGrPort int - query = `select @@group_replication_local_address as address` - err = fetchInstance(primaryInstanceKey, query, func(m sqlutils.RowMap) error { - address := m.GetString("address") - arr := strings.Split(address, ":") - primaryGrPort, err = strconv.Atoi(arr[1]) - if err != nil { - log.Errorf("Failed to parse primary GR port: %v", err) - return err - } - return nil - }) - if primaryGrPort == 0 { - return fmt.Errorf("cannot find group replication port on %v", primaryInstanceKey.Hostname) - } - // Now it's safe to join the group - cmds := []string{ - "set global offline_mode=0", - fmt.Sprintf("set @@persist.group_replication_group_name=\"%s\"", uuid), - fmt.Sprintf("set global group_replication_group_seeds=\"%s:%d\"", primaryInstanceKey.Hostname, primaryGrPort), - fmt.Sprintf("set global group_replication_local_address=\"%s:%d\"", instanceKey.Hostname, mysqlGroupPort), - fmt.Sprintf("start group_replication user='%s', password='%s'", agent.config.MySQLReplicaUser, agent.config.MySQLReplicaPassword), - } - for _, cmd := range cmds { - if err := execInstanceWithTopo(instanceKey, cmd); err != nil { - return err - } - } - return nil -} - -// Failover implements Agent interface -func (agent *SQLAgentImpl) Failover(instance *inst.InstanceKey) error { - var memberUUID string - query := `select member_id - from performance_schema.replication_group_members - where member_host=convert(@@hostname using ascii) and member_port=@@port and member_state='ONLINE'` - err := fetchInstance(instance, query, func(m sqlutils.RowMap) error { - memberUUID = m.GetString("member_id") - if memberUUID == "" { - return fmt.Errorf("unable to find member_id on %v", instance.Hostname) - } - return nil - }) - if err != nil { - return err - } - cmd := fmt.Sprintf(`select group_replication_set_as_primary('%s')`, memberUUID) - if err := execInstance(instance, cmd); err != nil { - return err - } - return nil -} - -// heartbeatCheck returns heartbeat check freshness result -func (agent *SQLAgentImpl) heartbeatCheck(instanceKey *inst.InstanceKey) (int, error) { - query := `select timestampdiff(SECOND, from_unixtime(truncate(ts * 0.000000001, 0)), NOW()) as diff from _vt.heartbeat;` - var result int - err := fetchInstance(instanceKey, query, func(m sqlutils.RowMap) error { - result = m.GetInt("diff") - return nil - }) - return result, err -} - -// FetchGroupView implements Agent interface -func (agent *SQLAgentImpl) FetchGroupView(alias string, instanceKey *inst.InstanceKey) (*GroupView, error) { - view := NewGroupView(alias, instanceKey.Hostname, instanceKey.Port) - var groupName string - var isReadOnly bool - query := `select - @@group_replication_group_name as group_name, - @@super_read_only as read_only, - member_host, member_port, member_state, member_role - from performance_schema.replication_group_members` - err := fetchInstance(instanceKey, query, func(m sqlutils.RowMap) error { - if groupName == "" { - groupName = m.GetString("group_name") - } - host := m.GetString("member_host") - port := m.GetInt("member_port") - isReadOnly = m.GetBool("read_only") - unresolvedMember := NewGroupMember( - m.GetString("member_state"), - m.GetString("member_role"), - host, - port, - false) - // readOnly is used to re-enable write after we set primary to read_only to protect the shard when there is - // less than desired number of nodes - // the default value is false because if the node is reachable and read_only, it will get override by the OR op - // if the host is unreachable, we don't need to trigger the protection for it therefore assume the it's writable - if host == instanceKey.Hostname && port == instanceKey.Port && isReadOnly { - unresolvedMember.ReadOnly = true - } - view.UnresolvedMembers = append(view.UnresolvedMembers, unresolvedMember) - return nil - }) - view.GroupName = groupName - if err != nil { - return nil, err - } - view.HeartbeatStaleness = math.MaxInt32 - if agent.enableHeartbeat { - heartbeatStaleness, err := agent.heartbeatCheck(instanceKey) - if err != nil { - // We can run into Error 1146: Table '_vt.heartbeat' doesn't exist on new provisioned shard: - // vtgr is checking heartbeat table - // -> heartbeat table is waiting primary tablet - // -> primary tablet needs vtgr. - // - // Therefore if we run into error, HeartbeatStaleness will - // remain to be max int32, which is 2147483647 sec - log.Errorf("Failed to check heartbeatCheck: %v", err) - } else { - view.HeartbeatStaleness = heartbeatStaleness - } - } - return view, nil -} - -// GetPrimaryView returns the view of primary member -func (view *GroupView) GetPrimaryView() (string, int, bool) { - for _, member := range view.UnresolvedMembers { - if member.Role == PRIMARY { - return member.HostName, member.Port, member.State == ONLINE - } - } - return "", 0, false -} - -func (agent *SQLAgentImpl) getGroupNameAndMemberState(instanceKey *inst.InstanceKey) (string, string, error) { - // If there is an instance that is unreachable but we still have quorum, GR will remove it from - // the replication_group_members and Failover if it is the primary node - // If the state becomes UNREACHABLE it indicates there is a network partition inside the group - // https://dev.mysql.com/doc/refman/8.0/en/group-replication-network-partitioning.html - // And then eventually if the node does not recover, the group will transit into ERROR state - // VTGR cannot handle this case, therefore we raise error here - var name, state string - query := `select @@group_replication_group_name as group_name` - err := fetchInstance(instanceKey, query, func(m sqlutils.RowMap) error { - name = m.GetString("group_name") - return nil - }) - if err != nil { - return "", "", err - } - query = `select member_state - from performance_schema.replication_group_members - where member_host=convert(@@hostname using ascii) and member_port=@@port` - err = fetchInstance(instanceKey, query, func(m sqlutils.RowMap) error { - state = m.GetString("member_state") - if state == "" { - state = inst.GroupReplicationMemberStateOffline - } - return nil - }) - if err != nil { - return "", "", err - } - return name, state, nil -} - -// FetchApplierGTIDSet implements Agent interface -func (agent *SQLAgentImpl) FetchApplierGTIDSet(instanceKey *inst.InstanceKey) (mysql.GTIDSet, error) { - var gtidSet string - // TODO: should we also take group_replication_recovery as well? - query := `select gtid_subtract(concat(received_transaction_set, ',', @@global.gtid_executed), '') as gtid_set - from performance_schema.replication_connection_status - where channel_name='group_replication_applier'` - err := fetchInstance(instanceKey, query, func(m sqlutils.RowMap) error { - // If the instance has no committed transaction, gtidSet will be empty string - gtidSet = m.GetString("gtid_set") - return nil - }) - if err != nil { - return nil, err - } - pos, err := mysql.ParsePosition(agent.dbFlavor, gtidSet) - if err != nil { - return nil, err - } - return pos.GTIDSet, nil -} - -// execInstance executes a given query on the given MySQL discovery instance -func execInstance(instanceKey *inst.InstanceKey, query string, args ...any) error { - if err := verifyInstance(instanceKey); err != nil { - return err - } - sqlDb, err := OpenDiscovery(instanceKey.Hostname, instanceKey.Port) - if err != nil { - log.Errorf("error exec %v: %v", query, err) - return err - } - _, err = sqlutils.ExecNoPrepare(sqlDb, query, args...) - return err -} - -// execInstanceWithTopo executes a given query on the given MySQL topology instance -func execInstanceWithTopo(instanceKey *inst.InstanceKey, query string, args ...any) error { - if err := verifyInstance(instanceKey); err != nil { - return err - } - sqlDb, err := OpenTopology(instanceKey.Hostname, instanceKey.Port) - if err != nil { - log.Errorf("error exec %v: %v", query, err) - return err - } - _, err = sqlutils.ExecNoPrepare(sqlDb, query, args...) - return err -} - -// fetchInstance fetches result from mysql -func fetchInstance(instanceKey *inst.InstanceKey, query string, onRow func(sqlutils.RowMap) error) error { - if err := verifyInstance(instanceKey); err != nil { - return err - } - sqlDb, err := OpenDiscovery(instanceKey.Hostname, instanceKey.Port) - if err != nil { - return err - } - return sqlutils.QueryRowsMap(sqlDb, query, onRow) -} - -// The hostname and port can be empty if a tablet crashed and did not populate them in -// the topo server. We treat them as if the host is unreachable when we calculate the -// quorum for the shard. -func verifyInstance(instanceKey *inst.InstanceKey) error { - if instanceKey.Hostname == "" || instanceKey.Port == 0 { - return ErrInvalidInstance - } - return nil -} - -// CreateInstanceKey returns an InstanceKey based on group member input -// When the group is init for the first time, the hostname and port are not set, e.g., -// +---------------------------+-----------+-------------+-------------+--------------+-------------+ -// | CHANNEL_NAME | MEMBER_ID | MEMBER_HOST | MEMBER_PORT | MEMBER_STATE | MEMBER_ROLE | -// +---------------------------+-----------+-------------+-------------+--------------+-------------+ -// | group_replication_applier | | | NULL | OFFLINE | | -// +---------------------------+-----------+-------------+-------------+--------------+-------------+ -// therefore we substitute with view's local hostname and port -func (view *GroupView) CreateInstanceKey(member *GroupMember) inst.InstanceKey { - if member.HostName == "" && member.Port == 0 { - return inst.InstanceKey{ - Hostname: view.MySQLHost, - Port: view.MySQLPort, - } - } - return inst.InstanceKey{ - Hostname: member.HostName, - Port: member.Port, - } -} - -// ToString make string for group view -func (view *GroupView) ToString() string { - var sb strings.Builder - sb.WriteString(fmt.Sprintf("group_name:%v\n", view.GroupName)) - for _, m := range view.UnresolvedMembers { - sb.WriteString(fmt.Sprintf("host:%v:%v | role:%v | state:%v\n", m.HostName, m.Port, m.Role, m.State)) - } - return sb.String() -} - -func (state MemberState) String() string { - switch state { - case ONLINE: - return inst.GroupReplicationMemberStateOnline - case ERROR: - return inst.GroupReplicationMemberStateError - case RECOVERING: - return inst.GroupReplicationMemberStateRecovering - case OFFLINE: - return inst.GroupReplicationMemberStateOffline - case UNREACHABLE: - return inst.GroupReplicationMemberStateUnreachable - } - return "UNKNOWN" -} - -func toMemberState(state string) MemberState { - switch state { - case inst.GroupReplicationMemberStateOnline: - return ONLINE - case inst.GroupReplicationMemberStateError: - return ERROR - case inst.GroupReplicationMemberStateRecovering: - return RECOVERING - case inst.GroupReplicationMemberStateOffline: - return OFFLINE - case inst.GroupReplicationMemberStateUnreachable: - return UNREACHABLE - default: - return UNKNOWNSTATE - } -} - -func (role MemberRole) String() string { - switch role { - case PRIMARY: - return inst.GroupReplicationMemberRolePrimary - case SECONDARY: - return inst.GroupReplicationMemberRoleSecondary - } - return "UNKNOWN" -} - -func toMemberRole(role string) MemberRole { - switch role { - case inst.GroupReplicationMemberRolePrimary: - return PRIMARY - case inst.GroupReplicationMemberRoleSecondary: - return SECONDARY - default: - return UNKNOWNROLE - } -} diff --git a/go/vt/vtgr/db/tls.go b/go/vt/vtgr/db/tls.go deleted file mode 100644 index 514e3d49df3..00000000000 --- a/go/vt/vtgr/db/tls.go +++ /dev/null @@ -1,152 +0,0 @@ -/* - Copyright 2014 Outbrain Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -/* - This file has been copied over from VTOrc package -*/ - -package db - -import ( - "fmt" - "strings" - "time" - - "vitess.io/vitess/go/vt/external/golib/sqlutils" - "vitess.io/vitess/go/vt/log" - - "github.com/go-sql-driver/mysql" - "github.com/patrickmn/go-cache" - "github.com/rcrowley/go-metrics" - - "vitess.io/vitess/go/vt/vtgr/config" - "vitess.io/vitess/go/vt/vtgr/ssl" -) - -const Error3159 = "Error 3159:" -const Error1045 = "Access denied for user" - -// Track if a TLS has already been configured for topology -var topologyTLSConfigured = false - -// Track if a TLS has already been configured for Orchestrator -var orchestratorTLSConfigured = false - -var requireTLSCache *cache.Cache = cache.New(time.Duration(config.Config.TLSCacheTTLFactor*config.Config.InstancePollSeconds)*time.Second, time.Second) - -var readInstanceTLSCounter = metrics.NewCounter() -var writeInstanceTLSCounter = metrics.NewCounter() -var readInstanceTLSCacheCounter = metrics.NewCounter() -var writeInstanceTLSCacheCounter = metrics.NewCounter() - -func init() { - metrics.Register("instance_tls.read", readInstanceTLSCounter) - metrics.Register("instance_tls.write", writeInstanceTLSCounter) - metrics.Register("instance_tls.read_cache", readInstanceTLSCacheCounter) - metrics.Register("instance_tls.write_cache", writeInstanceTLSCacheCounter) -} - -func requiresTLS(host string, port int, uri string) bool { - cacheKey := fmt.Sprintf("%s:%d", host, port) - - if value, found := requireTLSCache.Get(cacheKey); found { - readInstanceTLSCacheCounter.Inc(1) - return value.(bool) - } - - required := false - db, _, _ := sqlutils.GetDB(uri) - if err := db.Ping(); err != nil && (strings.Contains(err.Error(), Error3159) || strings.Contains(err.Error(), Error1045)) { - required = true - } - - query := ` - insert into - database_instance_tls ( - hostname, port, required - ) values ( - ?, ?, ? - ) - on duplicate key update - required=values(required) - ` - if _, err := ExecOrchestrator(query, host, port, required); err != nil { - log.Error(err) - } - writeInstanceTLSCounter.Inc(1) - - requireTLSCache.Set(cacheKey, required, cache.DefaultExpiration) - writeInstanceTLSCacheCounter.Inc(1) - - return required -} - -// SetupMySQLTopologyTLS creates a TLS configuration from the config supplied CA, Certificate, and Private key. -// Register the TLS config with the mysql drivers as the "topology" config -// Modify the supplied URI to call the TLS config -func SetupMySQLTopologyTLS(uri string) (string, error) { - if !topologyTLSConfigured { - tlsConfig, err := ssl.NewTLSConfig(config.Config.MySQLTopologySSLCAFile, !config.Config.MySQLTopologySSLSkipVerify, config.Config.MySQLTopologyTLSMinVersionNumber()) - if err != nil { - log.Errorf("Can't create TLS configuration for Topology connection %s: %s", uri, err) - return "", err - } - tlsConfig.InsecureSkipVerify = config.Config.MySQLTopologySSLSkipVerify - - if (config.Config.MySQLTopologyUseMutualTLS && !config.Config.MySQLTopologySSLSkipVerify) && - config.Config.MySQLTopologySSLCertFile != "" && - config.Config.MySQLTopologySSLPrivateKeyFile != "" { - if err = ssl.AppendKeyPair(tlsConfig, config.Config.MySQLTopologySSLCertFile, config.Config.MySQLTopologySSLPrivateKeyFile); err != nil { - log.Errorf("Can't setup TLS key pairs for %s: %s", uri, err) - return "", err - } - } - if err = mysql.RegisterTLSConfig("topology", tlsConfig); err != nil { - log.Errorf("Can't register mysql TLS config for topology: %s", err) - return "", err - } - topologyTLSConfigured = true - } - return fmt.Sprintf("%s&tls=topology", uri), nil -} - -// SetupMySQLOrchestratorTLS creates a TLS configuration from the config supplied CA, Certificate, and Private key. -// Register the TLS config with the mysql drivers as the "orchestrator" config -// Modify the supplied URI to call the TLS config -func SetupMySQLOrchestratorTLS(uri string) (string, error) { - if !orchestratorTLSConfigured { - tlsConfig, err := ssl.NewTLSConfig(config.Config.MySQLOrchestratorSSLCAFile, !config.Config.MySQLOrchestratorSSLSkipVerify, config.Config.MySQLOrchestratorTLSMinVersionNumber()) - if err != nil { - log.Fatalf("Can't create TLS configuration for Orchestrator connection %s: %s", uri, err) - return "", err - } - tlsConfig.InsecureSkipVerify = config.Config.MySQLOrchestratorSSLSkipVerify - if (!config.Config.MySQLOrchestratorSSLSkipVerify) && - config.Config.MySQLOrchestratorSSLCertFile != "" && - config.Config.MySQLOrchestratorSSLPrivateKeyFile != "" { - if err = ssl.AppendKeyPair(tlsConfig, config.Config.MySQLOrchestratorSSLCertFile, config.Config.MySQLOrchestratorSSLPrivateKeyFile); err != nil { - log.Fatalf("Can't setup TLS key pairs for %s: %s", uri, err) - return "", err - } - } - if err = mysql.RegisterTLSConfig("orchestrator", tlsConfig); err != nil { - log.Fatalf("Can't register mysql TLS config for orchestrator: %s", err) - return "", err - } - orchestratorTLSConfigured = true - } - return fmt.Sprintf("%s&tls=orchestrator", uri), nil -} diff --git a/go/vt/vtgr/inst/instance_key.go b/go/vt/vtgr/inst/instance_key.go deleted file mode 100644 index cd3039537b3..00000000000 --- a/go/vt/vtgr/inst/instance_key.go +++ /dev/null @@ -1,125 +0,0 @@ -/* - Copyright 2015 Shlomi Noach, courtesy Booking.com - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -/* - This file has been copied over from VTOrc package -*/ - -package inst - -import ( - "fmt" - "regexp" - "strings" -) - -// InstanceKey is an instance indicator, identifued by hostname and port -type InstanceKey struct { - Hostname string - Port int -} - -var ( - ipv4Regexp = regexp.MustCompile(`^([0-9]+)[.]([0-9]+)[.]([0-9]+)[.]([0-9]+)$`) -) - -const detachHint = "//" - -// Constant strings for Group Replication information -// See https://dev.mysql.com/doc/refman/8.0/en/replication-group-members-table.html for additional information. -const ( - // Group member roles - GroupReplicationMemberRolePrimary = "PRIMARY" - GroupReplicationMemberRoleSecondary = "SECONDARY" - // Group member states - GroupReplicationMemberStateOnline = "ONLINE" - GroupReplicationMemberStateRecovering = "RECOVERING" - GroupReplicationMemberStateUnreachable = "UNREACHABLE" - GroupReplicationMemberStateOffline = "OFFLINE" - GroupReplicationMemberStateError = "ERROR" -) - -// Equals tests equality between this key and another key -func (instanceKey *InstanceKey) Equals(other *InstanceKey) bool { - if other == nil { - return false - } - return instanceKey.Hostname == other.Hostname && instanceKey.Port == other.Port -} - -// SmallerThan returns true if this key is dictionary-smaller than another. -// This is used for consistent sorting/ordering; there's nothing magical about it. -func (instanceKey *InstanceKey) SmallerThan(other *InstanceKey) bool { - if instanceKey.Hostname < other.Hostname { - return true - } - if instanceKey.Hostname == other.Hostname && instanceKey.Port < other.Port { - return true - } - return false -} - -// IsDetached returns 'true' when this hostname is logically "detached" -func (instanceKey *InstanceKey) IsDetached() bool { - return strings.HasPrefix(instanceKey.Hostname, detachHint) -} - -// IsValid uses simple heuristics to see whether this key represents an actual instance -func (instanceKey *InstanceKey) IsValid() bool { - if instanceKey.Hostname == "_" { - return false - } - if instanceKey.IsDetached() { - return false - } - return len(instanceKey.Hostname) > 0 && instanceKey.Port > 0 -} - -// DetachedKey returns an instance key whose hostname is detahced: invalid, but recoverable -func (instanceKey *InstanceKey) DetachedKey() *InstanceKey { - if instanceKey.IsDetached() { - return instanceKey - } - return &InstanceKey{Hostname: fmt.Sprintf("%s%s", detachHint, instanceKey.Hostname), Port: instanceKey.Port} -} - -// ReattachedKey returns an instance key whose hostname is detahced: invalid, but recoverable -func (instanceKey *InstanceKey) ReattachedKey() *InstanceKey { - if !instanceKey.IsDetached() { - return instanceKey - } - return &InstanceKey{Hostname: instanceKey.Hostname[len(detachHint):], Port: instanceKey.Port} -} - -// StringCode returns an official string representation of this key -func (instanceKey *InstanceKey) StringCode() string { - return fmt.Sprintf("%s:%d", instanceKey.Hostname, instanceKey.Port) -} - -// DisplayString returns a user-friendly string representation of this key -func (instanceKey *InstanceKey) DisplayString() string { - return instanceKey.StringCode() -} - -// String returns a user-friendly string representation of this key -func (instanceKey InstanceKey) String() string { - return instanceKey.StringCode() -} - -// IsValid uses simple heuristics to see whether this key represents an actual instance -func (instanceKey *InstanceKey) IsIPv4() bool { - return ipv4Regexp.MatchString(instanceKey.Hostname) -} diff --git a/go/vt/vtgr/inst/instance_key_test.go b/go/vt/vtgr/inst/instance_key_test.go deleted file mode 100644 index e3e016e474c..00000000000 --- a/go/vt/vtgr/inst/instance_key_test.go +++ /dev/null @@ -1,67 +0,0 @@ -/* - Copyright 2014 Outbrain Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -/* - This file has been copied over from VTOrc package -*/ - -package inst - -import ( - "testing" - - "github.com/stretchr/testify/require" - - "vitess.io/vitess/go/vt/vtgr/config" -) - -func init() { - config.Config.HostnameResolveMethod = "none" -} - -var key1 = InstanceKey{Hostname: "host1", Port: 3306} - -func TestInstanceKeyEquals(t *testing.T) { - i1 := InstanceKey{ - Hostname: "sql00.db", - Port: 3306, - } - i2 := InstanceKey{ - Hostname: "sql00.db", - Port: 3306, - } - - require.Equal(t, i1, i2) - - i2.Port = 3307 - require.NotEqual(t, i1, i2) -} - -func TestInstanceKeyDetach(t *testing.T) { - require.False(t, key1.IsDetached()) - detached1 := key1.DetachedKey() - require.True(t, detached1.IsDetached()) - detached2 := key1.DetachedKey() - require.True(t, detached2.IsDetached()) - require.True(t, detached1.Equals(detached2)) - - reattached1 := detached1.ReattachedKey() - require.False(t, reattached1.IsDetached()) - require.True(t, reattached1.Equals(&key1)) - reattached2 := reattached1.ReattachedKey() - require.False(t, reattached2.IsDetached()) - require.True(t, reattached1.Equals(reattached2)) -} diff --git a/go/vt/vtgr/log/log.go b/go/vt/vtgr/log/log.go deleted file mode 100644 index 4133bbb39a1..00000000000 --- a/go/vt/vtgr/log/log.go +++ /dev/null @@ -1,53 +0,0 @@ -package log - -import ( - "fmt" - - "vitess.io/vitess/go/vt/log" -) - -// Logger is a wrapper that prefix loglines with keyspace/shard -type Logger struct { - prefix string -} - -// NewVTGRLogger creates a new logger -func NewVTGRLogger(keyspace, shard string) *Logger { - return &Logger{ - prefix: fmt.Sprintf("%s/%s", keyspace, shard), - } -} - -// Info formats arguments like fmt.Print -func (logger *Logger) Info(msg string) { - log.InfoDepth(1, logger.annotate(msg)) -} - -// Infof formats arguments like fmt.Printf. -func (logger *Logger) Infof(format string, args ...any) { - log.InfoDepth(1, logger.annotate(fmt.Sprintf(format, args...))) -} - -// Warning formats arguments like fmt.Print -func (logger *Logger) Warning(msg string) { - log.WarningDepth(1, logger.annotate(msg)) -} - -// Warningf formats arguments like fmt.Printf. -func (logger *Logger) Warningf(format string, args ...any) { - log.WarningDepth(1, logger.annotate(fmt.Sprintf(format, args...))) -} - -// Error formats arguments like fmt.Print -func (logger *Logger) Error(msg string) { - log.ErrorDepth(1, logger.annotate(msg)) -} - -// Errorf formats arguments like fmt.Printf. -func (logger *Logger) Errorf(format string, args ...any) { - log.ErrorDepth(1, logger.annotate(fmt.Sprintf(format, args...))) -} - -func (logger *Logger) annotate(input string) string { - return fmt.Sprintf("shard=%s %s", logger.prefix, input) -} diff --git a/go/vt/vtgr/log/log_test.go b/go/vt/vtgr/log/log_test.go deleted file mode 100644 index fd4ede386e9..00000000000 --- a/go/vt/vtgr/log/log_test.go +++ /dev/null @@ -1,16 +0,0 @@ -package log - -import ( - "fmt" - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestVTGRLogger(t *testing.T) { - logger := NewVTGRLogger("ks", "0") - s1 := logger.annotate("abc") - assert.Equal(t, "shard=ks/0 abc", s1) - s2 := fmt.Sprintf(logger.annotate("abc %s"), "def") - assert.Equal(t, "shard=ks/0 abc def", s2) -} diff --git a/go/vt/vtgr/plugin_consultopo.go b/go/vt/vtgr/plugin_consultopo.go deleted file mode 100644 index 3786fd59c26..00000000000 --- a/go/vt/vtgr/plugin_consultopo.go +++ /dev/null @@ -1,23 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreedto in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package vtgr - -// This plugin imports consultopo to register the consul implementation of TopoServer. - -import ( - _ "vitess.io/vitess/go/vt/topo/consultopo" -) diff --git a/go/vt/vtgr/plugin_etcd2topo.go b/go/vt/vtgr/plugin_etcd2topo.go deleted file mode 100644 index 0f9c385f69b..00000000000 --- a/go/vt/vtgr/plugin_etcd2topo.go +++ /dev/null @@ -1,23 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package vtgr - -// This plugin imports etcd2topo to register the etcd2 implementation of TopoServer. - -import ( - _ "vitess.io/vitess/go/vt/topo/etcd2topo" -) diff --git a/go/vt/vtgr/plugin_grpctmclient.go b/go/vt/vtgr/plugin_grpctmclient.go deleted file mode 100644 index 529c560c207..00000000000 --- a/go/vt/vtgr/plugin_grpctmclient.go +++ /dev/null @@ -1,23 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package vtgr - -// Imports and register the gRPC tabletmanager client - -import ( - _ "vitess.io/vitess/go/vt/vttablet/grpctmclient" -) diff --git a/go/vt/vtgr/plugin_zk2topo.go b/go/vt/vtgr/plugin_zk2topo.go deleted file mode 100644 index f524fd0e21a..00000000000 --- a/go/vt/vtgr/plugin_zk2topo.go +++ /dev/null @@ -1,23 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreedto in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package vtgr - -// Imports and register the zk2 TopologyServer - -import ( - _ "vitess.io/vitess/go/vt/topo/zk2topo" -) diff --git a/go/vt/vtgr/ssl/ssl.go b/go/vt/vtgr/ssl/ssl.go deleted file mode 100644 index 9ca18c4c807..00000000000 --- a/go/vt/vtgr/ssl/ssl.go +++ /dev/null @@ -1,62 +0,0 @@ -package ssl - -import ( - "crypto/tls" - "crypto/x509" - "errors" - "os" - - "vitess.io/vitess/go/vt/log" -) - -/* - This file has been copied over from VTOrc package -*/ - -// NewTLSConfig returns an initialized TLS configuration suitable for client -// authentication. If caFile is non-empty, it will be loaded. -func NewTLSConfig(caFile string, verifyCert bool, minVersion uint16) (*tls.Config, error) { - var c tls.Config - - // Set to TLS 1.2 as a minimum. This is overridden for mysql communication - c.MinVersion = minVersion - - if verifyCert { - log.Info("verifyCert requested, client certificates will be verified") - c.ClientAuth = tls.VerifyClientCertIfGiven - } - caPool, err := ReadCAFile(caFile) - if err != nil { - return &c, err - } - c.ClientCAs = caPool - return &c, nil -} - -// Returns CA certificate. If caFile is non-empty, it will be loaded. -func ReadCAFile(caFile string) (*x509.CertPool, error) { - var caCertPool *x509.CertPool - if caFile != "" { - data, err := os.ReadFile(caFile) - if err != nil { - return nil, err - } - caCertPool = x509.NewCertPool() - if !caCertPool.AppendCertsFromPEM(data) { - return nil, errors.New("No certificates parsed") - } - log.Infof("Read in CA file: %v", caFile) - } - return caCertPool, nil -} - -// AppendKeyPair loads the given TLS key pair and appends it to -// tlsConfig.Certificates. -func AppendKeyPair(tlsConfig *tls.Config, certFile string, keyFile string) error { - cert, err := tls.LoadX509KeyPair(certFile, keyFile) - if err != nil { - return err - } - tlsConfig.Certificates = append(tlsConfig.Certificates, cert) - return nil -} diff --git a/go/vt/vtgr/ssl/ssl_test.go b/go/vt/vtgr/ssl/ssl_test.go deleted file mode 100644 index e9ab4a84d74..00000000000 --- a/go/vt/vtgr/ssl/ssl_test.go +++ /dev/null @@ -1,123 +0,0 @@ -package ssl_test - -import ( - "crypto/tls" - "os" - "syscall" - "testing" - - "vitess.io/vitess/go/vt/vtgr/ssl" -) - -/* - This file has been copied over from VTOrc package -*/ - -// TODO: Build a fake CA and make sure it loads up -func TestNewTLSConfig(t *testing.T) { - fakeCA := writeFakeFile(pemCertificate) - defer syscall.Unlink(fakeCA) - - conf, err := ssl.NewTLSConfig(fakeCA, true, tls.VersionTLS13) - if err != nil { - t.Errorf("Could not create new TLS config: %s", err) - } - if conf.ClientAuth != tls.VerifyClientCertIfGiven { - t.Errorf("Client certificate verification was not enabled") - } - if conf.ClientCAs == nil { - t.Errorf("ClientCA empty even though cert provided") - } - if conf.MinVersion != tls.VersionTLS13 { - t.Errorf("incorrect tls min version set") - } - - conf, err = ssl.NewTLSConfig("", false, tls.VersionTLS12) - if err != nil { - t.Errorf("Could not create new TLS config: %s", err) - } - if conf.ClientAuth == tls.VerifyClientCertIfGiven { - t.Errorf("Client certificate verification was enabled unexpectedly") - } - if conf.ClientCAs != nil { - t.Errorf("Filling in ClientCA somehow without a cert") - } - if conf.MinVersion != tls.VersionTLS12 { - t.Errorf("incorrect tls min version set") - } -} - -func TestAppendKeyPair(t *testing.T) { - c, err := ssl.NewTLSConfig("", false, tls.VersionTLS12) - if err != nil { - t.Fatal(err) - } - pemCertFile := writeFakeFile(pemCertificate) - defer syscall.Unlink(pemCertFile) - pemPKFile := writeFakeFile(pemPrivateKey) - defer syscall.Unlink(pemPKFile) - - if err := ssl.AppendKeyPair(c, pemCertFile, pemPKFile); err != nil { - t.Errorf("Failed to append certificate and key to tls config: %s", err) - } -} - -func writeFakeFile(content string) string { - f, err := os.CreateTemp("", "ssl_test") - if err != nil { - return "" - } - os.WriteFile(f.Name(), []byte(content), 0644) - return f.Name() -} - -const pemCertificate = `-----BEGIN CERTIFICATE----- -MIIDtTCCAp2gAwIBAgIJAOxKC7FsJelrMA0GCSqGSIb3DQEBBQUAMEUxCzAJBgNV -BAYTAkFVMRMwEQYDVQQIEwpTb21lLVN0YXRlMSEwHwYDVQQKExhJbnRlcm5ldCBX -aWRnaXRzIFB0eSBMdGQwHhcNMTcwODEwMTQ0MjM3WhcNMTgwODEwMTQ0MjM3WjBF -MQswCQYDVQQGEwJBVTETMBEGA1UECBMKU29tZS1TdGF0ZTEhMB8GA1UEChMYSW50 -ZXJuZXQgV2lkZ2l0cyBQdHkgTHRkMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB -CgKCAQEA12vHV3gYy5zd1lujA7prEhCSkAszE6E37mViWhLQ63CuedZfyYaTAHQK -HYDZi4K1MNAySUfZRMcICSSsxlRIz6mzXrFsowaJgwx4cbMDIvXE03KstuXoTYJh -+xmXB+5yEVEtIyP2DvPqfCmwCZb3k94Y/VY1nAQDxIxciXrAxT9zT1oYd0YWr2yp -J2mgsfnY4c3zg7W5WgvOTmYz7Ey7GJjpUjGdayx+P1CilKzSWH1xZuVQFNLSHvcH -WXkEoCMVc0tW5mO5eEO1aNHo9MSjPF386l1rq+pz5OwjqCEZq2b1YxesyLnbF+8+ -iYGfYmFaDLFwG7zVDwialuI4TzIIOQIDAQABo4GnMIGkMB0GA1UdDgQWBBQ1ubGx -Yvn3wN5VXyoR0lOD7ARzVTB1BgNVHSMEbjBsgBQ1ubGxYvn3wN5VXyoR0lOD7ARz -VaFJpEcwRTELMAkGA1UEBhMCQVUxEzARBgNVBAgTClNvbWUtU3RhdGUxITAfBgNV -BAoTGEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZIIJAOxKC7FsJelrMAwGA1UdEwQF -MAMBAf8wDQYJKoZIhvcNAQEFBQADggEBALmm4Zw/4jLKDJciUGUYOcr5Xe9TP/Cs -afH7IWvaFUDfV3W6yAm9jgNfIy9aDLpuu2CdEb+0qL2hdmGLV7IM3y62Ve0UTdGV -BGsm1zMmIguew2wGbAwGr5LmIcUseatVUKAAAfDrBNwotEAdM8kmGekUZfOM+J9D -FoNQ62C0buRHGugtu6zWAcZNOe6CI7HdhaAdxZlgn8y7dfJQMacoK0NcWeUVQwii -6D4mgaqUGM2O+WcquD1vEMuBPYVcKhi43019E0+6LI5QB6w80bARY8K7tkTdRD7U -y1/C7iIqyuBVL45OdSabb37TfGlHZIPIwLaGw3i4Mr0+F0jQT8rZtTQ= ------END CERTIFICATE-----` - -const pemPrivateKey = `-----BEGIN RSA PRIVATE KEY----- -MIIEpAIBAAKCAQEA12vHV3gYy5zd1lujA7prEhCSkAszE6E37mViWhLQ63CuedZf -yYaTAHQKHYDZi4K1MNAySUfZRMcICSSsxlRIz6mzXrFsowaJgwx4cbMDIvXE03Ks -tuXoTYJh+xmXB+5yEVEtIyP2DvPqfCmwCZb3k94Y/VY1nAQDxIxciXrAxT9zT1oY -d0YWr2ypJ2mgsfnY4c3zg7W5WgvOTmYz7Ey7GJjpUjGdayx+P1CilKzSWH1xZuVQ -FNLSHvcHWXkEoCMVc0tW5mO5eEO1aNHo9MSjPF386l1rq+pz5OwjqCEZq2b1Yxes -yLnbF+8+iYGfYmFaDLFwG7zVDwialuI4TzIIOQIDAQABAoIBAHLf4pleTbqmmBWr -IC7oxhgIBmAR2Nbq7eyO2/e0ePxURnZqPwI0ZUekmZBKGbgvp3e0TlyNl+r5R+u4 -RvosD/fNQv2IF6qH3eSoTcIz98Q40xD+4eNWjp5mnOFOMB/mo6VgaHWIw7oNkElN -4bX7b2LG2QSfaE8eRPQW9XHKp+mGhYFbxgPYxUmlIXuYZF61hVwxysDA6DP3LOi8 -yUL6E64x6NqN9xtg/VoN+f6N0MOvsr4yb5+uvni1LVRFI7tNqIN4Y6P6trgKfnRR -EpZeAUu8scqyxE4NeqnnjK/wBuXxaeh3e9mN1V2SzT629c1InmmQasZ5slcCJQB+ -38cswgECgYEA+esaLKwHXT4+sOqMYemi7TrhxtNC2f5OAGUiSRVmTnum2gl4wOB+ -h5oLZAuG5nBEIoqbMEbI35vfuHqIe390IJtPdQlz4TGDsPufYj/gnnBBFy/c8f+n -f/CdRDRYrpnpKGwvUntLRB2pFbe2hlqqq+4YUqiHauJMOCJnPbOo1lECgYEA3KnF -VOXyY0fKD45G7ttfAcpw8ZI2gY99sCRwtBQGsbO61bvw5sl/3j7AmYosz+n6f7hb -uHmitIuPv4z3r1yfVysh80tTGIM3wDkpr3fLYRxpVOZU4hgxMQV9yyaSA/Hfqn48 -vIK/NC4bERqpofNNdrIqNaGWkd87ZycvpRfa0WkCgYBztbVVr4RtWG9gLAg5IRot -KhD0pEWUdpiYuDpqifznI3r6Al6lNot+rwTNGkUoFhyFvZTigjNozFuFpz3fqAAV -RLNCJdFAF1O4spd1vst5r9GDMcbjSJG9u6KkvHO+y0XXUFeMoccUT4NEqd1ZUUsp -9T/PrXWdOA9AAjW4rKDkMQKBgQC9R4NVR8mbD8Frhoeh69qbFqO7E8hdalBN/3QN -hAAZ/imNnSEPVliwsvNSwQufbPzLAcDrhKrkY7JyhOERM0oa44zDvSESLbxszpvL -P97c9hoEEW9OYaIQgr1cvUES0S8ieBZxPVX11HazPUO0/5a68ijyyCD4D5xM53gf -DU9NwQKBgQCmVthQi65xcc4mgCIwXtBZWXeaPv5x0dLEXIC5EoN6eXLK9iW//7cE -hhawtJtl+J6laB+TkEGQsyhc4v85WcywdisyR7LR7CUqFYJMKeE/VtTVKnYbfq54 -rHoQS9YotByBwPtRx0V93gkc+KWBOGmSBBxKj7lrBkYkcWAiRfpJjg== ------END RSA PRIVATE KEY-----` diff --git a/go/vt/vtgr/vtgr.go b/go/vt/vtgr/vtgr.go deleted file mode 100644 index 80a5f99fad9..00000000000 --- a/go/vt/vtgr/vtgr.go +++ /dev/null @@ -1,233 +0,0 @@ -/* -Copyright 2021 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package vtgr - -import ( - "context" - "errors" - "os" - "os/signal" - "strings" - "sync" - "sync/atomic" - "syscall" - "time" - - "github.com/spf13/pflag" - - "vitess.io/vitess/go/vt/concurrency" - "vitess.io/vitess/go/vt/log" - "vitess.io/vitess/go/vt/servenv" - "vitess.io/vitess/go/vt/topo" - "vitess.io/vitess/go/vt/vtgr/config" - "vitess.io/vitess/go/vt/vtgr/controller" - "vitess.io/vitess/go/vt/vtgr/db" - "vitess.io/vitess/go/vt/vttablet/tmclient" -) - -var ( - refreshInterval = 10 * time.Second - scanInterval = 3 * time.Second - scanAndRepairTimeout = 3 * time.Second - vtgrConfigFile string - - localDbPort int -) - -func init() { - servenv.OnParseFor("vtgr", func(fs *pflag.FlagSet) { - fs.DurationVar(&refreshInterval, "refresh_interval", 10*time.Second, "Refresh interval to load tablets.") - fs.DurationVar(&scanInterval, "scan_interval", 3*time.Second, "Scan interval to diagnose and repair.") - fs.DurationVar(&scanAndRepairTimeout, "scan_repair_timeout", 3*time.Second, "Time to wait for a Diagnose and repair operation.") - fs.StringVar(&vtgrConfigFile, "vtgr_config", "", "Config file for vtgr.") - fs.IntVar(&localDbPort, "db_port", 0, "Local mysql port, set this to enable local fast check.") - }) -} - -// VTGR is the interface to manage the component to set up group replication with Vitess. -// The main goal of it is to reconcile MySQL group and the Vitess topology. -// Caller should use OpenTabletDiscovery to create the VTGR instance. -type VTGR struct { - // Shards are all the shards that a VTGR is monitoring. - // Caller can choose to iterate the shards to scan and repair for more granular control (e.g., stats report) - // instead of calling ScanAndRepair() directly. - Shards []*controller.GRShard - topo controller.GRTopo - tmc tmclient.TabletManagerClient - ctx context.Context - - stopped atomic.Bool -} - -func newVTGR(ctx context.Context, ts controller.GRTopo, tmc tmclient.TabletManagerClient) *VTGR { - return &VTGR{ - topo: ts, - tmc: tmc, - ctx: ctx, - } -} - -// OpenTabletDiscovery calls OpenTabletDiscoveryWithAcitve and set the shard to be active -// it opens connection with topo server -// and triggers the first round of controller based on specified cells and keyspace/shards. -func OpenTabletDiscovery(ctx context.Context, cellsToWatch, clustersToWatch []string) *VTGR { - return OpenTabletDiscoveryWithAcitve(ctx, cellsToWatch, clustersToWatch, true) -} - -// OpenTabletDiscoveryWithAcitve opens connection with topo server -// and triggers the first round of controller based on parameter -func OpenTabletDiscoveryWithAcitve(ctx context.Context, cellsToWatch, clustersToWatch []string, active bool) *VTGR { - if vtgrConfigFile == "" { - log.Fatal("vtgr_config is required") - } - config, err := config.ReadVTGRConfig(vtgrConfigFile) - if err != nil { - log.Fatalf("Cannot load vtgr config file: %v", err) - } - vtgr := newVTGR( - ctx, - topo.Open(), - tmclient.NewTabletManagerClient(), - ) - var shards []*controller.GRShard - ctx, cancel := context.WithTimeout(vtgr.ctx, topo.RemoteOperationTimeout) - defer cancel() - for _, ks := range clustersToWatch { - if strings.Contains(ks, "/") { - // This is a keyspace/shard specification - input := strings.Split(ks, "/") - shards = append(shards, controller.NewGRShard(input[0], input[1], cellsToWatch, vtgr.tmc, vtgr.topo, db.NewVTGRSqlAgent(), config, localDbPort, active)) - } else { - // Assume this is a keyspace and find all shards in keyspace - shardNames, err := vtgr.topo.GetShardNames(ctx, ks) - if err != nil { - // Log the error and continue - log.Errorf("Error fetching shards for keyspace %v: %v", ks, err) - continue - } - if len(shardNames) == 0 { - log.Errorf("Topo has no shards for ks: %v", ks) - continue - } - for _, s := range shardNames { - shards = append(shards, controller.NewGRShard(ks, s, cellsToWatch, vtgr.tmc, vtgr.topo, db.NewVTGRSqlAgent(), config, localDbPort, active)) - } - } - } - vtgr.handleSignal(os.Exit) - vtgr.Shards = shards - log.Infof("Monitoring shards size %v", len(vtgr.Shards)) - // Force refresh all tablet here to populate data for vtgr - var wg sync.WaitGroup - for _, shard := range vtgr.Shards { - wg.Add(1) - go func(shard *controller.GRShard) { - defer wg.Done() - shard.UpdateTabletsInShardWithLock(ctx) - }(shard) - } - wg.Wait() - log.Info("Ready to start VTGR") - return vtgr -} - -// RefreshCluster get the latest tablets from topo server -func (vtgr *VTGR) RefreshCluster() { - for _, shard := range vtgr.Shards { - go func(shard *controller.GRShard) { - ticker := time.Tick(refreshInterval) - for range ticker { - ctx, cancel := context.WithTimeout(vtgr.ctx, refreshInterval) - shard.UpdateTabletsInShardWithLock(ctx) - cancel() - } - }(shard) - } -} - -// ScanAndRepair starts the scanAndFix routine -func (vtgr *VTGR) ScanAndRepair() { - for _, shard := range vtgr.Shards { - go func(shard *controller.GRShard) { - ticker := time.Tick(scanInterval) - for range ticker { - func() { - ctx, cancel := context.WithTimeout(vtgr.ctx, scanAndRepairTimeout) - defer cancel() - if !vtgr.stopped.Load() { - log.Infof("Start scan and repair %v/%v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard) - shard.ScanAndRepairShard(ctx) - log.Infof("Finished scan and repair %v/%v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard) - } - }() - } - }(shard) - } -} - -// Diagnose exposes the endpoint to diagnose a particular shard -func (vtgr *VTGR) Diagnose(ctx context.Context, shard *controller.GRShard) (controller.DiagnoseType, error) { - return shard.Diagnose(ctx) -} - -// Repair exposes the endpoint to repair a particular shard -func (vtgr *VTGR) Repair(ctx context.Context, shard *controller.GRShard, diagnose controller.DiagnoseType) (controller.RepairResultCode, error) { - if vtgr.stopped.Load() { - return controller.Fail, errors.New("VTGR is stopped") - } - return shard.Repair(ctx, diagnose) -} - -// GetCurrentShardStatuses is used when we want to know what VTGR observes -// it contains information about a list of instances and primary tablet -func (vtgr *VTGR) GetCurrentShardStatuses() []controller.ShardStatus { - var result []controller.ShardStatus - for _, shard := range vtgr.Shards { - status := shard.GetCurrentShardStatuses() - result = append(result, status) - } - return result -} - -// OverrideRebootstrapGroupSize forces an override the group size used in safety check for rebootstrap -func (vtgr *VTGR) OverrideRebootstrapGroupSize(groupSize int) error { - errorRecord := concurrency.AllErrorRecorder{} - for _, shard := range vtgr.Shards { - err := shard.OverrideRebootstrapGroupSize(groupSize) - if err != nil { - errorRecord.RecordError(err) - } - } - return errorRecord.Error() -} - -func (vtgr *VTGR) handleSignal(action func(int)) { - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, syscall.SIGHUP) - go func() { - // block until the signal is received - <-sigChan - log.Infof("Handling SIGHUP") - // Set stopped to true so that following repair call won't do anything - // For the ongoing repairs, checkShardLocked will abort if needed - vtgr.stopped.Store(true) - for _, shard := range vtgr.Shards { - shard.UnlockShard() - } - action(1) - }() -} diff --git a/go/vt/vtgr/vtgr_test.go b/go/vt/vtgr/vtgr_test.go deleted file mode 100644 index 3632e88427c..00000000000 --- a/go/vt/vtgr/vtgr_test.go +++ /dev/null @@ -1,55 +0,0 @@ -package vtgr - -import ( - "context" - "sync/atomic" - "syscall" - "testing" - "time" - - "github.com/stretchr/testify/assert" - - "vitess.io/vitess/go/vt/topo/memorytopo" - "vitess.io/vitess/go/vt/vtgr/config" - "vitess.io/vitess/go/vt/vtgr/controller" - "vitess.io/vitess/go/vt/vtgr/db" - "vitess.io/vitess/go/vt/vttablet/tmclient" - - topodatapb "vitess.io/vitess/go/vt/proto/topodata" -) - -func TestSighupHandle(t *testing.T) { - ctx := context.Background() - ts := memorytopo.NewServer("cell1") - defer ts.Close() - ts.CreateKeyspace(ctx, "ks", &topodatapb.Keyspace{}) - ts.CreateShard(ctx, "ks", "0") - vtgr := newVTGR( - ctx, - ts, - tmclient.NewTabletManagerClient(), - ) - var shards []*controller.GRShard - config := &config.VTGRConfig{ - DisableReadOnlyProtection: false, - BootstrapGroupSize: 5, - MinNumReplica: 3, - BackoffErrorWaitTimeSeconds: 10, - BootstrapWaitTimeSeconds: 10 * 60, - } - shards = append(shards, controller.NewGRShard("ks", "0", nil, vtgr.tmc, vtgr.topo, db.NewVTGRSqlAgent(), config, localDbPort, true)) - vtgr.Shards = shards - shard := vtgr.Shards[0] - shard.LockShard(ctx, "test") - var res atomic.Bool - vtgr.handleSignal(func(i int) { - res.Store(true) - }) - assert.NotNil(t, shard.GetUnlock()) - assert.False(t, vtgr.stopped.Load()) - syscall.Kill(syscall.Getpid(), syscall.SIGHUP) - time.Sleep(100 * time.Millisecond) - assert.True(t, res.Load()) - assert.Nil(t, shard.GetUnlock()) - assert.True(t, vtgr.stopped.Load()) -} diff --git a/go/vt/vttablet/grpctmclient/client.go b/go/vt/vttablet/grpctmclient/client.go index 1899c82179c..8ff66212674 100644 --- a/go/vt/vttablet/grpctmclient/client.go +++ b/go/vt/vttablet/grpctmclient/client.go @@ -69,7 +69,6 @@ var _binaries = []string{ // binaries that require the flags in this package "vtctl", "vtctld", "vtctldclient", - "vtgr", "vtorc", "vttablet", "vttestserver", diff --git a/go/vt/vttablet/tmclient/rpc_client_api.go b/go/vt/vttablet/tmclient/rpc_client_api.go index 710d8df64d7..ca86d4c9fa0 100644 --- a/go/vt/vttablet/tmclient/rpc_client_api.go +++ b/go/vt/vttablet/tmclient/rpc_client_api.go @@ -51,7 +51,6 @@ func init() { "vtctl", "vtctld", "vtctldclient", - "vtgr", "vtorc", "vttablet", "vttestserver",