From 911d384221834989edd1f9835248aea1a6db2135 Mon Sep 17 00:00:00 2001 From: Hoshea Date: Mon, 9 Dec 2024 15:00:26 +0800 Subject: [PATCH] Fix the pod may be not recreate when config is changed (#172) --- apis/core/v1alpha1/common_types.go | 9 ++ pkg/controllers/pd/tasks/cm.go | 26 ++---- pkg/controllers/pd/tasks/cm_test.go | 22 +++-- pkg/controllers/pd/tasks/ctx.go | 8 +- pkg/controllers/pd/tasks/pod.go | 16 ++-- pkg/controllers/tidb/tasks/cm.go | 25 ++---- pkg/controllers/tidb/tasks/cm_test.go | 2 + pkg/controllers/tidb/tasks/ctx.go | 8 +- pkg/controllers/tidb/tasks/pod.go | 13 +-- pkg/controllers/tiflash/tasks/cm.go | 25 ++---- pkg/controllers/tiflash/tasks/ctx.go | 8 +- pkg/controllers/tiflash/tasks/pod.go | 12 ++- pkg/controllers/tikv/tasks/cm.go | 25 ++---- pkg/controllers/tikv/tasks/ctx.go | 8 +- pkg/controllers/tikv/tasks/pod.go | 14 +-- pkg/utils/toml/toml.go | 20 +++++ pkg/utils/toml/toml_test.go | 120 ++++++++++++++++++++++++++ 17 files changed, 246 insertions(+), 115 deletions(-) diff --git a/apis/core/v1alpha1/common_types.go b/apis/core/v1alpha1/common_types.go index dc9ef361d8..8b0b318fc6 100644 --- a/apis/core/v1alpha1/common_types.go +++ b/apis/core/v1alpha1/common_types.go @@ -51,6 +51,15 @@ const ( LabelKeyPodSpecHash = LabelKeyPrefix + "pod-spec-hash" LabelKeyInstanceRevisionHash = LabelKeyPrefix + "instance-revision-hash" + + // LabelKeyConfigHash is the hash of the user-specified config (i.e., `.Spec.Config`), + // which will be used to determine whether the config has changed. + // Since the tidb operator will overlay the user-specified config with some operator-managed fields, + // if we hash the overlayed config, with the evolving TiDB Operator, the hash may change, + // potentially triggering an unexpected rolling update. + // Instead, we choose to hash the user-specified config, + // and the worst case is that users expect a reboot but it doesn't happen. + LabelKeyConfigHash = LabelKeyPrefix + "config-hash" ) const ( diff --git a/pkg/controllers/pd/tasks/cm.go b/pkg/controllers/pd/tasks/cm.go index 58147446c6..36034bac26 100644 --- a/pkg/controllers/pd/tasks/cm.go +++ b/pkg/controllers/pd/tasks/cm.go @@ -63,35 +63,25 @@ func (t *TaskConfigMap) Sync(ctx task.Context[ReconcileContext]) task.Result { return task.Fail().With("pd config cannot be encoded: %v", err) } - expected := newConfigMap(rtx.PD, data) - var current corev1.ConfigMap - err = t.Client.Get(ctx, client.ObjectKey{ - Name: expected.Name, - Namespace: expected.Namespace, - }, ¤t) - if client.IgnoreNotFound(err) != nil { - return task.Fail().With("can't get cm of pd: %v", err) + rtx.ConfigHash, err = toml.GenerateHash(rtx.PD.Spec.Config) + if err != nil { + return task.Fail().With("failed to generate hash for `pd.spec.config`: %v", err) } - - // Current config exists and is different from expected config - if err == nil && !maputil.AreEqual(current.Data, expected.Data) { - rtx.ConfigChanged = true + expected := newConfigMap(rtx.PD, data, rtx.ConfigHash) + if err := t.Client.Apply(rtx, expected); err != nil { + return task.Fail().With("can't create/update the cm of pd: %v", err) } - - if e := t.Client.Apply(rtx, expected); e != nil { - return task.Fail().With("can't create cm of pd: %v", e) - } - return task.Complete().With("cm is synced") } -func newConfigMap(pd *v1alpha1.PD, data []byte) *corev1.ConfigMap { +func newConfigMap(pd *v1alpha1.PD, data []byte, hash string) *corev1.ConfigMap { return &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: ConfigMapName(pd.Name), Namespace: pd.Namespace, Labels: maputil.Merge(pd.Labels, map[string]string{ v1alpha1.LabelKeyInstance: pd.Name, + v1alpha1.LabelKeyConfigHash: hash, }), OwnerReferences: []metav1.OwnerReference{ *metav1.NewControllerRef(pd, v1alpha1.SchemeGroupVersion.WithKind("PD")), diff --git a/pkg/controllers/pd/tasks/cm_test.go b/pkg/controllers/pd/tasks/cm_test.go index 971f3ccb0e..3b536637e0 100644 --- a/pkg/controllers/pd/tasks/cm_test.go +++ b/pkg/controllers/pd/tasks/cm_test.go @@ -69,7 +69,8 @@ func TestConfigMap(t *testing.T) { } obj.Spec.Cluster.Name = "test-cluster" obj.Spec.Subdomain = "subdomain" - + obj.Spec.Config = v1alpha1.ConfigFile(`k1 = 'v1' +k2 = 'v2'`) return obj }, ), @@ -78,8 +79,9 @@ func TestConfigMap(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "test-pd", Labels: map[string]string{ - "aaa": "bbb", - v1alpha1.LabelKeyInstance: "test-pd", + "aaa": "bbb", + v1alpha1.LabelKeyInstance: "test-pd", + v1alpha1.LabelKeyConfigHash: "6744f8f9c8", }, OwnerReferences: []metav1.OwnerReference{ { @@ -99,6 +101,8 @@ client-urls = 'http://[::]:2379' initial-cluster = 'test-pd=http://test-pd.subdomain.default:2380' initial-cluster-state = 'new' initial-cluster-token = 'test-cluster' +k1 = 'v1' +k2 = 'v2' name = 'test-pd' peer-urls = 'http://[::]:2380' `, @@ -129,8 +133,8 @@ peer-urls = 'http://[::]:2380' } obj.Spec.Cluster.Name = "test-cluster" obj.Spec.Subdomain = "subdomain" - obj.Spec.Config = v1alpha1.ConfigFile("zzz = 'zzz'") - + obj.Spec.Config = v1alpha1.ConfigFile(`k2 = 'v2' +k1 = 'v1'`) return obj }, ), @@ -139,8 +143,9 @@ peer-urls = 'http://[::]:2380' ObjectMeta: metav1.ObjectMeta{ Name: "test-pd", Labels: map[string]string{ - "aaa": "bbb", - v1alpha1.LabelKeyInstance: "test-pd", + "aaa": "bbb", + v1alpha1.LabelKeyInstance: "test-pd", + v1alpha1.LabelKeyConfigHash: "6744f8f9c8", }, OwnerReferences: []metav1.OwnerReference{ { @@ -160,9 +165,10 @@ client-urls = 'http://[::]:2379' initial-cluster = 'test-pd=http://test-pd.subdomain.default:2380' initial-cluster-state = 'new' initial-cluster-token = 'test-cluster' +k1 = 'v1' +k2 = 'v2' name = 'test-pd' peer-urls = 'http://[::]:2380' -zzz = 'zzz' `, }, }, diff --git a/pkg/controllers/pd/tasks/ctx.go b/pkg/controllers/pd/tasks/ctx.go index 5cdc80848b..9e27ee486c 100644 --- a/pkg/controllers/pd/tasks/ctx.go +++ b/pkg/controllers/pd/tasks/ctx.go @@ -49,10 +49,10 @@ type ReconcileContext struct { Cluster *v1alpha1.Cluster Pod *corev1.Pod - // ConfigChanged indicates whether the config of tidb has been changed. - // Since the config map's name is the same as the tidb's name, - // we use this variable to decide whether to recreate pods. - ConfigChanged bool + // ConfigHash stores the hash of **user-specified** config (i.e.`.Spec.Config`), + // which will be used to determine whether the config has changed. + // This ensures that our config overlay logic will not restart the tidb cluster unexpectedly. + ConfigHash string // Pod cannot be updated when call DELETE API, so we have to set this field to indicate // the underlay pod has been deleting diff --git a/pkg/controllers/pd/tasks/pod.go b/pkg/controllers/pd/tasks/pod.go index 65e55821b9..59e6de8f09 100644 --- a/pkg/controllers/pd/tasks/pod.go +++ b/pkg/controllers/pd/tasks/pod.go @@ -74,22 +74,21 @@ func TaskPodSuspend(c client.Client) task.Task[ReconcileContext] { func (t *TaskPod) Sync(ctx task.Context[ReconcileContext]) task.Result { rtx := ctx.Self() - expected := t.newPod(rtx.Cluster, rtx.PDGroup, rtx.PD) + expected := t.newPod(rtx.Cluster, rtx.PDGroup, rtx.PD, rtx.ConfigHash) if rtx.Pod == nil { if err := t.Client.Apply(rtx, expected); err != nil { - return task.Fail().With("can't apply pod of pd: %v", err) + return task.Fail().With("can't create pod of pd: %v", err) } - rtx.Pod = expected - return task.Complete().With("pod is synced") } res := k8s.ComparePods(rtx.Pod, expected) + curHash, expectHash := rtx.Pod.Labels[v1alpha1.LabelKeyConfigHash], expected.Labels[v1alpha1.LabelKeyConfigHash] + configChanged := curHash != expectHash + t.Logger.Info("compare pod", "result", res, "configChanged", configChanged, "currentConfigHash", curHash, "expectConfigHash", expectHash) - t.Logger.Info("compare pod", "result", res, "configChanged", rtx.ConfigChanged) - - if res == k8s.CompareResultRecreate || (rtx.ConfigChanged && rtx.PDGroup.Spec.ConfigUpdateStrategy == v1alpha1.ConfigUpdateStrategyRollingUpdate) { + if res == k8s.CompareResultRecreate || (configChanged && rtx.PDGroup.Spec.ConfigUpdateStrategy == v1alpha1.ConfigUpdateStrategyRollingUpdate) { // NOTE: both rtx.Healthy and rtx.Pod are not always newest // So pre delete check may also be skipped in some cases, for example, // the PD is just started. @@ -145,7 +144,7 @@ func (t *TaskPod) PreDeleteCheck(ctx context.Context, pdc pdm.PDClient, pd *v1al return false, nil } -func (t *TaskPod) newPod(cluster *v1alpha1.Cluster, pdg *v1alpha1.PDGroup, pd *v1alpha1.PD) *corev1.Pod { +func (t *TaskPod) newPod(cluster *v1alpha1.Cluster, pdg *v1alpha1.PDGroup, pd *v1alpha1.PD, configHash string) *corev1.Pod { vols := []corev1.Volume{ { Name: v1alpha1.VolumeNameConfig, @@ -227,6 +226,7 @@ func (t *TaskPod) newPod(cluster *v1alpha1.Cluster, pdg *v1alpha1.PDGroup, pd *v Name: pd.Name, Labels: maputil.Merge(pd.Labels, map[string]string{ v1alpha1.LabelKeyInstance: pd.Name, + v1alpha1.LabelKeyConfigHash: configHash, }), Annotations: anno, OwnerReferences: []metav1.OwnerReference{ diff --git a/pkg/controllers/tidb/tasks/cm.go b/pkg/controllers/tidb/tasks/cm.go index 007e4db447..a714f34835 100644 --- a/pkg/controllers/tidb/tasks/cm.go +++ b/pkg/controllers/tidb/tasks/cm.go @@ -61,34 +61,25 @@ func (t *TaskConfigMap) Sync(ctx task.Context[ReconcileContext]) task.Result { return task.Fail().With("tidb config cannot be encoded: %v", err) } - expected := newConfigMap(rtx.TiDB, data) - var current corev1.ConfigMap - err = t.Client.Get(ctx, client.ObjectKey{ - Name: expected.Name, - Namespace: expected.Namespace, - }, ¤t) - if client.IgnoreNotFound(err) != nil { - return task.Fail().With("can't get cm of tidb: %v", err) - } - - // Current config exists and is different from expected config - if err == nil && !maputil.AreEqual(current.Data, expected.Data) { - rtx.ConfigChanged = true + rtx.ConfigHash, err = toml.GenerateHash(rtx.TiDB.Spec.Config) + if err != nil { + return task.Fail().With("failed to generate hash for `tidb.spec.config`: %v", err) } - + expected := newConfigMap(rtx.TiDB, data, rtx.ConfigHash) if e := t.Client.Apply(rtx, expected); e != nil { - return task.Fail().With("can't create cm of tidb: %v", e) + return task.Fail().With("can't create/update cm of tidb: %v", e) } return task.Complete().With("cm is synced") } -func newConfigMap(tidb *v1alpha1.TiDB, data []byte) *corev1.ConfigMap { +func newConfigMap(tidb *v1alpha1.TiDB, data []byte, hash string) *corev1.ConfigMap { return &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: ConfigMapName(tidb.Name), Namespace: tidb.Namespace, Labels: maputil.Merge(tidb.Labels, map[string]string{ - v1alpha1.LabelKeyInstance: tidb.Name, + v1alpha1.LabelKeyInstance: tidb.Name, + v1alpha1.LabelKeyConfigHash: hash, }), OwnerReferences: []metav1.OwnerReference{ *metav1.NewControllerRef(tidb, v1alpha1.SchemeGroupVersion.WithKind("TiDB")), diff --git a/pkg/controllers/tidb/tasks/cm_test.go b/pkg/controllers/tidb/tasks/cm_test.go index e896522068..968890c549 100644 --- a/pkg/controllers/tidb/tasks/cm_test.go +++ b/pkg/controllers/tidb/tasks/cm_test.go @@ -111,6 +111,7 @@ func TestConfigMap(t *testing.T) { "aaa": "bbb", v1alpha1.LabelKeyInstance: "test-tidb", v1alpha1.LabelKeyInstanceRevisionHash: "foo", + v1alpha1.LabelKeyConfigHash: "7d6fc488b7", }, OwnerReferences: []metav1.OwnerReference{ { @@ -168,6 +169,7 @@ graceful-wait-before-shutdown = 60`), "aaa": "bbb", v1alpha1.LabelKeyInstance: "test-tidb", v1alpha1.LabelKeyInstanceRevisionHash: "foo", + v1alpha1.LabelKeyConfigHash: "7bd44dcc66", }, OwnerReferences: []metav1.OwnerReference{ { diff --git a/pkg/controllers/tidb/tasks/ctx.go b/pkg/controllers/tidb/tasks/ctx.go index dc34ebc03e..9fe4731950 100644 --- a/pkg/controllers/tidb/tasks/ctx.go +++ b/pkg/controllers/tidb/tasks/ctx.go @@ -50,10 +50,10 @@ type ReconcileContext struct { GracefulWaitTimeInSeconds int64 - // ConfigChanged indicates whether the config of tidb has been changed. - // Since the config map's name is the same as the tidb's name, - // we use this variable to decide whether to recreate pods. - ConfigChanged bool + // ConfigHash stores the hash of **user-specified** config (i.e.`.Spec.Config`), + // which will be used to determine whether the config has changed. + // This ensures that our config overlay logic will not restart the tidb cluster unexpectedly. + ConfigHash string // Pod cannot be updated when call DELETE API, so we have to set this field to indicate // the underlay pod has been deleting diff --git a/pkg/controllers/tidb/tasks/pod.go b/pkg/controllers/tidb/tasks/pod.go index 48026d3a0c..72d5c737a7 100644 --- a/pkg/controllers/tidb/tasks/pod.go +++ b/pkg/controllers/tidb/tasks/pod.go @@ -78,8 +78,7 @@ func (t *TaskPod) Name() string { func (t *TaskPod) Sync(ctx task.Context[ReconcileContext]) task.Result { rtx := ctx.Self() - expected := t.newPod(rtx.Cluster, rtx.TiDBGroup, rtx.TiDB, rtx.GracefulWaitTimeInSeconds) - + expected := t.newPod(rtx.Cluster, rtx.TiDBGroup, rtx.TiDB, rtx.GracefulWaitTimeInSeconds, rtx.ConfigHash) if rtx.Pod == nil { if err := t.Client.Apply(rtx, expected); err != nil { return task.Fail().With("can't create pod of tidb: %v", err) @@ -90,8 +89,11 @@ func (t *TaskPod) Sync(ctx task.Context[ReconcileContext]) task.Result { } res := k8s.ComparePods(rtx.Pod, expected) - t.Logger.Info("compare pod", "result", res, "configChanged", rtx.ConfigChanged) - if res == k8s.CompareResultRecreate || (rtx.ConfigChanged && rtx.TiDBGroup.Spec.ConfigUpdateStrategy == v1alpha1.ConfigUpdateStrategyRollingUpdate) { + curHash, expectHash := rtx.Pod.Labels[v1alpha1.LabelKeyConfigHash], expected.Labels[v1alpha1.LabelKeyConfigHash] + configChanged := curHash != expectHash + t.Logger.Info("compare pod", "result", res, "configChanged", configChanged, "currentConfigHash", curHash, "expectConfigHash", expectHash) + + if res == k8s.CompareResultRecreate || (configChanged && rtx.TiDBGroup.Spec.ConfigUpdateStrategy == v1alpha1.ConfigUpdateStrategyRollingUpdate) { t.Logger.Info("will recreate the pod") if err := t.Client.Delete(rtx, rtx.Pod); err != nil { return task.Fail().With("can't delete pod of tidb: %v", err) @@ -111,7 +113,7 @@ func (t *TaskPod) Sync(ctx task.Context[ReconcileContext]) task.Result { return task.Complete().With("pod is synced") } -func (t *TaskPod) newPod(cluster *v1alpha1.Cluster, dbg *v1alpha1.TiDBGroup, tidb *v1alpha1.TiDB, gracePeriod int64) *corev1.Pod { +func (t *TaskPod) newPod(cluster *v1alpha1.Cluster, dbg *v1alpha1.TiDBGroup, tidb *v1alpha1.TiDB, gracePeriod int64, configHash string) *corev1.Pod { vols := []corev1.Volume{ { Name: v1alpha1.VolumeNameConfig, @@ -238,6 +240,7 @@ func (t *TaskPod) newPod(cluster *v1alpha1.Cluster, dbg *v1alpha1.TiDBGroup, tid Name: tidb.Name, Labels: maputil.Merge(tidb.Labels, map[string]string{ v1alpha1.LabelKeyInstance: tidb.Name, + v1alpha1.LabelKeyConfigHash: configHash, }), Annotations: maputil.Copy(tidb.GetAnnotations()), OwnerReferences: []metav1.OwnerReference{ diff --git a/pkg/controllers/tiflash/tasks/cm.go b/pkg/controllers/tiflash/tasks/cm.go index 92f2cfb7cd..73c297393e 100644 --- a/pkg/controllers/tiflash/tasks/cm.go +++ b/pkg/controllers/tiflash/tasks/cm.go @@ -72,34 +72,25 @@ func (t *TaskConfigMap) Sync(ctx task.Context[ReconcileContext]) task.Result { return task.Fail().With("tiflash proxy config cannot be encoded: %v", err) } - expected := newConfigMap(rtx.TiFlash, flashData, proxyData) - var current corev1.ConfigMap - err = t.Client.Get(ctx, client.ObjectKey{ - Name: expected.Name, - Namespace: expected.Namespace, - }, ¤t) - if client.IgnoreNotFound(err) != nil { - return task.Fail().With("can't get cm of tiflash: %v", err) - } - - // Current config exists and is different from expected config - if err == nil && !maputil.AreEqual(current.Data, expected.Data) { - rtx.ConfigChanged = true + rtx.ConfigHash, err = toml.GenerateHash(rtx.TiFlash.Spec.Config) + if err != nil { + return task.Fail().With("failed to generate hash for `tiflash.spec.config`: %v", err) } - + expected := newConfigMap(rtx.TiFlash, flashData, proxyData, rtx.ConfigHash) if e := t.Client.Apply(rtx, expected); e != nil { - return task.Fail().With("can't create cm of tiflash: %v", e) + return task.Fail().With("can't create/update cm of tiflash: %v", e) } return task.Complete().With("cm is synced") } -func newConfigMap(tiflash *v1alpha1.TiFlash, flashData, proxyData []byte) *corev1.ConfigMap { +func newConfigMap(tiflash *v1alpha1.TiFlash, flashData, proxyData []byte, hash string) *corev1.ConfigMap { return &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: ConfigMapName(tiflash.Name), Namespace: tiflash.Namespace, Labels: maputil.Merge(tiflash.Labels, map[string]string{ - v1alpha1.LabelKeyInstance: tiflash.Name, + v1alpha1.LabelKeyInstance: tiflash.Name, + v1alpha1.LabelKeyConfigHash: hash, }), OwnerReferences: []metav1.OwnerReference{ *metav1.NewControllerRef(tiflash, v1alpha1.SchemeGroupVersion.WithKind("TiFlash")), diff --git a/pkg/controllers/tiflash/tasks/ctx.go b/pkg/controllers/tiflash/tasks/ctx.go index aa993b509c..9e7ba91f6a 100644 --- a/pkg/controllers/tiflash/tasks/ctx.go +++ b/pkg/controllers/tiflash/tasks/ctx.go @@ -51,10 +51,10 @@ type ReconcileContext struct { TiFlashGroup *v1alpha1.TiFlashGroup Pod *corev1.Pod - // ConfigChanged indicates whether the config of TiFlash has been changed. - // Since the config map's name is the same as the TiFlash's name, - // we use this variable to decide whether to recreate pods. - ConfigChanged bool + // ConfigHash stores the hash of **user-specified** config (i.e.`.Spec.Config`), + // which will be used to determine whether the config has changed. + // This ensures that our config overlay logic will not restart the tidb cluster unexpectedly. + ConfigHash string // Pod cannot be updated when call DELETE API, so we have to set this field to indicate // the underlay pod has been deleting diff --git a/pkg/controllers/tiflash/tasks/pod.go b/pkg/controllers/tiflash/tasks/pod.go index c8c690404d..aae958e65e 100644 --- a/pkg/controllers/tiflash/tasks/pod.go +++ b/pkg/controllers/tiflash/tasks/pod.go @@ -65,7 +65,7 @@ func (t *TaskPod) Name() string { func (t *TaskPod) Sync(ctx task.Context[ReconcileContext]) task.Result { rtx := ctx.Self() - expected := t.newPod(rtx.Cluster, rtx.TiFlashGroup, rtx.TiFlash) + expected := t.newPod(rtx.Cluster, rtx.TiFlashGroup, rtx.TiFlash, rtx.ConfigHash) if rtx.Pod == nil { if err := t.Client.Apply(rtx, expected); err != nil { return task.Fail().With("can't apply pod of tiflash: %v", err) @@ -76,8 +76,11 @@ func (t *TaskPod) Sync(ctx task.Context[ReconcileContext]) task.Result { } res := k8s.ComparePods(rtx.Pod, expected) - t.Logger.Info("compare pod", "result", res, "configChanged", rtx.ConfigChanged) - if res == k8s.CompareResultRecreate || (rtx.ConfigChanged && rtx.TiFlashGroup.Spec.ConfigUpdateStrategy == v1alpha1.ConfigUpdateStrategyRollingUpdate) { + curHash, expectHash := rtx.Pod.Labels[v1alpha1.LabelKeyConfigHash], expected.Labels[v1alpha1.LabelKeyConfigHash] + configChanged := curHash != expectHash + t.Logger.Info("compare pod", "result", res, "configChanged", configChanged, "currentConfigHash", curHash, "expectConfigHash", expectHash) + + if res == k8s.CompareResultRecreate || (configChanged && rtx.TiFlashGroup.Spec.ConfigUpdateStrategy == v1alpha1.ConfigUpdateStrategyRollingUpdate) { t.Logger.Info("will recreate the pod") if err := t.Client.Delete(rtx, rtx.Pod); err != nil { return task.Fail().With("can't delete pod of tiflash: %v", err) @@ -96,7 +99,7 @@ func (t *TaskPod) Sync(ctx task.Context[ReconcileContext]) task.Result { return task.Complete().With("pod is synced") } -func (t *TaskPod) newPod(cluster *v1alpha1.Cluster, flashg *v1alpha1.TiFlashGroup, tiflash *v1alpha1.TiFlash) *corev1.Pod { +func (t *TaskPod) newPod(cluster *v1alpha1.Cluster, flashg *v1alpha1.TiFlashGroup, tiflash *v1alpha1.TiFlash, configHash string) *corev1.Pod { vols := []corev1.Volume{ { Name: v1alpha1.VolumeNameConfig, @@ -165,6 +168,7 @@ func (t *TaskPod) newPod(cluster *v1alpha1.Cluster, flashg *v1alpha1.TiFlashGrou Name: tiflash.Name, Labels: maputil.Merge(tiflash.Labels, map[string]string{ v1alpha1.LabelKeyInstance: tiflash.Name, + v1alpha1.LabelKeyConfigHash: configHash, }), Annotations: maputil.Copy(tiflash.GetAnnotations()), OwnerReferences: []metav1.OwnerReference{ diff --git a/pkg/controllers/tikv/tasks/cm.go b/pkg/controllers/tikv/tasks/cm.go index 192d64af25..57652ced75 100644 --- a/pkg/controllers/tikv/tasks/cm.go +++ b/pkg/controllers/tikv/tasks/cm.go @@ -60,34 +60,25 @@ func (t *TaskConfigMap) Sync(ctx task.Context[ReconcileContext]) task.Result { return task.Fail().With("tikv config cannot be encoded: %v", err) } - expected := newConfigMap(rtx.TiKV, data) - var current corev1.ConfigMap - err = t.Client.Get(ctx, client.ObjectKey{ - Name: expected.Name, - Namespace: expected.Namespace, - }, ¤t) - if client.IgnoreNotFound(err) != nil { - return task.Fail().With("can't get cm of pd: %v", err) - } - - // Current config exists and is different from expected config - if err == nil && !maputil.AreEqual(current.Data, expected.Data) { - rtx.ConfigChanged = true + rtx.ConfigHash, err = toml.GenerateHash(rtx.TiKV.Spec.Config) + if err != nil { + return task.Fail().With("failed to generate hash for `tikv.spec.config`: %v", err) } - + expected := newConfigMap(rtx.TiKV, data, rtx.ConfigHash) if e := t.Client.Apply(rtx, expected); e != nil { - return task.Fail().With("can't create cm of pd: %v", e) + return task.Fail().With("can't create/update cm of tikv: %v", e) } return task.Complete().With("cm is synced") } -func newConfigMap(tikv *v1alpha1.TiKV, data []byte) *corev1.ConfigMap { +func newConfigMap(tikv *v1alpha1.TiKV, data []byte, hash string) *corev1.ConfigMap { return &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: ConfigMapName(tikv.Name), Namespace: tikv.Namespace, Labels: maputil.Merge(tikv.Labels, map[string]string{ - v1alpha1.LabelKeyInstance: tikv.Name, + v1alpha1.LabelKeyInstance: tikv.Name, + v1alpha1.LabelKeyConfigHash: hash, }), OwnerReferences: []metav1.OwnerReference{ *metav1.NewControllerRef(tikv, v1alpha1.SchemeGroupVersion.WithKind("TiKV")), diff --git a/pkg/controllers/tikv/tasks/ctx.go b/pkg/controllers/tikv/tasks/ctx.go index 281fe1c779..93baa845f6 100644 --- a/pkg/controllers/tikv/tasks/ctx.go +++ b/pkg/controllers/tikv/tasks/ctx.go @@ -53,10 +53,10 @@ type ReconcileContext struct { Store *pdv1.Store - // ConfigChanged indicates whether the config of tidb has been changed. - // Since the config map's name is the same as the tidb's name, - // we use this variable to decide whether to recreate pods. - ConfigChanged bool + // ConfigHash stores the hash of **user-specified** config (i.e.`.Spec.Config`), + // which will be used to determine whether the config has changed. + // This ensures that our config overlay logic will not restart the tidb cluster unexpectedly. + ConfigHash string // Pod cannot be updated when call DELETE API, so we have to set this field to indicate // the underlay pod has been deleting diff --git a/pkg/controllers/tikv/tasks/pod.go b/pkg/controllers/tikv/tasks/pod.go index 17de6fd021..85cede3ec1 100644 --- a/pkg/controllers/tikv/tasks/pod.go +++ b/pkg/controllers/tikv/tasks/pod.go @@ -72,7 +72,7 @@ func (t *TaskPod) Name() string { func (t *TaskPod) Sync(ctx task.Context[ReconcileContext]) task.Result { rtx := ctx.Self() - expected := t.newPod(rtx.Cluster, rtx.TiKVGroup, rtx.TiKV) + expected := t.newPod(rtx.Cluster, rtx.TiKVGroup, rtx.TiKV, rtx.ConfigHash) if rtx.Pod == nil { if err := t.Client.Apply(rtx, expected); err != nil { return task.Fail().With("can't apply pod of tikv: %v", err) @@ -106,8 +106,11 @@ func (t *TaskPod) Sync(ctx task.Context[ReconcileContext]) task.Result { } res := k8s.ComparePods(rtx.Pod, expected) - t.Logger.Info("compare pod", "result", res, "configChanged", rtx.ConfigChanged) - if res == k8s.CompareResultRecreate || (rtx.ConfigChanged && rtx.TiKVGroup.Spec.ConfigUpdateStrategy == v1alpha1.ConfigUpdateStrategyRollingUpdate) { + curHash, expectHash := rtx.Pod.Labels[v1alpha1.LabelKeyConfigHash], expected.Labels[v1alpha1.LabelKeyConfigHash] + configChanged := curHash != expectHash + t.Logger.Info("compare pod", "result", res, "configChanged", configChanged, "currentConfigHash", curHash, "expectConfigHash", expectHash) + + if res == k8s.CompareResultRecreate || (configChanged && rtx.TiKVGroup.Spec.ConfigUpdateStrategy == v1alpha1.ConfigUpdateStrategyRollingUpdate) { t.Logger.Info("will recreate the pod") if err := t.Client.Delete(rtx, rtx.Pod); err != nil { return task.Fail().With("can't delete pod of tikv: %v", err) @@ -128,7 +131,7 @@ func (t *TaskPod) Sync(ctx task.Context[ReconcileContext]) task.Result { return task.Complete().With("pod is synced") } -func (t *TaskPod) newPod(cluster *v1alpha1.Cluster, kvg *v1alpha1.TiKVGroup, tikv *v1alpha1.TiKV) *corev1.Pod { +func (t *TaskPod) newPod(cluster *v1alpha1.Cluster, kvg *v1alpha1.TiKVGroup, tikv *v1alpha1.TiKV, configHash string) *corev1.Pod { vols := []corev1.Volume{ { Name: v1alpha1.VolumeNameConfig, @@ -221,7 +224,8 @@ func (t *TaskPod) newPod(cluster *v1alpha1.Cluster, kvg *v1alpha1.TiKVGroup, tik Namespace: tikv.Namespace, Name: tikv.Name, Labels: maputil.Merge(tikv.Labels, map[string]string{ - v1alpha1.LabelKeyInstance: tikv.Name, + v1alpha1.LabelKeyInstance: tikv.Name, + v1alpha1.LabelKeyConfigHash: configHash, }), Annotations: maputil.Copy(tikv.GetAnnotations()), OwnerReferences: []metav1.OwnerReference{ diff --git a/pkg/utils/toml/toml.go b/pkg/utils/toml/toml.go index 34c70ba0b9..f798fac3fa 100644 --- a/pkg/utils/toml/toml.go +++ b/pkg/utils/toml/toml.go @@ -17,10 +17,15 @@ package toml import ( "bytes" "fmt" + "hash/fnv" "reflect" "github.com/mitchellh/mapstructure" "github.com/pelletier/go-toml/v2" + "k8s.io/apimachinery/pkg/util/rand" + + "github.com/pingcap/tidb-operator/apis/core/v1alpha1" + hashutil "github.com/pingcap/tidb-operator/third_party/kubernetes/pkg/util/hash" ) type Decoder[T any, PT *T] interface { @@ -149,3 +154,18 @@ func getField(src reflect.Value, dst any) (any, error) { return src.Interface(), nil } } + + +// GenerateHash takes a TOML string as input, unmarshals it into a map, +// and generates a hash of the resulting configuration. The hash is then +// encoded into a safe string format and returned. +// If the order of keys in the TOML string is different, the hash will be the same. +func GenerateHash(tomlStr v1alpha1.ConfigFile) (string, error) { + var config map[string]any + if err := toml.NewDecoder(bytes.NewReader([]byte(tomlStr))).Decode(&config); err != nil { + return "", fmt.Errorf("failed to unmarshal toml string %s: %v", tomlStr, err) + } + hasher := fnv.New32a() + hashutil.DeepHashObject(hasher, config) + return rand.SafeEncodeString(fmt.Sprint(hasher.Sum32())), nil +} diff --git a/pkg/utils/toml/toml_test.go b/pkg/utils/toml/toml_test.go index 758565590a..529a727754 100644 --- a/pkg/utils/toml/toml_test.go +++ b/pkg/utils/toml/toml_test.go @@ -19,6 +19,8 @@ import ( "testing" "github.com/stretchr/testify/assert" + + "github.com/pingcap/tidb-operator/apis/core/v1alpha1" ) type TestType struct { @@ -117,3 +119,121 @@ unknown = 'xxx' }) } } + +func TestGenerateHash(t *testing.T) { + tests := []struct { + name string + tomlStr v1alpha1.ConfigFile + semanticallyEquivalentStr v1alpha1.ConfigFile + wantHash string + wantError bool + }{ + { + name: "Valid TOML string", + tomlStr: v1alpha1.ConfigFile(`foo = 'bar' +[log] +k1 = 'v1' +k2 = 'v2'`), + semanticallyEquivalentStr: v1alpha1.ConfigFile(`foo = 'bar' +[log] +k2 = 'v2' +k1 = 'v1'`), + wantHash: "5dbbcf4574", + wantError: false, + }, + { + name: "Different config value", + tomlStr: v1alpha1.ConfigFile(`foo = 'foo' +[log] +k2 = 'v2' +k1 = 'v1'`), + wantHash: "f5bc46cb9", + wantError: false, + }, + { + name: "multiple sections with blank line", + tomlStr: v1alpha1.ConfigFile(`[a] +k1 = 'v1' +[b] +k2 = 'v2'`), + semanticallyEquivalentStr: v1alpha1.ConfigFile(`[a] +k1 = 'v1' +[b] + +k2 = 'v2'`), + wantHash: "79598d5977", + wantError: false, + }, + { + name: "Empty TOML string", + tomlStr: v1alpha1.ConfigFile(``), + wantHash: "7d6fc488b7", + wantError: false, + }, + { + name: "Invalid TOML string", + tomlStr: v1alpha1.ConfigFile(`key1 = "value1" + key2 = value2`), // Missing quotes around value2 + wantHash: "", + wantError: true, + }, + { + name: "Nested tables", + tomlStr: v1alpha1.ConfigFile(`[parent] +child1 = "value1" +child2 = "value2" +[parent.child] +grandchild1 = "value3" +grandchild2 = "value4"`), + semanticallyEquivalentStr: v1alpha1.ConfigFile(`[parent] +child2 = "value2" +child1 = "value1" +[parent.child] +grandchild2 = "value4" +grandchild1 = "value3"`), + wantHash: "7bf645ccb4", + wantError: false, + }, + { + name: "Array of tables", + tomlStr: v1alpha1.ConfigFile(`[[products]] +name = "Hammer" +sku = 738594937 + +[[products]] +name = "Nail" +sku = 284758393 + +color = "gray"`), + semanticallyEquivalentStr: v1alpha1.ConfigFile(`[[products]] +sku = 738594937 +name = "Hammer" + +[[products]] +sku = 284758393 +name = "Nail" + +color = "gray"`), + wantHash: "7549cf87f4", + wantError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotHash, err := GenerateHash(tt.tomlStr) + if tt.wantError { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.wantHash, gotHash) + + if len(string(tt.semanticallyEquivalentStr)) > 0 { + reorderedHash, err := GenerateHash(tt.semanticallyEquivalentStr) + assert.NoError(t, err) + assert.Equal(t, tt.wantHash, reorderedHash) + } + } + }) + } +}