-
Notifications
You must be signed in to change notification settings - Fork 501
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
failover: emit events when pd failover #1466
Changes from 6 commits
4414b36
0bd5ff3
d7a215d
7a2091b
c8ed4a0
304ed5b
24aad0a
7c6fd11
bb412ef
5f63d61
2d413cd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,11 +76,11 @@ func NewController( | |
tikvFailoverPeriod time.Duration, | ||
tidbFailoverPeriod time.Duration, | ||
) *Controller { | ||
eventBroadcaster := record.NewBroadcaster() | ||
eventBroadcaster.StartLogging(glog.Infof) | ||
eventBroadcaster := record.NewBroadcasterWithCorrelatorOptions(record.CorrelatorOptions{QPS: 10}) | ||
eventBroadcaster.StartLogging(glog.V(4).Infof) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. events are often the most important messages that end users care, I'd prefer to have it on in logs by default, how about There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. reasonable |
||
eventBroadcaster.StartRecordingToSink(&eventv1.EventSinkImpl{ | ||
Interface: eventv1.New(kubeCli.CoreV1().RESTClient()).Events("")}) | ||
recorder := eventBroadcaster.NewRecorder(v1alpha1.Scheme, corev1.EventSource{Component: "tidbcluster"}) | ||
recorder := eventBroadcaster.NewRecorder(v1alpha1.Scheme, corev1.EventSource{Component: "tidb-controller-manager"}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. like k8s does |
||
|
||
tcInformer := informerFactory.Pingcap().V1alpha1().TidbClusters() | ||
setInformer := kubeInformerFactory.Apps().V1().StatefulSets() | ||
|
@@ -108,7 +108,7 @@ func NewController( | |
typedControl := controller.NewTypedControl(controller.NewRealGenericControl(genericCli, recorder)) | ||
pdScaler := mm.NewPDScaler(pdControl, pvcInformer.Lister(), pvcControl) | ||
tikvScaler := mm.NewTiKVScaler(pdControl, pvcInformer.Lister(), pvcControl, podInformer.Lister()) | ||
pdFailover := mm.NewPDFailover(cli, pdControl, pdFailoverPeriod, podInformer.Lister(), podControl, pvcInformer.Lister(), pvcControl, pvInformer.Lister()) | ||
pdFailover := mm.NewPDFailover(cli, pdControl, pdFailoverPeriod, podInformer.Lister(), podControl, pvcInformer.Lister(), pvcControl, pvInformer.Lister(), recorder) | ||
tikvFailover := mm.NewTiKVFailover(tikvFailoverPeriod) | ||
tidbFailover := mm.NewTiDBFailover(tidbFailoverPeriod) | ||
pdUpgrader := mm.NewPDUpgrader(pdControl, podControl, podInformer.Lister()) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,9 +23,11 @@ import ( | |
"github.com/pingcap/tidb-operator/pkg/controller" | ||
"github.com/pingcap/tidb-operator/pkg/pdapi" | ||
"github.com/pingcap/tidb-operator/pkg/util" | ||
apiv1 "k8s.io/api/core/v1" | ||
"k8s.io/apimachinery/pkg/api/errors" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
corelisters "k8s.io/client-go/listers/core/v1" | ||
"k8s.io/client-go/tools/record" | ||
glog "k8s.io/klog" | ||
) | ||
|
||
|
@@ -39,6 +41,7 @@ type pdFailover struct { | |
pvcLister corelisters.PersistentVolumeClaimLister | ||
pvcControl controller.PVCControlInterface | ||
pvLister corelisters.PersistentVolumeLister | ||
recorder record.EventRecorder | ||
} | ||
|
||
// NewPDFailover returns a pd Failover | ||
|
@@ -49,7 +52,8 @@ func NewPDFailover(cli versioned.Interface, | |
podControl controller.PodControlInterface, | ||
pvcLister corelisters.PersistentVolumeClaimLister, | ||
pvcControl controller.PVCControlInterface, | ||
pvLister corelisters.PersistentVolumeLister) Failover { | ||
pvLister corelisters.PersistentVolumeLister, | ||
recorder record.EventRecorder) Failover { | ||
return &pdFailover{ | ||
cli, | ||
pdControl, | ||
|
@@ -58,7 +62,8 @@ func NewPDFailover(cli versioned.Interface, | |
podControl, | ||
pvcLister, | ||
pvcControl, | ||
pvLister} | ||
pvLister, | ||
recorder} | ||
} | ||
|
||
func (pf *pdFailover) Failover(tc *v1alpha1.TidbCluster) error { | ||
|
@@ -73,9 +78,12 @@ func (pf *pdFailover) Failover(tc *v1alpha1.TidbCluster) error { | |
} | ||
|
||
healthCount := 0 | ||
for _, pdMember := range tc.Status.PD.Members { | ||
for podName, pdMember := range tc.Status.PD.Members { | ||
if pdMember.Health { | ||
healthCount++ | ||
} else { | ||
pf.recorder.Eventf(tc, apiv1.EventTypeWarning, "PDMemberUnhealthy", | ||
"%s(%s) is unhealthy", podName, pdMember.ID) | ||
Comment on lines
+85
to
+86
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMHO, this is better to consider as a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The is already a Emit it as an event is good for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, seems like it is already in the describe result?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. emit an event is more user-friendly, it is a progress of failover. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or change There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the progress of failover makes sense, it is okay for me to emit events based on the the status of PD when we cannot actually capture the the "PD turning from healthy to unhealthy" event. The naming issue is just trivial, I think current name is ok There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it possible to report the event on state change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good suggestion, we can do this in the |
||
} | ||
} | ||
inQuorum := healthCount > len(tc.Status.PD.Members)/2 | ||
|
@@ -132,6 +140,9 @@ func (pf *pdFailover) tryToMarkAPeerAsFailure(tc *v1alpha1.TidbCluster) error { | |
return err | ||
} | ||
|
||
pf.recorder.Eventf(tc, apiv1.EventTypeWarning, "PDMemberMarkedAsFailure", | ||
"%s(%s) marked as a failure member", podName, pdMember.ID) | ||
|
||
tc.Status.PD.FailureMembers[podName] = v1alpha1.PDFailureMember{ | ||
PodName: podName, | ||
MemberID: pdMember.ID, | ||
|
@@ -173,6 +184,8 @@ func (pf *pdFailover) tryToDeleteAFailureMember(tc *v1alpha1.TidbCluster) error | |
return err | ||
} | ||
glog.Infof("pd failover: delete member: %d successfully", memberID) | ||
pf.recorder.Eventf(tc, apiv1.EventTypeWarning, "PDMemberDeleted", | ||
"%s(%d) deleted from cluster", failurePodName, memberID) | ||
|
||
// The order of old PVC deleting and the new Pod creating is not guaranteed by Kubernetes. | ||
// If new Pod is created before old PVC deleted, new Pod will reuse old PVC. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the default
QPS
is too low:1/ 300
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
10 events per object per seconds seems too large, it might generate a lot of load on the API server or unable to do other CRUD requests because of exceeding the total throttling limit. Most burstable tokens (25) are consumed by unnecessary events. We can increase the burst and QPS together but not too much at the first.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, 10 is too large.
I will set them to proper values after removing those events.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what's the current rate of events?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rate of events
, does it mean the default value ofQPS
andBurstSize
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
create an issue about setting the rate to proper values?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#1494
an issue opened