Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix restored cluster failed pod #461

Merged
merged 1 commit into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions charts/nebula-operator/crds/nebulaclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ spec:
type: object
type: array
heartbeatInterval:
default: 60
format: int32
type: integer
image:
Expand Down
1 change: 1 addition & 0 deletions charts/nebula-operator/templates/scheduler-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ spec:
- --leader-elect
- --leader-elect-resource-name={{ .Values.scheduler.schedulerName }}
- --leader-elect-resource-namespace={{ template "nebula-operator.namespace" . }}
- --pod-max-in-unschedulable-pods-duration={{ .Values.scheduler.podMaxInUnschedulablePodsDuration }}
- --v={{ .Values.scheduler.verbosity }}
{{- if or .Values.kubernetesClusterDomain .Values.scheduler.env }}
env:
Expand Down
1 change: 1 addition & 0 deletions charts/nebula-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ scheduler:
cpu: 100m
memory: 100Mi
verbosity: 0
podMaxInUnschedulablePodsDuration: 10s
plugins:
enabled: ["NodeZone"]
disabled: [] # only in-tree plugins need to be defined here
Expand Down
1 change: 1 addition & 0 deletions config/crd/bases/apps.nebula-graph.io_nebulaclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ spec:
type: object
type: array
heartbeatInterval:
default: 60
format: int32
type: integer
image:
Expand Down
2 changes: 1 addition & 1 deletion config/samples/nebularestore-gs.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: v1
kind: Secret
metadata:
name: gs-secret
name: gcp-secret
type: Opaque
data:
credentials: <GOOGLE_APPLICATION_CREDENTIALS_JSON>
Expand Down
4 changes: 2 additions & 2 deletions config/samples/nebularestore-s3.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: v1
kind: Secret
metadata:
name: aws-s3-secret
name: aws-secret
type: Opaque
data:
access_key: <ACCESS_KEY>
Expand All @@ -20,4 +20,4 @@ spec:
region: "us-west-2"
bucket: "nebula-br-test"
endpoint: "https://s3.us-west-2.amazonaws.com"
secretName: "aws-s3-secret"
secretName: "aws-secret"
6 changes: 3 additions & 3 deletions config/samples/restore-pod.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: v1
kind: Secret
metadata:
name: aws-s3-secret
name: aws-secret
type: Opaque
data:
access_key: <ACCESS_KEY>
Expand All @@ -15,7 +15,7 @@ spec:
imagePullSecrets:
- name: nebula-image
containers:
- image: reg.vesoft-inc.com/cloud-dev/br-ent:v3.5.1
- image: reg.vesoft-inc.com/cloud-dev/br-ent:v3.7.0
imagePullPolicy: Always
name: restore
command:
Expand All @@ -24,7 +24,7 @@ spec:
- 'exec /usr/local/bin/br-ent restore full
--mode=k8s --cluster=nebula
--namespace default
--secret=aws-s3-secret
--secret=aws-secret
--name BACKUP_2023_02_10_09_57_17
--storage s3://BUCKET
--s3.region=REGION
Expand Down
5 changes: 4 additions & 1 deletion doc/user/br_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ The fields in the table is optional.
| Parameter | Description | Default |
|:---------------------|:--------------------------------------------------------------------------|:---------|
| `image` | backup container image without tag, and use `version` as tag | `` |
| `nebula.version` | backup image tag | `` |
| `version` | backup image tag | `` |
| `imagePullPolicy` | backup image pull policy | `Always` |
| `imagePullSecrets` | The secret to use for pulling the images | `[]` |
| `env` | backup container environment variables | `[]` |
Expand Down Expand Up @@ -76,6 +76,8 @@ spec:
config:
# The name of the backup/restore nebula cluster
clusterName: nebula
# Concurrency is used to control the number of concurrent file uploads during data backup.
concurrency: 15
gs:
# Location in which the gs bucket is located.
location: "us-central1"
Expand Down Expand Up @@ -125,6 +127,7 @@ spec:
cleanBackupData: true
config:
clusterName: nebula
concurrency: 15
gs:
location: "us-central1"
bucket: "nebula-test"
Expand Down
4 changes: 4 additions & 0 deletions pkg/controller/nebulabackup/nebula_backup_control.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ func (c *defaultBackupControl) UpdateNebulaBackup(backup *v1alpha1.NebulaBackup)
}

func (c *defaultBackupControl) addFinalizer(backup *v1alpha1.NebulaBackup) error {
if !backup.CleanBackupData() && kube.HasFinalizer(backup, finalizer) {
return kube.UpdateFinalizer(context.TODO(), c.client, backup, kube.RemoveFinalizerOpType, finalizer)
}

if needToAddFinalizer(backup) {
if err := kube.UpdateFinalizer(context.TODO(), c.client, backup, kube.AddFinalizerOpType, finalizer); err != nil {
return fmt.Errorf("add backup [%s/%s] finalizer failed, err: %v", backup.Namespace, backup.Name, err)
Expand Down
16 changes: 8 additions & 8 deletions pkg/controller/nebularestore/nebula_restore_control.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package nebularestore

import (
"fmt"

corev1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"

Expand Down Expand Up @@ -77,23 +79,21 @@ func (c *defaultRestoreControl) UpdateNebulaRestore(nr *v1alpha1.NebulaRestore)
}
for _, pod := range pods {
if pod.Status.Phase == corev1.PodFailed {
klog.Infof("NebulaCluster [%s/%s] has failed pod %s.", ns, name, pod.Name)
terminatedReason := getPodTerminateReason(pod)
if err := c.clientSet.NebulaRestore().UpdateNebulaRestoreStatus(nr, &v1alpha1.RestoreCondition{
Type: v1alpha1.RestoreFailed,
Status: corev1.ConditionTrue,
Status: corev1.ConditionUnknown,
Reason: "PodFailed",
Message: getPodTerminateReason(pod),
Message: terminatedReason,
}, &kube.RestoreUpdateStatus{
ConditionType: v1alpha1.RestoreFailed,
}); err != nil {
klog.Errorf("Fail to update the condition of NebulaRestore [%s/%s], %v", ns, name, err)
}
if nr.Spec.AutoRemoveFailed {
if err := c.deleteRestoredCluster(ns, nr.Status.ClusterName); err != nil {
klog.Errorf("Fail to delete NebulaCluster [%s/%s], %v", ns, nr.Status.ClusterName, err)
}
if terminatedReason != "" {
klog.Errorf("restored cluster [%s/%s] has failed pod %s, terminated reason: %s", ns, name, pod.Name, terminatedReason)
return fmt.Errorf("restored cluster has failed pod: %s", pod.Name)
}
return nil
}
}
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/controller/nebularestore/nebula_restore_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -812,13 +812,13 @@ func (rm *restoreManager) getRestoredName(nr *v1alpha1.NebulaRestore) (string, e

func getPodTerminateReason(pod corev1.Pod) string {
for _, cs := range pod.Status.InitContainerStatuses {
if cs.State.Terminated != nil {
return cs.State.Terminated.String()
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
return fmt.Sprintf("container %s terminated: %s", cs.Name, cs.State.Terminated.String())
}
}
for _, cs := range pod.Status.ContainerStatuses {
if cs.State.Terminated != nil {
return cs.State.Terminated.String()
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
return fmt.Sprintf("container %s terminated: %s", cs.Name, cs.State.Terminated.String())
}
}
return ""
Expand Down