From 9b00dcc32e187560ce36e3f45f8926c7f91f5b48 Mon Sep 17 00:00:00 2001 From: Johnu George Date: Mon, 3 Sep 2018 13:30:48 +0530 Subject: [PATCH 1/2] Compatible with upstream changes --- Gopkg.lock | 4 ++-- pkg/controller.v2/pytorch/controller.go | 5 +++-- pkg/controller.v2/pytorch/controller_test.go | 2 +- pkg/controller.v2/pytorch/pod.go | 2 +- .../pkg/controller.v2/jobcontroller/jobcontroller.go | 10 +++------- 5 files changed, 10 insertions(+), 13 deletions(-) diff --git a/Gopkg.lock b/Gopkg.lock index ebb0dcd67..142f8497b 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -227,7 +227,7 @@ [[projects]] branch = "master" - digest = "1:ea96765fc263e77a6a3df99fae6b2c90f2ea66374cc12ba65245920f37cff6d4" + digest = "1:00720e3e8a1266a155dec3deb18d24c032351a43bd82954962ae3ac94f38c42b" name = "github.com/kubeflow/tf-operator" packages = [ "pkg/control", @@ -238,7 +238,7 @@ "pkg/version", ] pruneopts = "NUT" - revision = "52f860977b8386c3a7c53aab6d6dbea141da61e0" + revision = "ed91105e7aa2e6f00c10c70ec99d369cc0254076" [[projects]] branch = "master" diff --git a/pkg/controller.v2/pytorch/controller.go b/pkg/controller.v2/pytorch/controller.go index 53264094a..327699ff7 100644 --- a/pkg/controller.v2/pytorch/controller.go +++ b/pkg/controller.v2/pytorch/controller.go @@ -297,7 +297,8 @@ func (pc *PyTorchController) syncPyTorchJob(key string) (bool, error) { jobNeedsSync := pc.satisfiedExpectations(job) if pc.Config.EnableGangScheduling { - _, err := pc.SyncPdb(job) + minAvailableReplicas := getTotalReplicas(job) + _, err := pc.SyncPdb(job, minAvailableReplicas) if err != nil { logger.Warnf("Sync pdb %v: %v", job.Name, err) } @@ -318,7 +319,7 @@ func (pc *PyTorchController) syncPyTorchJob(key string) (bool, error) { return true, err } -func (pc *PyTorchController) GetTotalReplicas(obj metav1.Object) int32 { +func getTotalReplicas(obj metav1.Object) int32 { job := obj.(*v1alpha2.PyTorchJob) jobReplicas := int32(0) for _, r := range job.Spec.PyTorchReplicaSpecs { diff --git a/pkg/controller.v2/pytorch/controller_test.go b/pkg/controller.v2/pytorch/controller_test.go index 5470d591a..9835f241c 100644 --- a/pkg/controller.v2/pytorch/controller_test.go +++ b/pkg/controller.v2/pytorch/controller_test.go @@ -434,7 +434,7 @@ func TestSyncPdb(t *testing.T) { }, } for _, c := range testCases { - pdb, _ := ctr.SyncPdb(c.job) + pdb, _ := ctr.SyncPdb(c.job, getTotalReplicas(c.job)) if pdb == nil && c.expectPdb != nil { t.Errorf("Got nil, want %v", c.expectPdb.Spec) } diff --git a/pkg/controller.v2/pytorch/pod.go b/pkg/controller.v2/pytorch/pod.go index f04e5d203..fda6dad91 100644 --- a/pkg/controller.v2/pytorch/pod.go +++ b/pkg/controller.v2/pytorch/pod.go @@ -145,7 +145,7 @@ func (pc *PyTorchController) createNewPod(job *v1alpha2.PyTorchJob, rtype v1alph labels[replicaIndexLabel] = index podTemplate := spec.Template.DeepCopy() - totalReplicas := pc.GetTotalReplicas(job) + totalReplicas := getTotalReplicas(job) // Set name for the template. podTemplate.Name = jobcontroller.GenGeneralName(job.Name, rt, index) diff --git a/vendor/github.com/kubeflow/tf-operator/pkg/controller.v2/jobcontroller/jobcontroller.go b/vendor/github.com/kubeflow/tf-operator/pkg/controller.v2/jobcontroller/jobcontroller.go index 7455b763b..2af9cfc0c 100644 --- a/vendor/github.com/kubeflow/tf-operator/pkg/controller.v2/jobcontroller/jobcontroller.go +++ b/vendor/github.com/kubeflow/tf-operator/pkg/controller.v2/jobcontroller/jobcontroller.go @@ -52,9 +52,6 @@ type ControllerInterface interface { // Returns the Replica Index(value) in the labels of the job GetReplicaIndexLabelKey() string - // Returns total replicas for a job. This is used for gang scheduling - GetTotalReplicas(obj metav1.Object) int32 - // Returns the Job from Infomer Cache GetJobFromInformerCache(namespace, name string) (metav1.Object, error) @@ -201,11 +198,10 @@ func (jc *JobController) GenLabels(jobName string) map[string]string { } // SyncPdb will create a PDB for gang scheduling by kube-arbitrator. -func (jc *JobController) SyncPdb(job metav1.Object) (*v1beta1.PodDisruptionBudget, error) { +func (jc *JobController) SyncPdb(job metav1.Object, minAvailableReplicas int32) (*v1beta1.PodDisruptionBudget, error) { labelJobName := jc.Controller.GetJobNameLabelKey() - totalJobReplicas := jc.Controller.GetTotalReplicas(job) // Non-distributed training is not required gang scheduling - if totalJobReplicas < 2 { + if minAvailableReplicas < 2 { return nil, nil } @@ -219,7 +215,7 @@ func (jc *JobController) SyncPdb(job metav1.Object) (*v1beta1.PodDisruptionBudge } // Create pdb for gang scheduling by kube-arbitrator - minAvailable := intstr.FromInt(int(totalJobReplicas)) + minAvailable := intstr.FromInt(int(minAvailableReplicas)) createPdb := &v1beta1.PodDisruptionBudget{ ObjectMeta: metav1.ObjectMeta{ Name: job.GetName(), From c966f9e84cc4b83b6f6f4363b33a484813fa85e8 Mon Sep 17 00:00:00 2001 From: Johnu George Date: Sat, 31 Aug 2019 16:55:59 +0530 Subject: [PATCH 2/2] Format error logs --- pkg/common/config/config.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/common/config/config.go b/pkg/common/config/config.go index 5e138fb2e..39f099da0 100644 --- a/pkg/common/config/config.go +++ b/pkg/common/config/config.go @@ -15,8 +15,9 @@ var initContainerTemplate = ` func init() { bytes, err := ioutil.ReadFile("/etc/config/initContainer.yaml") if err != nil { - log.Warningf("error while read initContainerTemplate, use default. error: %s", err) + log.Info("Using default init container template") } else { + log.Info("Using init container template from /etc/config/initContainer.yaml") initContainerTemplate = string(bytes) } }