Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cmd/kar-controllers/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ type ServerOption struct {
QuotaRestURL string
HealthProbeListenAddr string
DispatchResourceReservationTimeout int64
ExternalDispatch bool // if true, will use external plugin to dispatch workloads
}

// NewServerOption creates a new CMServer with a default config.
Expand Down Expand Up @@ -83,6 +84,8 @@ func (s *ServerOption) AddFlags(fs *flag.FlagSet) {
fs.IntVar(&s.SecurePort, "secure-port", 6443, "The port on which to serve secured, authenticated access for metrics.")
fs.StringVar(&s.HealthProbeListenAddr, "healthProbeListenAddr", ":8081", "Listen address for health probes. Defaults to ':8081'")
fs.Int64Var(&s.DispatchResourceReservationTimeout, "dispatchResourceReservationTimeout", s.DispatchResourceReservationTimeout, "Resource reservation timeout for pods to be created once AppWrapper is dispatched, in millisecond. Defaults to '300000', 5 minutes")
fs.BoolVar(&s.ExternalDispatch,"externalDispatch", s.ExternalDispatch,"Use external workload dispatch plugin. Default is false.")

flag.Parse()
klog.V(4).Infof("[AddFlags] Controller configuration: %#v", s)
}
Expand Down Expand Up @@ -147,6 +150,12 @@ func (s *ServerOption) loadDefaultsFromEnvVars() {
s.DispatchResourceReservationTimeout = to
}
}
externalDispatch, envVarExists := os.LookupEnv("EXTERNAL_DISPATCH")
s.ExternalDispatch = false
if envVarExists && strings.EqualFold(externalDispatch, "true") {
s.ExternalDispatch = true
}

}

func (s *ServerOption) CheckOptionOrDie() {
Expand Down
1 change: 1 addition & 0 deletions deployment/mcad-controller/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ metadata:
data:
QUOTA_ENABLED: {{ .Values.configMap.quotaEnabled }}
DISPATCHER_MODE: {{ .Values.configMap.dispatcherMode }}
EXTERNAL_DISPATCH: {{.Values.configMap.externalDispatch }}
{{ if .Values.configMap.agentConfigs }}DISPATCHER_AGENT_CONFIGS: {{ .Values.configMap.agentConfigs }}{{ end }}
PREEMPTION: {{ .Values.configMap.preemptionEnabled }}
{{ if .Values.configMap.quotaRestUrl }}QUOTA_REST_URL: {{ .Values.configMap.quotaRestUrl }}{{ end }}
Expand Down
1 change: 1 addition & 0 deletions deployment/mcad-controller/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ configMap:
quotaEnabled: '"false"'
multiCluster: false
dispatcherMode: '"false"'
externalDispatch: '"false"'
preemptionEnabled: '"false"'
agentConfigs: ""
quotaRestUrl: ""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: ClusterServiceVersion
metadata:
annotations:
alm-examples: >-
[{"apiVersion":"mcad.ibm.com/v1beta1","kind":"MCADHelmConfig","metadata":{"name":"example-mcadhelmconfig"},"spec":{"configMap":{"agentConfigs":null,"dispatcherMode":"false","name":null},"deploymentName":"xqueuejob-controller","image":{"pullPolicy":"Always","repository":"darroyo/mcad-controller","tag":"v1.29.0"},"imagePullSecret":{"name":null,"password":"dummyvalue","registry":"registry.stage1.ng.bluemix.net","username":"iamapikey"},"loglevel":4,"namespace":"kube-system","nodeSelector":{"hostname":null},"replicaCount":1,"resources":{"limits":{"cpu":"2000m","memory":"2048Mi"},"requests":{"cpu":"2000m","memory":"2048Mi"}},"serviceAccount":"xqueuejob-controller","volumes":{"hostPath":null}}}]
[{"apiVersion":"mcad.ibm.com/v1beta1","kind":"MCADHelmConfig","metadata":{"name":"example-mcadhelmconfig"},"spec":{"configMap":{"agentConfigs":null,"dispatcherMode":"false","externalDispatch":"false","name":null},"deploymentName":"xqueuejob-controller","image":{"pullPolicy":"Always","repository":"darroyo/mcad-controller","tag":"v1.29.0"},"imagePullSecret":{"name":null,"password":"dummyvalue","registry":"registry.stage1.ng.bluemix.net","username":"iamapikey"},"loglevel":4,"namespace":"kube-system","nodeSelector":{"hostname":null},"replicaCount":1,"resources":{"limits":{"cpu":"2000m","memory":"2048Mi"},"requests":{"cpu":"2000m","memory":"2048Mi"}},"serviceAccount":"xqueuejob-controller","volumes":{"hostPath":null}}}]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This operator is deprecated but no need to undo.

capabilities: Basic Install
description: A Kubernetes Native Holistic Lifecycle Resource Manager for Applications
name: mcad-operator.v0.1.9
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ spec:
configMap:
agentConfigs: null
dispatcherMode: "false"
externalDispatch: "false"
name: null
deploymentName: xqueuejob-controller
image:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: ClusterServiceVersion
metadata:
annotations:
alm-examples: >-
[{"apiVersion":"mcad.ibm.com/v1beta1","kind":"MCADHelmConfig","metadata":{"name":"example-mcadhelmconfig"},"spec":{"configMap":{"agentConfigs":null,"dispatcherMode":"false","name":null},"deploymentName":"xqueuejob-controller","image":{"pullPolicy":"Always","repository":"darroyo/mcad-controller","tag":"v1.29.0"},"imagePullSecret":{"name":null,"password":"dummyvalue","registry":"registry.stage1.ng.bluemix.net","username":"iamapikey"},"loglevel":4,"namespace":"kube-system","nodeSelector":{"hostname":null},"replicaCount":1,"resources":{"limits":{"cpu":"2000m","memory":"2048Mi"},"requests":{"cpu":"2000m","memory":"2048Mi"}},"serviceAccount":"xqueuejob-controller","volumes":{"hostPath":null}}}]
[{"apiVersion":"mcad.ibm.com/v1beta1","kind":"MCADHelmConfig","metadata":{"name":"example-mcadhelmconfig"},"spec":{"configMap":{"agentConfigs":null,"dispatcherMode":"false","externalDispatch":"false","name":null},"deploymentName":"xqueuejob-controller","image":{"pullPolicy":"Always","repository":"darroyo/mcad-controller","tag":"v1.29.0"},"imagePullSecret":{"name":null,"password":"dummyvalue","registry":"registry.stage1.ng.bluemix.net","username":"iamapikey"},"loglevel":4,"namespace":"kube-system","nodeSelector":{"hostname":null},"replicaCount":1,"resources":{"limits":{"cpu":"2000m","memory":"2048Mi"},"requests":{"cpu":"2000m","memory":"2048Mi"}},"serviceAccount":"xqueuejob-controller","volumes":{"hostPath":null}}}]
capabilities: Basic Install
description: A Kubernetes Native Holistic Lifecycle Resource Manager for Applications
name: mcad-operator.v0.1.9
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ metadata:
namespace: kube-system
data:
DISPATCHER_MODE: {{ .Values.configMap.dispatcherMode }}
EXTERNAL_DISPATCH: {{.Values.configMap.externalDispatch }}
DISPATCHER_AGENT_CONFIGS: {{ .Values.configMap.agentConfigs }}
#{{ end }}
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ configMap:
name:
multiCluster: false
dispatcherMode: "false"
externalDispatch: "false"
agentConfigs:

volumes:
Expand Down
1 change: 1 addition & 0 deletions doc/deploy/deployment.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ The following table lists the configurable parameters of the helm chart and thei
| ----------------------- | ------------------------------------ | ------------- | ------------------------------------------------ |
| `configMap.agentConfigs` | *For Every Agent Cluster separated by commas(,):* Name of *agent* config file _:_ Set the dispatching mode for the _*Agent Cluster*_. Note:For the dispatching mode `uncordon`, indicating _MCAD_ controller is allowed to dispatched jobs to the _*Agent Cluster*_, is only supported. | <_No default for agent config file_>:`uncordon` | `agent101config:uncordon,agent110config:uncordon` |
| `configMap.dispatcherMode` | Whether the _MCAD_ Controller should be launched in Dispatcher mode or not | `false` | `true` |
| `configMap.externalDispatch` | Whether the _MCAD_ Controller should use external plugin to dispatch workloads or not | `false` | `true` |
| `configMap.name` | Name of the Kubernetes *ConfigMap* resource to configure the _MCAD_ Controller | | `mcad-deployer` |
| `deploymentName` | Name of _MCAD_ Controller Deployment Object | `mcad-controller` | `my-mcad-controller` |
| `image.pullPolicy` | Policy that dictates when the specified image is pulled | `Always` | `Never` |
Expand Down
47 changes: 39 additions & 8 deletions pkg/controller/queuejob/queuejob_controller_ex.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"fmt"
"math"
"math/rand"
"path"
"reflect"
"runtime/debug"
"sort"
Expand Down Expand Up @@ -1011,6 +1012,31 @@ func (qjm *XController) getAggregatedAvailableResourcesPriority(unallocatedClust
}

func (qjm *XController) chooseAgent(qj *arbv1.AppWrapper) string {

if qjm.serverOption.ExternalDispatch {
clusters := qj.Spec.SchedSpec.ClusterScheduling.Clusters
var agentId = ""
apath := path.Dir(qjm.agentList[0])
var agentIdList = make([]string, len(clusters))
clustersProvided := false // assume clusters not provided
for _, clusterRef := range clusters {
if clusterRef.Name != "" {
clustersProvided = true
agentIdList = append(agentIdList, apath+"/"+clusterRef.Name )
}
}
// target clusters no defined by the submitter of workload. Just pick a target
// from a known list of clusters provided in serverOption.AgentConfigs
if !clustersProvided {
agentId = qjm.agentList[rand.Int()%len(qjm.agentList)]
klog.V(1).Infof("ClusterId %s is chosen randomly from a list provided by mcad\n", agentId)
} else {
// choose target clusterId at random
agentId = agentIdList[rand.Int()%len(agentIdList)]
klog.V(1).Infof("ClusterId %s is chosen randomly from a list provided in Spec.SchedSpec.ClusterScheduling.Clusters: %s\n", agentId, agentIdList)
}
return agentId;
}

qjAggrResources := qjm.GetAggregatedResources(qj)
klog.V(2).Infof("[chooseAgent] Aggregated Resources of XQJ %s: %v\n", qj.Name, qjAggrResources)
Expand Down Expand Up @@ -1922,7 +1948,6 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool
defer func() {
klog.V(10).Infof("[worker-manageQJ] Ending %s manageQJ time=%s &qj=%p Version=%s Status=%+v", qj.Name, time.Now().Sub(startTime), qj, qj.ResourceVersion, qj.Status)
}()

if !cc.isDispatcher { // Agent Mode

if qj.DeletionTimestamp != nil {
Expand Down Expand Up @@ -2215,18 +2240,21 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool
current_time := time.Now()
klog.V(10).Infof("[worker-manageQJ] XQJ %s has Overhead Before Dispatching: %s", qj.Name, current_time.Sub(qj.CreationTimestamp.Time))
klog.V(10).Infof("[TTime] %s, %s: WorkerBeforeDispatch", qj.Name, time.Now().Sub(qj.CreationTimestamp.Time))
}

}
queuejobKey, _ := GetQueueJobKey(qj)
// agentId:=cc.dispatchMap[queuejobKey]
// if agentId!=nil {
if agentId, ok := cc.dispatchMap[queuejobKey]; ok {
klog.V(10).Infof("[Dispatcher Controller] Dispatched AppWrapper %s to Agent ID: %s.", qj.Name, agentId)
cc.agentMap[agentId].CreateJob(qj)
if cc.serverOption.ExternalDispatch {
values := strings.Split(agentId,"/")
klog.V(10).Infof("[Dispatcher Controller] Dispatching AppWrapper %s to Agent ID: %s Through External Dispatcher.", qj.Name, values[len(values)-1])
qj.Status.TargetClusterName = values[len(values)-1] //agentId
} else {
cc.agentMap[agentId].CreateJob(qj)
}
qj.Status.IsDispatched = true
} else {
klog.Errorf("[Dispatcher Controller] AppWrapper %s not found in dispatcher mapping.", qj.Name)
}
}
if klog.V(10).Enabled() {
current_time := time.Now()
klog.V(10).Infof("[Dispatcher Controller] XQJ %s has Overhead After Dispatching: %s", qj.Name, current_time.Sub(qj.CreationTimestamp.Time))
Expand Down Expand Up @@ -2277,7 +2305,10 @@ func (cc *XController) Cleanup(appwrapper *arbv1.AppWrapper) error {
if appwrapper.Status.IsDispatched {
queuejobKey, _ := GetQueueJobKey(appwrapper)
if obj, ok := cc.dispatchMap[queuejobKey]; ok {
cc.agentMap[obj].DeleteJob(appwrapper)
if !cc.serverOption.ExternalDispatch {
cc.agentMap[obj].DeleteJob(appwrapper)
}
delete(cc.dispatchMap,queuejobKey)
}
appwrapper.Status.IsDispatched = false
}
Expand Down