Skip to content

Commit

Permalink
Robustness fixes (#287)
Browse files Browse the repository at this point in the history
* small update to docs

* robustness fixes

* unload all models on agent start

* remove load models on startup
  • Loading branch information
ukclivecox authored Jun 13, 2022
1 parent 06ac908 commit 23b7b9a
Show file tree
Hide file tree
Showing 14 changed files with 114 additions and 117 deletions.
2 changes: 1 addition & 1 deletion k8s/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ DATAFLOW_IMG ?= ${DOCKERHUB_USERNAME}/seldon-dataflow-engine:${CUSTOM_IMAGE_TAG}
ENVOY_IMG ?= ${DOCKERHUB_USERNAME}/seldon-envoy:${CUSTOM_IMAGE_TAG}

MLSERVER_IMG ?= seldonio/mlserver:1.0.1
TRITON_IMG ?= nvcr.io/nvidia/tritonserver:21.12-py3
TRITON_IMG ?= nvcr.io/nvidia/tritonserver:22.05-py3

.PHONY: create
create: create-yaml create-helm-charts
Expand Down
2 changes: 1 addition & 1 deletion k8s/helm-charts/seldon-core-v2-setup/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ serverConfig:
pullPolicy: IfNotPresent
registry: nvcr.io
repository: nvidia/tritonserver
tag: 21.12-py3
tag: 22.05-py3
serverCapabilities: "triton,dali,fil,onnx,openvino,python,pytorch,tensorflow,tensorrt"
overcommitPercentage: "10"
modelVolumeStorage: 1Gi
Expand Down
2 changes: 1 addition & 1 deletion k8s/yaml/seldon-v2-components.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1153,7 +1153,7 @@ spec:
value: "9500"
- name: SERVER_MODELS_DIR
value: /mnt/agent/models
image: nvcr.io/nvidia/tritonserver:21.12-py3
image: nvcr.io/nvidia/tritonserver:22.05-py3
imagePullPolicy: IfNotPresent
livenessProbe:
httpGet:
Expand Down
2 changes: 1 addition & 1 deletion operator/config/serverconfigs/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ images:
newTag: latest
- name: triton
newName: nvcr.io/nvidia/tritonserver
newTag: 21.12-py3
newTag: 22.05-py3
8 changes: 4 additions & 4 deletions operator/pkg/cli/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -655,11 +655,11 @@ func (sc *SchedulerClient) ListPipelines() error {
}

writer := tabwriter.NewWriter(os.Stdout, 0, 8, 1, '\t', tabwriter.AlignRight)
_, err = fmt.Fprintln(writer, "pipeline\tstate")
_, err = fmt.Fprintln(writer, "pipeline\tstate\treason")
if err != nil {
return err
}
_, err = fmt.Fprintln(writer, "--------\t-----")
_, err = fmt.Fprintln(writer, "--------\t-----\t------")
if err != nil {
return err
}
Expand All @@ -673,8 +673,8 @@ func (sc *SchedulerClient) ListPipelines() error {
}

}

_, err = fmt.Fprintf(writer, "%s\t%s\n", res.PipelineName, res.Versions[len(res.Versions)-1].State.Status.String())
pv := res.Versions[len(res.Versions)-1]
_, err = fmt.Fprintf(writer, "%s\t%s\t%s\n", res.PipelineName, pv.State.Status.String(), pv.State.Reason)
if err != nil {
return err
}
Expand Down
18 changes: 9 additions & 9 deletions scheduler/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ ENVOY_IMG ?= ${DOCKERHUB_USERNAME}/seldon-envoy:${CUSTOM_IMAGE_TAG}
# Grafana image only used for Docker compose not k8s
GRAFANA_IMG ?= ${DOCKERHUB_USERNAME}/seldon-grafana:${CUSTOM_IMAGE_TAG}
MLSERVER_IMG ?= seldonio/mlserver:1.1.0.dev3
TRITON_IMG ?= nvcr.io/nvidia/tritonserver:21.12-py3
TRITON_IMG ?= nvcr.io/nvidia/tritonserver:22.05-py3
KIND_NAME=ansible

GO_LDFLAGS := -s -w $(patsubst %,-X %, $(GO_BUILD_VARS))
Expand Down Expand Up @@ -165,31 +165,31 @@ docker-push-all: docker-push-agent docker-push-envoy docker-push-rclone docker-p
#####################################

.PHONY: kind-image-install-scheduler
kind-image-install-scheduler: docker-build-scheduler
kind-image-install-scheduler:
kind load -v 3 docker-image ${SCHEDULER_IMG} --name ${KIND_NAME}

.PHONY: kind-image-install-agent
kind-image-install-agent: docker-build-agent
kind-image-install-agent:
kind load -v 3 docker-image ${AGENT_IMG} --name ${KIND_NAME}

.PHONY: kind-image-install-envoy
kind-image-install-envoy: docker-build-envoy
kind-image-install-envoy:
kind load -v 3 docker-image ${ENVOY_IMG} --name ${KIND_NAME}

.PHONY: kind-image-install-rclone
kind-image-install-rclone: docker-build-rclone
kind-image-install-rclone:
kind load -v 3 docker-image ${RCLONE_IMG} --name ${KIND_NAME}

.PHONY: kind-image-install-modelgateway
kind-image-install-modelgateway: docker-build-modelgateway
kind-image-install-modelgateway:
kind load -v 3 docker-image ${MODELGATEWAY_IMG} --name ${KIND_NAME}

.PHONY: kind-image-install-pipelinegateway
kind-image-install-pipelinegateway: docker-build-pipelinegateway
kind-image-install-pipelinegateway:
kind load -v 3 docker-image ${PIPELINEGATEWAY_IMG} --name ${KIND_NAME}

.PHONY: kind-image-install-dataflow
kind-image-install-dataflow: docker-build-dataflow
kind-image-install-dataflow:
kind load -v 3 docker-image ${DATAFLOW_IMG} --name ${KIND_NAME}

.PHONY: kind-image-install-all
Expand All @@ -216,7 +216,7 @@ DOCKER_COMPOSE_COMMON_IMAGES = \
SERVER_TRITON_IMAGE_AND_TAG=${TRITON_IMG} \
GRAFANA_IMAGE_AND_TAG=${GRAFANA_IMG}

DOCKER_COMPOSE_TRITON_LOG_LEVEL ?= 0
DOCKER_COMPOSE_TRITON_LOG_LEVEL ?= 5

DOCKER_COMPOSE_USE_EMPTY_VOLUMES ?= true

Expand Down
2 changes: 1 addition & 1 deletion scheduler/env.all
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ RCLONE_IMAGE_AND_TAG=seldonio/seldon-rclone:latest
MODELGATEWAY_IMAGE_AND_TAG=seldonio/seldon-modelgateway:latest
PIPELINEGATEWAY_IMAGE_AND_TAG=seldonio/seldon-pipelinegateway:latest
SERVER_MLSERVER_IMAGE_AND_TAG=seldonio/mlserver:1.0.1
SERVER_TRITON_IMAGE_AND_TAG=nvcr.io/nvidia/tritonserver:21.12-py3
SERVER_TRITON_IMAGE_AND_TAG=nvcr.io/nvidia/tritonserver:22.05-py3
SCHEDULER_IMAGE_AND_TAG=seldonio/seldon-scheduler:latest
KAFKA_IMAGE_AND_TAG=quay.io/strimzi/kafka:0.28.0-kafka-3.1.0
GRAFANA_IMAGE_AND_TAG=seldonio/seldon-grafana:latest
Expand Down
28 changes: 27 additions & 1 deletion scheduler/pkg/agent/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,12 @@ func (c *Client) WaitReady() error {
return err
}

// Unload any existing models on server to ensure we start in a clean state
err = c.UnloadAllModels()
if err != nil {
return err
}

// http reverse proxy
if err := startSubService(c.rpHTTP, logger); err != nil {
return err
Expand All @@ -207,6 +213,26 @@ func (c *Client) WaitReady() error {
return nil
}

func (c *Client) UnloadAllModels() error {
logger := c.logger.WithField("func", "UnloadAllModels")
models, err := c.stateManager.v2Client.GetModels()
if err != nil {
return err
}
for _, model := range models {
logger.Infof("Unloading existing model %s", model)
v2Err := c.stateManager.v2Client.UnloadModel(model)
if v2Err != nil {
return v2Err.err
}
err := c.ModelRepository.RemoveModelVersion(model)
if err != nil {
return err
}
}
return nil
}

func startSubService(service ClientServiceInterface, logger *log.Entry) error {
// debug service
logger.Infof("Starting and waiting for %s", service.Name())
Expand Down Expand Up @@ -388,7 +414,7 @@ func (c *Client) UnloadModel(request *agent.ModelOperationMessage) error {

logger.Infof("Unload model %s:%d", modelName, modelVersion)

_, err := c.ModelRepository.RemoveModelVersion(modelWithVersion, pinnedModelVersion)
err := c.ModelRepository.RemoveModelVersion(modelWithVersion)
if err != nil {
c.sendModelEventError(modelName, modelVersion, agent.ModelEventMessage_UNLOAD_FAILED, err)
return err
Expand Down
4 changes: 2 additions & 2 deletions scheduler/pkg/agent/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ type FakeModelRepository struct {
err error
}

func (f FakeModelRepository) RemoveModelVersion(modelName string, version uint32) (int, error) {
return 0, nil
func (f FakeModelRepository) RemoveModelVersion(modelName string) error {
return nil
}

func (f FakeModelRepository) DownloadModelVersion(modelName string, version uint32, artifactVersion *uint32, srcUri string, config []byte) (*string, error) {
Expand Down
34 changes: 5 additions & 29 deletions scheduler/pkg/agent/repository/model_repository.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ type ModelRepositoryHandler interface {

type ModelRepository interface {
DownloadModelVersion(modelName string, version uint32, artifactVersion *uint32, srcUri string, config []byte) (*string, error)
RemoveModelVersion(modelName string, version uint32) (int, error)
RemoveModelVersion(modelName string) error
Ready() error
}

Expand Down Expand Up @@ -96,37 +96,13 @@ func (r *V2ModelRepository) DownloadModelVersion(modelName string, version uint3
}

// Remove version folder and return number of remaining versions calculated as found model-settings files
func (r *V2ModelRepository) RemoveModelVersion(modelName string, version uint32) (int, error) {
logger := r.logger.WithField("func", "RemoveModelVersion")
versionStr := fmt.Sprintf("%d", version)
versionPath := filepath.Join(r.repoPath, modelName, versionStr)
logger.Debugf("Removing version path %s", versionPath)
err := os.RemoveAll(versionPath)
if err != nil {
return 0, err
}
func (r *V2ModelRepository) RemoveModelVersion(modelName string) error {
modelPath := filepath.Join(r.repoPath, modelName)
var found int
err = filepath.Walk(modelPath, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() && filepath.Base(path) == "model-settings.json" {
found++
}
return nil
})
err := os.RemoveAll(modelPath)
if err != nil {
return 0, err
}
if found == 0 { // Remove model folder as no versions remain
err := os.RemoveAll(modelPath)
if err != nil {
return 0, err
}
return err
}
logger.Debugf("Found %d versions in %s", found, modelPath)
return found, nil
return nil
}

func (r *V2ModelRepository) Ready() error {
Expand Down
58 changes: 8 additions & 50 deletions scheduler/pkg/agent/repository/model_repository_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,9 @@ func TestRemoveModelVersion(t *testing.T) {
g := NewGomegaWithT(t)

type test struct {
name string
folders map[string]*mlserver.ModelSettings
modelName string
versionToDelete uint32
expectedVersions int
name string
folders map[string]*mlserver.ModelSettings
modelName string
}

tests := []test{
Expand All @@ -245,43 +243,6 @@ func TestRemoveModelVersion(t *testing.T) {
},
},
},
versionToDelete: 1,
expectedVersions: 0,
},
{
name: "DeleteVersionNotExisting",
folders: map[string]*mlserver.ModelSettings{
"1": {
Name: "iris",
Implementation: "mlserver_sklearn.SKLearnModel",
Parameters: &mlserver.ModelParameters{
Version: "1",
},
},
},
versionToDelete: 2,
expectedVersions: 1,
},
{
name: "DeleteOneVersion",
folders: map[string]*mlserver.ModelSettings{
"1": {
Name: "iris",
Implementation: "mlserver_sklearn.SKLearnModel",
Parameters: &mlserver.ModelParameters{
Version: "1",
},
},
"2": {
Name: "iris",
Implementation: "mlserver_sklearn.SKLearnModel",
Parameters: &mlserver.ModelParameters{
Version: "2",
},
},
},
versionToDelete: 1,
expectedVersions: 1,
},
}

Expand All @@ -301,15 +262,12 @@ func TestRemoveModelVersion(t *testing.T) {
logger := log.New()
logger.SetLevel(log.DebugLevel)
mr := NewModelRepository(logger, nil, path, nil)
found, err := mr.RemoveModelVersion(test.modelName, test.versionToDelete)
err := mr.RemoveModelVersion(test.modelName)
g.Expect(err).To(BeNil())
g.Expect(found).To(Equal(test.expectedVersions))
if found == 0 {
modelPath := filepath.Join(path, test.modelName)
_, err := os.Stat(modelPath)
g.Expect(err).ToNot(BeNil())
g.Expect(errors.Is(err, os.ErrNotExist)).To(BeTrue())
}
modelPath := filepath.Join(path, test.modelName)
_, err = os.Stat(modelPath)
g.Expect(err).ToNot(BeNil())
g.Expect(errors.Is(err, os.ErrNotExist)).To(BeTrue())
})
}
}
24 changes: 24 additions & 0 deletions scheduler/pkg/agent/v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -284,3 +284,27 @@ func (v *V2Client) readyGrpc() error {
_, err := v.grpcClient.ServerReady(ctx, req)
return err
}

func (v *V2Client) GetModels() ([]string, error) {
if v.isGrpc {
return v.getModelsGrpc()
} else {
v.logger.Warnf("Http GetModels not available returning empty list")
return []string{}, nil
}
}

func (v *V2Client) getModelsGrpc() ([]string, error) {
var models []string
ctx := context.Background()
req := &v2.RepositoryIndexRequest{}

res, err := v.grpcClient.RepositoryIndex(ctx, req)
if err != nil {
return nil, err
}
for _, modelRes := range res.Models {
models = append(models, modelRes.Name)
}
return models, nil
}
Loading

0 comments on commit 23b7b9a

Please sign in to comment.