Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implement MlxResetFW to reset the FW on VF changes #733

Merged
merged 2 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Dockerfile.sriov-network-config-daemon
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ RUN make _build-sriov-network-config-daemon BIN_PATH=build/_output/cmd

FROM quay.io/centos/centos:stream9
ARG MSTFLINT=mstflint
RUN ARCH_DEP_PKGS=$(if [ "$(uname -m)" != "s390x" ]; then echo -n ${MSTFLINT} ; fi) && yum -y install hwdata $ARCH_DEP_PKGS && yum clean all
# We have to ensure that pciutils is installed. This package is needed for mstfwreset to succeed.
# xref pkg/vendors/mellanox/mellanox.go#L150
RUN ARCH_DEP_PKGS=$(if [ "$(uname -m)" != "s390x" ]; then echo -n ${MSTFLINT} ; fi) && yum -y install hwdata pciutils $ARCH_DEP_PKGS && yum clean all
tobiasgiese marked this conversation as resolved.
Show resolved Hide resolved
LABEL io.k8s.display-name="sriov-network-config-daemon" \
io.k8s.description="This is a daemon that manage and config sriov network devices in Kubernetes cluster"
COPY --from=builder /go/src/github.com/k8snetworkplumbingwg/sriov-network-operator/build/_output/cmd/sriov-network-config-daemon /usr/bin/
Expand Down
15 changes: 15 additions & 0 deletions cmd/sriov-network-config-daemon/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"github.com/spf13/cobra"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
Expand All @@ -41,6 +42,7 @@ import (
snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/daemon"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms"
Expand Down Expand Up @@ -276,6 +278,18 @@ func runStartCmd(cmd *cobra.Command, args []string) error {
}
go nodeWriter.Run(stopCh, refreshCh, syncCh)

// Init feature gates once to prevent race conditions.
defaultConfig := &sriovnetworkv1.SriovOperatorConfig{}
err = kClient.Get(context.Background(), types.NamespacedName{Namespace: vars.Namespace, Name: consts.DefaultConfigName}, defaultConfig)
if err != nil {
log.Log.Error(err, "Failed to get default SriovOperatorConfig object")
return err
}
featureGates := featuregate.New()
featureGates.Init(defaultConfig.Spec.FeatureGates)
vars.MlxPluginFwReset = featureGates.IsEnabled(consts.MellanoxFirmwareResetFeatureGate)
log.Log.Info("Enabled featureGates", "featureGates", featureGates.String())

setupLog.V(0).Info("Starting SriovNetworkConfigDaemon")
err = daemon.New(
kClient,
Expand All @@ -288,6 +302,7 @@ func runStartCmd(cmd *cobra.Command, args []string) error {
syncCh,
refreshCh,
eventRecorder,
featureGates,
startOpts.disabledPlugins,
).Run(stopCh, exitCh)
if err != nil {
Expand Down
3 changes: 3 additions & 0 deletions pkg/consts/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ const (
// ManageSoftwareBridgesFeatureGate: enables management of software bridges by the operator
ManageSoftwareBridgesFeatureGate = "manageSoftwareBridges"

// MellanoxFirmwareResetFeatureGate: enables the firmware reset via mstfwreset before a reboot
MellanoxFirmwareResetFeatureGate = "mellanoxFirmwareReset"
tobiasgiese marked this conversation as resolved.
Show resolved Hide resolved

// The path to the file on the host filesystem that contains the IB GUID distribution for IB VFs
InfinibandGUIDConfigFilePath = SriovConfBasePath + "/infiniband/guids"
)
Expand Down
14 changes: 14 additions & 0 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"math/rand"
"os/exec"
"reflect"
"sync"
"time"

Expand All @@ -23,6 +24,7 @@ import (
snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned"
sninformer "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/informers/externalversions"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms"
Expand Down Expand Up @@ -82,6 +84,8 @@ type Daemon struct {
workqueue workqueue.RateLimitingInterface

eventRecorder *EventRecorder

featureGate featuregate.FeatureGate
}

func New(
Expand All @@ -95,6 +99,7 @@ func New(
syncCh <-chan struct{},
refreshCh chan<- Message,
er *EventRecorder,
featureGates featuregate.FeatureGate,
disabledPlugins []string,
) *Daemon {
return &Daemon{
Expand All @@ -113,6 +118,7 @@ func New(
&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(updateDelay), 1)},
workqueue.NewItemExponentialFailureRateLimiter(1*time.Second, maxUpdateBackoff)), "SriovNetworkNodeState"),
eventRecorder: er,
featureGate: featureGates,
disabledPlugins: disabledPlugins,
}
}
Expand Down Expand Up @@ -286,6 +292,7 @@ func (dn *Daemon) operatorConfigAddHandler(obj interface{}) {
}

func (dn *Daemon) operatorConfigChangeHandler(old, new interface{}) {
oldCfg := old.(*sriovnetworkv1.SriovOperatorConfig)
newCfg := new.(*sriovnetworkv1.SriovOperatorConfig)
if newCfg.Namespace != vars.Namespace || newCfg.Name != consts.DefaultConfigName {
log.Log.V(2).Info("unsupported SriovOperatorConfig", "namespace", newCfg.Namespace, "name", newCfg.Name)
Expand All @@ -299,6 +306,13 @@ func (dn *Daemon) operatorConfigChangeHandler(old, new interface{}) {
dn.disableDrain = newDisableDrain
log.Log.Info("Set Disable Drain", "value", dn.disableDrain)
}

if !reflect.DeepEqual(oldCfg.Spec.FeatureGates, newCfg.Spec.FeatureGates) {
dn.featureGate.Init(newCfg.Spec.FeatureGates)
log.Log.Info("Updated featureGates", "featureGates", dn.featureGate.String())
}

vars.MlxPluginFwReset = dn.featureGate.IsEnabled(consts.MellanoxFirmwareResetFeatureGate)
}

func (dn *Daemon) nodeStateSyncHandler() error {
Expand Down
9 changes: 6 additions & 3 deletions pkg/daemon/daemon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,18 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log/zap"

sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
mock_platforms "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/mock"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem"

snclient "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned"
snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned/fake"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate"
mock_helper "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper/mock"
mock_platforms "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/mock"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/openshift"
plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins/fake"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins/generic"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
"github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem"
)

func TestConfigDaemon(t *testing.T) {
Expand Down Expand Up @@ -151,6 +151,8 @@ var _ = Describe("Config Daemon", func() {
vendorHelper.EXPECT().PrepareNMUdevRule([]string{"0x1014", "0x154c"}).Return(nil).AnyTimes()
vendorHelper.EXPECT().PrepareVFRepUdevRule().Return(nil).AnyTimes()

featureGates := featuregate.New()

sut = New(
kClient,
snclient,
Expand All @@ -162,6 +164,7 @@ var _ = Describe("Config Daemon", func() {
syncCh,
refreshCh,
er,
featureGates,
nil,
)

Expand Down
14 changes: 14 additions & 0 deletions pkg/helper/mock/mock_helper.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 14 additions & 1 deletion pkg/plugins/mellanox/mellanox_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper"
plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins"
"github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars"
mlx "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vendors/mellanox"
)

Expand All @@ -20,6 +21,7 @@ type MellanoxPlugin struct {
helpers helper.HostHelpersInterface
}

var pciAddressesToReset []string
var attributesToChange map[string]mlx.MlxNic
var mellanoxNicsStatus map[string]map[string]sriovnetworkv1.InterfaceExt
var mellanoxNicsSpec map[string]sriovnetworkv1.Interface
Expand Down Expand Up @@ -52,6 +54,7 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS
needDrain = false
needReboot = false
err = nil
pciAddressesToReset = []string{}
attributesToChange = map[string]mlx.MlxNic{}
mellanoxNicsStatus = map[string]map[string]sriovnetworkv1.InterfaceExt{}
mellanoxNicsSpec = map[string]sriovnetworkv1.Interface{}
Expand Down Expand Up @@ -132,6 +135,10 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS
if needReboot || changeWithoutReboot {
attributesToChange[ifaceSpec.PciAddress] = *attrs
}

if needReboot {
pciAddressesToReset = append(pciAddressesToReset, ifaceSpec.PciAddress)
}
}

// Set total VFs to 0 for mellanox interfaces with no spec
Expand Down Expand Up @@ -202,7 +209,13 @@ func (p *MellanoxPlugin) Apply() error {
return nil
}
log.Log.Info("mellanox plugin Apply()")
return p.helpers.MlxConfigFW(attributesToChange)
if err := p.helpers.MlxConfigFW(attributesToChange); err != nil {
return err
}
if vars.MlxPluginFwReset {
tobiasgiese marked this conversation as resolved.
Show resolved Hide resolved
return p.helpers.MlxResetFW(pciAddressesToReset)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return an error here can be problematic if we have multiple PCIAddress and one them them failed (for example the primary one)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand you correctly your idea would be to introduce a errs variable to append the errors and return it at the end of the function, right?

}
return nil
}

// nicHasExternallyManagedPFs returns true if one of the ports(interface) of the NIC is marked as externally managed
Expand Down
3 changes: 3 additions & 0 deletions pkg/vars/vars.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ var (
// ManageSoftwareBridges global variable which reflects state of manageSoftwareBridges feature
ManageSoftwareBridges = false

// MlxPluginFwReset global variable enables mstfwreset before rebooting a node on VF changes
MlxPluginFwReset = false

// FilesystemRoot used by test to mock interactions with filesystem
FilesystemRoot = ""

Expand Down
18 changes: 18 additions & 0 deletions pkg/vendors/mellanox/mellanox.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strconv"
"strings"

kerrors "k8s.io/apimachinery/pkg/util/errors"
"sigs.k8s.io/controller-runtime/pkg/log"

sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
Expand Down Expand Up @@ -60,6 +61,7 @@ type MellanoxInterface interface {
GetMlxNicFwData(pciAddress string) (current, next *MlxNic, err error)

MlxConfigFW(attributesToChange map[string]MlxNic) error
MlxResetFW(pciAddresses []string) error
}

type mellanoxHelper struct {
Expand Down Expand Up @@ -141,6 +143,22 @@ func (m *mellanoxHelper) GetMellanoxBlueFieldMode(PciAddress string) (BlueFieldM
return -1, fmt.Errorf("MellanoxBlueFieldMode(): unknown device status for %s", PciAddress)
}

func (m *mellanoxHelper) MlxResetFW(pciAddresses []string) error {
log.Log.Info("mellanox-plugin resetFW()")
var errs []error
for _, pciAddress := range pciAddresses {
tobiasgiese marked this conversation as resolved.
Show resolved Hide resolved
cmdArgs := []string{"-d", pciAddress, "--skip_driver", "-l", "3", "-y", "reset"}
log.Log.Info("mellanox-plugin: resetFW()", "cmd-args", cmdArgs)
// We have to ensure that pciutils is installed into the container image Dockerfile.sriov-network-config-daemon
_, stderr, err := m.utils.RunCommand("mstfwreset", cmdArgs...)
if err != nil {
log.Log.Error(err, "mellanox-plugin resetFW(): failed", "stderr", stderr)
errs = append(errs, err)
}
}
return kerrors.NewAggregate(errs)
}

func (m *mellanoxHelper) MlxConfigFW(attributesToChange map[string]MlxNic) error {
log.Log.Info("mellanox-plugin configFW()")
for pciAddr, fwArgs := range attributesToChange {
Expand Down
14 changes: 14 additions & 0 deletions pkg/vendors/mellanox/mock/mock_mellanox.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading