Skip to content

Commit

Permalink
Fix #232: Ensure storaged does not crash on boot
Browse files Browse the repository at this point in the history
Rather than returning an error while initializing the storage module,
which ultimately crashes the module, maintain separate lists of
(presumably) faulty devices and storagepools. This allows storaged to
finish intializing with all the known working devices and storagpools in
the system. Also expose these lists over the zbus interface, so other
modules have the ability to check if there is faulty hardware and if so,
take action (e.g. notify farmer).
  • Loading branch information
LeeSmet committed Dec 13, 2019
1 parent 4a44114 commit 8922713
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 12 deletions.
24 changes: 24 additions & 0 deletions pkg/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,26 @@ func (e ErrNotEnoughSpace) Error() string {
// i.e. SSD or HDD
type DeviceType string

type (
// BrokenDevice is a disk which is somehow not fully functional. Storage keeps
// track of disks which have failed at some point, so they are not used, and
// to be able to later report this to other daemons.
BrokenDevice struct {
// Path to allow identification of the disk
Path string
// Err returned which lead to the disk being marked as faulty
Err error
}

// BrokenPool contains info about a malfunctioning storage pool
BrokenPool struct {
// Label of the broken pool
Label string
// Err returned by the action which let to the pool being marked as broken
Err error
}
)

// Known device types
const (
SSDDevice DeviceType = "SSD"
Expand Down Expand Up @@ -110,4 +130,8 @@ type StorageModule interface {

// Total gives the total amount of storage available for a device type
Total(kind DeviceType) (uint64, error)
// BrokenPools lists the broken storage pools that have been detected
BrokenPools() []BrokenPool
// BrokenDevices lists the broken devices that have been detected
BrokenDevices() []BrokenDevice
}
45 changes: 33 additions & 12 deletions pkg/storage/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@ var (
)

type storageModule struct {
volumes []filesystem.Pool
devices filesystem.DeviceManager
volumes []filesystem.Pool
brokenPools []pkg.BrokenPool
devices filesystem.DeviceManager
brokenDevices []pkg.BrokenDevice
}

// New create a new storage module service
Expand All @@ -45,8 +47,10 @@ func New() (pkg.StorageModule, error) {
}

s := &storageModule{
volumes: []filesystem.Pool{},
devices: m,
volumes: []filesystem.Pool{},
brokenPools: []pkg.BrokenPool{},
devices: m,
brokenDevices: []pkg.BrokenDevice{},
}

// go for a simple linear setup right now
Expand Down Expand Up @@ -88,6 +92,16 @@ func (s *storageModule) Total(kind pkg.DeviceType) (uint64, error) {
return total, nil
}

// BrokenPools lists the broken storage pools that have been detected
func (s *storageModule) BrokenPools() []pkg.BrokenPool {
return s.brokenPools
}

// BrokenDevices lists the broken devices that have been detected
func (s *storageModule) BrokenDevices() []pkg.BrokenDevice {
return s.brokenDevices
}

/**
initialize, must be called at least onetime each boot.
What Initialize will do is the following:
Expand Down Expand Up @@ -120,11 +134,13 @@ func (s *storageModule) initialize(policy pkg.StoragePolicy) error {
}
_, err = volume.Mount()
if err != nil {
return err
s.brokenPools = append(s.brokenPools, pkg.BrokenPool{Label: volume.Name(), Err: err})
log.Warn().Msgf("Failed to mount volume %v", volume.Name())
continue
}
log.Debug().Msgf("Mounted volume %s", volume.Name())
s.volumes = append(s.volumes, volume)
}
s.volumes = append(s.volumes, existingPools...)

// list disks
log.Info().Msgf("Finding free disks")
Expand Down Expand Up @@ -191,7 +207,14 @@ func (s *storageModule) initialize(policy pkg.StoragePolicy) error {

pool, err := fs.Create(ctx, uuid.New().String(), policy.Raid, poolDevices...)
if err != nil {
return err
// Failure to create a filesystem -> disk is dead. It is possible
// that multiple devides are used to create a single pool, and only
// one devide is actuall broken. We should probably expand on
// this once we start to use storagepools spanning multiple disks.
for _, dev := range poolDevices {
s.brokenDevices = append(s.brokenDevices, pkg.BrokenDevice{Path: dev.Path, Err: err})
}
continue
}

newPools = append(newPools, pool)
Expand All @@ -205,18 +228,17 @@ func (s *storageModule) initialize(policy pkg.StoragePolicy) error {
if _, mounted := newPools[idx].Mounted(); !mounted {
log.Debug().Msgf("Mounting volume %s", newPools[idx].Name())
if _, err = newPools[idx].Mount(); err != nil {
return err
s.brokenPools = append(s.brokenPools, pkg.BrokenPool{Label: newPools[idx].Name(), Err: err})
continue
}
s.volumes = append(s.volumes, newPools[idx])
}
}

s.volumes = append(s.volumes, newPools...)

return s.ensureCache()
}

func (s *storageModule) Maintenance() error {

for _, pool := range s.volumes {
log.Info().
Str("pool", pool.Name()).
Expand Down Expand Up @@ -318,7 +340,6 @@ func (s *storageModule) ensureCache() error {
fs, err := s.createSubvol(cacheSize, cacheLabel, pkg.SSDDevice)
if errors.Is(err, pkg.ErrNotEnoughSpace{}) {
// No space on SSD (probably no SSD in the node at all), try HDD
err = nil
fs, err = s.createSubvol(cacheSize, cacheLabel, pkg.HDDDevice)
}
if err != nil {
Expand Down
24 changes: 24 additions & 0 deletions pkg/stubs/storage_stub.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,30 @@ func (s *StorageModuleStub) Allocate(arg0 pkg.DeviceType, arg1 uint64, arg2 pkg.
return
}

func (s *StorageModuleStub) BrokenDevices() (ret0 []pkg.BrokenDevice) {
args := []interface{}{}
result, err := s.client.Request(s.module, s.object, "BrokenDevices", args...)
if err != nil {
panic(err)
}
if err := result.Unmarshal(0, &ret0); err != nil {
panic(err)
}
return
}

func (s *StorageModuleStub) BrokenPools() (ret0 []pkg.BrokenPool) {
args := []interface{}{}
result, err := s.client.Request(s.module, s.object, "BrokenPools", args...)
if err != nil {
panic(err)
}
if err := result.Unmarshal(0, &ret0); err != nil {
panic(err)
}
return
}

func (s *StorageModuleStub) Claim(arg0 string, arg1 uint64) (ret0 error) {
args := []interface{}{arg0, arg1}
result, err := s.client.Request(s.module, s.object, "Claim", args...)
Expand Down

0 comments on commit 8922713

Please sign in to comment.