Skip to content

Commit

Permalink
fix: report internally service as unhealthy if not running
Browse files Browse the repository at this point in the history
Otherwise the internal code might assume that the service is still
running and healthy, never issuing a health change event.

Fixes siderolabs#9271

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
(cherry picked from commit 07b9179)
  • Loading branch information
smira committed Sep 25, 2024
1 parent 60b7839 commit f722c64
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 11 deletions.
4 changes: 4 additions & 0 deletions internal/app/machined/pkg/system/service_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,10 @@ func (svcrunner *ServiceRunner) run(ctx context.Context, runnr runner.Runner) er
go func() {
errCh <- runnr.Run(func(s events.ServiceState, msg string, args ...interface{}) {
svcrunner.UpdateState(ctx, s, msg, args...)

if s != events.StateRunning {
svcrunner.healthState.Update(false, "service not running")
}
})
}()

Expand Down
15 changes: 6 additions & 9 deletions internal/pkg/ntp/ntp.go
Original file line number Diff line number Diff line change
Expand Up @@ -392,18 +392,15 @@ func (syncer *Syncer) queryNTP(server string) (*Measurement, error) {
)

validationError := resp.Validate()
if validationError != nil {
return nil, validationError
}

measurement := &Measurement{
return &Measurement{
ClockOffset: resp.ClockOffset,
Leap: resp.Leap,
Spike: false,
}

if validationError == nil {
measurement.Spike = syncer.isSpike(resp)
}

return measurement, validationError
Spike: syncer.isSpike(resp),
}, nil
}

// log2i returns 0 for v == 0 and v == 1.
Expand Down
82 changes: 80 additions & 2 deletions internal/pkg/ntp/ntp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ type NTPSuite struct {
systemClock time.Time
clockAdjustments []time.Duration

failingServer int
spikyServer int
failingServer int
spikyServer int
kissOfDeathServer int
}

func TestNTPSuite(t *testing.T) {
Expand All @@ -49,6 +50,8 @@ func (suite *NTPSuite) SetupTest() {
suite.systemClock = time.Now().UTC()
suite.clockAdjustments = nil
suite.failingServer = 0
suite.spikyServer = 0
suite.kissOfDeathServer = 0
}

func (suite *NTPSuite) getSystemClock() time.Time {
Expand All @@ -73,6 +76,7 @@ func (suite *NTPSuite) adjustSystemClock(val *unix.Timex) (status timex.State, e
return
}

//nolint:gocyclo
func (suite *NTPSuite) fakeQuery(host string) (resp *beevikntp.Response, err error) {
switch host {
case "127.0.0.1": // error
Expand Down Expand Up @@ -161,6 +165,26 @@ func (suite *NTPSuite) fakeQuery(host string) (resp *beevikntp.Response, err err
suite.Require().NoError(resp.Validate())

return resp, nil
case "127.0.0.8": // kiss of death alternating
suite.kissOfDeathServer++

if suite.kissOfDeathServer%2 == 1 {
return &beevikntp.Response{ // kiss of death
Stratum: 0,
Time: suite.systemClock,
ReferenceTime: suite.systemClock,
ClockOffset: 2 * time.Millisecond,
RTT: time.Millisecond / 2,
}, nil
} else {
return &beevikntp.Response{ // normal response
Stratum: 1,
Time: suite.systemClock,
ReferenceTime: suite.systemClock,
ClockOffset: time.Millisecond,
RTT: time.Millisecond / 2,
}, nil
}
default:
return nil, fmt.Errorf("unknown host %q", host)
}
Expand Down Expand Up @@ -243,6 +267,60 @@ func (suite *NTPSuite) TestSyncContinuous() {
wg.Wait()
}

//nolint:dupl
func (suite *NTPSuite) TestSyncKissOfDeath() {
syncer := ntp.NewSyncer(zaptest.NewLogger(suite.T()).With(zap.String("controller", "ntp")), []string{"127.0.0.8"})

syncer.AdjustTime = suite.adjustSystemClock
syncer.CurrentTime = suite.getSystemClock
syncer.NTPQuery = suite.fakeQuery

syncer.MinPoll = time.Second
syncer.MaxPoll = time.Second

ctx, cancel := context.WithCancel(context.Background())
defer cancel()

var wg sync.WaitGroup

wg.Add(1)

go func() {
defer wg.Done()

syncer.Run(ctx)
}()

select {
case <-syncer.Synced():
case <-time.After(10 * time.Second):
suite.Assert().Fail("time sync timeout")
}

suite.Assert().NoError(
retry.Constant(10*time.Second, retry.WithUnits(100*time.Millisecond)).Retry(func() error {
suite.clockLock.Lock()
defer suite.clockLock.Unlock()

if len(suite.clockAdjustments) < 2 {
return retry.ExpectedErrorf("not enough syncs")
}

for _, adj := range suite.clockAdjustments {
// kiss of death syncs should be ignored
suite.Assert().Equal(time.Millisecond, adj)
}

return nil
}),
)

cancel()

wg.Wait()
}

//nolint:dupl
func (suite *NTPSuite) TestSyncWithSpikes() {
syncer := ntp.NewSyncer(logging.Wrap(log.Writer()).With(zap.String("controller", "ntp")), []string{"127.0.0.7"})

Expand Down

0 comments on commit f722c64

Please sign in to comment.