diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index d63daae50dc..ae8f81eb15b 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -45,6 +45,7 @@ https://github.com/elastic/beats/compare/v8.2.0\...main[Check the HEAD diff] - Support for multiline zookeeper logs {issue}2496[2496] - Allow `clock_nanosleep` in the default seccomp profiles for amd64 and 386. Newer versions of glibc (e.g. 2.31) require it. {issue}33792[33792] - Disable lockfile when running under elastic-agent. {pull}33988[33988] +- Fix lockfile logic, retry locking {pull}34194[34194] - Add checks to ensure reloading of units if the configuration actually changed. {pull}34346[34346] *Auditbeat* diff --git a/libbeat/cmd/instance/locks/lock.go b/libbeat/cmd/instance/locks/lock.go index e7bee0fedda..c520b6e9543 100644 --- a/libbeat/cmd/instance/locks/lock.go +++ b/libbeat/cmd/instance/locks/lock.go @@ -18,11 +18,7 @@ package locks import ( - "encoding/json" - "errors" "fmt" - "os" - "runtime" "time" "github.com/gofrs/flock" @@ -30,20 +26,14 @@ import ( "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent-libs/paths" - metricproc "github.com/elastic/elastic-agent-system-metrics/metric/system/process" ) +// Locker is a retrying file locker type Locker struct { - fileLock *flock.Flock - logger *logp.Logger - beatName string - filePath string - beatStart time.Time -} - -type pidfile struct { - PID int `json:"pid"` - WriteTime time.Time `json:"write_time"` + fileLock *flock.Flock + retryCount int + retrySleep time.Duration + logger *logp.Logger } var ( @@ -51,24 +41,21 @@ var ( // unsuccessful because another Beat instance already has the lock on the same // data path. ErrAlreadyLocked = fmt.Errorf("data path already locked by another beat. Please make sure that multiple beats are not sharing the same data path (path.data)") - - // ErrLockfileEmpty is returned by readExistingPidfile() when an existing pidfile is found, but the file is empty. - ErrLockfileEmpty = fmt.Errorf("lockfile is empty") ) -// a little wrapper for the gitpid function to make testing easier. -var pidFetch = os.Getpid - -// New returns a new pid-aware file locker -// all logic, including checking for existing locks, is performed lazily +// New returns a new file locker func New(beatInfo beat.Info) *Locker { + return NewWithRetry(beatInfo, 4, time.Millisecond*400) +} + +// NewWithRetry returns a new file locker with the given settings +func NewWithRetry(beatInfo beat.Info, retryCount int, retrySleep time.Duration) *Locker { lockfilePath := paths.Resolve(paths.Data, beatInfo.Beat+".lock") return &Locker{ - fileLock: flock.New(lockfilePath), - logger: logp.L(), - beatName: beatInfo.Beat, - filePath: lockfilePath, - beatStart: beatInfo.StartTime, + fileLock: flock.New(lockfilePath), + retryCount: retryCount, + retrySleep: retrySleep, + logger: logp.L(), } } @@ -76,187 +63,21 @@ func New(beatInfo beat.Info) *Locker { // Beat instance. If another Beats instance already has a lock on the same data path // an ErrAlreadyLocked error is returned. func (lock *Locker) Lock() error { - new := pidfile{PID: pidFetch(), WriteTime: time.Now()} - encoded, err := json.Marshal(&new) - if err != nil { - return fmt.Errorf("error encoding json for pidfile: %w", err) - } - - // The combination of O_CREATE and O_EXCL will ensure we return an error if we don't - // manage to create the file - fh, openErr := os.OpenFile(lock.filePath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0o600) - // Don't trust different OSes to report the errors we expect, just try to recover regardless - if openErr != nil { - err = lock.handleFailedCreate() - if err != nil { - return fmt.Errorf("cannot obtain lockfile: %w", err) - } - // If something fails here, it's probably unrecoverable - fh, err = os.OpenFile(lock.filePath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0o600) + for i := 0; i < lock.retryCount; i++ { + // note that TryLock doesn't set an os.O_EXCL flag, + // which means that we could be locking a file we didn't create. + // This makes it easy to recover from a failed shutdown or panic, + // as the OS will clean up the lock and we'll re-lock the same file. + // However, can create odd races if you're not careful, since you don't know if you're locking "your" file. + gotLock, err := lock.fileLock.TryLock() if err != nil { - return fmt.Errorf("cannot re-obtain lockfile %s: %w", lock.filePath, err) - } - } - - // a Process can't write to its own locked file on all platforms, write first - _, err = fh.Write(encoded) - if err != nil { - return fmt.Errorf("error writing pidfile to %s: %w", lock.filePath, err) - } - - // Exclusive lock - isLocked, err := lock.fileLock.TryLock() - if err != nil { - return fmt.Errorf("unable to lock data path: %w", err) - } - // case: lock could not be obtained. - if !isLocked { - // if we're here, things are probably unrecoverable, as we've previously checked for a lockfile. Exit. - return fmt.Errorf("%s: %w", lock.filePath, ErrAlreadyLocked) - } - - return nil -} - -// Unlock attempts to release the lock on a data path previously acquired via Lock(). -func (lock *Locker) Unlock() error { - err := lock.fileLock.Unlock() - if err != nil { - return fmt.Errorf("unable to unlock data path: %w", err) - } - - err = os.Remove(lock.fileLock.Path()) - if err != nil { - return fmt.Errorf("unable to unlock data path file %s: %w", lock.fileLock.Path(), err) - } - return nil -} - -// ******* private helpers - -// handleFailedCreate will attempt to recover from a failed lock operation in a pid-aware way. -// The point of this is to deal with instances where an improper beat shutdown left us with -// a pre-existing pidfile for a beat process that no longer exists. -func (lock *Locker) handleFailedCreate() error { - // First, try to lock the file as a check to see what state we're in. - // If there's a pre-existing lock that's in effect, we probably can't recover - // Not all OSes will fail on this. - _, err := lock.fileLock.TryLock() - // Case: the file already locked, and in use by another process. - if err != nil { - if runtime.GOOS == "windows" { - // on windows, locks from dead PIDs will be auto-released, but it might take the OS a while. - // However, the time it takes for the operating system to unlock these locks depends upon available system resources. - time.Sleep(time.Second) - _, err := lock.fileLock.TryLock() - if err != nil { - return fmt.Errorf("The lockfile %s is locked after a retry, another beat is probably running", lock.fileLock) - } - } else { - return fmt.Errorf("The lockfile %s is already locked by another beat", lock.fileLock) - } - } - - // if we're here, we've locked the file - // unlock so we can continue - err = lock.fileLock.Unlock() - if err != nil { - return fmt.Errorf("error unlocking a previously found file %s after a temporary lock", lock.filePath) - } - - // read in whatever existing lockfile caused us to fail - pf, err := lock.readExistingPidfile() - // Case: two beats start up simultaneously, there's a chance we could "see" the pidfile before the other process writes to it - // or, the other beat died before it could write the pidfile. - // Sleep, read again. If we still don't have anything, assume the other PID is dead, continue. - if errors.Is(err, ErrLockfileEmpty) { - lock.logger.Debugf("Found other pidfile, but no data. Retrying.") - time.Sleep(time.Millisecond * 500) - pf, err = lock.readExistingPidfile() - if errors.Is(err, ErrLockfileEmpty) { - lock.logger.Debugf("No PID found in other lockfile, continuing") - return lock.recoverLockfile() - } else if err != nil { - return fmt.Errorf("error re-reading existing lockfile: %w", err) + return fmt.Errorf("unable to try a lock of the data path: %w", err) } - } else if err != nil { - return fmt.Errorf("error reading existing lockfile: %w", err) - } - - // Case: the lockfile is locked, but by us. Probably a coding error, - // and probably hard to do - if pf.PID == os.Getpid() { - // the lockfile was written before the beat started, meaning we restarted and somehow got the same pid - // in which case, continue - if lock.beatStart.Before(pf.WriteTime) { - return fmt.Errorf("lockfile for beat has been locked twice by the same PID, potential bug.") + if gotLock { + return nil } - lock.logger.Debugf("Beat has started with the same PID, continuing") - return lock.recoverLockfile() - } - - // Check to see if the PID found in the pidfile exists. - existsState, err := findMatchingPID(pf.PID) - // Case: we have a lockfile, but the pid from the pidfile no longer exists - // this was presumably due to the dirty shutdown. - // Try to reset the lockfile and continue. - if errors.Is(err, metricproc.ProcNotExist) { - lock.logger.Debugf("%s shut down without removing previous lockfile, continuing", lock.beatName) - return lock.recoverLockfile() - } else if err != nil { - return fmt.Errorf("error looking up status for pid %d: %w", pf.PID, err) - } else { - // Case: the PID exists, but it's attached to a zombie process - // In this case...we should be okay to restart? - if existsState == metricproc.Zombie { - lock.logger.Debugf("%s shut down without removing previous lockfile and is currently in a zombie state, continuing", lock.beatName) - return lock.recoverLockfile() - } - // Case: we've gotten a lock file for another process that's already running - // This is the "base" lockfile case, which is two beats running from the same directory - // This is where we'll catch this particular case on Linux, due to Linux's advisory-style locks. - return fmt.Errorf("cannot start, data directory belongs to process with pid %d", pf.PID) - } -} - -// recoverLockfile attempts to remove the lockfile and continue running -// This should only be called after we're sure it's safe to ignore a pre-existing lockfile -// This will reset the internal lockfile handler when it's successful. -func (lock *Locker) recoverLockfile() error { - // File remove may or not work, depending on os-specific details with lockfiles - err := os.Remove(lock.fileLock.Path()) - if err != nil { - if runtime.GOOS == "windows" { - // retry on windows, the OS can take time to clean up - time.Sleep(time.Second) - err = os.Remove(lock.fileLock.Path()) - if err != nil { - return fmt.Errorf("tried twice to remove lockfile %s on windows: %w", - lock.fileLock.Path(), err) - } - } else { - return fmt.Errorf("lockfile %s cannot be removed: %w", lock.fileLock.Path(), err) - } - - } - lock.fileLock = flock.New(lock.filePath) - return nil -} - -// readExistingPidfile will read the contents of an existing pidfile -// Will return ErrLockfileEmpty if no data is found in the lockfile -func (lock *Locker) readExistingPidfile() (pidfile, error) { - rawPidfile, err := os.ReadFile(lock.filePath) - if err != nil { - return pidfile{}, fmt.Errorf("error reading pidfile from path %s", lock.filePath) - } - if len(rawPidfile) == 0 { - return pidfile{}, ErrLockfileEmpty - } - foundPidFile := pidfile{} - err = json.Unmarshal(rawPidfile, &foundPidFile) - if err != nil { - return pidfile{}, fmt.Errorf("error reading JSON from pid file %s: %w", lock.filePath, err) + lock.logger.Debugf("Could not obtain lock for file %s, retrying %d times", lock.fileLock.Path(), (lock.retryCount - i)) + time.Sleep(lock.retrySleep) } - return foundPidFile, nil + return fmt.Errorf("%s: %w", lock.fileLock.Path(), ErrAlreadyLocked) } diff --git a/libbeat/cmd/instance/locks/process_lookup_stub.go b/libbeat/cmd/instance/locks/lock_other.go similarity index 55% rename from libbeat/cmd/instance/locks/process_lookup_stub.go rename to libbeat/cmd/instance/locks/lock_other.go index cd151ebad14..40e76c6e945 100644 --- a/libbeat/cmd/instance/locks/process_lookup_stub.go +++ b/libbeat/cmd/instance/locks/lock_other.go @@ -15,17 +15,28 @@ // specific language governing permissions and limitations // under the License. -//go:build (!darwin || !cgo) && !freebsd && !linux && !windows && !aix +//go:build !windows +// +build !windows package locks import ( "fmt" - "runtime" - - "github.com/elastic/elastic-agent-system-metrics/metric/system/process" + "os" ) -func findMatchingPID(pid int) (process.PidState, error) { - return process.Dead, fmt.Errorf("findMatchingPID not supported on platform: %s", runtime.GOOS) +// Unlock attempts to release the lock on a data path previously acquired via Lock(). This will first remove the file, then unlock the file handle. +func (lock *Locker) Unlock() error { + // Unlock will remove the file while we still have the lock, so we reduce the odds of another beat swooping in to start between the Unlock() and Remove() operation. + err := os.Remove(lock.fileLock.Path()) + if err != nil { + lock.logger.Warnf("could not remove lockfile at %s: %s", lock.fileLock.Path(), err) + } + + err = lock.fileLock.Unlock() + if err != nil { + return fmt.Errorf("unable to unlock data path: %w", err) + } + + return nil } diff --git a/libbeat/cmd/instance/locks/lock_test.go b/libbeat/cmd/instance/locks/lock_test.go index 10c63f48567..8e112a1b3f7 100644 --- a/libbeat/cmd/instance/locks/lock_test.go +++ b/libbeat/cmd/instance/locks/lock_test.go @@ -21,9 +21,7 @@ import ( "fmt" "os" "testing" - "time" - "github.com/gofrs/uuid" "github.com/stretchr/testify/require" "github.com/elastic/beats/v7/libbeat/beat" @@ -60,96 +58,74 @@ func TestMain(m *testing.M) { os.Exit(exit) } -func TestLockWithDeadPid(t *testing.T) { - // create old lockfile - pidFetch = fakeDeadPid - testBeat := beat.Info{Beat: mustNewUUID(t), StartTime: time.Now()} - locker := New(testBeat) - err := locker.Lock() - require.NoError(t, err) +func TestLocker(t *testing.T) { + // Setup two beats with same name and data path + const beatName = "testbeat-testlocker" - // create new locker - pidFetch = os.Getpid - newLocker := New(testBeat) - err = newLocker.Lock() - require.NoError(t, err) -} + b1 := beat.Info{} + b1.Beat = beatName -func TestLockWithTwoBeats(t *testing.T) { - testBeat := beat.Info{Beat: mustNewUUID(t), StartTime: time.Now()} - // emulate two beats trying to run from the same data path - locker := New(testBeat) - // use the parent process as another random beat - pidFetch = os.Getppid - err := locker.Lock() - require.NoError(t, err) + b2 := beat.Info{} + b2.Beat = beatName - // create new locker for this beat - pidFetch = os.Getpid - newLocker := New(testBeat) - err = newLocker.Lock() - require.Error(t, err) - t.Logf("Got desired error: %s", err) -} - -func TestDoubleLock(t *testing.T) { - testBeat := beat.Info{Beat: mustNewUUID(t), StartTime: time.Now()} - locker := New(testBeat) - err := locker.Lock() + // Try to get a lock for the first beat. Expect it to succeed. + bl1 := New(b1) + err := bl1.Lock() require.NoError(t, err) - newLocker := New(testBeat) - err = newLocker.Lock() + // Try to get a lock for the second beat. Expect it to fail because the + // first beat already has the lock. + bl2 := New(b2) + err = bl2.Lock() require.Error(t, err) - t.Logf("Got desired error: %s", err) + } func TestUnlock(t *testing.T) { - testBeat := beat.Info{Beat: mustNewUUID(t), StartTime: time.Now()} - locker := New(testBeat) - err := locker.Lock() - require.NoError(t, err) + const beatName = "testbeat-testunlock" - err = locker.Unlock() - require.NoError(t, err) -} + b1 := beat.Info{} + b1.Beat = beatName -func TestRestartWithSamePID(t *testing.T) { - // create old lockfile - testBeatName := mustNewUUID(t) - testBeat := beat.Info{Beat: testBeatName, StartTime: time.Now().Add(-time.Second * 20)} - locker := New(testBeat) - err := locker.Lock() - require.NoError(t, err) - // create new lockfile with the same PID but a newer time - // create old lockfile - testNewBeat := beat.Info{Name: testBeatName, StartTime: time.Now()} - lockerNew := New(testNewBeat) - err = lockerNew.Lock() + b2 := beat.Info{} + b2.Beat = beatName + bl2 := New(b2) + + // Try to get a lock for the first beat. Expect it to succeed. + bl1 := New(b1) + err := bl1.Lock() require.NoError(t, err) -} -func TestEmptyLockfile(t *testing.T) { - testBeat := beat.Info{Beat: mustNewUUID(t), StartTime: time.Now().Add(-time.Second * 1)} - deadLock := New(testBeat) - // Create an empty lockfile - // Might happen in cases where a beat shut down at *just* the right time. - fh, err := os.Create(deadLock.filePath) + // now unlock + err = bl1.Unlock() require.NoError(t, err) - fh.Close() - newBeat := New(testBeat) - err = newBeat.Lock() + // try with other lockfile + err = bl2.Lock() require.NoError(t, err) } -func mustNewUUID(t *testing.T) string { - uuid, err := uuid.NewV4() +func TestUnlockWithRemainingFile(t *testing.T) { + const beatName = "testbeat-testunlockwithfile" + + b1 := beat.Info{} + b1.Beat = beatName + + b2 := beat.Info{} + b2.Beat = beatName + bl2 := New(b2) + + // Try to get a lock for the first beat. Expect it to succeed. + bl1 := New(b1) + err := bl1.Lock() require.NoError(t, err) - return uuid.String() -} -func fakeDeadPid() int { - return 99999 + // unlock the underlying FD, so we don't remove the file + err = bl1.fileLock.Unlock() + require.NoError(t, err) + + // now lock new handle with the same file + err = bl2.Lock() + require.NoError(t, err) } diff --git a/libbeat/cmd/instance/locks/process_lookup_cgo.go b/libbeat/cmd/instance/locks/lock_windows.go similarity index 50% rename from libbeat/cmd/instance/locks/process_lookup_cgo.go rename to libbeat/cmd/instance/locks/lock_windows.go index f0f2b253cca..f3ed63ebce3 100644 --- a/libbeat/cmd/instance/locks/process_lookup_cgo.go +++ b/libbeat/cmd/instance/locks/lock_windows.go @@ -15,16 +15,28 @@ // specific language governing permissions and limitations // under the License. -//go:build (darwin && cgo) || freebsd || linux || windows || aix - package locks import ( - "github.com/elastic/elastic-agent-system-metrics/metric/system/process" - "github.com/elastic/elastic-agent-system-metrics/metric/system/resolve" + "fmt" + "os" ) -// findMatchingPID is a small wrapper to deal with cgo compat issues in libbeat's CI -func findMatchingPID(pid int) (process.PidState, error) { - return process.GetPIDState(resolve.NewTestResolver("/"), pid) +// Unlock attempts to release the lock on a data path previously acquired via Lock(). This will unlock the file before it removes it. +func (lock *Locker) Unlock() error { + // Removing a file that's locked seems to be an unsupported or undefined, and will often fail on Windows. + // Reverse the order of operations, and unlock first, then remove. + // This will slightly increase the odds of a race on Windows if we're in a tight restart loop, + // as another beat can swoop in and lock the file before this beat removes it. + err := lock.fileLock.Unlock() + if err != nil { + return fmt.Errorf("unable to unlock data path: %w", err) + } + + err = os.Remove(lock.fileLock.Path()) + if err != nil { + lock.logger.Warnf("could not remove lockfile at %s: %s", lock.fileLock.Path(), err) + } + + return nil }