From 0c01e35b5f733b13845431549f637e7898dd1058 Mon Sep 17 00:00:00 2001 From: Dan Jaglowski Date: Wed, 12 Jul 2023 09:09:55 -0400 Subject: [PATCH 1/2] [fileconsumer] Deduplicate fingerprints less aggressively When finding files, we would check for files with duplicate fingerprints and deduplicate them if a.StartsWith(b) or b.StartsWith(a). This StartsWith logic is useful for recognizing files that have new content since the previous poll interval. However, when deduplicating the results of the poll interval, the only case that we need to consider is copy/truncate rotation. In this case, a file may have been copied but the original has not yet been trucated. At that moment we should expect to find two files with exactly the same fingerprint. Therefore, we do not need to check StartsWith cases. --- .chloggen/strict-dedup.yaml | 20 +++++++++++++++++++ pkg/stanza/fileconsumer/file.go | 3 +-- .../internal/fingerprint/fingerprint.go | 18 +++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) create mode 100755 .chloggen/strict-dedup.yaml diff --git a/.chloggen/strict-dedup.yaml b/.chloggen/strict-dedup.yaml new file mode 100755 index 000000000000..8fe27170acc3 --- /dev/null +++ b/.chloggen/strict-dedup.yaml @@ -0,0 +1,20 @@ +# Use this changelog template to create an entry for release notes. +# If your change doesn't affect end users, such as a test fix or a tooling change, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: filelogreceiver + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Fix issue where files were deduplicated unnecessarily + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [24235] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: diff --git a/pkg/stanza/fileconsumer/file.go b/pkg/stanza/fileconsumer/file.go index b40140cdaf0c..f6328af9d7b3 100644 --- a/pkg/stanza/fileconsumer/file.go +++ b/pkg/stanza/fileconsumer/file.go @@ -224,8 +224,7 @@ func (m *Manager) makeFingerprint(path string) (*fingerprint.Fingerprint, *os.Fi func (m *Manager) checkDuplicates(fp *fingerprint.Fingerprint) bool { for i := 0; i < len(m.currentFps); i++ { - fp2 := m.currentFps[i] - if fp.StartsWith(fp2) || fp2.StartsWith(fp) { + if fp.Equal(m.currentFps[i]) { return true } } diff --git a/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint.go b/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint.go index 54ea2281b902..fb23296afae9 100644 --- a/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint.go +++ b/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint.go @@ -46,6 +46,24 @@ func (f Fingerprint) Copy() *Fingerprint { } } +// Equal returns true if the fingerprints have the same FirstBytes, +// false otherwise. This does not compare other aspects of the fingerprints +// because the primary purpose of a fingerprint is to convey a unique +// identity, and only the FirstBytes field contributes to this goal. +func (f Fingerprint) Equal(other *Fingerprint) bool { + l0 := len(other.FirstBytes) + l1 := len(f.FirstBytes) + if l0 != l1 { + return false + } + for i := 0; i < l0; i++ { + if other.FirstBytes[i] != f.FirstBytes[i] { + return false + } + } + return true +} + // StartsWith returns true if the fingerprints are the same // or if the new fingerprint starts with the old one // This is important functionality for tracking new files, From 7a0afedef001757789d2314c2c065bdfeb6a8533 Mon Sep 17 00:00:00 2001 From: Dan Jaglowski Date: Fri, 14 Jul 2023 09:14:29 -0400 Subject: [PATCH 2/2] Test fingerprint.Equal --- .../internal/fingerprint/fingerprint_test.go | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint_test.go b/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint_test.go index 09716442dacb..ff3532456ccd 100644 --- a/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint_test.go +++ b/pkg/stanza/fileconsumer/internal/fingerprint/fingerprint_test.go @@ -166,8 +166,32 @@ func TestFingerprintCopy(t *testing.T) { } } -func TestFingerprintStartsWith(t *testing.T) { +func TestEqual(t *testing.T) { + empty := &Fingerprint{FirstBytes: []byte("")} + empty2 := &Fingerprint{FirstBytes: []byte("")} + hello := &Fingerprint{FirstBytes: []byte("hello")} + hello2 := &Fingerprint{FirstBytes: []byte("hello")} + world := &Fingerprint{FirstBytes: []byte("world")} + world2 := &Fingerprint{FirstBytes: []byte("world")} + helloworld := &Fingerprint{FirstBytes: []byte("helloworld")} + helloworld2 := &Fingerprint{FirstBytes: []byte("helloworld")} + + require.True(t, empty.Equal(empty2)) + require.True(t, hello.Equal(hello2)) + require.True(t, world.Equal(world2)) + require.True(t, helloworld.Equal(helloworld2)) + require.False(t, hello.Equal(empty)) + require.False(t, empty.Equal(hello)) + + require.False(t, hello.Equal(world)) + require.False(t, world.Equal(hello)) + + require.False(t, hello.Equal(helloworld)) + require.False(t, helloworld.Equal(hello)) +} + +func TestStartsWith(t *testing.T) { empty := &Fingerprint{FirstBytes: []byte("")} hello := &Fingerprint{FirstBytes: []byte("hello")} world := &Fingerprint{FirstBytes: []byte("world")} @@ -183,7 +207,6 @@ func TestFingerprintStartsWith(t *testing.T) { require.True(t, helloworld.StartsWith(hello)) require.True(t, helloworld.StartsWith(helloworld)) require.False(t, helloworld.StartsWith(world)) - } // Generates a file filled with many random bytes, then @@ -193,7 +216,7 @@ func TestFingerprintStartsWith(t *testing.T) { // The static file can be thought of as the present state of // the file, while each iteration of the growing file represents // a possible state of the same file at a previous time. -func TestFingerprintStartsWith_FromFile(t *testing.T) { +func TestStartsWith_FromFile(t *testing.T) { r := rand.New(rand.NewSource(112358)) fingerprintSize := 10 fileLength := 12 * fingerprintSize