diff --git a/CHANGELOG.md b/CHANGELOG.md index 14ca27bd9bc..8e18f5bd807 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Fix inconsistent request body closing in `go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp`. (#5954) - Fix invalid exemplar keys in `go.opentelemetry.io/otel/exporters/prometheus`. (#5995) - Fix attribute value truncation in `go.opentelemetry.io/otel/sdk/trace`. (#5997) +- Fix attribute value truncation in `go.opentelemetry.io/otel/sdk/log`. (#6032) diff --git a/sdk/log/record.go b/sdk/log/record.go index 155e4cad2b6..f04e5b28f95 100644 --- a/sdk/log/record.go +++ b/sdk/log/record.go @@ -406,7 +406,7 @@ func (r *Record) applyValueLimits(val log.Value) log.Value { case log.KindString: s := val.AsString() if len(s) > r.attributeValueLengthLimit { - val = log.StringValue(truncate(s, r.attributeValueLengthLimit)) + val = log.StringValue(truncate(r.attributeValueLengthLimit, s)) } case log.KindSlice: sl := val.AsSlice() @@ -427,40 +427,78 @@ func (r *Record) applyValueLimits(val log.Value) log.Value { return val } -// truncate returns a copy of str truncated to have a length of at most n -// characters. If the length of str is less than n, str itself is returned. +// truncate returns a truncated version of s such that it contains less than +// the limit number of characters. Truncation is applied by returning the limit +// number of valid characters contained in s. // -// The truncate of str ensures that no valid UTF-8 code point is split. The -// copy returned will be less than n if a characters straddles the length -// limit. +// If limit is negative, it returns the original string. // -// No truncation is performed if n is less than zero. -func truncate(str string, n int) string { - if n < 0 { - return str +// UTF-8 is supported. When truncating, all invalid characters are dropped +// before applying truncation. +// +// If s already contains less than the limit number of bytes, it is returned +// unchanged. No invalid characters are removed. +func truncate(limit int, s string) string { + // This prioritize performance in the following order based on the most + // common expected use-cases. + // + // - Short values less than the default limit (128). + // - Strings with valid encodings that exceed the limit. + // - No limit. + // - Strings with invalid encodings that exceed the limit. + if limit < 0 || len(s) <= limit { + return s } - // cut returns a copy of the s truncated to not exceed a length of n. If - // invalid UTF-8 is encountered, s is returned with false. Otherwise, the - // truncated copy will be returned with true. - cut := func(s string) (string, bool) { - var i int - for i = 0; i < n; { - r, size := utf8.DecodeRuneInString(s[i:]) - if r == utf8.RuneError { - return s, false + // Optimistically, assume all valid UTF-8. + var b strings.Builder + count := 0 + for i, c := range s { + if c != utf8.RuneError { + count++ + if count > limit { + return s[:i] } - if i+size > n { - break - } - i += size + continue + } + + _, size := utf8.DecodeRuneInString(s[i:]) + if size == 1 { + // Invalid encoding. + b.Grow(len(s) - 1) + _, _ = b.WriteString(s[:i]) + s = s[i:] + break } - return s[:i], true } - cp, ok := cut(str) - if !ok { - cp, _ = cut(strings.ToValidUTF8(str, "")) + // Fast-path, no invalid input. + if b.Cap() == 0 { + return s } - return cp + + // Truncate while validating UTF-8. + for i := 0; i < len(s) && count < limit; { + c := s[i] + if c < utf8.RuneSelf { + // Optimization for single byte runes (common case). + _ = b.WriteByte(c) + i++ + count++ + continue + } + + _, size := utf8.DecodeRuneInString(s[i:]) + if size == 1 { + // We checked for all 1-byte runes above, this is a RuneError. + i++ + continue + } + + _, _ = b.WriteString(s[i : i+size]) + i += size + count++ + } + + return b.String() } diff --git a/sdk/log/record_test.go b/sdk/log/record_test.go index 5a16267b8b3..1d1d579d060 100644 --- a/sdk/log/record_test.go +++ b/sdk/log/record_test.go @@ -570,74 +570,147 @@ func assertKV(t *testing.T, r Record, kv log.KeyValue) { } func TestTruncate(t *testing.T) { - testcases := []struct { - input, want string - limit int + type group struct { + limit int + input string + expected string + } + + tests := []struct { + name string + groups []group }{ + // Edge case: limit is negative, no truncation should occur { - input: "value", - want: "value", - limit: -1, - }, - { - input: "value", - want: "", - limit: 0, - }, - { - input: "value", - want: "v", - limit: 1, - }, - { - input: "value", - want: "va", - limit: 2, + name: "NoTruncation", + groups: []group{ + {-1, "No truncation!", "No truncation!"}, + }, }, + + // Edge case: string is already shorter than the limit, no truncation + // should occur { - input: "value", - want: "val", - limit: 3, + name: "ShortText", + groups: []group{ + {10, "Short text", "Short text"}, + {15, "Short text", "Short text"}, + {100, "Short text", "Short text"}, + }, }, + + // Edge case: truncation happens with ASCII characters only { - input: "value", - want: "valu", - limit: 4, + name: "ASCIIOnly", + groups: []group{ + {1, "Hello World!", "H"}, + {5, "Hello World!", "Hello"}, + {12, "Hello World!", "Hello World!"}, + }, }, + + // Truncation including multi-byte characters (UTF-8) { - input: "value", - want: "value", - limit: 5, + name: "ValidUTF-8", + groups: []group{ + {7, "Hello, 世界", "Hello, "}, + {8, "Hello, 世界", "Hello, 世"}, + {2, "こんにちは", "こん"}, + {3, "こんにちは", "こんに"}, + {5, "こんにちは", "こんにちは"}, + {12, "こんにちは", "こんにちは"}, + }, }, + + // Truncation with invalid UTF-8 characters { - input: "value", - want: "value", - limit: 6, + name: "InvalidUTF-8", + groups: []group{ + {11, "Invalid\x80text", "Invalidtext"}, + // Do not modify invalid text if equal to limit. + {11, "Valid text\x80", "Valid text\x80"}, + // Do not modify invalid text if under limit. + {15, "Valid text\x80", "Valid text\x80"}, + {5, "Hello\x80World", "Hello"}, + {11, "Hello\x80World\x80!", "HelloWorld!"}, + {15, "Hello\x80World\x80Test", "HelloWorldTest"}, + {15, "Hello\x80\x80\x80World\x80Test", "HelloWorldTest"}, + {15, "\x80\x80\x80Hello\x80\x80\x80World\x80Test\x80\x80", "HelloWorldTest"}, + }, }, + + // Truncation with mixed validn and invalid UTF-8 characters { - input: "€€€€", // 3 bytes each - want: "€€€", - limit: 10, + name: "MixedUTF-8", + groups: []group{ + {6, "€"[0:2] + "hello€€", "hello€"}, + {6, "€" + "€"[0:2] + "hello", "€hello"}, + {11, "Valid text\x80📜", "Valid text📜"}, + {11, "Valid text📜\x80", "Valid text📜"}, + {14, "😊 Hello\x80World🌍🚀", "😊 HelloWorld🌍🚀"}, + {14, "😊\x80 Hello\x80World🌍🚀", "😊 HelloWorld🌍🚀"}, + {14, "😊\x80 Hello\x80World🌍\x80🚀", "😊 HelloWorld🌍🚀"}, + {14, "😊\x80 Hello\x80World🌍\x80🚀\x80", "😊 HelloWorld🌍🚀"}, + {14, "\x80😊\x80 Hello\x80World🌍\x80🚀\x80", "😊 HelloWorld🌍🚀"}, + }, }, + + // Edge case: empty string, should return empty string { - input: "€"[0:2] + "hello€€", // corrupted first rune, then over limit - want: "hello€", - limit: 10, + name: "Empty", + groups: []group{ + {5, "", ""}, + }, }, + + // Edge case: limit is 0, should return an empty string { - input: "€"[0:2] + "hello", // corrupted first rune, then not over limit - want: "hello", - limit: 10, + name: "Zero", + groups: []group{ + {0, "Some text", ""}, + {0, "", ""}, + }, }, } - for _, tc := range testcases { - name := fmt.Sprintf("%s/%d", tc.input, tc.limit) - t.Run(name, func(t *testing.T) { - t.Log(tc.input, len(tc.input), tc.limit) - assert.Equal(t, tc.want, truncate(tc.input, tc.limit)) - }) + for _, tt := range tests { + for _, g := range tt.groups { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + got := truncate(g.limit, g.input) + assert.Equalf( + t, g.expected, got, + "input: %q([]rune%v))\ngot: %q([]rune%v)\nwant %q([]rune%v)", + g.input, []rune(g.input), + got, []rune(got), + g.expected, []rune(g.expected), + ) + }) + } + } +} + +func BenchmarkTruncate(b *testing.B) { + run := func(limit int, input string) func(b *testing.B) { + return func(b *testing.B) { + b.ReportAllocs() + b.RunParallel(func(pb *testing.PB) { + var out string + for pb.Next() { + out = truncate(limit, input) + } + _ = out + }) + } } + b.Run("Unlimited", run(-1, "hello 😊 world 🌍🚀")) + b.Run("Zero", run(0, "Some text")) + b.Run("Short", run(10, "Short Text")) + b.Run("ASCII", run(5, "Hello, World!")) + b.Run("ValidUTF-8", run(10, "hello 😊 world 🌍🚀")) + b.Run("InvalidUTF-8", run(6, "€"[0:2]+"hello€€")) + b.Run("MixedUTF-8", run(14, "\x80😊\x80 Hello\x80World🌍\x80🚀\x80")) } func BenchmarkWalkAttributes(b *testing.B) {