-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Shorten UTD marker file #9387
Merged
JanKrivanek
merged 8 commits into
dotnet:main
from
JanKrivanek:proto/shorten-utd-marker
Nov 30, 2023
Merged
Shorten UTD marker file #9387
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
1772571
Shorten UTD marker file
JanKrivanek 4a4544c
Improve the StableStringHash intrinsic function and use it in the cop…
JanKrivanek 8fe5c17
Unify FNV implementations
JanKrivanek 97ceb6a
Merge branch 'dotnet:main' into proto/shorten-utd-marker
JanKrivanek e02ba1c
Apply PR suggestions
JanKrivanek dee22d1
Move FNV hashing to stringtools
JanKrivanek 0add561
Fix leftovers
JanKrivanek c58eb90
Add comments
JanKrivanek File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
|
||
using System.Runtime.InteropServices; | ||
using System; | ||
|
||
namespace Microsoft.NET.StringTools | ||
{ | ||
/// <summary> | ||
/// Fowler/Noll/Vo hashing. | ||
/// </summary> | ||
public static class FowlerNollVo1aHash | ||
{ | ||
// Fowler/Noll/Vo hashing. | ||
// http://www.isthe.com/chongo/tech/comp/fnv/ | ||
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function#FNV-1a_hash | ||
// http://www.isthe.com/chongo/src/fnv/hash_32a.c | ||
|
||
// 32 bit FNV prime and offset basis for FNV-1a. | ||
private const uint fnvPrimeA32Bit = 16777619; | ||
private const uint fnvOffsetBasisA32Bit = 2166136261; | ||
|
||
// 64 bit FNV prime and offset basis for FNV-1a. | ||
private const long fnvPrimeA64Bit = 1099511628211; | ||
private const long fnvOffsetBasisA64Bit = unchecked((long)14695981039346656037); | ||
|
||
/// <summary> | ||
/// Computes 32 bit Fowler/Noll/Vo-1a hash of a string (regardless of encoding). | ||
/// </summary> | ||
/// <param name="text">String to be hashed.</param> | ||
/// <returns>32 bit signed hash</returns> | ||
public static int ComputeHash32(string text) | ||
{ | ||
uint hash = fnvOffsetBasisA32Bit; | ||
|
||
#if NET35 | ||
unchecked | ||
{ | ||
for (int i = 0; i < text.Length; i++) | ||
{ | ||
char ch = text[i]; | ||
byte b = (byte)ch; | ||
hash ^= b; | ||
hash *= fnvPrimeA32Bit; | ||
|
||
b = (byte)(ch >> 8); | ||
hash ^= b; | ||
hash *= fnvPrimeA32Bit; | ||
} | ||
} | ||
#else | ||
ReadOnlySpan<byte> span = MemoryMarshal.Cast<char, byte>(text.AsSpan()); | ||
foreach (byte b in span) | ||
{ | ||
hash = unchecked((hash ^ b) * fnvPrimeA32Bit); | ||
} | ||
#endif | ||
|
||
return unchecked((int)hash); | ||
} | ||
|
||
/// <summary> | ||
/// Computes 64 bit Fowler/Noll/Vo-1a hash optimized for ASCII strings. | ||
/// The hashing algorithm considers only the first 8 bits of each character. | ||
/// Analysis: https://github.com/KirillOsenkov/MSBuildStructuredLog/wiki/String-Hashing#faster-fnv-1a | ||
/// </summary> | ||
/// <param name="text">String to be hashed.</param> | ||
/// <returns>64 bit unsigned hash</returns> | ||
public static long ComputeHash64Fast(string text) | ||
{ | ||
long hash = fnvOffsetBasisA64Bit; | ||
|
||
unchecked | ||
{ | ||
for (int i = 0; i < text.Length; i++) | ||
{ | ||
char ch = text[i]; | ||
|
||
hash = (hash ^ ch) * fnvPrimeA64Bit; | ||
} | ||
} | ||
|
||
return hash; | ||
} | ||
|
||
/// <summary> | ||
/// Computes 64 bit Fowler/Noll/Vo-1a hash of a string (regardless of encoding). | ||
/// </summary> | ||
/// <param name="text">String to be hashed.</param> | ||
/// <returns>64 bit unsigned hash</returns> | ||
public static long ComputeHash64(string text) | ||
{ | ||
long hash = fnvOffsetBasisA64Bit; | ||
|
||
#if NET35 | ||
unchecked | ||
{ | ||
for (int i = 0; i < text.Length; i++) | ||
{ | ||
char ch = text[i]; | ||
byte b = (byte)ch; | ||
hash ^= b; | ||
hash *= fnvPrimeA64Bit; | ||
|
||
b = (byte)(ch >> 8); | ||
hash ^= b; | ||
hash *= fnvPrimeA64Bit; | ||
} | ||
} | ||
#else | ||
ReadOnlySpan<byte> span = MemoryMarshal.Cast<char, byte>(text.AsSpan()); | ||
foreach (byte b in span) | ||
{ | ||
hash = unchecked((hash ^ b) * fnvPrimeA64Bit); | ||
} | ||
#endif | ||
|
||
return hash; | ||
} | ||
|
||
/// <summary> | ||
/// Combines two 64 bit hashes generated by <see cref="FowlerNollVo1aHash"/> class into one. | ||
/// </summary> | ||
/// <param name="left">First hash value to be combined.</param> | ||
/// <param name="right">Second hash value to be combined.</param> | ||
/// <returns></returns> | ||
public static long Combine64(long left, long right) | ||
{ | ||
unchecked | ||
{ | ||
return (left ^ right) * fnvPrimeA64Bit; | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does MSBuild support big-endian platforms? It looks like this would not be stable between big-endian and little-endian.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@KalleOlaviNiemitalo - Thank you for bringing this up!
We're not declaring what can be the expectations of stableness of the hashing between different systems (we actually do not declare anything ... yet :-)) - but it'd certainly be nice to have it more universal.
@uweigand - would you be willing to chime in and share your thoughts on the following?
Basically - since the
MemoryMarshal.Cast<char, byte>
changes how we fetch data from memory to processor - it'll behave differently between Endinan-ness representations. But how anout the version which manually splits each char to bytes:since all this happens in processor after the data are fetched from memory - it should fetch same results regardless of endinan-ness - is that right?
FYI @ladipro (and thanks for all the offline consultation)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes that would work the same in either endianness.
You could also check BitConverter.IsLittleEndian and use MemoryMarshal.Cast on little-endian platforms but fall back to the slower NET35-compatible implementation on big-endian platforms.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
…although I have not measured whether the NET35-compatible implementation actually is slower. In both implementations, each operation on
hash
depends on the previous one, so it doesn't look like there is much that the CPU could do in parallel. And the strings probably are pretty short anyway.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems to me the code above implements three methods that are (potentially) different. There's the
NET35
method inComputeHash64
(which operates on single bytes at a time, splitting the 2-byte char in an endian-independent way), the!NET35
method inComputeHash64
(which also operates on single bytes at a time, but splits the 2-byte char in an endian-dependent way), and then there'sComputeHash64Fast
(which operates on 2-byte chars at a time, which appears to give a different result from either of the two other methods).In general, I agree that it would be preferable to use a method that is stable across native endianness. Unless the
NET35
method is really slower (and I also suspect it actually isn't), and performance matters at this place, it might be best to just use that method unconditionally?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Re ComputeHash64Fast, this comment seems to be false:
msbuild/src/StringTools/FowlerNollVo1aHash.cs
Line 64 in e1652ea
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for all the detailed input! I'll remove the Spans version.
The Spans version is indeed not superior (it has slight to no advantage for large strings, but performs worse for a very short strings). It's nice that it doesn't need unchecked code and is more brief - but given the fact that it can perform worse and that the other version anyways need to be maintained - it's an easy choice.
I'll as well fix the description for the
ComputeHash64Fast