From 71c17f025d65383b2c95d926c92f8810f5a762f5 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Wed, 17 Jul 2019 13:20:40 -0700 Subject: [PATCH 1/4] fix #11764: make sets 1000 times faster for pointer, int64, int etc --- lib/pure/hashes.nim | 54 +++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim index 8104470e32043..989944efbabfd 100644 --- a/lib/pure/hashes.nim +++ b/lib/pure/hashes.nim @@ -56,6 +56,26 @@ type const IntSize = sizeof(int) +proc preferStringHash*(T: typedesc): bool = + ## whether hashing is more efficient using `hash($x)` + # when string hashing is more efficient, see #11764 + # exported so user defined hash can use this too + T.sizeof >= 4 + +proc hash*(x: string): Hash + +proc hash*[T: SomeNumber | Ordinal | char](x: T): Hash {.inline.} = + ## Efficient hashing of numbers, ordinals (eg enum), char. + # fix #11764 + when preferStringHash(T): + when T is SomeFloat: + # needed otherwise finite precision can cause hash duplicate + let x = cast[BiggestInt](x) + hash($x) + else: + # more efficient for small types + ord(x) + proc `!&`*(h: Hash, val: int): Hash {.inline.} = ## Mixes a hash value `h` with `val` to produce a new hash value. ## @@ -110,7 +130,9 @@ proc hash*(x: pointer): Hash {.inline.} = } """ else: - result = cast[Hash](cast[uint](x) shr 3) # skip the alignment + # bugfix #11764: s/cast[Hash]/hash/ + result = hash(cast[uint](x) shr 3) # skip the alignment + # CHECKME: why? isn't that responsability of caller if he needs this behavior? when not defined(booting): proc hash*[T: proc](x: T): Hash {.inline.} = @@ -120,36 +142,6 @@ when not defined(booting): else: result = hash(pointer(x)) -proc hash*(x: int): Hash {.inline.} = - ## Efficient hashing of integers. - result = x - -proc hash*(x: int64): Hash {.inline.} = - ## Efficient hashing of `int64` integers. - result = toU32(x) - -proc hash*(x: uint): Hash {.inline.} = - ## Efficient hashing of unsigned integers. - result = cast[int](x) - -proc hash*(x: uint64): Hash {.inline.} = - ## Efficient hashing of `uint64` integers. - result = toU32(cast[int](x)) - -proc hash*(x: char): Hash {.inline.} = - ## Efficient hashing of characters. - result = ord(x) - -proc hash*[T: Ordinal](x: T): Hash {.inline.} = - ## Efficient hashing of other ordinal types (e.g. enums). - result = ord(x) - -proc hash*(x: float): Hash {.inline.} = - ## Efficient hashing of floats. - var y = x + 1.0 - result = cast[ptr Hash](addr(y))[] - - # Forward declarations before methods that hash containers. This allows # containers to contain other containers proc hash*[A](x: openArray[A]): Hash From 0302ad4a641a07d1491cb22faa4494bd7d407d76 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Wed, 17 Jul 2019 16:39:48 -0700 Subject: [PATCH 2/4] fix nim script test --- lib/pure/hashes.nim | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim index 989944efbabfd..f513e726298c3 100644 --- a/lib/pure/hashes.nim +++ b/lib/pure/hashes.nim @@ -130,8 +130,9 @@ proc hash*(x: pointer): Hash {.inline.} = } """ else: - # bugfix #11764: s/cast[Hash]/hash/ - result = hash(cast[uint](x) shr 3) # skip the alignment + # 2 bug fixes: s/cast[Hash]()/hash()/ (#11764); and also s/uint/BiggestInt/ + # note that we can't use unsigned because nimscript doesn't have `$`(uint) + result = hash(cast[ByteAddress](x) shr 3) # skip the alignment # CHECKME: why? isn't that responsability of caller if he needs this behavior? when not defined(booting): From 93b46c43d0be5deeea30d097939a10bab014195f Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Wed, 17 Jul 2019 21:36:09 -0700 Subject: [PATCH 3/4] avoid $x string allocation; fix another bug that could lead to collisions for small flaots --- lib/pure/hashes.nim | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim index f513e726298c3..f88ec45643450 100644 --- a/lib/pure/hashes.nim +++ b/lib/pure/hashes.nim @@ -66,12 +66,20 @@ proc hash*(x: string): Hash proc hash*[T: SomeNumber | Ordinal | char](x: T): Hash {.inline.} = ## Efficient hashing of numbers, ordinals (eg enum), char. - # fix #11764 - when preferStringHash(T): + when preferStringHash(T): # fix #11764 when T is SomeFloat: - # needed otherwise finite precision can cause hash duplicate - let x = cast[BiggestInt](x) - hash($x) + # 0.0 vs -0.0 should map to same hash to avoid weird behavior. + # the only non nan value that can cause clash is 0 according to + # https://stackoverflow.com/questions/31087915/are-there-denormalized-floats-that-evaluate-to-the-same-value-apart-from-0-0 + # bugfix: the previous code was using `x = x + 1.0` (presumably for + # handling negative 0), however this doesn't work well for small inputs + # because `x+1.0` can become 0 with floating point accuracy, which + # leads to hash collisions. + # Note: this hit this bug: #11775: + # `let x = if x == 0.0: 0.0 else: x` + var x = x + if x == 0: x = 0 + hashData(cast[pointer](unsafeAddr x), T.sizeof) else: # more efficient for small types ord(x) @@ -99,6 +107,9 @@ proc `!$`*(h: Hash): Hash {.inline.} = proc hashData*(data: pointer, size: int): Hash = ## Hashes an array of bytes of size `size`. + # this should probably be merged/refactored with + # `proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash =` to avoid + # using 2 different algorithms var h: Hash = 0 when defined(js): var p: cstring From a4eb96d2cacd9c1c43f8311e173a6d97d8eb4193 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Thu, 18 Jul 2019 02:06:37 -0700 Subject: [PATCH 4/4] fix tests using hashBiggestIntVM vm callback --- compiler/vmops.nim | 4 ++ lib/pure/hashes.nim | 66 ++++++++++++++++------------ tests/collections/ttables.nim | 7 +-- tests/collections/ttablesthreads.nim | 2 +- tests/vm/tcompiletimetable.nim | 26 +++++++++++ 5 files changed, 73 insertions(+), 32 deletions(-) diff --git a/compiler/vmops.nim b/compiler/vmops.nim index 3bd931b3930f1..f917700427a6f 100644 --- a/compiler/vmops.nim +++ b/compiler/vmops.nim @@ -16,6 +16,7 @@ from math import sqrt, ln, log10, log2, exp, round, arccos, arcsin, from os import getEnv, existsEnv, dirExists, fileExists, putEnv, walkDir, getAppFilename from md5 import getMD5 from sighashes import symBodyDigest +from std/hashes import hashBiggestInt template mathop(op) {.dirty.} = registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`) @@ -144,3 +145,6 @@ proc registerAdditionalOps*(c: PCtx) = let n = getNode(a, 0) if n.kind != nkSym: raise newException(ValueError, "node is not a symbol") setResult(a, $symBodyDigest(c.graph, n.sym)) + + registerCallback c, "stdlib.hashes.hashBiggestIntVM", proc (a: VmArgs) {.nimcall.} = + a.setResult hashBiggestInt(getInt(a, 0)) diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim index f88ec45643450..548296a38d17b 100644 --- a/lib/pure/hashes.nim +++ b/lib/pure/hashes.nim @@ -56,38 +56,11 @@ type const IntSize = sizeof(int) -proc preferStringHash*(T: typedesc): bool = - ## whether hashing is more efficient using `hash($x)` - # when string hashing is more efficient, see #11764 - # exported so user defined hash can use this too - T.sizeof >= 4 - -proc hash*(x: string): Hash - -proc hash*[T: SomeNumber | Ordinal | char](x: T): Hash {.inline.} = - ## Efficient hashing of numbers, ordinals (eg enum), char. - when preferStringHash(T): # fix #11764 - when T is SomeFloat: - # 0.0 vs -0.0 should map to same hash to avoid weird behavior. - # the only non nan value that can cause clash is 0 according to - # https://stackoverflow.com/questions/31087915/are-there-denormalized-floats-that-evaluate-to-the-same-value-apart-from-0-0 - # bugfix: the previous code was using `x = x + 1.0` (presumably for - # handling negative 0), however this doesn't work well for small inputs - # because `x+1.0` can become 0 with floating point accuracy, which - # leads to hash collisions. - # Note: this hit this bug: #11775: - # `let x = if x == 0.0: 0.0 else: x` - var x = x - if x == 0: x = 0 - hashData(cast[pointer](unsafeAddr x), T.sizeof) - else: - # more efficient for small types - ord(x) - proc `!&`*(h: Hash, val: int): Hash {.inline.} = ## Mixes a hash value `h` with `val` to produce a new hash value. ## ## This is only needed if you need to implement a hash proc for a new datatype. + ## Uses Jenkins hash: https://en.wikipedia.org/wiki/Jenkins_hash_function let h = cast[uint](h) let val = cast[uint](val) var res = h + val @@ -124,6 +97,43 @@ proc hashData*(data: pointer, size: int): Hash = dec(s) result = !$h +proc hashBiggestIntVM(x: BiggestInt): Hash = discard # in vmops + +proc hashBiggestInt*(x: BiggestInt): Hash {.inline.} = + ## for internal use; user code should prefer `hash` overloads + when nimvm: hashBiggestIntVM(x) + else: hashData(cast[pointer](unsafeAddr x), type(x).sizeof) + +proc hash*[T: SomeNumber | Ordinal | char](x: T): Hash {.inline.} = + ## Efficient hashing of numbers, ordinals (eg enum), char. + when T.sizeof >= 4: + # fix #11764: `ord(x)`, `toU32(x)` or similar are up to 4X faster to compute + # compared to jenkins `hashData` but result in very poor hashes, leading to + # collisions; this can lead to several order magnitude (eg 1e3) slowdowns + # e.g. when used in hash tables, so we prefer to use slower to compute good + # hashes here. Murmur3 would improve speed of hash computation. + when T is SomeFloat: + # 0.0 vs -0.0 should map to same hash to avoid weird behavior. + # the only non nan value that can cause clash is 0 according to + # https://stackoverflow.com/questions/31087915/are-there-denormalized-floats-that-evaluate-to-the-same-value-apart-from-0-0 + # bugfix: the previous code was using `x = x + 1.0` (presumably for + # handling negative 0), however this leads to collisions for small x due + # to FP finite precision. + let x: BiggestInt = + if x == 0: 0.BiggestInt + else: + when sizeof(BiggestInt) == sizeof(T): + cast[BiggestInt](x) + else: # for nimvm + cast[int32](x).BiggestInt + else: + let x = x.BiggestInt + hashBiggestInt(x) + else: + # empirically better for small types, the collision risk is limited anyway + # due to cardinality of at most 2^16=65536 + ord(x) + when defined(js): var objectID = 0 diff --git a/tests/collections/ttables.nim b/tests/collections/ttables.nim index 0a5a013679aaf..66db5f79af323 100644 --- a/tests/collections/ttables.nim +++ b/tests/collections/ttables.nim @@ -165,9 +165,10 @@ block tableconstr: block ttables2: proc TestHashIntInt() = var tab = initTable[int,int]() - for i in 1..1_000_000: + const n = 1_000_000 # bottleneck: 50 seconds on OSX in debug mode + for i in 1..n: tab[i] = i - for i in 1..1_000_000: + for i in 1..n: var x = tab[i] if x != i : echo "not found ", i @@ -233,7 +234,7 @@ block tablesref: for y in 0..1: assert t[(x,y)] == $x & $y assert($t == - "{(x: 0, y: 1): \"01\", (x: 0, y: 0): \"00\", (x: 1, y: 0): \"10\", (x: 1, y: 1): \"11\"}") + """{(x: 0, y: 1): "01", (x: 1, y: 0): "10", (x: 0, y: 0): "00", (x: 1, y: 1): "11"}""") block tableTest2: var t = newTable[string, float]() diff --git a/tests/collections/ttablesthreads.nim b/tests/collections/ttablesthreads.nim index 7fe4c79b13474..a17627fec185e 100644 --- a/tests/collections/ttablesthreads.nim +++ b/tests/collections/ttablesthreads.nim @@ -48,7 +48,7 @@ block tableTest1: for y in 0..1: assert t[(x,y)] == $x & $y assert($t == - "{(x: 0, y: 1): \"01\", (x: 0, y: 0): \"00\", (x: 1, y: 0): \"10\", (x: 1, y: 1): \"11\"}") + """{(x: 0, y: 0): "00", (x: 1, y: 0): "10", (x: 0, y: 1): "01", (x: 1, y: 1): "11"}""") block tableTest2: var t = initTable[string, float]() diff --git a/tests/vm/tcompiletimetable.nim b/tests/vm/tcompiletimetable.nim index e78c06536cc7d..11fbc58db2b52 100644 --- a/tests/vm/tcompiletimetable.nim +++ b/tests/vm/tcompiletimetable.nim @@ -47,3 +47,29 @@ addStuff("Hey"): echo "Hey" addStuff("Hi"): echo "Hi" dump() +import std/hashes +block: + # check CT vs RT produces same results for Table + template callFun(T) = + block: + proc fun(): string = + var t: Table[T, string] + let n = 10 + for i in 0..