diff --git a/compiler/vmops.nim b/compiler/vmops.nim index 8cb2a055234d3..7067fad966449 100644 --- a/compiler/vmops.nim +++ b/compiler/vmops.nim @@ -18,7 +18,7 @@ from md5 import getMD5 from sighashes import symBodyDigest from times import cpuTime -from hashes import hash +from hashes import hash, hashUInt64, hashUInt32 template mathop(op) {.dirty.} = registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`) @@ -236,6 +236,11 @@ proc registerAdditionalOps*(c: PCtx) = registerCallback c, "stdlib.hashes.hashVmImplByte", hashVmImplByte registerCallback c, "stdlib.hashes.hashVmImplChar", hashVmImplByte + registerCallback c, "stdlib.hashes.hashUInt64", proc (a: VmArgs) {.nimcall.} = + a.setResult hashUInt64(cast[uint64](getInt(a, 0))) + registerCallback c, "stdlib.hashes.hashUInt32", proc (a: VmArgs) {.nimcall.} = + a.setResult hashUInt32(cast[uint32](getInt(a, 0))) + if optBenchmarkVM in c.config.globalOptions: wrap0(cpuTime, timesop) else: diff --git a/lib/pure/collections/sets.nim b/lib/pure/collections/sets.nim index caa25dbb31e25..b87df885298ee 100644 --- a/lib/pure/collections/sets.nim +++ b/lib/pure/collections/sets.nim @@ -572,11 +572,7 @@ proc map*[A, B](data: HashSet[A], op: proc (x: A): B {.closure.}): HashSet[B] = for item in items(data): result.incl(op(item)) proc hash*[A](s: HashSet[A]): Hash = - ## Hashing of HashSet. - for h in 0 .. high(s.data): - if isFilledAndValid(s.data[h].hcode): - result = result xor s.data[h].hcode - result = !$result + hashUnordered(s) proc `$`*[A](s: HashSet[A]): string = ## Converts the set `s` to a string, mostly for logging and printing purposes. diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim index d799fb0620c10..afefd642a7e4c 100644 --- a/lib/pure/hashes.nim +++ b/lib/pure/hashes.nim @@ -49,8 +49,58 @@ type ## always have a size of a power of two and can use the ``and`` ## operator instead of ``mod`` for truncation of the hash value. +type UHash = uint + +proc hashUInt64*(x: uint64): Hash {.inline.} = + ## for internal use; user code should prefer `hash` overloads + when nimvm: # in vmops + doAssert false + else: + # would be orders of magnitude worse, see thashes_perf toHighOrderBits + # hashData(cast[pointer](unsafeAddr x), type(x).sizeof) + + # would a bit worse, see thashes_perf toInt64 + # type ByteArr = array[int64.sizeof, uint8] + # result = murmurHash(cast[ptr ByteArr](unsafeAddr x)[]) + + # inspired from https://gist.github.com/badboy/6267743#64-bit-mix-functions + var x = x + x = (not x) + (x shl 21) # x = (x shl 21) - x - 1; + x = x xor (x shr 24) + x = (x + (x shl 3)) + (x shl 8) # x * 265 + x = x xor (x shr 14) + x = (x + (x shl 2)) + (x shl 4) # x * 21 + x = x xor (x shr 28) + x = x + (x shl 31) + result = cast[Hash](x) + +proc hashUInt32*(x: uint32): Hash {.inline.} = + ## for internal use; user code should prefer `hash` overloads + # calling `hashUInt64(x)` would perform 1.7X slower, see thashes_perf toInt32 + when nimvm: # in vmops + doAssert false + else: + # inspired from https://gist.github.com/badboy/6267743 + var x = x xor ((x shr 20) xor (x shr 12)) + result = cast[Hash](x xor (x shr 7) xor (x shr 4)) + +when defined(js): + proc hash*(x: string): Hash {.noSideEffect.} + +proc nonlinearHash*(x: Hash): Hash = + when defined(js): + when nimvm: + # this could also be `hashUInt64(cast[uint64](x))` on a 32 bit machine, + # but we can't query for int.sizeof since that's hardcoded for nim js + hashUInt64(cast[uint64](x)) + else: hash($x.float) # workaround + else: + when sizeof(Hash) == sizeof(uint64): hashUInt64(cast[uint64](x)) + else: hashUInt32(cast[uint32](x)) + proc `!&`*(h: Hash, val: int): Hash {.inline.} = ## Mixes a hash value `h` with `val` to produce a new hash value. + ## Uses Jenkins hash: https://en.wikipedia.org/wiki/Jenkins_hash_function ## ## This is only needed if you need to implement a hash proc for a new datatype. let h = cast[uint](h) @@ -72,6 +122,8 @@ proc `!$`*(h: Hash): Hash {.inline.} = proc hashData*(data: pointer, size: int): Hash = ## Hashes an array of bytes of size `size`. + # should probably reuse `proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash` + # which uses better murmurhash algorithm. var h: Hash = 0 when defined(js): var p: cstring @@ -407,6 +459,64 @@ proc hash*[A](x: set[A]): Hash = result = result !& hash(it) result = !$result +template hashUnordered*(iter: untyped): Hash = + ## Hashing of unordered elements. + runnableExamples: + doAssert hashUnordered(@[10, 20]) == hashUnordered(@[20, 10]) + doAssert hashUnordered(@[10, 20]) != hashUnordered(@[11, 19]) + doAssert hashUnordered(@[10, 10]) != hashUnordered(@[11, 11]) + static: doAssert hashUnordered(@[10, 10]) != hashUnordered(@[11, 11]) + var x: seq[int] + discard hashUnordered(x) # 0 elements works + discard hashUnordered(items(x)) # iterator works + # Example use case: for HashSet's, the result must be order-independant because 2 + # HashSet's with different `data.len` but same elements (say after insertions + # and deletions) must hash to the same result. + # To combine individual hashes `hi = hash(si)`, we must either sort the `hi` + # (best hash properties to avoid collisions but requires allocations + sorting) + # or combine them with a commutative and associative operator; + # we also want to avoid trivial cases of bad collisions, ruling out obvious + # combiners, eg: + # `xor`: `xor(hi, hj)` is 0 if hi == hj + # `+`: trivial collisions eg @[10,20] vs @[10+1, 20-1] + # `*`: trivial collisions eg @[10,20] vs @[10 div 2, 20*2], and 0 if any input + # is 0. + # So we combine with `+` but via `nonlinearHash(hash(ai))` to mitigate such + # collisions. As a final refinement, we also add non-linear mixing with + # `len(iter)`. + # + # Note: see also https://crypto.stackexchange.com/questions/54544/how-to-to-calculate-the-hash-of-an-unordered-set + # A more robust but more complex / expensive approach for this problem is + # studied here: http://people.csail.mit.edu/devadas/pubs/mhashes.pdf + when false: + # sort based approach; requires `from std/algorithm import sort` + var s2: seq[Hash] + for h in s: s2.add hash(h) + s2.sort + for h in s2: result = result !& hash(h) + result = !$result + + mixin hash + var ret: UHash # prevent checked arithmetics + var count = 0 + for ai in iter: + ret += cast[UHash](nonlinearHash(hash(ai))) + count.inc + var result = cast[Hash](ret) !& count # extra non-linear mixing with num elements + result = !$ result + result + +template hashOrdered*(iter: untyped): Hash = + ## Hashing of ordered elements. See also `hashUnordered` + mixin hash + var count = 0 + var result: Hash + for ai in iter: + result = result !& hash(ai) + count.inc + result = result !& count # extra non-linear mixing with num elements + result = !$ result + result when isMainModule: block empty: diff --git a/lib/pure/json.nim b/lib/pure/json.nim index beb3f7049bbea..fdef48a516869 100644 --- a/lib/pure/json.nim +++ b/lib/pure/json.nim @@ -424,15 +424,14 @@ proc `==`*(a, b: JsonNode): bool = if b.fields[key] != val: return false result = true -proc hash*(n: OrderedTable[string, JsonNode]): Hash {.noSideEffect.} - proc hash*(n: JsonNode): Hash = ## Compute the hash for a JSON node case n.kind of JArray: result = hash(n.elems) of JObject: - result = hash(n.fields) + # hash must order insensitive so that it's consistent with `==` + result = hashUnordered(pairs(n.fields)) of JInt: result = hash(n.num) of JFloat: @@ -444,10 +443,10 @@ proc hash*(n: JsonNode): Hash = of JNull: result = Hash(0) -proc hash*(n: OrderedTable[string, JsonNode]): Hash = - for key, val in n: - result = result xor (hash(key) !& hash(val)) - result = !$result +proc hash*(n: OrderedTable[string, JsonNode]): Hash {.deprecated: "use hashes.hashUnordered".} = + # not used anymore by `hash(JsonNode)` since json fields are order insensitive + # for fields, whereas OrderedTable should be order sensitive + hashOrdered(pairs(n)) proc len*(n: JsonNode): int = ## If `n` is a `JArray`, it returns the number of elements. @@ -1488,3 +1487,17 @@ when isMainModule: doAssert not isRefSkipDistinct(MyObject) doAssert isRefSkipDistinct(MyDistinct) doAssert isRefSkipDistinct(MyOtherDistinct) + + + block: # hash + let x1 = %*{ + "a1": 1, + "a2": 2, + } + let x2 = %*{ + "a2": 2, + "a1": 1, + } + doAssert hash(x1) == hash(x2) + doAssert x1 == x2 + doAssert $x1 != $x2 diff --git a/tests/sets/tsets_various.nim b/tests/sets/tsets_various.nim index c27d8e124caae..df8572a7b974b 100644 --- a/tests/sets/tsets_various.nim +++ b/tests/sets/tsets_various.nim @@ -254,3 +254,35 @@ block: # test correctness after a number of inserts/deletes testDel(): (var t: HashSet[int]) testDel(): (var t: OrderedSet[int]) + +block: # hash(HashSet) + block: # robustness to tombstones + var a: HashSet[int] + a.incl 10 + a.excl 10 + a.incl 11 + + var a2: HashSet[int] + a2.incl 11 + doAssert a == a2 + doAssert hash(a) == hash(a2) + + block: # robustness to deletions, which can affect ordering + var a: HashSet[float] + var vals: seq[float] + for i in 0..<10: + let ai = i.float * 0.7 + vals.add ai + a.incl ai + var a2: HashSet[float] + for ai in a: a2.incl ai + let n = 1000 + for i in 0..