Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix hash(HashSet) which was incorrect; fix hash(OrderedTable[string, JsonNode]) which was bad #13649

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion compiler/vmops.nim
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ from md5 import getMD5
from sighashes import symBodyDigest
from times import cpuTime

from hashes import hash
from hashes import hash, hashUInt64, hashUInt32

template mathop(op) {.dirty.} =
registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`)
Expand Down Expand Up @@ -236,6 +236,11 @@ proc registerAdditionalOps*(c: PCtx) =
registerCallback c, "stdlib.hashes.hashVmImplByte", hashVmImplByte
registerCallback c, "stdlib.hashes.hashVmImplChar", hashVmImplByte

registerCallback c, "stdlib.hashes.hashUInt64", proc (a: VmArgs) {.nimcall.} =
a.setResult hashUInt64(cast[uint64](getInt(a, 0)))
registerCallback c, "stdlib.hashes.hashUInt32", proc (a: VmArgs) {.nimcall.} =
a.setResult hashUInt32(cast[uint32](getInt(a, 0)))

if optBenchmarkVM in c.config.globalOptions:
wrap0(cpuTime, timesop)
else:
Expand Down
6 changes: 1 addition & 5 deletions lib/pure/collections/sets.nim
Original file line number Diff line number Diff line change
Expand Up @@ -572,11 +572,7 @@ proc map*[A, B](data: HashSet[A], op: proc (x: A): B {.closure.}): HashSet[B] =
for item in items(data): result.incl(op(item))

proc hash*[A](s: HashSet[A]): Hash =
## Hashing of HashSet.
for h in 0 .. high(s.data):
if isFilledAndValid(s.data[h].hcode):
result = result xor s.data[h].hcode
result = !$result
hashUnordered(s)

proc `$`*[A](s: HashSet[A]): string =
## Converts the set `s` to a string, mostly for logging and printing purposes.
Expand Down
110 changes: 110 additions & 0 deletions lib/pure/hashes.nim
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,58 @@ type
## always have a size of a power of two and can use the ``and``
## operator instead of ``mod`` for truncation of the hash value.

type UHash = uint

proc hashUInt64*(x: uint64): Hash {.inline.} =
## for internal use; user code should prefer `hash` overloads
when nimvm: # in vmops
doAssert false
else:
# would be orders of magnitude worse, see thashes_perf toHighOrderBits
# hashData(cast[pointer](unsafeAddr x), type(x).sizeof)

# would a bit worse, see thashes_perf toInt64
# type ByteArr = array[int64.sizeof, uint8]
# result = murmurHash(cast[ptr ByteArr](unsafeAddr x)[])

# inspired from https://gist.github.com/badboy/6267743#64-bit-mix-functions
var x = x
x = (not x) + (x shl 21) # x = (x shl 21) - x - 1;
x = x xor (x shr 24)
x = (x + (x shl 3)) + (x shl 8) # x * 265
x = x xor (x shr 14)
x = (x + (x shl 2)) + (x shl 4) # x * 21
x = x xor (x shr 28)
x = x + (x shl 31)
result = cast[Hash](x)

proc hashUInt32*(x: uint32): Hash {.inline.} =
## for internal use; user code should prefer `hash` overloads
# calling `hashUInt64(x)` would perform 1.7X slower, see thashes_perf toInt32
when nimvm: # in vmops
doAssert false
else:
# inspired from https://gist.github.com/badboy/6267743
var x = x xor ((x shr 20) xor (x shr 12))
result = cast[Hash](x xor (x shr 7) xor (x shr 4))

when defined(js):
proc hash*(x: string): Hash {.noSideEffect.}

proc nonlinearHash*(x: Hash): Hash =
when defined(js):
when nimvm:
# this could also be `hashUInt64(cast[uint64](x))` on a 32 bit machine,
# but we can't query for int.sizeof since that's hardcoded for nim js
hashUInt64(cast[uint64](x))
else: hash($x.float) # workaround
else:
when sizeof(Hash) == sizeof(uint64): hashUInt64(cast[uint64](x))
else: hashUInt32(cast[uint32](x))

proc `!&`*(h: Hash, val: int): Hash {.inline.} =
## Mixes a hash value `h` with `val` to produce a new hash value.
## Uses Jenkins hash: https://en.wikipedia.org/wiki/Jenkins_hash_function
##
## This is only needed if you need to implement a hash proc for a new datatype.
let h = cast[uint](h)
Expand All @@ -72,6 +122,8 @@ proc `!$`*(h: Hash): Hash {.inline.} =

proc hashData*(data: pointer, size: int): Hash =
## Hashes an array of bytes of size `size`.
# should probably reuse `proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash`
# which uses better murmurhash algorithm.
var h: Hash = 0
when defined(js):
var p: cstring
Expand Down Expand Up @@ -407,6 +459,64 @@ proc hash*[A](x: set[A]): Hash =
result = result !& hash(it)
result = !$result

template hashUnordered*(iter: untyped): Hash =
## Hashing of unordered elements.
runnableExamples:
doAssert hashUnordered(@[10, 20]) == hashUnordered(@[20, 10])
doAssert hashUnordered(@[10, 20]) != hashUnordered(@[11, 19])
doAssert hashUnordered(@[10, 10]) != hashUnordered(@[11, 11])
static: doAssert hashUnordered(@[10, 10]) != hashUnordered(@[11, 11])
var x: seq[int]
discard hashUnordered(x) # 0 elements works
discard hashUnordered(items(x)) # iterator works
# Example use case: for HashSet's, the result must be order-independant because 2
# HashSet's with different `data.len` but same elements (say after insertions
# and deletions) must hash to the same result.
# To combine individual hashes `hi = hash(si)`, we must either sort the `hi`
# (best hash properties to avoid collisions but requires allocations + sorting)
# or combine them with a commutative and associative operator;
# we also want to avoid trivial cases of bad collisions, ruling out obvious
# combiners, eg:
# `xor`: `xor(hi, hj)` is 0 if hi == hj
# `+`: trivial collisions eg @[10,20] vs @[10+1, 20-1]
# `*`: trivial collisions eg @[10,20] vs @[10 div 2, 20*2], and 0 if any input
# is 0.
# So we combine with `+` but via `nonlinearHash(hash(ai))` to mitigate such
# collisions. As a final refinement, we also add non-linear mixing with
# `len(iter)`.
#
# Note: see also https://crypto.stackexchange.com/questions/54544/how-to-to-calculate-the-hash-of-an-unordered-set
# A more robust but more complex / expensive approach for this problem is
# studied here: http://people.csail.mit.edu/devadas/pubs/mhashes.pdf
when false:
# sort based approach; requires `from std/algorithm import sort`
var s2: seq[Hash]
for h in s: s2.add hash(h)
s2.sort
for h in s2: result = result !& hash(h)
result = !$result

mixin hash
var ret: UHash # prevent checked arithmetics
var count = 0
for ai in iter:
ret += cast[UHash](nonlinearHash(hash(ai)))
count.inc
var result = cast[Hash](ret) !& count # extra non-linear mixing with num elements
result = !$ result
result

template hashOrdered*(iter: untyped): Hash =
## Hashing of ordered elements. See also `hashUnordered`
mixin hash
var count = 0
var result: Hash
for ai in iter:
result = result !& hash(ai)
count.inc
result = result !& count # extra non-linear mixing with num elements
result = !$ result
result

when isMainModule:
block empty:
Expand Down
27 changes: 20 additions & 7 deletions lib/pure/json.nim
Original file line number Diff line number Diff line change
Expand Up @@ -424,15 +424,14 @@ proc `==`*(a, b: JsonNode): bool =
if b.fields[key] != val: return false
result = true

proc hash*(n: OrderedTable[string, JsonNode]): Hash {.noSideEffect.}

proc hash*(n: JsonNode): Hash =
## Compute the hash for a JSON node
case n.kind
of JArray:
result = hash(n.elems)
of JObject:
result = hash(n.fields)
# hash must order insensitive so that it's consistent with `==`
result = hashUnordered(pairs(n.fields))
of JInt:
result = hash(n.num)
of JFloat:
Expand All @@ -444,10 +443,10 @@ proc hash*(n: JsonNode): Hash =
of JNull:
result = Hash(0)

proc hash*(n: OrderedTable[string, JsonNode]): Hash =
for key, val in n:
result = result xor (hash(key) !& hash(val))
result = !$result
proc hash*(n: OrderedTable[string, JsonNode]): Hash {.deprecated: "use hashes.hashUnordered".} =
# not used anymore by `hash(JsonNode)` since json fields are order insensitive
# for fields, whereas OrderedTable should be order sensitive
hashOrdered(pairs(n))

proc len*(n: JsonNode): int =
## If `n` is a `JArray`, it returns the number of elements.
Expand Down Expand Up @@ -1488,3 +1487,17 @@ when isMainModule:
doAssert not isRefSkipDistinct(MyObject)
doAssert isRefSkipDistinct(MyDistinct)
doAssert isRefSkipDistinct(MyOtherDistinct)


block: # hash
let x1 = %*{
"a1": 1,
"a2": 2,
}
let x2 = %*{
"a2": 2,
"a1": 1,
}
doAssert hash(x1) == hash(x2)
doAssert x1 == x2
doAssert $x1 != $x2
32 changes: 32 additions & 0 deletions tests/sets/tsets_various.nim
Original file line number Diff line number Diff line change
Expand Up @@ -254,3 +254,35 @@ block: # test correctness after a number of inserts/deletes

testDel(): (var t: HashSet[int])
testDel(): (var t: OrderedSet[int])

block: # hash(HashSet)
block: # robustness to tombstones
var a: HashSet[int]
a.incl 10
a.excl 10
a.incl 11

var a2: HashSet[int]
a2.incl 11
doAssert a == a2
doAssert hash(a) == hash(a2)

block: # robustness to deletions, which can affect ordering
var a: HashSet[float]
var vals: seq[float]
for i in 0..<10:
let ai = i.float * 0.7
vals.add ai
a.incl ai
var a2: HashSet[float]
for ai in a: a2.incl ai
let n = 1000
for i in 0..<n:
let ai = i.float * 0.31
a2.incl ai
for i in 0..<n:
let ai = i.float * 0.31
if ai notin vals:
a2.excl ai
doAssert a == a2
doAssert hash(a) == hash(a2)