From 71c17f025d65383b2c95d926c92f8810f5a762f5 Mon Sep 17 00:00:00 2001
From: Timothee Cour <timothee.cour2@gmail.com>
Date: Wed, 17 Jul 2019 13:20:40 -0700
Subject: [PATCH 1/4] fix #11764: make sets 1000 times faster for pointer,
 int64, int etc

---
 lib/pure/hashes.nim | 54 +++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 31 deletions(-)

diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim
index 8104470e32043..989944efbabfd 100644
--- a/lib/pure/hashes.nim
+++ b/lib/pure/hashes.nim
@@ -56,6 +56,26 @@ type
 const
   IntSize = sizeof(int)
 
+proc preferStringHash*(T: typedesc): bool =
+  ## whether hashing is more efficient using `hash($x)`
+  # when string hashing is more efficient, see #11764
+  # exported so user defined hash can use this too
+  T.sizeof >= 4
+
+proc hash*(x: string): Hash
+
+proc hash*[T: SomeNumber | Ordinal | char](x: T): Hash {.inline.} =
+  ## Efficient hashing of numbers, ordinals (eg enum), char.
+  # fix #11764
+  when preferStringHash(T):
+    when T is SomeFloat:
+      # needed otherwise finite precision can cause hash duplicate
+      let x = cast[BiggestInt](x)
+    hash($x)
+  else:
+    # more efficient for small types
+    ord(x)
+
 proc `!&`*(h: Hash, val: int): Hash {.inline.} =
   ## Mixes a hash value `h` with `val` to produce a new hash value.
   ##
@@ -110,7 +130,9 @@ proc hash*(x: pointer): Hash {.inline.} =
       }
     """
   else:
-    result = cast[Hash](cast[uint](x) shr 3) # skip the alignment
+    # bugfix #11764: s/cast[Hash]/hash/
+    result = hash(cast[uint](x) shr 3) # skip the alignment
+      # CHECKME: why? isn't that responsability of caller if he needs this behavior?
 
 when not defined(booting):
   proc hash*[T: proc](x: T): Hash {.inline.} =
@@ -120,36 +142,6 @@ when not defined(booting):
     else:
       result = hash(pointer(x))
 
-proc hash*(x: int): Hash {.inline.} =
-  ## Efficient hashing of integers.
-  result = x
-
-proc hash*(x: int64): Hash {.inline.} =
-  ## Efficient hashing of `int64` integers.
-  result = toU32(x)
-
-proc hash*(x: uint): Hash {.inline.} =
-  ## Efficient hashing of unsigned integers.
-  result = cast[int](x)
-
-proc hash*(x: uint64): Hash {.inline.} =
-  ## Efficient hashing of `uint64` integers.
-  result = toU32(cast[int](x))
-
-proc hash*(x: char): Hash {.inline.} =
-  ## Efficient hashing of characters.
-  result = ord(x)
-
-proc hash*[T: Ordinal](x: T): Hash {.inline.} =
-  ## Efficient hashing of other ordinal types (e.g. enums).
-  result = ord(x)
-
-proc hash*(x: float): Hash {.inline.} =
-  ## Efficient hashing of floats.
-  var y = x + 1.0
-  result = cast[ptr Hash](addr(y))[]
-
-
 # Forward declarations before methods that hash containers. This allows
 # containers to contain other containers
 proc hash*[A](x: openArray[A]): Hash

From 0302ad4a641a07d1491cb22faa4494bd7d407d76 Mon Sep 17 00:00:00 2001
From: Timothee Cour <timothee.cour2@gmail.com>
Date: Wed, 17 Jul 2019 16:39:48 -0700
Subject: [PATCH 2/4] fix nim script test

---
 lib/pure/hashes.nim | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim
index 989944efbabfd..f513e726298c3 100644
--- a/lib/pure/hashes.nim
+++ b/lib/pure/hashes.nim
@@ -130,8 +130,9 @@ proc hash*(x: pointer): Hash {.inline.} =
       }
     """
   else:
-    # bugfix #11764: s/cast[Hash]/hash/
-    result = hash(cast[uint](x) shr 3) # skip the alignment
+    # 2 bug fixes: s/cast[Hash]()/hash()/ (#11764); and also s/uint/BiggestInt/
+    # note that we can't use unsigned because nimscript doesn't have `$`(uint)
+    result = hash(cast[ByteAddress](x) shr 3) # skip the alignment
       # CHECKME: why? isn't that responsability of caller if he needs this behavior?
 
 when not defined(booting):

From 93b46c43d0be5deeea30d097939a10bab014195f Mon Sep 17 00:00:00 2001
From: Timothee Cour <timothee.cour2@gmail.com>
Date: Wed, 17 Jul 2019 21:36:09 -0700
Subject: [PATCH 3/4] avoid $x string allocation; fix another bug that could
 lead to collisions for small flaots

---
 lib/pure/hashes.nim | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim
index f513e726298c3..f88ec45643450 100644
--- a/lib/pure/hashes.nim
+++ b/lib/pure/hashes.nim
@@ -66,12 +66,20 @@ proc hash*(x: string): Hash
 
 proc hash*[T: SomeNumber | Ordinal | char](x: T): Hash {.inline.} =
   ## Efficient hashing of numbers, ordinals (eg enum), char.
-  # fix #11764
-  when preferStringHash(T):
+  when preferStringHash(T): # fix #11764
     when T is SomeFloat:
-      # needed otherwise finite precision can cause hash duplicate
-      let x = cast[BiggestInt](x)
-    hash($x)
+      # 0.0 vs -0.0 should map to same hash to avoid weird behavior.
+      # the only non nan value that can cause clash is 0 according to
+      # https://stackoverflow.com/questions/31087915/are-there-denormalized-floats-that-evaluate-to-the-same-value-apart-from-0-0
+      # bugfix: the previous code was using `x = x + 1.0` (presumably for
+      # handling negative 0), however this doesn't work well for small inputs
+      # because `x+1.0` can become 0 with floating point accuracy, which
+      # leads to hash collisions.
+      # Note: this hit this bug: #11775:
+      # `let x = if x == 0.0: 0.0 else: x`
+      var x = x
+      if x == 0: x = 0
+    hashData(cast[pointer](unsafeAddr x), T.sizeof)
   else:
     # more efficient for small types
     ord(x)
@@ -99,6 +107,9 @@ proc `!$`*(h: Hash): Hash {.inline.} =
 
 proc hashData*(data: pointer, size: int): Hash =
   ## Hashes an array of bytes of size `size`.
+  # this should probably be merged/refactored with
+  # `proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash =` to avoid
+  # using 2 different algorithms
   var h: Hash = 0
   when defined(js):
     var p: cstring

From a4eb96d2cacd9c1c43f8311e173a6d97d8eb4193 Mon Sep 17 00:00:00 2001
From: Timothee Cour <timothee.cour2@gmail.com>
Date: Thu, 18 Jul 2019 02:06:37 -0700
Subject: [PATCH 4/4] fix tests using hashBiggestIntVM vm callback

---
 compiler/vmops.nim                   |  4 ++
 lib/pure/hashes.nim                  | 66 ++++++++++++++++------------
 tests/collections/ttables.nim        |  7 +--
 tests/collections/ttablesthreads.nim |  2 +-
 tests/vm/tcompiletimetable.nim       | 26 +++++++++++
 5 files changed, 73 insertions(+), 32 deletions(-)

diff --git a/compiler/vmops.nim b/compiler/vmops.nim
index 3bd931b3930f1..f917700427a6f 100644
--- a/compiler/vmops.nim
+++ b/compiler/vmops.nim
@@ -16,6 +16,7 @@ from math import sqrt, ln, log10, log2, exp, round, arccos, arcsin,
 from os import getEnv, existsEnv, dirExists, fileExists, putEnv, walkDir, getAppFilename
 from md5 import getMD5
 from sighashes import symBodyDigest
+from std/hashes import hashBiggestInt
 
 template mathop(op) {.dirty.} =
   registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`)
@@ -144,3 +145,6 @@ proc registerAdditionalOps*(c: PCtx) =
     let n = getNode(a, 0)
     if n.kind != nkSym: raise newException(ValueError, "node is not a symbol")
     setResult(a, $symBodyDigest(c.graph, n.sym))
+
+  registerCallback c, "stdlib.hashes.hashBiggestIntVM", proc (a: VmArgs) {.nimcall.} =
+    a.setResult hashBiggestInt(getInt(a, 0))
diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim
index f88ec45643450..548296a38d17b 100644
--- a/lib/pure/hashes.nim
+++ b/lib/pure/hashes.nim
@@ -56,38 +56,11 @@ type
 const
   IntSize = sizeof(int)
 
-proc preferStringHash*(T: typedesc): bool =
-  ## whether hashing is more efficient using `hash($x)`
-  # when string hashing is more efficient, see #11764
-  # exported so user defined hash can use this too
-  T.sizeof >= 4
-
-proc hash*(x: string): Hash
-
-proc hash*[T: SomeNumber | Ordinal | char](x: T): Hash {.inline.} =
-  ## Efficient hashing of numbers, ordinals (eg enum), char.
-  when preferStringHash(T): # fix #11764
-    when T is SomeFloat:
-      # 0.0 vs -0.0 should map to same hash to avoid weird behavior.
-      # the only non nan value that can cause clash is 0 according to
-      # https://stackoverflow.com/questions/31087915/are-there-denormalized-floats-that-evaluate-to-the-same-value-apart-from-0-0
-      # bugfix: the previous code was using `x = x + 1.0` (presumably for
-      # handling negative 0), however this doesn't work well for small inputs
-      # because `x+1.0` can become 0 with floating point accuracy, which
-      # leads to hash collisions.
-      # Note: this hit this bug: #11775:
-      # `let x = if x == 0.0: 0.0 else: x`
-      var x = x
-      if x == 0: x = 0
-    hashData(cast[pointer](unsafeAddr x), T.sizeof)
-  else:
-    # more efficient for small types
-    ord(x)
-
 proc `!&`*(h: Hash, val: int): Hash {.inline.} =
   ## Mixes a hash value `h` with `val` to produce a new hash value.
   ##
   ## This is only needed if you need to implement a hash proc for a new datatype.
+  ## Uses Jenkins hash: https://en.wikipedia.org/wiki/Jenkins_hash_function
   let h = cast[uint](h)
   let val = cast[uint](val)
   var res = h + val
@@ -124,6 +97,43 @@ proc hashData*(data: pointer, size: int): Hash =
     dec(s)
   result = !$h
 
+proc hashBiggestIntVM(x: BiggestInt): Hash = discard # in vmops
+
+proc hashBiggestInt*(x: BiggestInt): Hash {.inline.} =
+  ## for internal use; user code should prefer `hash` overloads
+  when nimvm: hashBiggestIntVM(x)
+  else: hashData(cast[pointer](unsafeAddr x), type(x).sizeof)
+
+proc hash*[T: SomeNumber | Ordinal | char](x: T): Hash {.inline.} =
+  ## Efficient hashing of numbers, ordinals (eg enum), char.
+  when T.sizeof >= 4:
+    # fix #11764: `ord(x)`, `toU32(x)` or similar are up to 4X faster to compute
+    # compared to jenkins `hashData` but result in very poor hashes, leading to
+    # collisions; this can lead to several order magnitude (eg 1e3) slowdowns
+    # e.g. when used in hash tables, so we prefer to use slower to compute good
+    # hashes here. Murmur3 would improve speed of hash computation.
+    when T is SomeFloat:
+      # 0.0 vs -0.0 should map to same hash to avoid weird behavior.
+      # the only non nan value that can cause clash is 0 according to
+      # https://stackoverflow.com/questions/31087915/are-there-denormalized-floats-that-evaluate-to-the-same-value-apart-from-0-0
+      # bugfix: the previous code was using `x = x + 1.0` (presumably for
+      # handling negative 0), however this leads to collisions for small x due
+      # to FP finite precision.
+      let x: BiggestInt =
+        if x == 0: 0.BiggestInt
+        else:
+          when sizeof(BiggestInt) == sizeof(T):
+            cast[BiggestInt](x)
+          else: # for nimvm
+            cast[int32](x).BiggestInt
+    else:
+      let x = x.BiggestInt
+    hashBiggestInt(x)
+  else:
+    # empirically better for small types, the collision risk is limited anyway
+    # due to cardinality of at most 2^16=65536
+    ord(x)
+
 when defined(js):
   var objectID = 0
 
diff --git a/tests/collections/ttables.nim b/tests/collections/ttables.nim
index 0a5a013679aaf..66db5f79af323 100644
--- a/tests/collections/ttables.nim
+++ b/tests/collections/ttables.nim
@@ -165,9 +165,10 @@ block tableconstr:
 block ttables2:
   proc TestHashIntInt() =
     var tab = initTable[int,int]()
-    for i in 1..1_000_000:
+    const n = 1_000_000 # bottleneck: 50 seconds on OSX in debug mode
+    for i in 1..n:
       tab[i] = i
-    for i in 1..1_000_000:
+    for i in 1..n:
       var x = tab[i]
       if x != i : echo "not found ", i
 
@@ -233,7 +234,7 @@ block tablesref:
       for y in 0..1:
         assert t[(x,y)] == $x & $y
     assert($t ==
-      "{(x: 0, y: 1): \"01\", (x: 0, y: 0): \"00\", (x: 1, y: 0): \"10\", (x: 1, y: 1): \"11\"}")
+      """{(x: 0, y: 1): "01", (x: 1, y: 0): "10", (x: 0, y: 0): "00", (x: 1, y: 1): "11"}""")
 
   block tableTest2:
     var t = newTable[string, float]()
diff --git a/tests/collections/ttablesthreads.nim b/tests/collections/ttablesthreads.nim
index 7fe4c79b13474..a17627fec185e 100644
--- a/tests/collections/ttablesthreads.nim
+++ b/tests/collections/ttablesthreads.nim
@@ -48,7 +48,7 @@ block tableTest1:
     for y in 0..1:
       assert t[(x,y)] == $x & $y
   assert($t ==
-    "{(x: 0, y: 1): \"01\", (x: 0, y: 0): \"00\", (x: 1, y: 0): \"10\", (x: 1, y: 1): \"11\"}")
+    """{(x: 0, y: 0): "00", (x: 1, y: 0): "10", (x: 0, y: 1): "01", (x: 1, y: 1): "11"}""")
 
 block tableTest2:
   var t = initTable[string, float]()
diff --git a/tests/vm/tcompiletimetable.nim b/tests/vm/tcompiletimetable.nim
index e78c06536cc7d..11fbc58db2b52 100644
--- a/tests/vm/tcompiletimetable.nim
+++ b/tests/vm/tcompiletimetable.nim
@@ -47,3 +47,29 @@ addStuff("Hey"): echo "Hey"
 addStuff("Hi"): echo "Hi"
 dump()
 
+import std/hashes
+block:
+  # check CT vs RT produces same results for Table
+  template callFun(T) =
+    block:
+      proc fun(): string =
+        var t: Table[T, string]
+        let n = 10
+        for i in 0..<n:
+          let i2 = when T.sizeof == type(i).sizeof: i else: i.int32
+          let k = cast[T](i2)
+            # cast intentional for regression testing,
+            # producing small values
+          doAssert k notin t
+          t[k] = $(i, k)
+          doAssert k in t
+        $t
+      const s1 = fun()
+      let s2 = fun()
+      # echo s1 # for debugging
+      doAssert s1 == s2
+      doAssert s1 == s2
+      doAssert hash(0.0) == hash(-0.0)
+  callFun(float)
+  callFun(float32)
+  callFun(int64)