From 0e24c5e09d825466ada63ffe861ea9b34f98cf7c Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Thu, 19 Nov 2020 14:33:19 -0500 Subject: [PATCH 01/16] Speed up parsejson 3.25x (with a gcc-10.2 PGO build) on number heavy input. Also add a couple convenience iterators and update `changelo.md`. The basic idea here is to process the ASCII just once in a combination validation/classification/parse rather than validating/classifying a number in `parsejson.parseNumber`, then doing the same work again in `parseFloat`/`parseInt` and finally doing it a 3rd time in the C library `strtod`/etc. The last is especially slow on some libc's/common CPUs due to `long double`/80-bit extended precision arithmetic on each digit. In any event, the new fully functional `parseNumber` is not actually much more code than the old classifying-only `parseNumber`. Because we aim to do this work just once and the output of the work is an int64|float, this PR adds those exported fields to `JsonParser`. (It also documents two existing but undocumented fields.) One simple optimization done here is pow10. It uses a small lookup table for the power of 10 multiplier. The full 2048*8B = 16 KiB one is bigger than is useful. In truth most numbers in any given run will likely be of similar orders of magnitude, meaning the cache cost would not be heavy, but probably best to not rely only likelihood. So fall back to a fast integer-exponentiation algorithm when out of range. The new `parseNumber` itself is more or less a straight-line parse of scientific notation where the '.' can be anywhere in the number. To do the right power-of-10 scaling later, we need to bookkeep a little more than the old `parseNumber` as noted in side comments in the code. Note that calling code that always wants a `float` even if the textual representation looks like an integer can do something like `if p.tok == tkInt: float(p.i) else: p.f` or similar, depending on context. Note that since we form the final float by integer * powerOf10 and since powers of 10 have some intrinsic binary representational error and since some alternative approaches (on just some CPUs) use 80-bit floats, it is possible for the number to mismatch those other approaches in the least significant mantissa bit (or for the ASCII->binary->ASCII round-trip to not be the identity). On those CPUs only, better matching results can maybe be gotten from an `emit` using `long double` if desired (also with a long double table for powers of 10 and the powers of 10 calculation). This strikes me as unlikely to be truly needed long-term, though. --- changelog.md | 5 ++ lib/pure/parsejson.nim | 161 ++++++++++++++++++++++++++++++----------- 2 files changed, 125 insertions(+), 41 deletions(-) diff --git a/changelog.md b/changelog.md index d53f5b0381cc7..e7ba39e80eb4e 100644 --- a/changelog.md +++ b/changelog.md @@ -22,6 +22,11 @@ literals remain in the "raw" string form so that client code can easily treat small and large numbers uniformly. +- The parsejson module now combines number validation & parsing to net a speed + up of over 2x on number heavy inputs. It also allows *not* retaining origin + strings for integers & floats for another 1.5x speed up (over 3X overall). + A couple convenience iterators were also added. + - Added `randState` template that exposes the default random number generator. Useful for library authors. diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index 18e6037f3cf64..d1364504977b2 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -63,13 +63,15 @@ type stateExpectObjectComma, stateExpectColon, stateExpectValue JsonParser* = object of BaseLexer ## the parser object. - a*: string - tok*: TokKind + a*: string ## last valid string + i*: int64 ## last valid integer + f*: float ## last valid float + tok*: TokKind ## current token kind kind: JsonEventKind err: JsonError state: seq[ParserState] filename: string - rawStringLiterals: bool + rawStringLiterals, strIntegers, strFloats: bool JsonKindError* = object of ValueError ## raised by the ``to`` macro if the ## JSON kind is incorrect. @@ -102,17 +104,25 @@ const ] proc open*(my: var JsonParser, input: Stream, filename: string; - rawStringLiterals = false) = + rawStringLiterals = false, strIntegers = true, strFloats = true) = ## initializes the parser with an input stream. `Filename` is only used ## for nice error messages. If `rawStringLiterals` is true, string literals ## are kept with their surrounding quotes and escape sequences in them are - ## left untouched too. + ## left untouched too. If `strIntegers` is true, the `a` field is set to the + ## substring used to build the integer `i`. If `strFloats` is true, the `a` + ## field is set to the substring used to build the float `f`. These are + ## distinct from `rawFloats` & `rawIntegers` in `json` module, but must be + ## true for those raw* forms to work correctly. Parsing is about 1.5x faster + ## with all 3 flags false vs. all true, but false `str` defaults are needed + ## for backward compatibility. lexbase.open(my, input) my.filename = filename my.state = @[stateStart] my.kind = jsonError my.a = "" my.rawStringLiterals = rawStringLiterals + my.strIntegers = strIntegers + my.strFloats = strFloats proc close*(my: var JsonParser) {.inline.} = ## closes the parser `my` and its associated input stream. @@ -317,35 +327,79 @@ proc skip(my: var JsonParser) = break my.bufpos = pos -proc parseNumber(my: var JsonParser) = - var pos = my.bufpos - if my.buf[pos] == '-': - add(my.a, '-') - inc(pos) - if my.buf[pos] == '.': - add(my.a, "0.") - inc(pos) - else: - while my.buf[pos] in Digits: - add(my.a, my.buf[pos]) - inc(pos) - if my.buf[pos] == '.': - add(my.a, '.') - inc(pos) - # digits after the dot: - while my.buf[pos] in Digits: - add(my.a, my.buf[pos]) - inc(pos) - if my.buf[pos] in {'E', 'e'}: - add(my.a, my.buf[pos]) - inc(pos) - if my.buf[pos] in {'+', '-'}: - add(my.a, my.buf[pos]) - inc(pos) - while my.buf[pos] in Digits: - add(my.a, my.buf[pos]) - inc(pos) - my.bufpos = pos +proc i64(c: char): int64 {.inline.} = int64(ord(c) - ord('0')) + +proc pow10(e: int64): float {.inline.} = + const p10 = [1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14, + 1e-13, 1e-12, 1e-11, 1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, + 1e-4, 1e-3, 1e-2, 1e-1, 1.0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, + 1e8, 1e9] # 4*64B cache lines = 32 slots + if -22 <= e and e <= 9: + return p10[e + 22] # common case=small table lookup + result = 1.0 + var base = 10.0 + var e = e + if e < 0: + e = -e + base = 0.1 + while e != 0: + if (e and 1) != 0: + result *= base + e = e shr 1 + base *= base + +proc parseNumber(my: var JsonParser): TokKind {.inline.} = + # Parse/validate/classify all at once, both setting & returning token kind + # and, if not `tkError`, leaving the binary numeric answer in `my.[if]`. + const Sign = {'+', '-'} # NOTE: `parseFloat` can generalize this to INF/NAN. + var i = my.bufpos # NUL ('\0') terminated + var noDot = false + var exp = 0'i64 + var p10 = 0 + var pnt = -1 # find '.' (point); do digits + var nD = 0 + my.i = 0'i64 # build my.i up from zero.. + if my.buf[i] in Sign: + i.inc # skip optional sign + while my.buf[i] != '\0': # ..and track scale/pow10. + if my.buf[i] notin Digits: + if my.buf[i] != '.' or pnt >= 0: + break # a second '.' is forbidden + pnt = nD # save location of '.' (point) + nD.dec # undo loop's nD.inc + elif nD < 20: # 2**63==9.2e18 => 19 digits ok + my.i = 10 * my.i + my.buf[i].i64 # core ASCII->binary transform + else: # 20+ digits before decimal + p10.inc # any digit moves implicit '.' + i.inc + nD.inc + if my.buf[my.bufpos] == '-': + my.i = -my.i # adjust sign + if pnt < 0: # never saw '.' + pnt = nD; noDot = true # so set to number of digits + elif nD == 1: + return tkError # ONLY "[+-]*\.*" + if my.buf[i] in {'E', 'e'}: # optional exponent + i.inc + let i0 = i + if my.buf[i] in Sign: + i.inc # skip optional sign + while my.buf[i] in Digits: # build exponent + exp = 10 * exp + my.buf[i].i64 + i.inc + if my.buf[i0] == '-': + exp = -exp # adjust sign + elif noDot: # and my.i < (1'i64 shl 53'i64) ? # No '.' & No [Ee]xponent + my.bufpos = i + if my.strIntegers: + my.a = my.buf[my.bufpos.. Date: Fri, 20 Nov 2020 03:08:44 -0500 Subject: [PATCH 02/16] Make the default mode quite a bit faster with setLen+copyMem instead of slice assignment. Within 1.03x of non-copy mode in a PGO build (1.10x if the caller must save the last valid string). One might think that this argues for ditching `strFloats` & `strIntegers` entirely and just always copying. In some sense, it does. However, the non-copy mode is a useful common case here, as shown in `jsonTokens(string)` example code. So, it is a bit of a judgement call whether supporting that last 10% of perf in a useful common case matters. --- lib/pure/parsejson.nim | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index d1364504977b2..e8d8b9ceaae51 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -392,12 +392,14 @@ proc parseNumber(my: var JsonParser): TokKind {.inline.} = elif noDot: # and my.i < (1'i64 shl 53'i64) ? # No '.' & No [Ee]xponent my.bufpos = i if my.strIntegers: - my.a = my.buf[my.bufpos.. Date: Fri, 20 Nov 2020 04:09:38 -0500 Subject: [PATCH 03/16] NimScript uses `json` and thus `parsejson`, but does not support `copyMem`. --- lib/pure/parsejson.nim | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index e8d8b9ceaae51..27abbef34f1a7 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -392,14 +392,20 @@ proc parseNumber(my: var JsonParser): TokKind {.inline.} = elif noDot: # and my.i < (1'i64 shl 53'i64) ? # No '.' & No [Ee]xponent my.bufpos = i if my.strIntegers: - my.a.setLen i - my.bufpos - copyMem my.a[0].addr, my.buf[my.bufpos].addr, i - my.bufpos + when nimvm: + my.a = my.buf[my.bufpos.. Date: Fri, 20 Nov 2020 04:14:37 -0500 Subject: [PATCH 04/16] Gack. I guess one needs defined(nimscript) not nimvm. (Someone should make `copyMem` just work everywhere, IMO.) --- lib/pure/parsejson.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index 27abbef34f1a7..382a8ecc4ddbe 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -392,7 +392,7 @@ proc parseNumber(my: var JsonParser): TokKind {.inline.} = elif noDot: # and my.i < (1'i64 shl 53'i64) ? # No '.' & No [Ee]xponent my.bufpos = i if my.strIntegers: - when nimvm: + when defined(nimscript): my.a = my.buf[my.bufpos.. Date: Fri, 20 Nov 2020 04:35:24 -0500 Subject: [PATCH 05/16] Ok. This jsOrVmBlock works for `streams` so it should work here. Also, extract mess into a template. --- lib/pure/parsejson.nim | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index 382a8ecc4ddbe..4ce1079d6eaaf 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -327,6 +327,25 @@ proc skip(my: var JsonParser) = break my.bufpos = pos +template jsOrVmBlock(caseJsOrVm, caseElse: untyped): untyped = + when nimvm: + block: + caseJsOrVm + else: + block: + when defined(js) or defined(nimscript): + # nimscript has to be here to avoid semantic checking of caseElse + caseJsOrVm + else: + caseElse + +template doCopy(a, b, start, endp1: untyped): untyped = + jsOrVmBlock: + a = b[start ..< endp1] + do: + a.setLen endp1 - start + copyMem a[0].addr, b[start].addr, endp1 - start + proc i64(c: char): int64 {.inline.} = int64(ord(c) - ord('0')) proc pow10(e: int64): float {.inline.} = @@ -391,21 +410,11 @@ proc parseNumber(my: var JsonParser): TokKind {.inline.} = exp = -exp # adjust sign elif noDot: # and my.i < (1'i64 shl 53'i64) ? # No '.' & No [Ee]xponent my.bufpos = i - if my.strIntegers: - when defined(nimscript): - my.a = my.buf[my.bufpos.. Date: Fri, 20 Nov 2020 04:54:16 -0500 Subject: [PATCH 06/16] Fix empty container problem. --- lib/pure/parsejson.nim | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index 4ce1079d6eaaf..5d191c5a9e66f 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -343,8 +343,10 @@ template doCopy(a, b, start, endp1: untyped): untyped = jsOrVmBlock: a = b[start ..< endp1] do: - a.setLen endp1 - start - copyMem a[0].addr, b[start].addr, endp1 - start + let n = endp1 - start + if n > 0: + a.setLen n + copyMem a[0].addr, b[start].addr, n proc i64(c: char): int64 {.inline.} = int64(ord(c) - ord('0')) From 05a735085143d6778e81ffdfea67351f93ebe9a6 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Fri, 20 Nov 2020 05:37:35 -0500 Subject: [PATCH 07/16] Have `getInt` & `getFloat` use the (also exported) always populated fields. --- lib/pure/parsejson.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index 5d191c5a9e66f..b09d1535f77b5 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -137,12 +137,12 @@ proc str*(my: JsonParser): string {.inline.} = proc getInt*(my: JsonParser): BiggestInt {.inline.} = ## returns the number for the event: ``jsonInt`` assert(my.kind == jsonInt) - return parseBiggestInt(my.a) + return cast[BiggestInt](my.i) # A no-op unless BiggestInt changes proc getFloat*(my: JsonParser): float {.inline.} = ## returns the number for the event: ``jsonFloat`` assert(my.kind == jsonFloat) - return parseFloat(my.a) + return my.f proc kind*(my: JsonParser): JsonEventKind {.inline.} = ## returns the current event type for the JSON parser From 136fe92ac19e92a34f9f3c74f5e27f66336c01a4 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Fri, 20 Nov 2020 05:54:50 -0500 Subject: [PATCH 08/16] Fix assertion; `kind = jsonError` here. --- lib/pure/parsejson.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index b09d1535f77b5..c3bf3ec38b548 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -136,12 +136,12 @@ proc str*(my: JsonParser): string {.inline.} = proc getInt*(my: JsonParser): BiggestInt {.inline.} = ## returns the number for the event: ``jsonInt`` - assert(my.kind == jsonInt) + assert(my.tok == tkInt) return cast[BiggestInt](my.i) # A no-op unless BiggestInt changes proc getFloat*(my: JsonParser): float {.inline.} = ## returns the number for the event: ``jsonFloat`` - assert(my.kind == jsonFloat) + assert(my.tok == tkFloat) return my.f proc kind*(my: JsonParser): JsonEventKind {.inline.} = From 7fd9287f36334779391ad47e1dddd362f3ae07b9 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Fri, 20 Nov 2020 05:55:53 -0500 Subject: [PATCH 09/16] These two spots should always have used the `get(Int|Float)` abstractions. --- lib/pure/json.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pure/json.nim b/lib/pure/json.nim index 063fad8b45d55..213349758f6b9 100644 --- a/lib/pure/json.nim +++ b/lib/pure/json.nim @@ -813,7 +813,7 @@ proc parseJson(p: var JsonParser; rawIntegers, rawFloats: bool): JsonNode = result = newJRawNumber(p.a) else: try: - result = newJInt(parseBiggestInt(p.a)) + result = newJInt(p.getInt) except ValueError: result = newJRawNumber(p.a) discard getTok(p) @@ -822,7 +822,7 @@ proc parseJson(p: var JsonParser; rawIntegers, rawFloats: bool): JsonNode = result = newJRawNumber(p.a) else: try: - result = newJFloat(parseFloat(p.a)) + result = newJFloat(p.getFloat) except ValueError: result = newJRawNumber(p.a) discard getTok(p) From 5b98112bece2b9bbe18e2b427469bc0b495dc0d1 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Fri, 20 Nov 2020 06:24:01 -0500 Subject: [PATCH 10/16] Fix round-trip in test being of by just the least significant bit. --- tests/stdlib/tjsonutils.nim | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/stdlib/tjsonutils.nim b/tests/stdlib/tjsonutils.nim index 28f05ecbe0f4c..40290cf4ffc59 100644 --- a/tests/stdlib/tjsonutils.nim +++ b/tests/stdlib/tjsonutils.nim @@ -4,6 +4,7 @@ discard """ import std/jsonutils import std/json +import std/math proc testRoundtrip[T](t: T, expected: string) = let j = t.toJson @@ -238,7 +239,7 @@ template fn() = doAssert not foo.b doAssert foo.f == 0.0 doAssert foo.c == 1 - doAssert foo.c1 == 3.14159 + doAssert almostEqual(foo.c1, 3.14159, 1) block testExceptionOnWrongDiscirminatBranchInJson: var foo = Foo(b: false, f: 3.14159, c: 0, c0: 42) @@ -247,7 +248,7 @@ template fn() = fromJson(foo, json, Joptions(allowMissingKeys: true)) # Test that the original fields are not reset. doAssert not foo.b - doAssert foo.f == 3.14159 + doAssert almostEqual(foo.f, 3.14159, 1) doAssert foo.c == 0 doAssert foo.c0 == 42 @@ -258,7 +259,7 @@ template fn() = doAssert not foo.b doAssert foo.f == 2.71828 doAssert foo.c == 1 - doAssert foo.c1 == 3.14159 + doAssert almostEqual(foo.c1, 3.14159, 1) block testAllowExtraKeysInJsonOnWrongDisciriminatBranch: var foo = Foo(b: false, f: 3.14159, c: 0, c0: 42) @@ -267,7 +268,7 @@ template fn() = allowExtraKeys: true)) # Test that the original fields are not reset. doAssert not foo.b - doAssert foo.f == 3.14159 + doAssert almostEqual(foo.f, 3.14159, 1) doAssert foo.c == 0 doAssert foo.c0 == 42 From 61f39cdbafca617674db229bda1cbbd93b9e6123 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Fri, 20 Nov 2020 06:54:04 -0500 Subject: [PATCH 11/16] Fix integer overflow errors. 18 digits is still plenty (IEEE 64-bit float has only 15.95 decimal digits). --- lib/pure/parsejson.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index c3bf3ec38b548..c4be6c7a04a23 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -388,7 +388,7 @@ proc parseNumber(my: var JsonParser): TokKind {.inline.} = break # a second '.' is forbidden pnt = nD # save location of '.' (point) nD.dec # undo loop's nD.inc - elif nD < 20: # 2**63==9.2e18 => 19 digits ok + elif nD < 18: # 2**63==9.2e18 => 19 digits ok my.i = 10 * my.i + my.buf[i].i64 # core ASCII->binary transform else: # 20+ digits before decimal p10.inc # any digit moves implicit '.' From 822d91cee059b30b742519a7b1df9a86c1b18139 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Fri, 20 Nov 2020 06:56:40 -0500 Subject: [PATCH 12/16] Oops..fix comment, too. --- lib/pure/parsejson.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index c4be6c7a04a23..5ac211bf026d8 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -388,7 +388,7 @@ proc parseNumber(my: var JsonParser): TokKind {.inline.} = break # a second '.' is forbidden pnt = nD # save location of '.' (point) nD.dec # undo loop's nD.inc - elif nD < 18: # 2**63==9.2e18 => 19 digits ok + elif nD < 18: # 2**63==9.2e18 => 18 digits ok my.i = 10 * my.i + my.buf[i].i64 # core ASCII->binary transform else: # 20+ digits before decimal p10.inc # any digit moves implicit '.' From 88d984567d311ba0f8905fcd49520d164fb58e06 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Fri, 20 Nov 2020 08:41:26 -0500 Subject: [PATCH 13/16] This may pass all tests, but it is preliminary to see if, in fact, this is the only remaining problem. Before merging we should make overflow detection more precise than just a digit count for both `tkInt` and `tkFloat`. --- lib/pure/json.nim | 18 ++++-------------- lib/pure/parsejson.nim | 9 +++++++-- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/lib/pure/json.nim b/lib/pure/json.nim index 213349758f6b9..794d2a9349d19 100644 --- a/lib/pure/json.nim +++ b/lib/pure/json.nim @@ -809,22 +809,12 @@ proc parseJson(p: var JsonParser; rawIntegers, rawFloats: bool): JsonNode = p.a = "" discard getTok(p) of tkInt: - if rawIntegers: - result = newJRawNumber(p.a) - else: - try: - result = newJInt(p.getInt) - except ValueError: - result = newJRawNumber(p.a) + result = if rawIntegers or p.giant: newJRawNumber(p.a) + else: newJInt(p.getInt) discard getTok(p) of tkFloat: - if rawFloats: - result = newJRawNumber(p.a) - else: - try: - result = newJFloat(p.getFloat) - except ValueError: - result = newJRawNumber(p.a) + result = if rawFloats or p.giant: newJRawNumber(p.a) + else: newJFloat(p.getFloat) discard getTok(p) of tkTrue: result = newJBool(true) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index 5ac211bf026d8..b5074b345a5c1 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -66,6 +66,7 @@ type a*: string ## last valid string i*: int64 ## last valid integer f*: float ## last valid float + giant*: bool ## true if tkInt or tkFloat overflow native bounds tok*: TokKind ## current token kind kind: JsonEventKind err: JsonError @@ -379,6 +380,7 @@ proc parseNumber(my: var JsonParser): TokKind {.inline.} = var p10 = 0 var pnt = -1 # find '.' (point); do digits var nD = 0 + my.giant = false my.i = 0'i64 # build my.i up from zero.. if my.buf[i] in Sign: i.inc # skip optional sign @@ -391,6 +393,7 @@ proc parseNumber(my: var JsonParser): TokKind {.inline.} = elif nD < 18: # 2**63==9.2e18 => 18 digits ok my.i = 10 * my.i + my.buf[i].i64 # core ASCII->binary transform else: # 20+ digits before decimal + my.giant = true #XXX condition should be more precise than "18 digits" p10.inc # any digit moves implicit '.' i.inc nD.inc @@ -412,11 +415,13 @@ proc parseNumber(my: var JsonParser): TokKind {.inline.} = exp = -exp # adjust sign elif noDot: # and my.i < (1'i64 shl 53'i64) ? # No '.' & No [Ee]xponent my.bufpos = i - if my.strIntegers: doCopy(my.a, my.buf, my.bufpos, i) + if my.strIntegers or my.giant: + doCopy(my.a, my.buf, my.bufpos, i) return tkInt # mark as integer exp += pnt - nD + p10 # combine explicit&implicit exp my.f = my.i.float * pow10(exp) # has round-off vs. 80-bit - if my.strFloats: doCopy(my.a, my.buf, my.bufpos, i) + if my.strFloats or my.giant: + doCopy(my.a, my.buf, my.bufpos, i) my.bufpos = i return tkFloat # mark as float From 7a076c2f9af60fb4735196630c0f2e572c55b5f7 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Mon, 23 Nov 2020 09:49:22 -0500 Subject: [PATCH 14/16] Remove token iterators as requested by https://github.com/nim-lang/Nim/pull/16055#discussion_r528753581 --- lib/pure/parsejson.nim | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index b5074b345a5c1..291f0e28661f0 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -599,31 +599,3 @@ proc raiseParseErr*(p: JsonParser, msg: string) {.noinline, noreturn.} = proc eat*(p: var JsonParser, tok: TokKind) = if p.tok == tok: discard getTok(p) else: raiseParseErr(p, tokToStr[tok]) - -iterator jsonTokens*(input: Stream, path: string, rawStringLiterals = false, - strIntegers = true, strFloats = true): ptr JsonParser = - ## Yield each successive `ptr JsonParser` while parsing `input` with error - ## label `path`. - var p: JsonParser - p.open(input, path, rawStringLiterals = rawStringLiterals, - strIntegers = strIntegers, strFloats = strFloats) - try: - var tok = p.getTok - while tok notin {tkEof,tkError}: - yield p.addr # `JsonParser` is a pretty big struct - tok = p.getTok - finally: - p.close() - -iterator jsonTokens*(path: string, rawStringLiterals = false, - strIntegers = true, strFloats = true): ptr JsonParser = - ## Yield each successive `ptr JsonParser` while parsing file data at `path`. - ## Example usage: - ## - ## .. code-block:: nim - ## var sum = 0.0 # total all json floats named "myKey" in JSON file `path`. - ## for tok in jsonTokens(path, false, false, false): - ## if t.tok == tkFloat and t.a == "myKey": sum += t.f - for tok in jsonTokens(newFileStream(path), path, - rawStringLiterals, strIntegers, strFloats): - yield tok From 24192b55d6a881673bcef53846d1b89cede75b3f Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Mon, 23 Nov 2020 12:40:57 -0500 Subject: [PATCH 15/16] Replace public fields with accessors updating comments a bit. --- lib/pure/json.nim | 4 ++-- lib/pure/parsejson.nim | 15 ++++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/lib/pure/json.nim b/lib/pure/json.nim index 794d2a9349d19..d07a04fc7c234 100644 --- a/lib/pure/json.nim +++ b/lib/pure/json.nim @@ -809,11 +809,11 @@ proc parseJson(p: var JsonParser; rawIntegers, rawFloats: bool): JsonNode = p.a = "" discard getTok(p) of tkInt: - result = if rawIntegers or p.giant: newJRawNumber(p.a) + result = if rawIntegers or p.isGiant: newJRawNumber(p.a) else: newJInt(p.getInt) discard getTok(p) of tkFloat: - result = if rawFloats or p.giant: newJRawNumber(p.a) + result = if rawFloats or p.isGiant: newJRawNumber(p.a) else: newJFloat(p.getFloat) discard getTok(p) of tkTrue: diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index 291f0e28661f0..0ae6715c00746 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -64,9 +64,9 @@ type JsonParser* = object of BaseLexer ## the parser object. a*: string ## last valid string - i*: int64 ## last valid integer - f*: float ## last valid float - giant*: bool ## true if tkInt or tkFloat overflow native bounds + i: int64 # last valid integer + f: float # last valid float + giant: bool # true if tkInt or tkFloat overflow native bounds tok*: TokKind ## current token kind kind: JsonEventKind err: JsonError @@ -135,13 +135,18 @@ proc str*(my: JsonParser): string {.inline.} = assert(my.kind in {jsonInt, jsonFloat, jsonString}) return my.a +proc isGiant*(my: JsonParser): bool {.inline.} = + ## returns whether the last ``tkInt|tkFloat`` token was not CPU native + assert(my.tok == tkInt) + return cast[BiggestInt](my.i) # A no-op unless BiggestInt changes + proc getInt*(my: JsonParser): BiggestInt {.inline.} = - ## returns the number for the event: ``jsonInt`` + ## returns the number for the last ``tkInt`` token as a ``BiggestInt`` assert(my.tok == tkInt) return cast[BiggestInt](my.i) # A no-op unless BiggestInt changes proc getFloat*(my: JsonParser): float {.inline.} = - ## returns the number for the event: ``jsonFloat`` + ## returns the number for the last ``tkFloat`` token as a ``float``. assert(my.tok == tkFloat) return my.f From e8c234f97efa8472e702aa2a1b2ad831cdd0eed9 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Mon, 23 Nov 2020 12:50:21 -0500 Subject: [PATCH 16/16] Oops - correct ``isGiant`` implementation to do what was intended. --- lib/pure/parsejson.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pure/parsejson.nim b/lib/pure/parsejson.nim index 0ae6715c00746..a008324dde47c 100644 --- a/lib/pure/parsejson.nim +++ b/lib/pure/parsejson.nim @@ -137,8 +137,8 @@ proc str*(my: JsonParser): string {.inline.} = proc isGiant*(my: JsonParser): bool {.inline.} = ## returns whether the last ``tkInt|tkFloat`` token was not CPU native - assert(my.tok == tkInt) - return cast[BiggestInt](my.i) # A no-op unless BiggestInt changes + assert(my.tok in {tkInt, tkFloat}) + return my.giant proc getInt*(my: JsonParser): BiggestInt {.inline.} = ## returns the number for the last ``tkInt`` token as a ``BiggestInt``