pluots · tgross35 · Dec 30, 2022 · Dec 22, 2022 · Dec 23, 2022 · Dec 23, 2022
diff --git a/.github/workflows/validation-rust.yaml b/.github/workflows/validation-rust.yaml
@@ -197,3 +197,11 @@ jobs:
           curl -sSL https://github.com/rust-lang/mdBook/releases/download/v0.4.21/mdbook-v0.4.21-x86_64-unknown-linux-gnu.tar.gz \
           | tar -xz --directory=$(echo ~)/mdbook
       - run: ~/mdbook/mdbook test
+
+  # Make sure we turned the clippy lint off
+  verify_fixme_critical:
+    name: "Verify critical fixmes"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - run: grep -r "FIXME:CRIT" --exclude-dir="target" --exclude-dir=".git" --exclude="validation-rust.yaml" && return 1 || return 0
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,8 +2,8 @@
 
 members = [
     "crates/zspell",
-    "crates/zspell-py",
-    "crates/zspell-cli"
+    # "crates/zspell-py",
+    "crates/zspell-cli",
 ]
 
 # Build with `cargo build --profile=release-debug`

diff --git a/Pipfile b/Pipfile
@@ -8,6 +8,7 @@ maturin = "*"
 sphinx = "*"
 
 [dev-packages]
+black = "*"
 
 [requires]
 python_version = "3.10"
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # ZSpell
 
-This project is a spellchecker written completely in rust, that maintains
+This project is a spellchecker written completely in Rust, that maintains
 compatibility with the venerable Hunspell dictionary format. It is entirely
 native and does not rely on any other backends (Enchant, Hunspell, Aspell,
 etc.). This library also has the goal of being usable via WASM.
@@ -54,10 +54,10 @@ added at some point.
 | Feature                        | Available via Library | Available via CLI | Tracking Issue |
 |--------------------------------|-----------------------|-------------------|----------------|
 | Basic spellcheck functionality | ✓                     | ✓                 |                |
-| Forbidden word handling        | ✕                     | ✕                 | [#17](https://github.com/pluots/zspell/issues/17) |
-| Suggestions                    | ✕                     | ✕                 | [#16](https://github.com/pluots/zspell/issues/16) |
+| Forbidden word handling        | WIP                   | ✕                 | [#17](https://github.com/pluots/zspell/issues/17) |
+| Suggestions                    | WIP                   | ✕                 | [#16](https://github.com/pluots/zspell/issues/16) |
 | Compound word handling         | ✕                     | ✕                 |                |
-| Full Morph/Phone Handling      | ✕                     | ✕                 |                |
+| Full Morph/Phone Handling      | WIP                   | ✕                 |                |
 | Python Interface               | ✕                     | ✕                 | [#18](https://github.com/pluots/zspell/issues/18) |
 | Prebuilt WASM bindings         | ✕                     | ✕                 | [#19](https://github.com/pluots/zspell/issues/19) |
 
@@ -66,7 +66,7 @@ added at some point.
 This repository has the goal of highly prioritizing the most expected usage,
 i.e., that most words to be checked are correct. With optimizations based around
 this concept and with the modern computers now able to store entire compiled
-word lists in memory (~2 MiB), `zspell` tends to outperform other spellcheckers.
+word lists in memory (~20 MiB), `zspell` tends to outperform other spellcheckers.
 
 ## License
 

diff --git a/benches/findings.md b/benches/findings.md
@@ -1,5 +1,7 @@
 # Benchmark Findings
 
+Simple notes from benchmarks that have been run
+
 ## Collection types
 
 Four collections were compared; `Vec` (as a baseline),
@@ -22,3 +24,9 @@ problem for our applications).
 For some reason, the improvements going from `std` to `hashbrown` don't really
 seem to show up for the dictionary integration tests. This will take some
 looking into.
+
+## Slice `contains` vs. `binary_search`
+
+Overall, the price of sorting doesn't seem to have any payoff, especially for
+our use cases of short arrays. If it is already sorted then we can save time,
+about 20% on average.
diff --git a/benches/logbench.py b/benches/logbench.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""Run `cargo bench`, print the output with CPU information to a timestamped
+file.
+"""
+
+
+import platform
+import subprocess as sp
+import sys
+from datetime import datetime
+from inspect import cleandoc
+from pathlib import Path
+
+
+def decode_sp_out(b: bytes) -> str:
+    return b.decode(errors="ignore").strip()
+
+
+def get_dtime() -> str:
+    return datetime.utcnow().strftime(r"%Y-%m-%d_%H%M")
+
+
+def git_describe() -> str:
+    args = ["git", "describe", "--always", "--tags"]
+    return decode_sp_out(sp.check_output(args))
+
+
+def get_fpath(dtime: str, describe: str) -> tuple[str, Path]:
+    fname = f"{describe}_{dtime}.bench"
+    fpath = Path(__file__).resolve().parents[0] / "results" / fname
+    return (fname, fpath)
+
+
+def rustc_version() -> str:
+    return decode_sp_out(sp.check_output(["rustc", "--version"]))
+
+
+def get_cpu_info() -> str:
+    s = ""
+    if platform.system() == "Darwin":
+        cmd = ["sysctl", "-n", "machdep.cpu.brand_string"]
+        s += decode_sp_out(sp.check_output(cmd))
+    else:
+        tmp = sp.check_output("lscpu")
+        for line in tmp.splitlines():
+            if (
+                "Architecture" in line
+                or "Model name" in line
+                or "Socket" in line
+                or "Thread" in line
+                or "CPU(s)" in line
+                or "MHz" in line
+            ):
+                s += line
+    return s
+
+
+def main():
+    dtime = get_dtime()
+    describe = git_describe()
+    fname, fpath = get_fpath(dtime, describe)
+    version = rustc_version()
+    cpu_info = get_cpu_info()
+    cmd = ["cargo", "bench"]
+    cmd += sys.argv[1:]
+
+    header_str = (
+        cleandoc(
+            f"""
+        {fname}
+
+        Benchmark from {dtime} on commit {describe}
+        {version}
+
+        CPU Information:
+        {cpu_info}
+
+        Running: '{" ".join(cmd)}'
+        """
+        )
+        + "\n\n\n"
+    )
+
+    print(header_str)
+    output = header_str
+
+    with sp.Popen(cmd, stdout=sp.PIPE, bufsize=1, universal_newlines=True) as p:
+        for line in p.stdout:
+            print(line, end="")  # process line here
+            output += line
+
+    if p.returncode != 0:
+        print("\nCommand did not complete successfully")
+        exit(p.returncode)
+
+    print("\nWriting file...", end="")
+
+    with open(fpath, "w") as f:
+        f.write(output)
+
+    print(" Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benches/results/v0.2.2-27-g87c32f5_2022-12-25_2135.bench b/benches/results/v0.2.2-27-g87c32f5_2022-12-25_2135.bench
@@ -0,0 +1,174 @@
+v0.2.2-27-g87c32f5_2022-12-25_2135.bench
+
+Benchmark from 2022-12-25_2135 on commit v0.2.2-27-g87c32f5
+rustc 1.68.0-nightly (b569c9dc5 2022-12-21)
+
+CPU Information:
+Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz
+
+Running: 'cargo bench --bench datastructure'
+
+
+Vec contains true       time:   [569.22 µs 583.36 µs 600.88 µs]
+                        change: [+13.517% +17.843% +22.890%] (p = 0.00 < 0.05)
+                        Performance has regressed.
+Found 2 outliers among 100 measurements (2.00%)
+  2 (2.00%) high severe
+
+Vec contains false      time:   [1.7272 ms 1.7367 ms 1.7486 ms]
+                        change: [-2.3433% -0.7517% +0.5590%] (p = 0.35 > 0.05)
+                        No change in performance detected.
+Found 6 outliers among 100 measurements (6.00%)
+  2 (2.00%) high mild
+  4 (4.00%) high severe
+
+BTree contains true     time:   [2.0813 µs 2.0988 µs 2.1284 µs]
+                        change: [-33.216% -18.618% -5.3123%] (p = 0.02 < 0.05)
+                        Performance has improved.
+Found 5 outliers among 100 measurements (5.00%)
+  5 (5.00%) high severe
+
+BTree contains false    time:   [2.4196 µs 2.4301 µs 2.4494 µs]
+                        change: [-2.6558% -2.0096% -1.1657%] (p = 0.00 < 0.05)
+                        Performance has improved.
+Found 12 outliers among 100 measurements (12.00%)
+  2 (2.00%) high mild
+  10 (10.00%) high severe
+
+HashSet contains true   time:   [478.58 ns 479.76 ns 481.22 ns]
+                        change: [-1.7564% -1.4741% -1.2080%] (p = 0.00 < 0.05)
+                        Performance has improved.
+Found 12 outliers among 100 measurements (12.00%)
+  3 (3.00%) high mild
+  9 (9.00%) high severe
+
+HashSet contains false  time:   [393.25 ns 434.25 ns 490.96 ns]
+                        change: [-0.3539% +3.7872% +10.037%] (p = 0.19 > 0.05)
+                        No change in performance detected.
+Found 18 outliers among 100 measurements (18.00%)
+  2 (2.00%) high mild
+  16 (16.00%) high severe
+
+HashBrownSet contains true
+                        time:   [329.93 ns 358.98 ns 394.22 ns]
+                        change: [+11.832% +15.944% +20.455%] (p = 0.00 < 0.05)
+                        Performance has regressed.
+Found 4 outliers among 100 measurements (4.00%)
+  1 (1.00%) high mild
+  3 (3.00%) high severe
+
+HashBrownSet contains false
+                        time:   [209.08 ns 233.62 ns 266.77 ns]
+                        change: [+59.011% +105.51% +168.79%] (p = 0.00 < 0.05)
+                        Performance has regressed.
+Found 16 outliers among 100 measurements (16.00%)
+  8 (8.00%) high mild
+  8 (8.00%) high severe
+
+VecMap contains true    time:   [1.3709 ms 1.5246 ms 1.6924 ms]
+                        change: [+63.260% +80.742% +100.35%] (p = 0.00 < 0.05)
+                        Performance has regressed.
+Found 7 outliers among 100 measurements (7.00%)
+  6 (6.00%) high mild
+  1 (1.00%) high severe
+
+VecMap contains false   time:   [3.7792 ms 4.1292 ms 4.5332 ms]
+                        change: [+46.003% +60.872% +75.969%] (p = 0.00 < 0.05)
+                        Performance has regressed.
+Found 4 outliers among 100 measurements (4.00%)
+  2 (2.00%) high mild
+  2 (2.00%) high severe
+
+VecMap get true         time:   [834.35 µs 884.64 µs 937.83 µs]
+                        change: [+15.380% +22.165% +29.051%] (p = 0.00 < 0.05)
+                        Performance has regressed.
+Found 7 outliers among 100 measurements (7.00%)
+  5 (5.00%) high mild
+  2 (2.00%) high severe
+
+VecMap get false        time:   [4.9732 ms 5.5763 ms 6.2481 ms]
+                        change: [+91.748% +117.22% +146.27%] (p = 0.00 < 0.05)
+                        Performance has regressed.
+Found 4 outliers among 100 measurements (4.00%)
+  3 (3.00%) high mild
+  1 (1.00%) high severe
+
+BTreeMap contains true  time:   [2.1304 µs 2.1587 µs 2.1911 µs]
+                        change: [+3.8901% +6.2983% +9.0957%] (p = 0.00 < 0.05)
+                        Performance has regressed.
+Found 7 outliers among 100 measurements (7.00%)
+  5 (5.00%) high mild
+  2 (2.00%) high severe
+
+BTreeMap contains false time:   [3.3337 µs 4.1256 µs 4.9965 µs]
+                        change: [+18.633% +34.493% +54.500%] (p = 0.00 < 0.05)
+                        Performance has regressed.
+Found 17 outliers among 100 measurements (17.00%)
+  2 (2.00%) high mild
+  15 (15.00%) high severe
+
+BTreeMap get true       time:   [2.2010 µs 2.2381 µs 2.2866 µs]
+                        change: [-13.097% -7.2918% -1.8966%] (p = 0.01 < 0.05)
+                        Performance has improved.
+Found 4 outliers among 100 measurements (4.00%)
+  3 (3.00%) high mild
+  1 (1.00%) high severe
+
+BTreeMap get false      time:   [2.6190 µs 2.6996 µs 2.7880 µs]
+                        change: [-31.970% -14.791% +1.5950%] (p = 0.19 > 0.05)
+                        No change in performance detected.
+Found 5 outliers among 100 measurements (5.00%)
+  5 (5.00%) high mild
+
+HashMap contains true   time:   [521.98 ns 532.74 ns 544.49 ns]
+                        change: [-4.4057% +1.0033% +6.5691%] (p = 0.73 > 0.05)
+                        No change in performance detected.
+Found 6 outliers among 100 measurements (6.00%)
+  3 (3.00%) high mild
+  3 (3.00%) high severe
+
+HashMap contains false  time:   [439.96 ns 506.63 ns 592.46 ns]
+                        change: [-14.677% -3.6708% +7.9234%] (p = 0.56 > 0.05)
+                        No change in performance detected.
+Found 13 outliers among 100 measurements (13.00%)
+  5 (5.00%) high mild
+  8 (8.00%) high severe
+
+HashMap get true        time:   [515.70 ns 522.12 ns 529.77 ns]
+                        change: [+8.3012% +13.486% +19.008%] (p = 0.00 < 0.05)
+                        Performance has regressed.
+Found 14 outliers among 100 measurements (14.00%)
+  6 (6.00%) high mild
+  8 (8.00%) high severe
+
+HashMap get false       time:   [418.13 ns 441.50 ns 478.80 ns]
+                        change: [-33.490% -21.735% -9.8900%] (p = 0.00 < 0.05)
+                        Performance has improved.
+Found 12 outliers among 100 measurements (12.00%)
+  7 (7.00%) high mild
+  5 (5.00%) high severe
+
+HashBrownMap contains true
+                        time:   [287.48 ns 288.11 ns 288.81 ns]
+                        change: [-27.185% -21.547% -15.816%] (p = 0.00 < 0.05)
+                        Performance has improved.
+Found 17 outliers among 100 measurements (17.00%)
+  4 (4.00%) high mild
+  13 (13.00%) high severe
+
+HashBrownMap contains false
+                        time:   [190.02 ns 233.14 ns 291.77 ns]
+                        change: [+15.667% +28.449% +44.790%] (p = 0.00 < 0.05)
+                        Performance has regressed.
+Found 8 outliers among 100 measurements (8.00%)
+  3 (3.00%) high mild
+  5 (5.00%) high severe
+
+HashBrownMap get true   time:   [302.78 ns 336.35 ns 388.80 ns]
+Found 8 outliers among 100 measurements (8.00%)
+  5 (5.00%) high mild
+  3 (3.00%) high severe
+
+HashBrownMap get false  time:   [172.41 ns 175.16 ns 178.23 ns]
+Found 1 outliers among 100 measurements (1.00%)
+  1 (1.00%) high mild
diff --git a/clippy.toml b/clippy.toml
@@ -0,0 +1,3 @@
+# for performance, we always want to use HashBrown
+disallowed-types = ["std::collections::HashMap", "std::collections::HashSet"]
+doc-valid-idents = ["ZSpell"]
diff --git a/crates/zspell-cli/Cargo.toml b/crates/zspell-cli/Cargo.toml
@@ -14,9 +14,8 @@ name = "zspell"
 path = "src/main.rs"
 
 [dependencies]
-anyhow = "1.0.66"
 cfg-if = "1.0.0"
-clap = { version = "4.0.18", features = ["derive", "wrap_help"] }
+clap = { version = "4.0.32", features = ["derive", "wrap_help"] }
 futures-util = "0.3.25"
 hex = "0.4.3"
 indicatif = "0.17.1"
@@ -28,7 +27,8 @@ sha1 = "0.10.5"
 stringmetrics = "2.1"
 termcolor = "1.1.3"
 tokio = "1.21.2"
-zspell = { path = "../zspell" }
+zspell = { path = "../zspell", features = ["zspell-unstable"] }
+anyhow = "1.0.68"
 
 [dev-dependencies]
 criterion = "0.4"