Added character equivalencies (closes #11)

haukex · Jul 27, 2024 · eae3963 · eae3963
1 parent 6752400
commit eae3963
Show file tree

Hide file tree

Showing 8 changed files with 221 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -34,9 +34,6 @@ Thanks
 - [@Tekl (Wolfgang Kreutz)](https://github.com/Tekl), author of the
   [macOS Beolingus German-English Dictionary Plugin](https://tekl.de/lexikon-plug-ins/beolingus-deutsch-englisch-lexikon-plugin),
   for [suggesting searching via URL query](https://github.com/haukex/de-en-dict/issues/7)
-- [@sindresorhus (Sindre Sorhus)](https://github.com/sindresorhus)
-  for [escape-string-regexp](https://github.com/sindresorhus/escape-string-regexp),
-  from which I've borrowed a line of code
 
 Author, Copyright, and License
 ------------------------------
@@ -59,3 +56,14 @@ GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this project; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+### Used Libraries
+
+This project makes use of the following libraries:
+
+- [{} Simple.css](https://simplecss.org/),
+  Copyright © 2020 Simple.css (Kev Quirk),
+  [MIT License](https://github.com/kevquirk/simple.css/blob/main/LICENSE)
+- [escape-string-regexp](https://www.npmjs.com/package/escape-string-regexp),
+  Copyright © Sindre Sorhus \<sindresorhus@gmail.com\> (https://sindresorhus.com),
+  [MIT License](https://github.com/sindresorhus/escape-string-regexp/blob/main/license)
diff --git a/dict-check.pl b/dict-check.pl
@@ -27,6 +27,7 @@
 my $LINE_GRAMMAR = qr{
     (?(DEFINE)
         (?<TOKEN>
+            # REMEMBER to keep all of the special characters here in sync with equiv.ts !
             (?<LETTER>
                 [ a-z A-Z
                 \N{LATIN SMALL LETTER A WITH DIAERESIS}

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -22,6 +22,7 @@
     "typescript-eslint": "^7.16.1"
   },
   "dependencies": {
-    "@parcel/service-worker": "^2.12.0"
+    "@parcel/service-worker": "^2.12.0",
+    "escape-string-regexp": "^5.0.0"
   }
 }
diff --git a/src/index.html b/src/index.html
@@ -135,9 +135,16 @@ <h3>Zusammenarbeit</h3>
       <a href="https://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html" target="_blank">GNU
       General Public License Version 2</a> or later.
       The source code is <a href="https://github.com/haukex/de-en-dict" target="_blank">on GitHub</a>.
-      This page uses <a href="https://simplecss.org/" target="_blank" class="text-nowrap">{} Simple.css</a>,
-      Copyright &copy; 2020 Simple.css (Kev Quirk), terms:
-      <a href="https://github.com/kevquirk/simple.css/blob/main/LICENSE" target="_blank">MIT License</a>.
+      This page/app makes use of the following libraries:
+      <ul>
+        <li><a href="https://simplecss.org/" target="_blank" class="text-nowrap">{} Simple.css</a>,
+          Copyright &copy; 2020 Simple.css (Kev Quirk),
+          <a href="https://github.com/kevquirk/simple.css/blob/main/LICENSE" target="_blank">MIT License</a>.</li>
+        <li><a href="https://www.npmjs.com/package/escape-string-regexp" target="_blank">escape-string-regexp</a>,
+          Copyright &copy; Sindre Sorhus <sindresorhus@gmail.com>
+          (<a href="https://sindresorhus.com" target="_blank">https://sindresorhus.com</a>),
+          <a href="https://github.com/sindresorhus/escape-string-regexp/blob/main/license">MIT License</a>.</li>
+      </ul>
     </details>
 
     <details>

diff --git a/src/js/equiv.ts b/src/js/equiv.ts
@@ -0,0 +1,153 @@
+/**
+ * German-English Dictionary
+ * =========================
+ *
+ * Copyright © 2024 Hauke Dämpfling, haukex@zero-g.net
+ *
+ * Source code: https://github.com/haukex/de-en-dict
+ *
+ * This project is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This project is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this project; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+import escapeStringRegexp from 'escape-string-regexp'
+
+/**
+ * The following is the list of character equivalencies.
+ * If the user searches for any of the strings in the left list, then the search should
+ * result in matches for what the user entered, plus the alternatives in the right list.
+ * If the right list is empty, it is taken to be identical to the right list.
+ * The idea is that users can enter search terms in ASCII and still find Unicode chars.
+ * REMEMBER to keep this list in sync with the special characters in the grammar in dict-check.pl !
+ */
+const EQUIV :[string[], string[]][] = [
+  // ä
+  // searches for "a" should match "a" or "ä", and "A" matches "A" or "Ä"
+  [ ['a'],                    ['ä']      ],
+  [ ['A'],                    ['Ä']      ],
+  // searches for "ae" or "ä" should both match either "ae" or "ä" (same for uppercase)
+  [ ['ae', 'ä'],              []         ],
+  [ ['AE', 'Ae', 'Ä'],        []         ],
+  // ö
+  [ ['o'],                    ['ö']      ],
+  [ ['O'],                    ['Ö']      ],
+  [ ['oe', 'ö'],              []         ],
+  [ ['OE', 'Oe', 'Ö'],        []         ],
+  // ü
+  [ ['u'],                    ['ü']      ],
+  [ ['U'],                    ['Ü']      ],
+  [ ['ue', 'ü'],              []         ],
+  [ ['UE', 'Ue', 'Ü'],        []         ],
+  // ß
+  [ ['ss', 'sz'],             ['ß']      ],
+  [ ['ß'],                    ['ss']     ],
+  // others
+  [ ['e'],                    ['ë']      ],
+  [ ['i'],                    ['ï']      ],
+  [ ['A'],                    ['Á']      ],
+  [ ['E'],                    ['É']      ],
+  [ ['I'],                    ['Î']      ],
+  [ ['a'],                    ['á']      ],
+  [ ['e'],                    ['é']      ],
+  [ ['i'],                    ['í']      ],
+  [ ['o'],                    ['ó']      ],
+  [ ['a'],                    ['à']      ],
+  [ ['e'],                    ['è']      ],
+  [ ['i'],                    ['ì']      ],
+  [ ['o'],                    ['ò']      ],
+  [ ['a'],                    ['â']      ],
+  [ ['e'],                    ['ê']      ],
+  [ ['i'],                    ['î']      ],
+  [ ['o'],                    ['ô']      ],
+  [ ['u'],                    ['û']      ],
+  [ ['a'],                    ['ã']      ],
+  [ ['n'],                    ['ñ']      ],
+  [ ['i'],                    ['ī']      ],
+  [ ['c'],                    ['ç']      ],
+  [ ['S'],                    ['Š']      ],
+  [ ['a'],                    ['å']      ],
+  [ ['ae'],                   ['æ']      ],
+  // greek letters
+  [ ['alpha'],                ['α']      ],
+  [ ['lambda', 'lamda'],      ['λ']      ],
+  [ ['omega', 'ohm'],         ['Ω']      ],
+  // special chars
+  [ ['\'', '’'],              []         ],
+  [ ['-', '–'],               []         ],
+  [ ['...', '…'],             []         ],
+  [ ['"', '“', '”', '„'],     []         ],
+  // other special sequences
+  [ ['^2'],                   ['²']      ],
+  [ ['^3'],                   ['³']      ],
+  [ ['m2'],                   ['m²']     ],
+  [ ['m3'],                   ['m³']     ],
+  [ ['1/2'],                  ['½']      ],
+  [ ['*', 'x'],               ['×']      ],
+  [ ['(R)'],                  ['®']      ],
+  [ ['(c)', '(C)'],           ['©']      ],
+]
+
+function assert(condition: unknown, msg?: string): asserts condition {
+  if (!condition) throw new Error(msg)
+}
+
+interface IStringSetHash {
+  [details: string] : Set<string>;
+}
+interface IStringHash {
+  [details: string]: string;
+}
+
+// this code builds the set of patterns to match and each of their replacements
+const _pat_dict :IStringSetHash = {}
+for (const [l,r] of EQUIV) {
+  assert(l && r)
+  for (const k of l) {
+    for (const v of (r.length ? [k].concat(r) : l) ) {
+      if (!(k in _pat_dict))
+        _pat_dict[k] = new Set()
+      assert(_pat_dict[k] instanceof Set)
+      _pat_dict[k].add(v)
+    }
+  }
+}
+const _pats = Object.keys(_pat_dict).sort().sort((a,b) => b.length-a.length)
+const EQUIV_PAT = new RegExp( '(' + _pats.map(escapeStringRegexp).join('|') + ')', 'g')
+const EQUIV_REPL :IStringHash = {}
+for (const pat of _pats) {
+  const s = _pat_dict[pat]
+  assert(s)
+  const repl = Array.from(s.values()).sort().sort((a,b) => b.length-a.length)
+  EQUIV_REPL[pat] = '(?:' + repl.map(escapeStringRegexp).join('|') + ')'
+}
+//console.debug(EQUIV_PAT, EQUIV_REPL)
+
+/**
+ * This function takes a search word and turns it into a string suitable for use in a regular expression.
+ * Two strings are returned: The first is the search word simply turned into a regular expression
+ * (a stricter version, intended for use in giving exact matches a higher score), while the second string
+ * contains mappings for the character equivalents (a looser pattern that will result in more matches).
+ */
+export function makeSearchPattern(what :string) : [string, string] {
+  what = what.replaceAll(/\s+/g, ' ')
+  const stricter = escapeStringRegexp(what)
+  let withEquiv = ''
+  for (const part of what.split(EQUIV_PAT) ) {
+    if (part in EQUIV_REPL)
+      withEquiv += EQUIV_REPL[part]  // special chars already escaped
+    else
+      withEquiv += escapeStringRegexp(part)
+  }
+  return [stricter, withEquiv]
+}
diff --git a/src/js/flags.ts b/src/js/flags.ts
@@ -1,3 +1,25 @@
+/**
+ * German-English Dictionary
+ * =========================
+ *
+ * Copyright © 2024 Hauke Dämpfling, haukex@zero-g.net
+ *
+ * Source code: https://github.com/haukex/de-en-dict
+ *
+ * This project is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This project is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this project; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
 
 interface IFlagList { de: string[], en: string[] }
 

diff --git a/src/js/main.ts b/src/js/main.ts
@@ -23,6 +23,7 @@
 
 import {DB_URL, DB_VER_URL, DB_CACHE_NAME, cacheFirst} from './common'
 import {init_flags} from './flags'
+import {makeSearchPattern} from './equiv'
 
 // for the parcel development environment:
 if (module.hot) module.hot.accept()
@@ -124,8 +125,8 @@ async function loadDict() :Promise<string[]> {
       throw new Error('Failed to load dict')
     // unpack the dictionary file
     return (await gunzipUTF8(dictResp.body))
-      // these two replaces fix some oversights that I guess happened on conversion from CP1252 to UTF-8 (?)
-      .replaceAll(String.fromCodePoint(0x92),'\u2019').replaceAll(String.fromCodePoint(0x96),'\u2013')
+      // this fixes an oversight that I guess happened on conversion from CP1252 to UTF-8 (?)
+      .replaceAll(String.fromCodePoint(0x96),'\u2013')
       // split the text into lines, trim the lines, remove blank lines and comments
       .split(/\r?\n|\r(?!\n)/g).map((line) => line.trim()).filter((line) => line.length && !line.startsWith('#'))
   } catch (error) {
@@ -196,19 +197,7 @@ window.addEventListener('DOMContentLoaded', async () => {
     document.title = what ? `${TITLE_PREFIX}: ${what}` : TITLE_PREFIX
 
     // turn the search term into a regex
-    const whatPatStricter = what.replaceAll(/\s+/g, ' ')
-      // escape characters that are special to a regex
-      // https://github.com/sindresorhus/escape-string-regexp/blob/6ced614e/index.js#L8
-      .replaceAll(/[|\\{}()[\]^$+*?.]/g, '\\$&').replaceAll(/-/g, '\\x2d')
-      // searching for different kinds of quotation marks finds all of them
-      .replaceAll(/[\u0027\u2019]/g, '[\\u0027\\u2019]')
-      .replaceAll(/[\u0022\u201C\u201D\u201E]/g, '[\\u0022\\u201C\\u201D\\u201E]')
-    // we differentiate between a stricter and looser match for scoring
-    const whatPat = whatPatStricter
-      // searching for umlaut replacements searches for the umlauts as well (e.g. for users of non-German keyboard layouts)
-      // (note the search will be case-insensitive anyway, so we don't need to care about case here)
-      .replaceAll(/ae/ig, '(?:ae|ä)').replaceAll(/oe/ig, '(?:oe|ö)').replaceAll(/ue/ig, '(?:ue|ü)')
-      .replaceAll(/s[sz]/ig, '(?:$&|ß)')
+    const [whatPatStricter, whatPat] = makeSearchPattern(what)
     // generate a regex that matches the search term
     const whatRe = new RegExp(whatPat, 'ig')