From eae39638e3f4c108f217622e2c9d37df585bbc4b Mon Sep 17 00:00:00 2001 From: Hauke D Date: Sat, 27 Jul 2024 13:56:07 +0000 Subject: [PATCH] Added character equivalencies (closes #11) --- README.md | 14 ++++- dict-check.pl | 1 + package-lock.json | 24 ++++++-- package.json | 3 +- src/index.html | 13 +++- src/js/equiv.ts | 153 ++++++++++++++++++++++++++++++++++++++++++++++ src/js/flags.ts | 22 +++++++ src/js/main.ts | 19 ++---- 8 files changed, 221 insertions(+), 28 deletions(-) create mode 100644 src/js/equiv.ts diff --git a/README.md b/README.md index 0f97894..a06f4fc 100644 --- a/README.md +++ b/README.md @@ -34,9 +34,6 @@ Thanks - [@Tekl (Wolfgang Kreutz)](https://github.com/Tekl), author of the [macOS Beolingus German-English Dictionary Plugin](https://tekl.de/lexikon-plug-ins/beolingus-deutsch-englisch-lexikon-plugin), for [suggesting searching via URL query](https://github.com/haukex/de-en-dict/issues/7) -- [@sindresorhus (Sindre Sorhus)](https://github.com/sindresorhus) - for [escape-string-regexp](https://github.com/sindresorhus/escape-string-regexp), - from which I've borrowed a line of code Author, Copyright, and License ------------------------------ @@ -59,3 +56,14 @@ GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this project; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +### Used Libraries + +This project makes use of the following libraries: + +- [{} Simple.css](https://simplecss.org/), + Copyright © 2020 Simple.css (Kev Quirk), + [MIT License](https://github.com/kevquirk/simple.css/blob/main/LICENSE) +- [escape-string-regexp](https://www.npmjs.com/package/escape-string-regexp), + Copyright © Sindre Sorhus \ (https://sindresorhus.com), + [MIT License](https://github.com/sindresorhus/escape-string-regexp/blob/main/license) diff --git a/dict-check.pl b/dict-check.pl index 15b0511..73f1ec2 100755 --- a/dict-check.pl +++ b/dict-check.pl @@ -27,6 +27,7 @@ my $LINE_GRAMMAR = qr{ (?(DEFINE) (? + # REMEMBER to keep all of the special characters here in sync with equiv.ts ! (? [ a-z A-Z \N{LATIN SMALL LETTER A WITH DIAERESIS} diff --git a/package-lock.json b/package-lock.json index 80a6aab..f3d664c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -5,7 +5,8 @@ "packages": { "": { "dependencies": { - "@parcel/service-worker": "^2.12.0" + "@parcel/service-worker": "^2.12.0", + "escape-string-regexp": "^5.0.0" }, "devDependencies": { "@eslint/js": "^9.7.0", @@ -4078,12 +4079,11 @@ } }, "node_modules/escape-string-regexp": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", - "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", - "dev": true, + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz", + "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==", "engines": { - "node": ">=10" + "node": ">=12" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -4181,6 +4181,18 @@ "node": "^12.22.0 || ^14.17.0 || >=16.0.0" } }, + "node_modules/eslint/node_modules/escape-string-regexp": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", + "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/espree": { "version": "9.6.1", "resolved": "https://registry.npmjs.org/espree/-/espree-9.6.1.tgz", diff --git a/package.json b/package.json index d15edbf..990724f 100644 --- a/package.json +++ b/package.json @@ -22,6 +22,7 @@ "typescript-eslint": "^7.16.1" }, "dependencies": { - "@parcel/service-worker": "^2.12.0" + "@parcel/service-worker": "^2.12.0", + "escape-string-regexp": "^5.0.0" } } diff --git a/src/index.html b/src/index.html index bc9d374..5e9e513 100644 --- a/src/index.html +++ b/src/index.html @@ -135,9 +135,16 @@

Zusammenarbeit

GNU General Public License Version 2 or later. The source code is on GitHub. - This page uses {} Simple.css, - Copyright © 2020 Simple.css (Kev Quirk), terms: - MIT License. + This page/app makes use of the following libraries: +
diff --git a/src/js/equiv.ts b/src/js/equiv.ts new file mode 100644 index 0000000..c52878b --- /dev/null +++ b/src/js/equiv.ts @@ -0,0 +1,153 @@ +/** + * German-English Dictionary + * ========================= + * + * Copyright © 2024 Hauke Dämpfling, haukex@zero-g.net + * + * Source code: https://github.com/haukex/de-en-dict + * + * This project is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This project is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this project; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +import escapeStringRegexp from 'escape-string-regexp' + +/** + * The following is the list of character equivalencies. + * If the user searches for any of the strings in the left list, then the search should + * result in matches for what the user entered, plus the alternatives in the right list. + * If the right list is empty, it is taken to be identical to the right list. + * The idea is that users can enter search terms in ASCII and still find Unicode chars. + * REMEMBER to keep this list in sync with the special characters in the grammar in dict-check.pl ! + */ +const EQUIV :[string[], string[]][] = [ + // ä + // searches for "a" should match "a" or "ä", and "A" matches "A" or "Ä" + [ ['a'], ['ä'] ], + [ ['A'], ['Ä'] ], + // searches for "ae" or "ä" should both match either "ae" or "ä" (same for uppercase) + [ ['ae', 'ä'], [] ], + [ ['AE', 'Ae', 'Ä'], [] ], + // ö + [ ['o'], ['ö'] ], + [ ['O'], ['Ö'] ], + [ ['oe', 'ö'], [] ], + [ ['OE', 'Oe', 'Ö'], [] ], + // ü + [ ['u'], ['ü'] ], + [ ['U'], ['Ü'] ], + [ ['ue', 'ü'], [] ], + [ ['UE', 'Ue', 'Ü'], [] ], + // ß + [ ['ss', 'sz'], ['ß'] ], + [ ['ß'], ['ss'] ], + // others + [ ['e'], ['ë'] ], + [ ['i'], ['ï'] ], + [ ['A'], ['Á'] ], + [ ['E'], ['É'] ], + [ ['I'], ['Î'] ], + [ ['a'], ['á'] ], + [ ['e'], ['é'] ], + [ ['i'], ['í'] ], + [ ['o'], ['ó'] ], + [ ['a'], ['à'] ], + [ ['e'], ['è'] ], + [ ['i'], ['ì'] ], + [ ['o'], ['ò'] ], + [ ['a'], ['â'] ], + [ ['e'], ['ê'] ], + [ ['i'], ['î'] ], + [ ['o'], ['ô'] ], + [ ['u'], ['û'] ], + [ ['a'], ['ã'] ], + [ ['n'], ['ñ'] ], + [ ['i'], ['ī'] ], + [ ['c'], ['ç'] ], + [ ['S'], ['Š'] ], + [ ['a'], ['å'] ], + [ ['ae'], ['æ'] ], + // greek letters + [ ['alpha'], ['α'] ], + [ ['lambda', 'lamda'], ['λ'] ], + [ ['omega', 'ohm'], ['Ω'] ], + // special chars + [ ['\'', '’'], [] ], + [ ['-', '–'], [] ], + [ ['...', '…'], [] ], + [ ['"', '“', '”', '„'], [] ], + // other special sequences + [ ['^2'], ['²'] ], + [ ['^3'], ['³'] ], + [ ['m2'], ['m²'] ], + [ ['m3'], ['m³'] ], + [ ['1/2'], ['½'] ], + [ ['*', 'x'], ['×'] ], + [ ['(R)'], ['®'] ], + [ ['(c)', '(C)'], ['©'] ], +] + +function assert(condition: unknown, msg?: string): asserts condition { + if (!condition) throw new Error(msg) +} + +interface IStringSetHash { + [details: string] : Set; +} +interface IStringHash { + [details: string]: string; +} + +// this code builds the set of patterns to match and each of their replacements +const _pat_dict :IStringSetHash = {} +for (const [l,r] of EQUIV) { + assert(l && r) + for (const k of l) { + for (const v of (r.length ? [k].concat(r) : l) ) { + if (!(k in _pat_dict)) + _pat_dict[k] = new Set() + assert(_pat_dict[k] instanceof Set) + _pat_dict[k].add(v) + } + } +} +const _pats = Object.keys(_pat_dict).sort().sort((a,b) => b.length-a.length) +const EQUIV_PAT = new RegExp( '(' + _pats.map(escapeStringRegexp).join('|') + ')', 'g') +const EQUIV_REPL :IStringHash = {} +for (const pat of _pats) { + const s = _pat_dict[pat] + assert(s) + const repl = Array.from(s.values()).sort().sort((a,b) => b.length-a.length) + EQUIV_REPL[pat] = '(?:' + repl.map(escapeStringRegexp).join('|') + ')' +} +//console.debug(EQUIV_PAT, EQUIV_REPL) + +/** + * This function takes a search word and turns it into a string suitable for use in a regular expression. + * Two strings are returned: The first is the search word simply turned into a regular expression + * (a stricter version, intended for use in giving exact matches a higher score), while the second string + * contains mappings for the character equivalents (a looser pattern that will result in more matches). + */ +export function makeSearchPattern(what :string) : [string, string] { + what = what.replaceAll(/\s+/g, ' ') + const stricter = escapeStringRegexp(what) + let withEquiv = '' + for (const part of what.split(EQUIV_PAT) ) { + if (part in EQUIV_REPL) + withEquiv += EQUIV_REPL[part] // special chars already escaped + else + withEquiv += escapeStringRegexp(part) + } + return [stricter, withEquiv] +} diff --git a/src/js/flags.ts b/src/js/flags.ts index 5f33a94..70f43df 100644 --- a/src/js/flags.ts +++ b/src/js/flags.ts @@ -1,3 +1,25 @@ +/** + * German-English Dictionary + * ========================= + * + * Copyright © 2024 Hauke Dämpfling, haukex@zero-g.net + * + * Source code: https://github.com/haukex/de-en-dict + * + * This project is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This project is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this project; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ interface IFlagList { de: string[], en: string[] } diff --git a/src/js/main.ts b/src/js/main.ts index afa0e59..5dd46a7 100644 --- a/src/js/main.ts +++ b/src/js/main.ts @@ -23,6 +23,7 @@ import {DB_URL, DB_VER_URL, DB_CACHE_NAME, cacheFirst} from './common' import {init_flags} from './flags' +import {makeSearchPattern} from './equiv' // for the parcel development environment: if (module.hot) module.hot.accept() @@ -124,8 +125,8 @@ async function loadDict() :Promise { throw new Error('Failed to load dict') // unpack the dictionary file return (await gunzipUTF8(dictResp.body)) - // these two replaces fix some oversights that I guess happened on conversion from CP1252 to UTF-8 (?) - .replaceAll(String.fromCodePoint(0x92),'\u2019').replaceAll(String.fromCodePoint(0x96),'\u2013') + // this fixes an oversight that I guess happened on conversion from CP1252 to UTF-8 (?) + .replaceAll(String.fromCodePoint(0x96),'\u2013') // split the text into lines, trim the lines, remove blank lines and comments .split(/\r?\n|\r(?!\n)/g).map((line) => line.trim()).filter((line) => line.length && !line.startsWith('#')) } catch (error) { @@ -196,19 +197,7 @@ window.addEventListener('DOMContentLoaded', async () => { document.title = what ? `${TITLE_PREFIX}: ${what}` : TITLE_PREFIX // turn the search term into a regex - const whatPatStricter = what.replaceAll(/\s+/g, ' ') - // escape characters that are special to a regex - // https://github.com/sindresorhus/escape-string-regexp/blob/6ced614e/index.js#L8 - .replaceAll(/[|\\{}()[\]^$+*?.]/g, '\\$&').replaceAll(/-/g, '\\x2d') - // searching for different kinds of quotation marks finds all of them - .replaceAll(/[\u0027\u2019]/g, '[\\u0027\\u2019]') - .replaceAll(/[\u0022\u201C\u201D\u201E]/g, '[\\u0022\\u201C\\u201D\\u201E]') - // we differentiate between a stricter and looser match for scoring - const whatPat = whatPatStricter - // searching for umlaut replacements searches for the umlauts as well (e.g. for users of non-German keyboard layouts) - // (note the search will be case-insensitive anyway, so we don't need to care about case here) - .replaceAll(/ae/ig, '(?:ae|ä)').replaceAll(/oe/ig, '(?:oe|ö)').replaceAll(/ue/ig, '(?:ue|ü)') - .replaceAll(/s[sz]/ig, '(?:$&|ß)') + const [whatPatStricter, whatPat] = makeSearchPattern(what) // generate a regex that matches the search term const whatRe = new RegExp(whatPat, 'ig')