diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js index 2d0d23b320..cdd8ce9ad9 100644 --- a/ext/js/language/ja/japanese-text-preprocessors.js +++ b/ext/js/language/ja/japanese-text-preprocessors.js @@ -24,6 +24,7 @@ import { convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana as convertHiraganaToKatakanaFunction, convertKatakanaToHiragana as convertKatakanaToHiraganaFunction, + normalizeCombiningCharacters as normalizeCombiningCharactersFunction, } from './japanese.js'; /** @type {import('language').TextProcessor} */ @@ -90,3 +91,11 @@ export const collapseEmphaticSequences = { return str; }, }; + +/** @type {import('language').TextProcessor} */ +export const normalizeCombiningCharacters = { + name: 'Normalize combining characters', + description: 'ド → ド (U+30C8 U+3099 → U+30C9)', + options: basicTextProcessorOptions, + process: (str, setting) => (setting ? normalizeCombiningCharactersFunction(str) : str), +}; diff --git a/ext/js/language/ja/japanese.js b/ext/js/language/ja/japanese.js index 3a009ebbef..231d8f624e 100644 --- a/ext/js/language/ja/japanese.js +++ b/ext/js/language/ja/japanese.js @@ -560,6 +560,65 @@ export function getKanaDiacriticInfo(character) { return typeof info !== 'undefined' ? {character: info.character, type: info.type} : null; } +/** + * @param {number} codePoint + * @returns {boolean} + */ +function dakutenAllowed(codePoint) { + // To reduce processing time some characters which shouldn't have dakuten but are highly unlikely to have a combining character attached are included + // かがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとはばぱひびぴふぶぷへべぺほ + // カガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトハバパヒビピフブプヘベペホ + return ((codePoint >= 0x304B && codePoint <= 0x3068) || + (codePoint >= 0x306F && codePoint <= 0x307B) || + (codePoint >= 0x30AB && codePoint <= 0x30C8) || + (codePoint >= 0x30CF && codePoint <= 0x30DB)); +} + +/** + * @param {number} codePoint + * @returns {boolean} + */ +function handakutenAllowed(codePoint) { + // To reduce processing time some characters which shouldn't have handakuten but are highly unlikely to have a combining character attached are included + // はばぱひびぴふぶぷへべぺほ + // ハバパヒビピフブプヘベペホ + return ((codePoint >= 0x306F && codePoint <= 0x307B) || + (codePoint >= 0x30CF && codePoint <= 0x30DB)); +} + +/** + * @param {string} text + * @returns {string} + */ +export function normalizeCombiningCharacters(text) { + let result = ''; + let i = text.length - 1; + // Ignoring the first character is intentional, it cannot combine with anything + while (i > 0) { + if (text[i] === '\u3099') { + const dakutenCombinee = text[i - 1].codePointAt(0); + if (dakutenCombinee && dakutenAllowed(dakutenCombinee)) { + result = String.fromCodePoint(dakutenCombinee + 1) + result; + i -= 2; + continue; + } + } else if (text[i] === '\u309A') { + const handakutenCombinee = text[i - 1].codePointAt(0); + if (handakutenCombinee && handakutenAllowed(handakutenCombinee)) { + result = String.fromCodePoint(handakutenCombinee + 2) + result; + i -= 2; + continue; + } + } + result = text[i] + result; + i--; + } + // i === -1 when first two characters are combined + if (i === 0) { + result = text[0] + result; + } + return result; +} // Furigana distribution diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 7965ff30d5..f9fb4f09eb 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -26,6 +26,7 @@ import { collapseEmphaticSequences, convertHalfWidthCharacters, convertHiraganaToKatakana, + normalizeCombiningCharacters, } from './ja/japanese-text-preprocessors.js'; import {japaneseTransforms} from './ja/japanese-transforms.js'; import {isStringPartiallyJapanese} from './ja/japanese.js'; @@ -36,9 +37,9 @@ import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js import {oldIrishTransforms} from './sga/old-irish-transforms.js'; import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js'; import {albanianTransforms} from './sq/albanian-transforms.js'; -import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; -import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js'; +import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; +import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js'; const capitalizationPreprocessors = { decapitalize, @@ -155,6 +156,7 @@ const languageDescriptors = [ textPreprocessors: { convertHalfWidthCharacters, alphabeticToHiragana, + normalizeCombiningCharacters, alphanumericWidthVariants, convertHiraganaToKatakana, collapseEmphaticSequences, diff --git a/test/language/japanese-preprocessors.test.js b/test/language/japanese-preprocessors.test.js new file mode 100644 index 0000000000..90313abd56 --- /dev/null +++ b/test/language/japanese-preprocessors.test.js @@ -0,0 +1,202 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {describe, expect, test} from 'vitest'; +import {normalizeCombiningCharacters} from '../../ext/js/language/ja/japanese-text-preprocessors.js'; + +const testCasesDakuten = [ + ['か\u3099', 'が'], + ['き\u3099', 'ぎ'], + ['く\u3099', 'ぐ'], + ['け\u3099', 'げ'], + ['こ\u3099', 'ご'], + ['さ\u3099', 'ざ'], + ['し\u3099', 'じ'], + ['す\u3099', 'ず'], + ['せ\u3099', 'ぜ'], + ['そ\u3099', 'ぞ'], + ['た\u3099', 'だ'], + ['ち\u3099', 'ぢ'], + ['つ\u3099', 'づ'], + ['て\u3099', 'で'], + ['と\u3099', 'ど'], + ['は\u3099', 'ば'], + ['ひ\u3099', 'び'], + ['ふ\u3099', 'ぶ'], + ['へ\u3099', 'べ'], + ['ほ\u3099', 'ぼ'], + ['カ\u3099', 'ガ'], + ['キ\u3099', 'ギ'], + ['ク\u3099', 'グ'], + ['ケ\u3099', 'ゲ'], + ['コ\u3099', 'ゴ'], + ['サ\u3099', 'ザ'], + ['シ\u3099', 'ジ'], + ['ス\u3099', 'ズ'], + ['セ\u3099', 'ゼ'], + ['ソ\u3099', 'ゾ'], + ['タ\u3099', 'ダ'], + ['チ\u3099', 'ヂ'], + ['ツ\u3099', 'ヅ'], + ['テ\u3099', 'デ'], + ['ト\u3099', 'ド'], + ['ハ\u3099', 'バ'], + ['ヒ\u3099', 'ビ'], + ['フ\u3099', 'ブ'], + ['ヘ\u3099', 'ベ'], + ['ホ\u3099', 'ボ'], +]; + +const testCasesHandakuten = [ + ['は\u309A', 'ぱ'], + ['ひ\u309A', 'ぴ'], + ['ふ\u309A', 'ぷ'], + ['へ\u309A', 'ぺ'], + ['ほ\u309A', 'ぽ'], + ['ハ\u309A', 'パ'], + ['ヒ\u309A', 'ピ'], + ['フ\u309A', 'プ'], + ['ヘ\u309A', 'ペ'], + ['ホ\u309A', 'ポ'], +]; + +const testCasesIgnored = [ + ['な\u3099', 'な\u3099'], + ['な\u309A', 'な\u309A'], + ['に\u3099', 'に\u3099'], + ['に\u309A', 'に\u309A'], + ['ぬ\u3099', 'ぬ\u3099'], + ['ぬ\u309A', 'ぬ\u309A'], + ['ね\u3099', 'ね\u3099'], + ['ね\u309A', 'ね\u309A'], + ['の\u3099', 'の\u3099'], + ['の\u309A', 'の\u309A'], + ['ま\u3099', 'ま\u3099'], + ['ま\u309A', 'ま\u309A'], + ['み\u3099', 'み\u3099'], + ['み\u309A', 'み\u309A'], + ['む\u3099', 'む\u3099'], + ['む\u309A', 'む\u309A'], + ['め\u3099', 'め\u3099'], + ['め\u309A', 'め\u309A'], + ['も\u3099', 'も\u3099'], + ['も\u309A', 'も\u309A'], + ['ゃ\u3099', 'ゃ\u3099'], + ['ゃ\u309A', 'ゃ\u309A'], + ['や\u3099', 'や\u3099'], + ['や\u309A', 'や\u309A'], + ['ゅ\u3099', 'ゅ\u3099'], + ['ゅ\u309A', 'ゅ\u309A'], + ['ゆ\u3099', 'ゆ\u3099'], + ['ゆ\u309A', 'ゆ\u309A'], + ['ょ\u3099', 'ょ\u3099'], + ['ょ\u309A', 'ょ\u309A'], + ['よ\u3099', 'よ\u3099'], + ['よ\u309A', 'よ\u309A'], + ['ら\u3099', 'ら\u3099'], + ['ら\u309A', 'ら\u309A'], + ['り\u3099', 'り\u3099'], + ['り\u309A', 'り\u309A'], + ['る\u3099', 'る\u3099'], + ['る\u309A', 'る\u309A'], + ['れ\u3099', 'れ\u3099'], + ['れ\u309A', 'れ\u309A'], + ['ろ\u3099', 'ろ\u3099'], + ['ろ\u309A', 'ろ\u309A'], + ['ゎ\u3099', 'ゎ\u3099'], + ['ゎ\u309A', 'ゎ\u309A'], + ['わ\u3099', 'わ\u3099'], + ['わ\u309A', 'わ\u309A'], + ['ゐ\u3099', 'ゐ\u3099'], + ['ゐ\u309A', 'ゐ\u309A'], + ['ゑ\u3099', 'ゑ\u3099'], + ['ゑ\u309A', 'ゑ\u309A'], + ['を\u3099', 'を\u3099'], + ['を\u309A', 'を\u309A'], + ['ん\u3099', 'ん\u3099'], + ['ん\u309A', 'ん\u309A'], + ['ナ\u3099', 'ナ\u3099'], + ['ナ\u309A', 'ナ\u309A'], + ['ニ\u3099', 'ニ\u3099'], + ['ニ\u309A', 'ニ\u309A'], + ['ヌ\u3099', 'ヌ\u3099'], + ['ヌ\u309A', 'ヌ\u309A'], + ['ネ\u3099', 'ネ\u3099'], + ['ネ\u309A', 'ネ\u309A'], + ['ノ\u3099', 'ノ\u3099'], + ['ノ\u309A', 'ノ\u309A'], + ['マ\u3099', 'マ\u3099'], + ['マ\u309A', 'マ\u309A'], + ['ミ\u3099', 'ミ\u3099'], + ['ミ\u309A', 'ミ\u309A'], + ['ム\u3099', 'ム\u3099'], + ['ム\u309A', 'ム\u309A'], + ['メ\u3099', 'メ\u3099'], + ['メ\u309A', 'メ\u309A'], + ['モ\u3099', 'モ\u3099'], + ['モ\u309A', 'モ\u309A'], + ['ャ\u3099', 'ャ\u3099'], + ['ャ\u309A', 'ャ\u309A'], + ['ヤ\u3099', 'ヤ\u3099'], + ['ヤ\u309A', 'ヤ\u309A'], + ['ュ\u3099', 'ュ\u3099'], + ['ュ\u309A', 'ュ\u309A'], + ['ユ\u3099', 'ユ\u3099'], + ['ユ\u309A', 'ユ\u309A'], + ['ョ\u3099', 'ョ\u3099'], + ['ョ\u309A', 'ョ\u309A'], + ['ヨ\u3099', 'ヨ\u3099'], + ['ヨ\u309A', 'ヨ\u309A'], + ['ラ\u3099', 'ラ\u3099'], + ['ラ\u309A', 'ラ\u309A'], + ['リ\u3099', 'リ\u3099'], + ['リ\u309A', 'リ\u309A'], + ['ル\u3099', 'ル\u3099'], + ['ル\u309A', 'ル\u309A'], + ['レ\u3099', 'レ\u3099'], + ['レ\u309A', 'レ\u309A'], + ['ロ\u3099', 'ロ\u3099'], + ['ロ\u309A', 'ロ\u309A'], + ['ヮ\u3099', 'ヮ\u3099'], + ['ヮ\u309A', 'ヮ\u309A'], + ['ワ\u3099', 'ワ\u3099'], + ['ワ\u309A', 'ワ\u309A'], + ['ヰ\u3099', 'ヰ\u3099'], + ['ヰ\u309A', 'ヰ\u309A'], + ['ヱ\u3099', 'ヱ\u3099'], + ['ヱ\u309A', 'ヱ\u309A'], + ['ヲ\u3099', 'ヲ\u3099'], + ['ヲ\u309A', 'ヲ\u309A'], + ['ン\u3099', 'ン\u3099'], + ['ン\u309A', 'ン\u309A'], +]; + +const textCasesMisc = [ + ['', ''], + ['\u3099ハ', '\u3099ハ'], + ['\u309Aハ', '\u309Aハ'], + ['さくらし\u3099また\u3099いこん', 'さくらじまだいこん'], + ['いっほ\u309Aん', 'いっぽん'], +]; + +describe('combining dakuten/handakuten normalization', () => { + const {process} = normalizeCombiningCharacters; + const testCases = [...testCasesDakuten, ...testCasesHandakuten, ...testCasesIgnored, ...textCasesMisc]; + test.each(testCases)('%s normalizes to %s', (input, expected) => { + expect(process(input, true)).toStrictEqual(expected); + }); +}); diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index 423129373c..d0136d922a 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -120,6 +120,7 @@ type AllTextProcessors = { pre: { convertHalfWidthCharacters: TextProcessor; alphabeticToHiragana: TextProcessor; + normalizeCombiningCharacters: TextProcessor; alphanumericWidthVariants: BidirectionalConversionPreprocessor; convertHiraganaToKatakana: BidirectionalConversionPreprocessor; collapseEmphaticSequences: TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>;