themoeway · Kuuuube · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024
@@ -24,6 +24,7 @@ import {
     convertHalfWidthKanaToFullWidth,
     convertHiraganaToKatakana as convertHiraganaToKatakanaFunction,
     convertKatakanaToHiragana as convertKatakanaToHiraganaFunction,
+    normalizeCombiningCharacters as normalizeCombiningCharactersFunction,
 } from './japanese.js';
 
 /** @type {import('language').TextProcessor<boolean>} */
@@ -90,3 +91,11 @@ export const collapseEmphaticSequences = {
         return str;
     },
 };
+
+/** @type {import('language').TextProcessor<boolean>} */
+export const normalizeCombiningCharacters = {
+    name: 'Normalize combining characters',
+    description: 'ド → ド (U+30C8 U+3099 → U+30C9)',
+    options: basicTextProcessorOptions,
+    process: (str, setting) => (setting ? normalizeCombiningCharactersFunction(str) : str),
+};
@@ -560,6 +560,63 @@ export function getKanaDiacriticInfo(character) {
     return typeof info !== 'undefined' ? {character: info.character, type: info.type} : null;
 }
 
+/**
+ * @param {number} codePoint
+ * @returns {boolean}
+ */
+function dakutenAllowed(codePoint) {
+    // To reduce processing time some characters which shouldn't have dakuten but are highly unlikely to have a combining character attatched are included
+    // かがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとはばぱひびぴふぶぷへべぺほ
+    // カガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトハバパヒビピフブプヘベペホ
+    return ((codePoint >= 0x304B && codePoint <= 0x3068) ||
+    (codePoint >= 0x306F && codePoint <= 0x307B) ||
+    (codePoint >= 0x30AB && codePoint <= 0x30C8) ||
+    (codePoint >= 0x30CF && codePoint <= 0x30DB));
+}
+
+/**
+ * @param {number} codePoint
+ * @returns {boolean}
+ */
+function handakutenAllowed(codePoint) {
+    // To reduce processing time some characters which shouldn't have handakuten but are highly unlikely to have a combining character attatched are included
+    // はばぱひびぴふぶぷへべぺほ
+    // ハバパヒビピフブプヘベペホ
+    return ((codePoint >= 0x306F && codePoint <= 0x307B) ||
+    (codePoint >= 0x30CF && codePoint <= 0x30DB));
+}
+
+/**
+ * @param {string} text
+ * @returns {string}
+ */
+export function normalizeCombiningCharacters(text) {
+    let result = '';
+    const textLength = text.length;
+    let i = textLength - 1;
+    // Ignoring the first character is intentional, it cannot combine with anything
+    while (i > 0) {
+        if (text[i] === '\u3099') {
+            const dakutenCombinee = text[i - 1].codePointAt(0);
+            if (dakutenCombinee && dakutenAllowed(dakutenCombinee)) {
+                result = String.fromCodePoint(dakutenCombinee + 1) + result;
+                i -= 2;
+                continue;
+            }
+        } else if (text[i] === '\u309A') {
+            const handakutenCombinee = text[i - 1].codePointAt(0);
+            if (handakutenCombinee && handakutenAllowed(handakutenCombinee)) {
+                result = String.fromCodePoint(handakutenCombinee + 2) + result;
+                i -= 2;
+                continue;
+            }
+        }
+        result = text[i] + result;
+        i--;
+    }
+    result = text[0] + result;
+    return result;
+}
 
 // Furigana distribution
 

@@ -26,6 +26,7 @@ import {
     collapseEmphaticSequences,
     convertHalfWidthCharacters,
     convertHiraganaToKatakana,
+    normalizeCombiningCharacters,
 } from './ja/japanese-text-preprocessors.js';
 import {japaneseTransforms} from './ja/japanese-transforms.js';
 import {isStringPartiallyJapanese} from './ja/japanese.js';
@@ -36,9 +37,9 @@ import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js
 import {oldIrishTransforms} from './sga/old-irish-transforms.js';
 import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js';
 import {albanianTransforms} from './sq/albanian-transforms.js';
-import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
 import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
-import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js';
+import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
+import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js';
 
 const capitalizationPreprocessors = {
     decapitalize,
@@ -155,6 +156,7 @@ const languageDescriptors = [
         textPreprocessors: {
             convertHalfWidthCharacters,
             alphabeticToHiragana,
+            normalizeCombiningCharacters,
             alphanumericWidthVariants,
             convertHiraganaToKatakana,
             collapseEmphaticSequences,

@@ -120,6 +120,7 @@ type AllTextProcessors = {
         pre: {
             convertHalfWidthCharacters: TextProcessor<boolean>;
             alphabeticToHiragana: TextProcessor<boolean>;
+            normalizeCombiningCharacters: TextProcessor<boolean>;
             alphanumericWidthVariants: BidirectionalConversionPreprocessor;
             convertHiraganaToKatakana: BidirectionalConversionPreprocessor;
             collapseEmphaticSequences: TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>;