Skip to content
This repository has been archived by the owner on May 10, 2023. It is now read-only.

add korean sentence validation & cleanup #630

Merged
merged 1 commit into from
Aug 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions server/lib/validation/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const eo = require('./languages/eo');
const ig = require('./languages/ig');
const it = require('./languages/it');
const kab = require( './languages/kab');
const ko = require( './languages/ko');
const ne = require('./languages/ne');
const or = require('./languages/or');
const ru = require('./languages/ru');
Expand All @@ -25,6 +26,7 @@ const VALIDATORS = {
ig,
it,
kab,
ko,
ne,
or,
ru,
Expand Down
55 changes: 55 additions & 0 deletions server/lib/validation/languages/ko.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Minimum of characters that qualify as a sentence.
const MIN_CHARACTERS = 1;

// Maximum of characters allowed per sentence to keep recordings in a manageable duration.
const MAX_CHARACTERS = 50;
MichaelKohler marked this conversation as resolved.
Show resolved Hide resolved

const INVALIDATIONS = [{
fn: (sentence) => {
// To properly tokenize Korean, We need some heavy tokenizers (ex: mecab-ko, nori, ...),
// For counting letters those tokenizers are not necessary.
return sentence.length < MIN_CHARACTERS || sentence.length > MAX_CHARACTERS;
},
error: `문장의 글자 수는 ${MIN_CHARACTERS}글자 이상, ${MAX_CHARACTERS}글자 이하여야 합니다.`,
}, {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the three rules here can be removed, as the last rule in this list would catch those anyway. What do you think?

// One Korean letter is composed with two or three letters,
// in order of (consonant(1st) - vowel(2nd) - consonant(3rd, optional)).
// It shouldn't be allowed to use them separately, since that could cause various pronunciation issues.
//
// This regex is for Unicode "Hangul Syllables" (U+AC00–U+D7A3), Which are composed form (see below).
regex: /[ㄱ-ㅎㅏ-ㅣ]/,
error: '문장에는 자음이나 모음만 따로 있는 글자가 있어서는 안 됩니다.',
},
{
// Korean letters (Hangul) have two type of Unicode code points.
//
// - Composed form (Unicode "Hangul Syllables" : U+AC00–U+D7A3)
// - One Unicode codepoint contains three or two letters in rectangular shape.
// - This is normally used codepoints.
// - Other forms
// - Other Unicode codepoints deal korean letters as separated vowels and consonants.
// - This takes doubled space in bytes.
// - This only appears when a contributor is using keyboard layout called "Sebeolsik", which is akin to Dvorak.
// - After NFC normalization ( 5a86a81 ),
// Composible combination of two or three characters (1st - 2nd - 3rd (optional)) will become
// Composed form ("Hangul Syllables"). Characters that cannot be combined may remain.
//
// This regex is for codepoints other than "Hangul Syllables" (U+AC00–U+D7A3).
regex: /[\u1100-\u11FF\uA960-\uA97F\u3130-\u318F]/u,
error: '문장에는 첫가끝 형태의 분해된 글자가 있어서는 안 됩니다. 완성형 글자를 입력해주세요.',
}, {
// Since there are so may kinds of "should not be allowd" letters,
// It would be convenient to allow only certain type of characters.
// examples: CJK chinese letters, Japanese letters, Korean specific chinese letters (aka hanja),
// not-used symbols (semicolon, colon - native korean sentences do not contain them),
// better to be excluded symbols (quote, tilda, ...),
// characters that can be normalized into normal characters with destructive NFKC normalization (ⓐ, ㈜, ...),
// historical korean letters (aka 옛한글 - ㆆ, ㅿ, ㆁ, ...)
// ...
regex: /[^가-힣.,?! ]/u,
error: '문장에는 한글과 마침표, 쉼표, 느낌표, 물음표, 공백만 들어있어야 합니다.',
}];

module.exports = {
INVALIDATIONS,
};