Skip to content
This repository has been archived by the owner on May 10, 2023. It is now read-only.

Commit

Permalink
fix: [th] Prevent 2 separated words to stick together (#451)
Browse files Browse the repository at this point in the history
Replace invalid char with a space instead of just remove it, to prevent two separated words to stick together after the removal.
If excessive spaces occur, they will be eliminated later.
  • Loading branch information
bact authored Jun 14, 2021
1 parent 3d16d47 commit 0ee3bae
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions server/lib/cleanup/languages/th.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ function clean(sentences) {
// remove zero-width chars (occurs in some Thai texts)
.replace(/[\u200b\u200c\u2063]/g, '')
// remove emoji
.replace(/\u00a9|\u00ae|[\u2000-\u3300]|[\u2580-\u27bf]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\ue000-\uf8ff]/g, '')
.replace(/\u00a9|\u00ae|[\u2000-\u3300]|[\u2580-\u27bf]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\ue000-\uf8ff]/g, ' ')
// remove erroneous tone marks, Phinthu, Thanthakhat, Nikhahit, Yamakkan, above/below vowels at the beginning of a word
.replace(/(^|\s)[\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+/g, '')
.replace(/(^|\s)[\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]+/g, ' ')
.replace(/:/g, ' : ') // add a space before and after colon
.replace(/\?/g, ' ? ') // add a space before and after question mark
.replace(/!/g, ' ! ') // add a space before and after exclamation mark
Expand Down

0 comments on commit 0ee3bae

Please sign in to comment.