diff --git a/Taskfile.yaml b/Taskfile.yaml index ac77674..85578bb 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -315,8 +315,8 @@ tasks: cmds: - rm -f *.txt - echo "{{.TATN}}Downloading dictionary patterns from \"ctan.math.utah.edu\"{{.TOFF}}" - - wget -q -r -l1 --no-parent -nd -A.pat.txt http://ctan.math.utah.edu/ctan/tex-archive/language/hyph-utf8/tex/generic/hyph-utf8/patterns/txt - - wget -q -r -l1 --no-parent -nd -A.hyp.txt http://ctan.math.utah.edu/ctan/tex-archive/language/hyph-utf8/tex/generic/hyph-utf8/patterns/txt + - wget -q -r -l1 --no-parent -nd -A.pat.txt http://ctan.math.utah.edu/ctan/tex-archive/language/hyph-utf8/tex/patterns/txt + - wget -q -r -l1 --no-parent -nd -A.hyp.txt http://ctan.math.utah.edu/ctan/tex-archive/language/hyph-utf8/tex/patterns/txt - gzip -q -f hyph-*.txt status: - find -type f -name 'hyph-*.txt.gz' | grep -q . diff --git a/processor/sentences.go b/processor/sentences.go index a4e3a7a..4d4aac9 100644 --- a/processor/sentences.go +++ b/processor/sentences.go @@ -108,19 +108,31 @@ func splitSentences(t *tokenizer, in string) []string { // splitWords returns slice of words in sentence. func splitWords(_ *tokenizer, in string, ignoreNBSP bool) []string { - if ignoreNBSP { - // unicode.IsSpace will eat everything - for backward compatibility - return strings.Fields(in) + var ( + result = []string{} + word strings.Builder + ) + for _, sym := range in { + if isSep(sym, ignoreNBSP) { + result = append(result, word.String()) + word.Reset() + continue + } + word.WriteRune(sym) } - // exclude NBSP from the list of white space separators for latin1 symbols - return strings.FieldsFunc(in, func(r rune) bool { - if uint32(r) <= unicode.MaxLatin1 { - switch r { - case '\t', '\n', '\v', '\f', '\r', ' ', 0x85: - return true - } - return false + return append(result, word.String()) +} + +func isSep(r rune, ignoreNBSP bool) bool { + if uint32(r) <= unicode.MaxLatin1 { + switch r { + // exclude NBSP from the list of white space separators for latin1 symbols + case '\t', '\n', '\v', '\f', '\r', ' ', 0x85: + return true + case 0xA0: // NBSP + return ignoreNBSP } - return unicode.IsSpace(r) - }) + return false + } + return unicode.IsSpace(r) }