Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add text segmentation for extended grapheme clusters - part 1 #2

Merged
merged 1 commit into from
Oct 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
# Ignore the example binaries
example

# Ignore the generated docs
# Ignore the generated files
generated-docs
ucd/gen
14 changes: 7 additions & 7 deletions package/CodePoint.roc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ interface CodePoint
countUtf8Bytes,
]
imports [
Internal.{ CP, fromU32Unchecked },
InternalCP.{ CP, fromU32Unchecked },
]

## A [Unicode code point](http://www.unicode.org/glossary/#code_point).
Expand All @@ -22,7 +22,7 @@ CodePoint : CP
## Converts a [CodePoint] to its underlying [Unicode code point](http://www.unicode.org/glossary/#code_point)
## integer representation.
toU32 : CodePoint -> U32
toU32 = Internal.toU32
toU32 = InternalCP.toU32

## Converts a [U32] to a [CodePoint] by verifying that it is a valid [Unicode code point](http://www.unicode.org/glossary/#code_point)
## (that is, it's between `0` and `0x10FFFF`).
Expand All @@ -45,23 +45,23 @@ isValidScalar = \codePoint -> !(isHighSurrogate codePoint || isLowSurrogate code
## (`0xD800` to `0xDBFF`)
isHighSurrogate : CodePoint -> Bool
isHighSurrogate = \codePoint ->
u32 = Internal.toU32 codePoint
u32 = InternalCP.toU32 codePoint

u32 >= 0xDC00 && u32 <= 0xDFFF

## Returns true if this is a [low-surrogate code point](https://www.unicode.org/glossary/#low_surrogate_code_point)
## U+DC00 to U+DFFF
isLowSurrogate : CodePoint -> Bool
isLowSurrogate = \codePoint ->
u32 = Internal.toU32 codePoint
u32 = InternalCP.toU32 codePoint

u32 >= 0xDC00 && u32 <= 0xDFFF

## Zig docs: bytes the UTF-8 representation would require
## for the given codepoint.
utf8Len : CodePoint -> Result Nat [InvalidCodePoint]
utf8Len = \codePoint ->
u32 = Internal.toU32 codePoint
u32 = InternalCP.toU32 codePoint

if u32 < 0x80 then
Ok 1
Expand All @@ -77,7 +77,7 @@ utf8Len = \codePoint ->
## Encode a Scalar as UTF-8 bytes and append those bytes to an existing list of UTF-8 bytes.
appendUtf8 : List U8, CodePoint -> List U8
appendUtf8 = \bytes, codePoint ->
u32 = Internal.toU32 codePoint
u32 = InternalCP.toU32 codePoint

if u32 < 0x80 then
List.append bytes (Num.toU8 u32)
Expand Down Expand Up @@ -174,7 +174,7 @@ addContinuation = \original, continuationByte ->
## The number of UTF-8 bytes it takes to represent this Scalar.
countUtf8Bytes : CodePoint -> Nat
countUtf8Bytes = \codePoint ->
u32 = Internal.toU32 codePoint
u32 = InternalCP.toU32 codePoint

if u32 < 0x80 then
1
Expand Down
2 changes: 1 addition & 1 deletion package/Internal.roc → package/InternalCP.roc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
interface Internal
interface InternalCP
exposes [
CP,
fromU32Unchecked,
Expand Down
25 changes: 25 additions & 0 deletions package/InternalGBP.roc
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
## WARNING This file is automatically generated. Do not edit it manually. ##
interface InternalGBP
exposes [
GraphemeBreakProperty,
]
imports []

GraphemeBreakProperty : [
CR,
LF,
Control,
Extend,
ZWL,
RI,
Prepend,
SpacingMark,
L,
V,
T,
LV,
LVT,
Other,
]

expect 1 == 1
3 changes: 2 additions & 1 deletion package/main.roc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package "unicode"
exposes [
CodePoint,
Scalar,
# TODO enable
# Scalar,
]
packages {}
9 changes: 9 additions & 0 deletions rebuild.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

# Generate the GBP internal module
echo "Generating package/InternalGBP.roc"
roc run ucd/GBP.roc -- package/

# Test the GBP internal module
echo "Testing package/InternalGBP.roc"
roc test package/InternalGBP.roc
233 changes: 233 additions & 0 deletions ucd/GBP.roc
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
app "gen"
packages {
pf: "https://github.com/roc-lang/basic-cli/releases/download/0.5.0/Cufzl36_SnJ4QbOoEmiJ5dIpUxBvdB3NEySvuH82Wio.tar.br",
parser: "https://github.com/lukewilliamboswell/roc-parser/releases/download/0.1.0/vPU-UZbWGIXsAfcJvAnmU3t3SWlHoG_GauZpqzJiBKA.tar.br",
}
imports [
pf.Stdout,
pf.Stderr,
pf.Task.{ Task },
pf.Path.{Path },
pf.Arg,
pf.File,
parser.Core.{ Parser, buildPrimitiveParser },
parser.String.{ parseStr },
# "GraphemeBreakProperty-15.1.0.txt" as gbpFile : Str,
"GBPTemplate.roc" as template : Str,
]
provides [main] to pf

CodePoint : U32
GraphemeBreakProperty : [
CR,
LF,
Control,
Extend,
ZWL,
RI,
Prepend,
SpacingMark,
L,
V,
T,
LV,
LVT,
Other,
]

main : Task {} I32
main =
getFilePath
|> Task.await writeToFile
|> Task.onErr \err -> Stderr.line "\(err)"

getFilePath : Task Path Str
getFilePath =
args <- Arg.list |> Task.await

when args |> List.get 1 is
Ok arg -> Task.ok (Path.fromStr "\(removeTrailingSlash arg)/InternalGBP.roc")
Err _ -> Task.err "USAGE: roc run InternalGBP.roc -- path/to/package/"

writeToFile : Path -> Task {} Str
writeToFile = \path ->
File.writeUtf8 path template
|> Task.mapErr \_ -> "ERROR: unable to write to \(Path.display path)"
|> Task.await \_ -> Stdout.line "\nSucessfully wrote \(lineCountStr) lines to \(Path.display path)\n"

removeTrailingSlash : Str -> Str
removeTrailingSlash = \str ->
trimmed = str |> Str.trim
reversed = trimmed |> Str.toUtf8 |> List.reverse

when reversed is
[a, ..] if a == '/' ->
reversed
|> List.drop 1
|> List.reverse
|> Str.fromUtf8
|> Result.withDefault ""
_ -> trimmed

expect removeTrailingSlash "abc " == "abc"
expect removeTrailingSlash " abc/package/ " == "abc/package"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I love quick-and-easy tests like this! 🤗


lineCountStr =
template
|> Str.split "\n"
|> List.len
|> Num.toStr

props : List {bytes : List U8, property : GraphemeBreakProperty, len : Nat}
props =
# NOTE ordering matters here, e.g. L after LV and LVT
# to match on longest first
[
{ bytes: Str.toUtf8 "CR", property: CR},
{ bytes: Str.toUtf8 "Control", property: Control},
{ bytes: Str.toUtf8 "Extend", property: Extend},
{ bytes: Str.toUtf8 "ZWL", property: ZWL},
{ bytes: Str.toUtf8 "RI", property: RI},
{ bytes: Str.toUtf8 "Prepend", property: Prepend},
{ bytes: Str.toUtf8 "SpacingMark", property: SpacingMark},
{ bytes: Str.toUtf8 "V", property: V},
{ bytes: Str.toUtf8 "T", property: T},
{ bytes: Str.toUtf8 "LF", property: LF},
{ bytes: Str.toUtf8 "LVT", property: LVT},
{ bytes: Str.toUtf8 "LV", property: LV},
{ bytes: Str.toUtf8 "L", property: L},
{ bytes: Str.toUtf8 "Other", property: Other},
]
|> List.map \{bytes, property} -> {bytes, property, len: List.len bytes}

graphemePropertyParser : Parser (List U8) GraphemeBreakProperty
graphemePropertyParser =

input <- buildPrimitiveParser

matches : List { val : GraphemeBreakProperty, input : List U8}
matches =
props
|> List.keepOks \prop ->
if List.startsWith input prop.bytes then
Ok prop
else
Err "not used"
|> List.map \{property, len} ->
{ val : property, input : List.drop input len }

when matches is
[a, ..] -> Ok a # take the longest match
_ -> Err (ParsingFailure "Not a GBP")

expect parseStr graphemePropertyParser "L" == Ok L
expect parseStr graphemePropertyParser "LF" == Ok LF
expect parseStr graphemePropertyParser "LV" == Ok LV
expect parseStr graphemePropertyParser "LVT" == Ok LVT
expect parseStr graphemePropertyParser "Other" == Ok Other
expect parseStr graphemePropertyParser "# ===" == Err (ParsingFailure "Not a GBP")

codePointParser : Parser (List U8) CodePoint
codePointParser =
input <- buildPrimitiveParser

{ val: hexBytes, rest} = takeHexBytes {val: [], rest:input}

when hexBytes is
[] -> Err (ParsingFailure "No hex bytes")
_ -> Ok {
val: hexBytesToU32 hexBytes,
input: List.drop rest (List.len hexBytes),
}

expect parseStr codePointParser "0000" == Ok 0
expect parseStr codePointParser "16FF1" == Ok 94193
expect parseStr codePointParser "# ===" == Err (ParsingFailure "No hex bytes")

hexBytesToU32 : List U8 -> CodePoint
hexBytesToU32 = \bytes ->
bytes
|> List.reverse
|> List.walkWithIndex 0 \accum, byte, i -> accum + (Num.powInt 16 (Num.toU32 i))*(hexToDec byte)
|> Num.toU32

expect hexBytesToU32 ['0', '0', '0', '0'] == 0
expect hexBytesToU32 ['0', '0', '0', '1'] == 1
expect hexBytesToU32 ['0', '0', '0', 'F'] == 15
expect hexBytesToU32 ['0', '0', '1', '0'] == 16
expect hexBytesToU32 ['0', '0', 'F', 'F'] == 255
expect hexBytesToU32 ['0', '1', '0', '0'] == 256
expect hexBytesToU32 ['0', 'F', 'F', 'F'] == 4095
expect hexBytesToU32 ['1', '0', '0', '0'] == 4096
expect hexBytesToU32 ['1', '6', 'F', 'F', '1'] == 94193

takeHexBytes : { val : List U8, rest : List U8} -> { val : List U8, rest : List U8}
takeHexBytes = \input ->
when input.rest is
[] -> input
[first, ..] ->
if first |> isHex then
# take the first hex byte and continue
takeHexBytes {
val : input.val |> List.append first,
rest : input.rest |> List.drop 1,
}
else
input

expect
bytes = [35, 32, 61, 61, 61] # "# ==="
takeHexBytes {val: [], rest: bytes} == {val: [], rest: bytes}

expect
bytes = [68, 54, 69, 49, 46, 46, 68, 54, 70, 66, 32, 32] # "D6E1..D6FB "
takeHexBytes {val: [], rest: bytes} == {val: [68, 54, 69, 49], rest: [46, 46, 68, 54, 70, 66, 32, 32]}

isHex : U8 -> Bool
isHex = \u8 ->
u8 == '0' ||
u8 == '1' ||
u8 == '2' ||
u8 == '3' ||
u8 == '4' ||
u8 == '5' ||
u8 == '6' ||
u8 == '7' ||
u8 == '8' ||
u8 == '9' ||
u8 == 'A' ||
u8 == 'B' ||
u8 == 'C' ||
u8 == 'D' ||
u8 == 'E' ||
u8 == 'F'

expect isHex '0'
expect isHex 'A'
expect isHex 'F'
expect !(isHex ';')
expect !(isHex '#')

hexToDec : U8 -> U32
hexToDec = \byte ->
when byte is
'0' -> 0
'1' -> 1
'2' -> 2
'3' -> 3
'4' -> 4
'5' -> 5
'6' -> 6
'7' -> 7
'8' -> 8
'9' -> 9
'A' -> 10
'B' -> 11
'C' -> 12
'D' -> 13
'E' -> 14
'F' -> 15
_ -> 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can totally be in a package someday...or maybe a builtin? 🤔


expect hexToDec '0' == 0
expect hexToDec 'F' == 15
25 changes: 25 additions & 0 deletions ucd/GBPTemplate.roc
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
## WARNING This file is automatically generated. Do not edit it manually. ##
interface InternalGBP
exposes [
GraphemeBreakProperty,
]
imports []

GraphemeBreakProperty : [
CR,
LF,
Control,
Extend,
ZWL,
RI,
Prepend,
SpacingMark,
L,
V,
T,
LV,
LVT,
Other,
]

expect 1 == 1
Loading