Skip to content

Commit

Permalink
Narrow MASKS and UTF8Byte.value's type, assign MASKS by left shift in…
Browse files Browse the repository at this point in the history
… UTF32ToUTF8 (#13310)

* Change MASKS from int[] to byte[], and assign it with left shift.

* Only set first byte for tmpUTF8.

* Only set first byte value for tmp utf8.

* Change value type from int to byte.

* Remove stale comment.
  • Loading branch information
vsop-479 authored and mikemccand committed Apr 23, 2024
1 parent a2f48d8 commit 9287167
Showing 1 changed file with 33 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,19 @@ public final class UTF32ToUTF8 {
private static final int[] startCodes = new int[] {0, 128, 2048, 65536};
private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111};

static int[] MASKS = new int[8];
static byte[] MASKS = new byte[8];

static {
int v = 2;
for (int i = 0; i < 7; i++) {
MASKS[i + 1] = v - 1;
v *= 2;
MASKS[i + 1] = (byte) ((2 << i) - 1);
}
}

// Represents one of the N utf8 bytes that (in sequence)
// define a code point. value is the byte value; bits is
// how many bits are "used" by utf8 at that byte
private static class UTF8Byte {
int value; // TODO: change to byte
byte value;
byte bits;
}

Expand All @@ -67,7 +65,7 @@ public UTF8Sequence() {
}

public int byteAt(int idx) {
return bytes[idx].value;
return bytes[idx].value & 0xFF;
}

public int numBits(int idx) {
Expand All @@ -77,33 +75,54 @@ public int numBits(int idx) {
private void set(int code) {
if (code < 128) {
// 0xxxxxxx
bytes[0].value = code;
bytes[0].value = (byte) code;
bytes[0].bits = 7;
len = 1;
} else if (code < 2048) {
// 110yyyxx 10xxxxxx
bytes[0].value = (6 << 5) | (code >> 6);
bytes[0].value = (byte) ((6 << 5) | (code >> 6));
bytes[0].bits = 5;
setRest(code, 1);
len = 2;
} else if (code < 65536) {
// 1110yyyy 10yyyyxx 10xxxxxx
bytes[0].value = (14 << 4) | (code >> 12);
bytes[0].value = (byte) ((14 << 4) | (code >> 12));
bytes[0].bits = 4;
setRest(code, 2);
len = 3;
} else {
// 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
bytes[0].value = (30 << 3) | (code >> 18);
bytes[0].value = (byte) ((30 << 3) | (code >> 18));
bytes[0].bits = 3;
setRest(code, 3);
len = 4;
}
}

// Only set first byte value for tmp utf8.
private void setFirstByte(int code) {
if (code < 128) {
// 0xxxxxxx
bytes[0].value = (byte) code;
len = 1;
} else if (code < 2048) {
// 110yyyxx 10xxxxxx
bytes[0].value = (byte) ((6 << 5) | (code >> 6));
len = 2;
} else if (code < 65536) {
// 1110yyyy 10yyyyxx 10xxxxxx
bytes[0].value = (byte) ((14 << 4) | (code >> 12));
len = 3;
} else {
// 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
bytes[0].value = (byte) ((30 << 3) | (code >> 18));
len = 4;
}
}

private void setRest(int code, int numBytes) {
for (int i = 0; i < numBytes; i++) {
bytes[numBytes - i].value = 128 | (code & MASKS[6]);
bytes[numBytes - i].value = (byte) (128 | (code & MASKS[6]));
bytes[numBytes - i].bits = 6;
code = code >> 6;
}
Expand All @@ -116,7 +135,7 @@ public String toString() {
if (i > 0) {
b.append(' ');
}
b.append(Integer.toBinaryString(bytes[i].value));
b.append(Integer.toBinaryString(byteAt(i)));
}
return b.toString();
}
Expand Down Expand Up @@ -183,10 +202,8 @@ private void build(int start, int end, UTF8Sequence startUTF8, UTF8Sequence endU
int byteCount = 1 + startUTF8.len - upto;
final int limit = endUTF8.len - upto;
while (byteCount < limit) {
// wasteful: we only need first byte, and, we should
// statically encode this first byte:
tmpUTF8a.set(startCodes[byteCount - 1]);
tmpUTF8b.set(endCodes[byteCount - 1]);
tmpUTF8a.setFirstByte(startCodes[byteCount - 1]);
tmpUTF8b.setFirstByte(endCodes[byteCount - 1]);
all(start, end, tmpUTF8a.byteAt(0), tmpUTF8b.byteAt(0), tmpUTF8a.len - 1);
byteCount++;
}
Expand Down

0 comments on commit 9287167

Please sign in to comment.