Skip to content

Commit

Permalink
UTF-8 cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
pull-vert committed Oct 14, 2024
1 parent a5a9044 commit 7a6841c
Show file tree
Hide file tree
Showing 13 changed files with 181 additions and 92 deletions.
57 changes: 30 additions & 27 deletions core/src/main/java/jayo/ByteString.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
* <b>Immutability is guaranteed:</b> ByteString copies data on creation as well as on conversion back to {@code byte[]}
* , thus guaranteeing that subsequent modification of source data or data returned from {@link #toByteArray()} won't
* mutate the byte string itself.
*
* @see Utf8 a UTF-8 specific implementation of {@code ByteString}.
*/
public sealed interface ByteString extends Serializable, Comparable<ByteString>
Expand All @@ -84,7 +85,9 @@ public sealed interface ByteString extends Serializable, Comparable<ByteString>
* @throws IndexOutOfBoundsException if {@code offset} or {@code byteCount} is out of range of
* {@code data} indices.
*/
static @NonNull ByteString of(final byte @NonNull [] data, final int offset, final int byteCount) {
static @NonNull ByteString of(final byte @NonNull [] data,
final @NonNegative int offset,
final @NonNegative int byteCount) {
return new RealByteString(data, offset, byteCount);
}

Expand All @@ -108,24 +111,6 @@ public sealed interface ByteString extends Serializable, Comparable<ByteString>
return new RealByteString(string.getBytes(charset));
}

/**
* Reads {@code byteCount} bytes from {@code in} and wraps them into a byte string.
*
* @throws JayoEOFException if {@code in} has fewer than {@code byteCount} bytes to read.
* @throws IllegalArgumentException if {@code byteCount} is negative.
*/
static @NonNull ByteString read(final @NonNull InputStream in, final @NonNegative int byteCount) {
if (byteCount < 0) {
throw new IllegalArgumentException("byteCount < 0: " + byteCount);
}

try {
return new RealByteString(in.readNBytes(byteCount));
} catch (IOException e) {
throw JayoException.buildJayoException(e);
}
}

/**
* Decodes the Base64-encoded bytes from {@code charSequence} and wraps them into a byte string. Returns
* {@code null} if this is not a valid Base64-encoded sequence of bytes.
Expand All @@ -140,14 +125,14 @@ public sealed interface ByteString extends Serializable, Comparable<ByteString>
}

/**
* Decodes the Hex-encoded bytes from {@code charSequence} and wraps them into a byte string.
* Decodes the hexadecimal-encoded bytes from {@code charSequence} and wraps them into a byte string.
*
* @param charSequence the char sequence to decode Base64-encoded bytes from.
* @throws IllegalArgumentException if {@code charSequence} is not a valid Hex char sequence.
* @throws IllegalArgumentException if {@code charSequence} is not a valid hexadecimal-encoded char sequence.
*/
static @NonNull ByteString decodeHex(final @NonNull CharSequence charSequence) {
if (charSequence.length() % 2 != 0) {
throw new IllegalArgumentException("Unexpected Hex char sequence: " + charSequence);
throw new IllegalArgumentException("Unexpected hexadecimal-encoded char sequence: " + charSequence);
}

final var result = new byte[charSequence.length() / 2];
Expand All @@ -160,11 +145,29 @@ public sealed interface ByteString extends Serializable, Comparable<ByteString>
}

/**
* @return either a new String by decoding all the bytes from this byte string using UTF-8, or the cached one
* Reads {@code byteCount} bytes from {@code in} and wraps them into a byte string.
*
* @throws JayoEOFException if {@code in} has fewer than {@code byteCount} bytes to read.
* @throws IllegalArgumentException if {@code byteCount} is negative.
*/
static @NonNull ByteString read(final @NonNull InputStream in, final @NonNegative int byteCount) {
if (byteCount < 0) {
throw new IllegalArgumentException("byteCount < 0: " + byteCount);
}

try {
return new RealByteString(in.readNBytes(byteCount));
} catch (IOException e) {
throw JayoException.buildJayoException(e);
}
}

/**
* @return either a new String by decoding all the bytes from this byte string using UTF-8, or the cached one if
* available.
*/
@NonNull
String decodeToUtf8();
String decodeToString();

/**
* Constructs a new String by decoding all the bytes from this byte string using {@code charset}.
Expand Down Expand Up @@ -424,14 +427,14 @@ void copyInto(final @NonNegative int offset,
int lastIndexOf(final byte @NonNull [] other, final @NonNegative int startIndex);

/**
* Returns a string representation of this byte string. A string representation consists of {@code size} and a
* @return a string representation of this byte string. A string representation consists of {@code size} and a
* hexadecimal-encoded string of the bytes wrapped by this byte string.
* <p>
* The string representation has the following format {@code ByteString(size=3 hex=ABCDEF)}, for empty strings it's
* always {@code ByteString(size=0)}.
* <p>
* Note that a string representation includes the whole byte string content encoded. Due to limitations exposed for
* the maximum string length, an attempt to return a string representation of too long byte string may fail.
* the maximum string length, an attempt to return a string representation of a too long byte string may fail.
*/
@Override
@NonNull
Expand All @@ -445,7 +448,7 @@ private static int decodeHexDigit(char c) {
} else if (c >= 'A' && c <= 'F') {
return c - 'A' + 10;
} else {
throw new IllegalArgumentException("Unexpected hex digit: " + c);
throw new IllegalArgumentException("Unexpected hexadecimal-encoded digit: " + c);
}
}
}
89 changes: 78 additions & 11 deletions core/src/main/java/jayo/Utf8.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,41 +34,87 @@ public sealed interface Utf8 extends ByteString permits RealUtf8, SegmentedUtf8

/**
* @param data a sequence of bytes to be wrapped.
* @return a new byte string containing a copy of all the bytes of {@code data}.
* @return a new UTF-8 byte string containing a copy of all the UTF-8 bytes of {@code data}.
*/
static @NonNull Utf8 of(final byte... data) {
static @NonNull Utf8 of(final byte @NonNull ... data) {
Objects.requireNonNull(data);
return new RealUtf8(data.clone(), false);
}

/**
* @param data a sequence of bytes to be wrapped.
* @return a new UTF-8 byte string containing a copy of all the ASCII bytes of {@code data}.
*/
static @NonNull Utf8 ofAscii(final byte @NonNull ... data) {
Objects.requireNonNull(data);
return new RealUtf8(data.clone(), true);
}

/**
* @param offset the start offset (inclusive) in the {@code data} byte array.
* @param byteCount the number of bytes to copy.
* @return a new byte string containing a copy of {@code byteCount} bytes of {@code data} starting at
* @return a new UTF-8 byte string containing a copy of {@code byteCount} UTF-8 bytes of {@code data} starting at
* {@code offset}.
* @throws IndexOutOfBoundsException if {@code offset} or {@code byteCount} is out of range of
* {@code data} indices.
*/
static @NonNull Utf8 of(final byte @NonNull [] data, final int offset, final int byteCount) {
return new RealUtf8(data, offset, byteCount);
static @NonNull Utf8 of(final byte @NonNull [] data,
final @NonNegative int offset,
final @NonNegative int byteCount) {
return new RealUtf8(data, offset, byteCount, false);
}

/**
* @param offset the start offset (inclusive) in the {@code data} byte array.
* @param byteCount the number of bytes to copy.
* @return a new UTF-8 byte string containing a copy of {@code byteCount} ASCII bytes of {@code data} starting at
* {@code offset}.
* @throws IndexOutOfBoundsException if {@code offset} or {@code byteCount} is out of range of
* {@code data} indices.
*/
static @NonNull Utf8 ofAscii(final byte @NonNull [] data,
final @NonNegative int offset,
final @NonNegative int byteCount) {
return new RealUtf8(data, offset, byteCount, true);
}

/**
* @param data a byte buffer from which we will copy the remaining bytes.
* @return a new byte string containing a copy of the remaining bytes of {@code data}.
* @return a new UTF-8 byte string containing a copy of the remaining UTF-8 bytes of {@code data}.
*/
static @NonNull Utf8 of(final @NonNull ByteBuffer data) {
Objects.requireNonNull(data);
final var copy = new byte[data.remaining()];
data.get(copy);
return new RealUtf8(copy, false);
}

/**
* Reads {@code byteCount} bytes from {@code in} and wraps them into a byte string.
* @param data a byte buffer from which we will copy the remaining bytes.
* @return a new UTF-8 byte string containing a copy of the remaining ASCII bytes of {@code data}.
*/
static @NonNull Utf8 ofAscii(final @NonNull ByteBuffer data) {
Objects.requireNonNull(data);
final var copy = new byte[data.remaining()];
data.get(copy);
return new RealUtf8(copy, true);
}

/**
* Encodes {@code string} using UTF-8 and wraps these bytes into a UTF-8 byte string.
*/
static @NonNull Utf8 encode(final @NonNull String string) {
return new RealUtf8(string);
}

/**
* Reads {@code byteCount} UTF-8 bytes from {@code in} and wraps them into a UTF-8 byte string.
*
* @throws JayoEOFException if {@code in} has fewer than {@code byteCount} bytes to read.
* @throws IllegalArgumentException if {@code byteCount} is negative.
*/
static @NonNull Utf8 read(final @NonNull InputStream in, final @NonNegative int byteCount) {
Objects.requireNonNull(in);
if (byteCount < 0) {
throw new IllegalArgumentException("byteCount < 0: " + byteCount);
}
Expand All @@ -81,17 +127,29 @@ public sealed interface Utf8 extends ByteString permits RealUtf8, SegmentedUtf8
}

/**
* Encodes {@code string} using UTF-8 and wraps these bytes into a byte string.
* Reads {@code byteCount} ASCII bytes from {@code in} and wraps them into a UTF-8 byte string.
*
* @throws JayoEOFException if {@code in} has fewer than {@code byteCount} bytes to read.
* @throws IllegalArgumentException if {@code byteCount} is negative.
*/
static @NonNull Utf8 encode(final @NonNull String string) {
return new RealUtf8(string);
static @NonNull Utf8 readAscii(final @NonNull InputStream in, final @NonNegative int byteCount) {
Objects.requireNonNull(in);
if (byteCount < 0) {
throw new IllegalArgumentException("byteCount < 0: " + byteCount);
}

try {
return new RealUtf8(in.readNBytes(byteCount), true);
} catch (IOException e) {
throw JayoException.buildJayoException(e);
}
}

/**
* @return the length of this UTF-8 bytes sequence. The length is equal to the number of
* {@linkplain java.lang.Character Unicode code units} in this UTF-8 bytes sequence.
* @implNote Result of this method is the same as {@link String#length()} you would get by calling
* {@code decodeToUtf8().length()}.
* {@code decodeToString().length()}.
*/
@NonNegative
int length();
Expand Down Expand Up @@ -146,6 +204,15 @@ public sealed interface Utf8 extends ByteString permits RealUtf8, SegmentedUtf8
@NonNull
Utf8 substring(final @NonNegative int startIndex, final @NonNegative int endIndex);

/**
* @return either a new String by decoding all the bytes from this byte string using UTF-8, or the cached one if
* available.The {@link String#length()} of the obtained string will be the {@link #length()} of this UTF-8 byte
* string.
*/
@Override
@NonNull
String toString();

/**
* @param prefix the prefix to check for.
* @return true if this UTF-8 byte string starts with the {@code prefix}.
Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/jayo/Utf8Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
* </tr>
* <tr>
* <td>Decode a string</td>
* <td>{@link ByteString#decodeToUtf8()}</td>
* <td>{@link ByteString#decodeToString()}</td>
* <td>{@link Reader#readString()}, {@link Reader#readString(long)}</td>
* </tr>
* <tr>
Expand Down
7 changes: 4 additions & 3 deletions core/src/main/java/jayo/internal/RealByteString.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ public RealByteString(final byte @NonNull [] data) {
public RealByteString(final byte @NonNull [] data,
final @NonNegative int offset,
final @NonNegative int byteCount) {
checkOffsetAndCount(Objects.requireNonNull(data).length, offset, byteCount);
Objects.requireNonNull(data);
checkOffsetAndCount(data.length, offset, byteCount);
this.data = Arrays.copyOfRange(data, offset, offset + byteCount);
utf8 = null;
}
Expand All @@ -79,7 +80,7 @@ public RealByteString(final @NonNull String string) {
}

@Override
public @NonNull String decodeToUtf8() {
public @NonNull String decodeToString() {
var utf8String = utf8;
if (utf8String == null) {
// We don't care if we double-allocate in racy code.
Expand All @@ -93,7 +94,7 @@ public RealByteString(final @NonNull String string) {
public @NonNull String decodeToString(final @NonNull Charset charset) {
Objects.requireNonNull(charset);
if (charset == StandardCharsets.UTF_8) {
return decodeToUtf8();
return decodeToString();
}
return new String(data, charset);
}
Expand Down
20 changes: 15 additions & 5 deletions core/src/main/java/jayo/internal/RealUtf8.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,21 @@ public RealUtf8(final byte @NonNull [] data, final boolean isAscii) {
public RealUtf8(final byte @NonNull [] data, final boolean isAscii, final boolean allowCompactString) {
super(data);
this.allowCompactString = allowCompactString;
this.isAscii = isAscii;
if (isAscii) {
this.isAscii = true;
length = data.length;
}
}

public RealUtf8(final byte @NonNull [] data,
final @NonNegative int offset,
final @NonNegative int byteCount) {
final @NonNegative int byteCount,
final boolean isAscii) {
super(data, offset, byteCount);
this.isAscii = isAscii;
if (isAscii) {
length = byteCount;
}
this.allowCompactString = UNSAFE_AVAILABLE && SUPPORT_COMPACT_STRING;
}

Expand All @@ -69,7 +74,7 @@ public RealUtf8(final @NonNull String string) {
}

@Override
public @NonNull String decodeToUtf8() {
public @NonNull String decodeToString() {
return decodeToUtf8Static(this, isAscii, allowCompactString);
}

Expand All @@ -89,8 +94,8 @@ public RealUtf8(final @NonNull String string) {
}
} else {
utf8 = new String(byteString.internalArray(), StandardCharsets.UTF_8);
byteString.length = utf8.length();
}
byteString.length = utf8.length();
byteString.utf8 = utf8;
return utf8;
}
Expand Down Expand Up @@ -220,6 +225,11 @@ public Utf8 substring(final @NonNegative int startIndex) {
return (uppercase != null) ? new RealUtf8(uppercase, isAscii, allowCompactString) : this;
}

@Override
public @NonNull String toString() {
return decodeToString();
}

private void fullScan() {
var byteIndex = 0;

Expand All @@ -232,9 +242,9 @@ private void fullScan() {
byteIndex++;
}

this.isAscii = isAscii;
if (isAscii) {
length = data.length;
this.isAscii = true;
return;
}

Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/jayo/internal/SegmentedByteString.java
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ public final int hashCode() {
}

@Override
public final @NonNull String toString() {
public @NonNull String toString() {
return toByteString().toString();
}

Expand Down
7 changes: 6 additions & 1 deletion core/src/main/java/jayo/internal/SegmentedUtf8.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ private SegmentedUtf8(final @NonNull Segment[] segments, final int @NonNull [] d
}

@Override
public @NonNull String decodeToUtf8() {
public @NonNull String decodeToString() {
return decodeToUtf8Static(this, isAscii, UNSAFE_AVAILABLE && SUPPORT_COMPACT_STRING);
}

Expand Down Expand Up @@ -227,6 +227,11 @@ public Utf8 substring(final @NonNegative int startIndex) {
return toByteString().toAsciiUppercase();
}

@Override
public @NonNull String toString() {
return decodeToString();
}

/**
* Returns a copy as a non-segmented byte string.
*/
Expand Down
Loading

0 comments on commit 7a6841c

Please sign in to comment.