From ab746939fdf8b1caee50c971badba5dd0e8e01f3 Mon Sep 17 00:00:00 2001 From: Tatu Saloranta Date: Fri, 29 Jan 2021 19:04:27 -0800 Subject: [PATCH] Fix #236 --- .../jackson/dataformat/cbor/CBORParser.java | 63 ++++++++++++++----- .../ParseInvalidUTF8String236Test.java | 49 +++++++++++++-- release-notes/CREDITS-2.x | 6 ++ release-notes/VERSION-2.x | 5 ++ 4 files changed, 103 insertions(+), 20 deletions(-) diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java index 7a21179cc..527799205 100644 --- a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java +++ b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java @@ -89,7 +89,7 @@ private Feature(boolean defaultState) { * I/O context for this reader. It handles buffer allocation * for the reader. */ - final protected IOContext _ioContext; + protected final IOContext _ioContext; /** * Flag that indicates whether parser is closed or not. Gets @@ -2157,17 +2157,21 @@ protected String _finishTextToken(int ch) throws IOException _finishChunkedText(); return _textBuffer.contentsAsString(); } - if (len > (_inputEnd - _inputPtr)) { - // or if not, could we read? - if (len >= _inputBuffer.length) { - // If not enough space, need handling similar to chunked - _finishLongText(len); - return _textBuffer.contentsAsString(); - } - _loadToHaveAtLeast(len); + // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that + // the longest individual unit is 4 bytes (surrogate pair) so we + // actually need len+3 bytes to avoid bounds checks + final int needed = len + 3; + final int available = _inputEnd - _inputPtr; + + if ((available >= needed) + // if not, could we read? NOTE: we do not require it, just attempt to read + || ((_inputBuffer.length >= needed) + && _tryToLoadToHaveAtLeast(needed))) { + return _finishShortText(len); } - // offline for better optimization - return _finishShortText(len); + // If not enough space, need handling similar to chunked + _finishLongText(len); + return _textBuffer.contentsAsString(); } private final String _finishShortText(int len) throws IOException @@ -2240,7 +2244,7 @@ private final void _finishLongText(int len) throws IOException continue; } if ((len -= code) < 0) { // may need to improve error here but... - throw _constructError("Malformed UTF-8 character at end of long (non-chunked) text segment"); + throw _constructError("Malformed UTF-8 character at the end of a (non-chunked) text segment"); } switch (code) { @@ -2260,13 +2264,13 @@ private final void _finishLongText(int len) throws IOException break; case 3: // 4-byte UTF c = _decodeUTF8_4(c); - // Let's add first part right away: - outBuf[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outBuf.length) { outBuf = _textBuffer.finishCurrentSegment(); outPtr = 0; outEnd = outBuf.length; } + // Let's add first part right away: + outBuf[outPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; @@ -3367,6 +3371,37 @@ protected final void _loadToHaveAtLeast(int minAvailable) throws IOException } } + // @since 2.12.2 + protected final boolean _tryToLoadToHaveAtLeast(int minAvailable) throws IOException + { + // No input stream, no leading (either we are closed, or have non-stream input source) + if (_inputStream == null) { + return false; + } + // Need to move remaining data in front? + int amount = _inputEnd - _inputPtr; + if (amount > 0 && _inputPtr > 0) { + //_currInputRowStart -= _inputPtr; + System.arraycopy(_inputBuffer, _inputPtr, _inputBuffer, 0, amount); + _inputEnd = amount; + } else { + _inputEnd = 0; + } + // Needs to be done here, as per [dataformats-binary#178] + _currInputProcessed += _inputPtr; + _inputPtr = 0; + while (_inputEnd < minAvailable) { + int count = _inputStream.read(_inputBuffer, _inputEnd, _inputBuffer.length - _inputEnd); + if (count < 1) { + // End of input; not ideal but we'll accept it here + _closeInput(); + return false; + } + _inputEnd += count; + } + return true; + } + protected ByteArrayBuilder _getByteArrayBuilder() { if (_byteArrayBuilder == null) { _byteArrayBuilder = new ByteArrayBuilder(); diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/failing/ParseInvalidUTF8String236Test.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/failing/ParseInvalidUTF8String236Test.java index e368e4b1c..72568ebab 100644 --- a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/failing/ParseInvalidUTF8String236Test.java +++ b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/failing/ParseInvalidUTF8String236Test.java @@ -1,20 +1,57 @@ package com.fasterxml.jackson.dataformat.cbor.failing; -import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.core.*; +import com.fasterxml.jackson.core.exc.StreamReadException; import com.fasterxml.jackson.dataformat.cbor.CBORParser; import com.fasterxml.jackson.dataformat.cbor.CBORTestBase; public class ParseInvalidUTF8String236Test extends CBORTestBase { - // [dataformats-binary#236]: Ends with the first byte of alleged 2-byte - // UTF-8 character; parser trying to access second byte beyond end. - public void testArrayIssue236() throws Exception + // [dataformats-binary#236]: Original version; broken UTF-8 all around. + // but gets hit by end-of-input only (since content not validated) + public void testShortString236Original() throws Exception { final byte[] input = {0x66, (byte) 0xef, 0x7d, 0x7d, 0xa, 0x2d, (byte) 0xda}; try (CBORParser p = cborParser(input)) { assertToken(JsonToken.VALUE_STRING, p.nextToken()); - assertEquals("foobar", p.getText()); - assertNull(p.nextToken()); + try { + String str = p.getText(); + fail("Should have failed, did not, String = '"+str+"'"); + } catch (StreamReadException e) { + verifyException(e, "Invalid UTF-8 middle byte 0x7d"); + } + } + } + + // Variant where the length would be valid, but the last byte is partial UTF-8 + // code point + public void testShortString236EndsWithPartialUTF8() throws Exception + { + final byte[] input = {0x63, 0x41, 0x2d, (byte) 0xda}; + try (CBORParser p = cborParser(input)) { + assertToken(JsonToken.VALUE_STRING, p.nextToken()); + try { + String str = p.getText(); + fail("Should have failed, did not, String = '"+str+"'"); + } catch (StreamReadException e) { + verifyException(e, "Malformed UTF-8 character at the end of"); + } + } + } + + // Variant where the length itself exceeds buffer + public void testShortString236TruncatedString() throws Exception + { + // String with length of 6 bytes claimed; only 5 provided + final byte[] input = {0x63, 0x41, 0x2d }; + try (CBORParser p = cborParser(input)) { + assertToken(JsonToken.VALUE_STRING, p.nextToken()); + try { + String str = p.getText(); + fail("Should have failed, did not, String = '"+str+"'"); + } catch (StreamReadException e) { + verifyException(e, "Unexpected end-of-input in VALUE_STRING"); + } } } } diff --git a/release-notes/CREDITS-2.x b/release-notes/CREDITS-2.x index 2dd867b5c..00caf30ca 100644 --- a/release-notes/CREDITS-2.x +++ b/release-notes/CREDITS-2.x @@ -157,3 +157,9 @@ Josh Barr (jobarr-amzn@github) * Contributed #232: (ion) Allow disabling native type ids in IonMapper (2.12.1) + +Fabian Meumertzheim (fmeum@github) + +* Reported #236: `ArrayIndexOutOfBoundsException` in `CBORParser` for invalid UTF-8 String + (2.12.2) + diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x index a61670d94..4d6e211ce 100644 --- a/release-notes/VERSION-2.x +++ b/release-notes/VERSION-2.x @@ -9,6 +9,11 @@ Project: jackson-datatypes-binaryModules: === Releases === ------------------------------------------------------------------------ +2.12.2 (not yet released) + +#236: `ArrayIndexOutOfBoundsException` in `CBORParser` for invalid UTF-8 String + (reported by Fabian M) + 2.12.1 (08-Jan-2021) #232: (ion) Allow disabling native type ids in IonMapper