Skip to content

Commit

Permalink
Account for bytes processed by encoding detection
Browse files Browse the repository at this point in the history
UTF8StreamJsonParser tracks read pointer (offset) and bytes processed
separately and uses those to generate JsonLocation. When the byte
payload starts with a UTF BOM, ByteSourceJsonBootstrapper processes a
few bytes ahead of the parser, moves/increases the offset and passes the
newly computed offset to the parser without telling it some bytes have
been pre-processed.
With this change, the number of bytes pre-processed for encoding
detection is passed to the parser. JsonLocation instances returned by
the parser now point to the correct byte offset when payload has a BOM.

Issue: FasterXML#533
  • Loading branch information
fabienrenaud authored and Fabien Renaud committed May 21, 2019
1 parent a8fbb07 commit 3e73f0b
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,9 @@ public JsonParser constructParser(ObjectReadContext readCtxt,
ByteQuadsCanonicalizer rootByteSymbols, CharsToNameCanonicalizer rootCharSymbols,
int factoryFeatures) throws IOException
{
int prevInputPtr = _inputPtr;
JsonEncoding enc = detectEncoding();
int bytesProcessed = _inputPtr - prevInputPtr;

if (enc == JsonEncoding.UTF8) {
/* and without canonicalization, byte-based approach is not performant; just use std UTF-8 reader
Expand All @@ -252,7 +254,7 @@ public JsonParser constructParser(ObjectReadContext readCtxt,
ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures);
return new UTF8StreamJsonParser(readCtxt, _context,
streamReadFeatures, formatReadFeatures, _in, can,
_inputBuffer, _inputPtr, _inputEnd, _bufferRecyclable);
_inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable);
}
}
return new ReaderBasedJsonParser(readCtxt, _context, streamReadFeatures, formatReadFeatures,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,21 @@ public class UTF8StreamJsonParser
/**********************************************************
*/

public UTF8StreamJsonParser(ObjectReadContext readCtxt, IOContext ctxt,
int stdFeatures, int formatReadFeatures,
InputStream in,
ByteQuadsCanonicalizer sym,
byte[] inputBuffer, int start, int end,
boolean bufferRecyclable) {
this(readCtxt, ctxt, stdFeatures, formatReadFeatures, in, sym,
inputBuffer, start, end, 0, bufferRecyclable);
}

public UTF8StreamJsonParser(ObjectReadContext readCtxt, IOContext ctxt,
int stdFeatures, int formatReadFeatures,
InputStream in,
ByteQuadsCanonicalizer sym,
byte[] inputBuffer, int start, int end,
byte[] inputBuffer, int start, int end, int bytesPreProcessed,
boolean bufferRecyclable)
{
super(readCtxt, ctxt, stdFeatures, formatReadFeatures);
Expand All @@ -131,9 +141,9 @@ public UTF8StreamJsonParser(ObjectReadContext readCtxt, IOContext ctxt,
_inputBuffer = inputBuffer;
_inputPtr = start;
_inputEnd = end;
_currInputRowStart = start;
_currInputRowStart = start - bytesPreProcessed;
// If we have offset, need to omit that from byte offset, so:
_currInputProcessed = -start;
_currInputProcessed = -start + bytesPreProcessed;
_bufferRecyclable = bufferRecyclable;
}

Expand Down
125 changes: 120 additions & 5 deletions src/test/java/com/fasterxml/jackson/core/json/LocationOffsetsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public void testSimpleInitialOffsets() throws Exception
assertEquals(0L, loc.getCharOffset());
assertEquals(1, loc.getLineNr());
assertEquals(1, loc.getColumnNr());

loc = p.getCurrentLocation();
assertEquals(-1L, loc.getByteOffset());
assertEquals(1L, loc.getCharOffset());
Expand All @@ -33,7 +33,7 @@ public void testSimpleInitialOffsets() throws Exception
p.close();

// then byte-based

p = JSON_F.createParser(ObjectReadContext.empty(), DOC.getBytes("UTF-8"));
assertToken(JsonToken.START_OBJECT, p.nextToken());

Expand All @@ -42,7 +42,7 @@ public void testSimpleInitialOffsets() throws Exception
assertEquals(-1L, loc.getCharOffset());
assertEquals(1, loc.getLineNr());
assertEquals(1, loc.getColumnNr());

loc = p.getCurrentLocation();
assertEquals(1L, loc.getByteOffset());
assertEquals(-1L, loc.getCharOffset());
Expand All @@ -61,15 +61,15 @@ public void testOffsetWithInputOffset() throws Exception
byte[] b = " { } ".getBytes("UTF-8");

// and then peel them off
p = JSON_F.createParser(ObjectReadContext.empty(), b, 3, b.length-5);
p = JSON_F.createParser(ObjectReadContext.empty(), b, 3, b.length - 5);
assertToken(JsonToken.START_OBJECT, p.nextToken());

loc = p.getTokenLocation();
assertEquals(0L, loc.getByteOffset());
assertEquals(-1L, loc.getCharOffset());
assertEquals(1, loc.getLineNr());
assertEquals(1, loc.getColumnNr());

loc = p.getCurrentLocation();
assertEquals(1L, loc.getByteOffset());
assertEquals(-1L, loc.getCharOffset());
Expand All @@ -78,4 +78,119 @@ public void testOffsetWithInputOffset() throws Exception

p.close();
}

public void testOffsetWithoutInputOffset() throws Exception
{
JsonLocation loc;
JsonParser p;
// 3 spaces before, 2 after, just for padding
byte[] b = " { } ".getBytes("UTF-8");

// and then peel them off
p = JSON_F.createParser(ObjectReadContext.empty(), b);
assertToken(JsonToken.START_OBJECT, p.nextToken());

loc = p.getTokenLocation();
assertEquals(3L, loc.getByteOffset());
assertEquals(-1L, loc.getCharOffset());
assertEquals(1, loc.getLineNr());
assertEquals(4, loc.getColumnNr());

loc = p.getCurrentLocation();
assertEquals(4L, loc.getByteOffset());
assertEquals(-1L, loc.getCharOffset());
assertEquals(1, loc.getLineNr());
assertEquals(5, loc.getColumnNr());

p.close();
}

// for [core#533]
public void testUtf8Bom() throws Exception
{
JsonLocation loc;
JsonParser p;

byte[] b = withUtf8Bom("{ }".getBytes());

// and then peel them off
p = JSON_F.createParser(ObjectReadContext.empty(), b);
assertToken(JsonToken.START_OBJECT, p.nextToken());

loc = p.getTokenLocation();
assertEquals(3L, loc.getByteOffset());
assertEquals(-1L, loc.getCharOffset());
assertEquals(1, loc.getLineNr());
assertEquals(4, loc.getColumnNr());

loc = p.getCurrentLocation();
assertEquals(4L, loc.getByteOffset());
assertEquals(-1L, loc.getCharOffset());
assertEquals(1, loc.getLineNr());
assertEquals(5, loc.getColumnNr());

p.close();
}

public void testUtf8BomWithPadding() throws Exception
{
JsonLocation loc;
JsonParser p;

byte[] b = withUtf8Bom(" { }".getBytes());

// and then peel them off
p = JSON_F.createParser(ObjectReadContext.empty(), b);
assertToken(JsonToken.START_OBJECT, p.nextToken());

loc = p.getTokenLocation();
assertEquals(6L, loc.getByteOffset());
assertEquals(-1L, loc.getCharOffset());
assertEquals(1, loc.getLineNr());
assertEquals(7, loc.getColumnNr());

loc = p.getCurrentLocation();
assertEquals(7L, loc.getByteOffset());
assertEquals(-1L, loc.getCharOffset());
assertEquals(1, loc.getLineNr());
assertEquals(8, loc.getColumnNr());

p.close();
}

public void testUtf8BomWithInputOffset() throws Exception
{
JsonLocation loc;
JsonParser p;

byte[] b = withUtf8Bom(" { }".getBytes());

// and then peel them off
p = JSON_F.createParser(ObjectReadContext.empty(), b);
assertToken(JsonToken.START_OBJECT, p.nextToken());

loc = p.getTokenLocation();
assertEquals(6L, loc.getByteOffset());
assertEquals(-1L, loc.getCharOffset());
assertEquals(1, loc.getLineNr());
assertEquals(7, loc.getColumnNr());

loc = p.getCurrentLocation();
assertEquals(7L, loc.getByteOffset());
assertEquals(-1L, loc.getCharOffset());
assertEquals(1, loc.getLineNr());
assertEquals(8, loc.getColumnNr());

p.close();
}

private byte[] withUtf8Bom(byte[] bytes) {
byte[] arr = new byte[bytes.length + 3];
// write UTF-8 BOM
arr[0] = (byte) 0xEF;
arr[1] = (byte) 0xBB;
arr[2] = (byte) 0xBF;
System.arraycopy(bytes, 0, arr, 3, bytes.length);
return arr;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -432,15 +432,9 @@ public void testUtf8BOMHandling() throws Exception

JsonParser p = JSON_FACTORY.createParser(ObjectReadContext.empty(), input);
assertEquals(JsonToken.START_ARRAY, p.nextToken());
// should also have skipped first 3 bytes of BOM; but do we have offset available?
/* 08-Oct-2013, tatu: Alas, due to [core#111], we have to omit BOM in calculations
* as we do not know what the offset is due to -- may need to revisit, if this
* discrepancy becomes an issue. For now it just means that BOM is considered
* "out of stream" (not part of input).
*/

JsonLocation loc = p.getTokenLocation();
// so if BOM was consider in-stream (part of input), this should expect 3:
assertEquals(0, loc.getByteOffset());
assertEquals(3, loc.getByteOffset());
assertEquals(-1, loc.getCharOffset());
assertEquals(JsonToken.VALUE_NUMBER_INT, p.nextToken());
assertEquals(JsonToken.END_ARRAY, p.nextToken());
Expand All @@ -449,7 +443,7 @@ public void testUtf8BOMHandling() throws Exception
p = JSON_FACTORY.createParser(ObjectReadContext.empty(),
new MockDataInput(input));
assertEquals(JsonToken.START_ARRAY, p.nextToken());
// same BOM, but DataInput is more restrctive so can skip but offsets
// same BOM, but DataInput is more restrictive so can skip but offsets
// are not reliable...
loc = p.getTokenLocation();
assertNotNull(loc);
Expand Down

0 comments on commit 3e73f0b

Please sign in to comment.