Account for bytes processed by encoding detection

UTF8StreamJsonParser tracks read pointer (offset) and bytes processed separately and uses those to generate JsonLocation. When the byte payload starts with a UTF BOM, ByteSourceJsonBootstrapper processes a few bytes ahead of the parser, moves/increases the offset and passes the newly computed offset to the parser without telling it some bytes have been pre-processed. With this change, the number of bytes pre-processed for encoding detection is passed to the parser. JsonLocation instances returned by the parser now point to the correct byte offset when payload has a BOM. Issue: FasterXML#533
fabienrenaud · May 21, 2019 · 3e73f0b · 3e73f0b
1 parent a8fbb07
commit 3e73f0b
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 18 deletions.
diff --git a/src/main/java/com/fasterxml/jackson/core/json/ByteSourceJsonBootstrapper.java b/src/main/java/com/fasterxml/jackson/core/json/ByteSourceJsonBootstrapper.java
@@ -242,7 +242,9 @@ public JsonParser constructParser(ObjectReadContext readCtxt,
             ByteQuadsCanonicalizer rootByteSymbols, CharsToNameCanonicalizer rootCharSymbols,
             int factoryFeatures) throws IOException
     {
+        int prevInputPtr = _inputPtr;
         JsonEncoding enc = detectEncoding();
+        int bytesProcessed = _inputPtr - prevInputPtr;
 
         if (enc == JsonEncoding.UTF8) {
             /* and without canonicalization, byte-based approach is not performant; just use std UTF-8 reader
@@ -252,7 +254,7 @@ public JsonParser constructParser(ObjectReadContext readCtxt,
                 ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures);
                 return new UTF8StreamJsonParser(readCtxt, _context,
                         streamReadFeatures, formatReadFeatures, _in, can,
-                        _inputBuffer, _inputPtr, _inputEnd, _bufferRecyclable);
+                        _inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable);
             }
         }
         return new ReaderBasedJsonParser(readCtxt, _context, streamReadFeatures, formatReadFeatures,

diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java
@@ -118,11 +118,21 @@ public class UTF8StreamJsonParser
     /**********************************************************
      */
 
+    public UTF8StreamJsonParser(ObjectReadContext readCtxt, IOContext ctxt,
+                                int stdFeatures, int formatReadFeatures,
+                                InputStream in,
+                                ByteQuadsCanonicalizer sym,
+                                byte[] inputBuffer, int start, int end,
+                                boolean bufferRecyclable) {
+        this(readCtxt, ctxt, stdFeatures, formatReadFeatures, in, sym,
+                inputBuffer, start, end, 0, bufferRecyclable);
+    }
+
     public UTF8StreamJsonParser(ObjectReadContext readCtxt, IOContext ctxt,
             int stdFeatures, int formatReadFeatures,
             InputStream in,
             ByteQuadsCanonicalizer sym,
-            byte[] inputBuffer, int start, int end,
+            byte[] inputBuffer, int start, int end, int bytesPreProcessed,
             boolean bufferRecyclable)
     {
         super(readCtxt, ctxt, stdFeatures, formatReadFeatures);
@@ -131,9 +141,9 @@ public UTF8StreamJsonParser(ObjectReadContext readCtxt, IOContext ctxt,
         _inputBuffer = inputBuffer;
         _inputPtr = start;
         _inputEnd = end;
-        _currInputRowStart = start;
+        _currInputRowStart = start - bytesPreProcessed;
         // If we have offset, need to omit that from byte offset, so:
-        _currInputProcessed = -start;
+        _currInputProcessed = -start + bytesPreProcessed;
         _bufferRecyclable = bufferRecyclable;
     }
 

diff --git a/src/test/java/com/fasterxml/jackson/core/json/LocationOffsetsTest.java b/src/test/java/com/fasterxml/jackson/core/json/LocationOffsetsTest.java
@@ -23,7 +23,7 @@ public void testSimpleInitialOffsets() throws Exception
         assertEquals(0L, loc.getCharOffset());
         assertEquals(1, loc.getLineNr());
         assertEquals(1, loc.getColumnNr());
-        
+
         loc = p.getCurrentLocation();
         assertEquals(-1L, loc.getByteOffset());
         assertEquals(1L, loc.getCharOffset());
@@ -33,7 +33,7 @@ public void testSimpleInitialOffsets() throws Exception
         p.close();
 
         // then byte-based
-        
+
         p = JSON_F.createParser(ObjectReadContext.empty(), DOC.getBytes("UTF-8"));
         assertToken(JsonToken.START_OBJECT, p.nextToken());
 
@@ -42,7 +42,7 @@ public void testSimpleInitialOffsets() throws Exception
         assertEquals(-1L, loc.getCharOffset());
         assertEquals(1, loc.getLineNr());
         assertEquals(1, loc.getColumnNr());
-        
+
         loc = p.getCurrentLocation();
         assertEquals(1L, loc.getByteOffset());
         assertEquals(-1L, loc.getCharOffset());
@@ -61,15 +61,15 @@ public void testOffsetWithInputOffset() throws Exception
         byte[] b = "   { }  ".getBytes("UTF-8");
 
         // and then peel them off
-        p = JSON_F.createParser(ObjectReadContext.empty(), b, 3, b.length-5);
+        p = JSON_F.createParser(ObjectReadContext.empty(), b, 3, b.length - 5);
         assertToken(JsonToken.START_OBJECT, p.nextToken());
 
         loc = p.getTokenLocation();
         assertEquals(0L, loc.getByteOffset());
         assertEquals(-1L, loc.getCharOffset());
         assertEquals(1, loc.getLineNr());
         assertEquals(1, loc.getColumnNr());
-        
+
         loc = p.getCurrentLocation();
         assertEquals(1L, loc.getByteOffset());
         assertEquals(-1L, loc.getCharOffset());
@@ -78,4 +78,119 @@ public void testOffsetWithInputOffset() throws Exception
 
         p.close();
     }
+
+    public void testOffsetWithoutInputOffset() throws Exception
+    {
+        JsonLocation loc;
+        JsonParser p;
+        // 3 spaces before, 2 after, just for padding
+        byte[] b = "   { }  ".getBytes("UTF-8");
+
+        // and then peel them off
+        p = JSON_F.createParser(ObjectReadContext.empty(), b);
+        assertToken(JsonToken.START_OBJECT, p.nextToken());
+
+        loc = p.getTokenLocation();
+        assertEquals(3L, loc.getByteOffset());
+        assertEquals(-1L, loc.getCharOffset());
+        assertEquals(1, loc.getLineNr());
+        assertEquals(4, loc.getColumnNr());
+
+        loc = p.getCurrentLocation();
+        assertEquals(4L, loc.getByteOffset());
+        assertEquals(-1L, loc.getCharOffset());
+        assertEquals(1, loc.getLineNr());
+        assertEquals(5, loc.getColumnNr());
+
+        p.close();
+    }
+
+    // for [core#533]
+    public void testUtf8Bom() throws Exception
+    {
+        JsonLocation loc;
+        JsonParser p;
+
+        byte[] b = withUtf8Bom("{ }".getBytes());
+
+        // and then peel them off
+        p = JSON_F.createParser(ObjectReadContext.empty(), b);
+        assertToken(JsonToken.START_OBJECT, p.nextToken());
+
+        loc = p.getTokenLocation();
+        assertEquals(3L, loc.getByteOffset());
+        assertEquals(-1L, loc.getCharOffset());
+        assertEquals(1, loc.getLineNr());
+        assertEquals(4, loc.getColumnNr());
+
+        loc = p.getCurrentLocation();
+        assertEquals(4L, loc.getByteOffset());
+        assertEquals(-1L, loc.getCharOffset());
+        assertEquals(1, loc.getLineNr());
+        assertEquals(5, loc.getColumnNr());
+
+        p.close();
+    }
+
+    public void testUtf8BomWithPadding() throws Exception
+    {
+        JsonLocation loc;
+        JsonParser p;
+
+        byte[] b = withUtf8Bom("   { }".getBytes());
+
+        // and then peel them off
+        p = JSON_F.createParser(ObjectReadContext.empty(), b);
+        assertToken(JsonToken.START_OBJECT, p.nextToken());
+
+        loc = p.getTokenLocation();
+        assertEquals(6L, loc.getByteOffset());
+        assertEquals(-1L, loc.getCharOffset());
+        assertEquals(1, loc.getLineNr());
+        assertEquals(7, loc.getColumnNr());
+
+        loc = p.getCurrentLocation();
+        assertEquals(7L, loc.getByteOffset());
+        assertEquals(-1L, loc.getCharOffset());
+        assertEquals(1, loc.getLineNr());
+        assertEquals(8, loc.getColumnNr());
+
+        p.close();
+    }
+
+    public void testUtf8BomWithInputOffset() throws Exception
+    {
+        JsonLocation loc;
+        JsonParser p;
+
+        byte[] b = withUtf8Bom("   { }".getBytes());
+
+        // and then peel them off
+        p = JSON_F.createParser(ObjectReadContext.empty(), b);
+        assertToken(JsonToken.START_OBJECT, p.nextToken());
+
+        loc = p.getTokenLocation();
+        assertEquals(6L, loc.getByteOffset());
+        assertEquals(-1L, loc.getCharOffset());
+        assertEquals(1, loc.getLineNr());
+        assertEquals(7, loc.getColumnNr());
+
+        loc = p.getCurrentLocation();
+        assertEquals(7L, loc.getByteOffset());
+        assertEquals(-1L, loc.getCharOffset());
+        assertEquals(1, loc.getLineNr());
+        assertEquals(8, loc.getColumnNr());
+
+        p.close();
+    }
+
+    private byte[] withUtf8Bom(byte[] bytes) {
+        byte[] arr = new byte[bytes.length + 3];
+        // write UTF-8 BOM
+        arr[0] = (byte) 0xEF;
+        arr[1] = (byte) 0xBB;
+        arr[2] = (byte) 0xBF;
+        System.arraycopy(bytes, 0, arr, 3, bytes.length);
+        return arr;
+    }
 }
diff --git a/src/test/java/com/fasterxml/jackson/core/read/JsonParserTest.java b/src/test/java/com/fasterxml/jackson/core/read/JsonParserTest.java
@@ -432,15 +432,9 @@ public void testUtf8BOMHandling() throws Exception
 
         JsonParser p = JSON_FACTORY.createParser(ObjectReadContext.empty(), input);
         assertEquals(JsonToken.START_ARRAY, p.nextToken());
-        // should also have skipped first 3 bytes of BOM; but do we have offset available?
-        /* 08-Oct-2013, tatu: Alas, due to [core#111], we have to omit BOM in calculations
-         *   as we do not know what the offset is due to -- may need to revisit, if this
-         *   discrepancy becomes an issue. For now it just means that BOM is considered
-         *   "out of stream" (not part of input).
-         */
+
         JsonLocation loc = p.getTokenLocation();
-        // so if BOM was consider in-stream (part of input), this should expect 3:
-        assertEquals(0, loc.getByteOffset());
+        assertEquals(3, loc.getByteOffset());
         assertEquals(-1, loc.getCharOffset());
         assertEquals(JsonToken.VALUE_NUMBER_INT, p.nextToken());
         assertEquals(JsonToken.END_ARRAY, p.nextToken());
@@ -449,7 +443,7 @@ public void testUtf8BOMHandling() throws Exception
         p = JSON_FACTORY.createParser(ObjectReadContext.empty(),
                 new MockDataInput(input));
         assertEquals(JsonToken.START_ARRAY, p.nextToken());
-        // same BOM, but DataInput is more restrctive so can skip but offsets
+        // same BOM, but DataInput is more restrictive so can skip but offsets
         // are not reliable...
         loc = p.getTokenLocation();
         assertNotNull(loc);