Skip to content

Commit

Permalink
yajsv: UTF-16 and BOM handling
Browse files Browse the repository at this point in the history
  • Loading branch information
neilpa committed Sep 18, 2020
1 parent 6f93deb commit 806c9cd
Showing 1 changed file with 63 additions and 2 deletions.
65 changes: 63 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
// a provided JSON Schema - https://json-schema.org/
package main

//go:generate go run gen_testdata.go

import (
"bufio"
"bytes"
"flag"
"fmt"
"io"
Expand All @@ -15,21 +18,37 @@ import (
"strings"
"sync"

"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"

"github.com/ghodss/yaml"
"github.com/mitchellh/go-homedir"
"github.com/xeipuuv/gojsonschema"
)

var (
version = "v1.3.0-dev"
version = "v1.4.0-dev"
schemaFlag = flag.String("s", "", "primary JSON schema to validate against, required")
quietFlag = flag.Bool("q", false, "quiet, only print validation failures and errors")
versionFlag = flag.Bool("v", false, "print version and exit")
bomFlag = flag.Bool("b", false, "allow BOM in JSON files, error if seen and unset")

listFlags stringFlags
refFlags stringFlags
)

// https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding
const (
bomUTF8 = "\xEF\xBB\xBF"
bomUTF16BE = "\xFE\xFF"
bomUTF16LE = "\xFF\xFE"
)

var (
encUTF16BE = unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
encUTF16LE = unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
)

func init() {
flag.Var(&listFlags, "l", "validate JSON documents from newline separated paths and/or globs in a text file (relative to the basename of the file itself)")
flag.Var(&refFlags, "r", "referenced schema(s), can be globs and/or used multiple times")
Expand Down Expand Up @@ -131,7 +150,6 @@ func realMain(args []string, w io.Writer) int {
sem <- 0
defer func() { <-sem }()


loader, err := jsonLoader(path)
if err != nil {
msg := fmt.Sprintf("%s: error: load doc: %s", path, err)
Expand Down Expand Up @@ -190,14 +208,57 @@ func jsonLoader(path string) (gojsonschema.JSONLoader, error) {
}
switch filepath.Ext(path) {
case ".yml", ".yaml":
// TODO YAML requires the precense of a BOM to detect UTF-16
// text. Is there a decent hueristic to detect UTF-16 text
// missing a BOM so we can provide a better error message?
buf, err = yaml.YAMLToJSON(buf)
default:
buf, err = jsonDecodeCharset(buf)
}
if err != nil {
return nil, err
}
// TODO What if we have an empty document?
return gojsonschema.NewBytesLoader(buf), nil
}

// jsonDecodeCharset attempts to detect UTF-16 (LE or BE) JSON text and
// decode as appropriate. It also skips a BOM at the start of the buffer
// if `-b` was specified. Presence of a BOM is an error otherwise.
func jsonDecodeCharset(buf []byte) ([]byte, error) {
if len(buf) < 2 { // UTF-8
return buf, nil
}

bom := ""
var enc encoding.Encoding
switch {
case bytes.HasPrefix(buf, []byte(bomUTF8)):
bom = bomUTF8
case bytes.HasPrefix(buf, []byte(bomUTF16BE)):
bom = bomUTF16BE
enc = encUTF16BE
case bytes.HasPrefix(buf, []byte(bomUTF16LE)):
bom = bomUTF16LE
enc = encUTF16LE
case buf[0] == 0:
enc = encUTF16BE
case buf[1] == 0:
enc = encUTF16LE
}

if bom != "" {
if !*bomFlag {
return nil, fmt.Errorf("unexpected BOM, see `-b` flag")
}
buf = buf[len(bom):]
}
if enc != nil {
return enc.NewDecoder().Bytes(buf)
}
return buf, nil
}

func printUsage() {
fmt.Fprintf(os.Stderr, `Usage: %s -s schema.(json|yml) [options] document.(json|yml) ...
Expand Down

0 comments on commit 806c9cd

Please sign in to comment.