Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UTF-16 support for JSON and option to skip BOMs #14

Merged
merged 4 commits into from
Oct 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions gen_testdata.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// +build ignore

// gen_testdata clones the utf-8 tests data to the other
// unicode encodings and adds BOM variants of each.
package main

import (
"io/ioutil"
"log"
"os"
"path/filepath"

"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
)

func main() {
var xforms = []struct {
dir, bom string
enc encoding.Encoding
}{
{"testdata/utf-16be", "\xFE\xFF", unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)},
{"testdata/utf-16le", "\xFF\xFE", unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)},
}

paths, _ := filepath.Glob("testdata/utf-8/*")
for _, p := range paths {
src, err := ioutil.ReadFile(p)
if err != nil {
log.Fatal(err)
}

write("testdata/utf-8_bom", p, "\xEF\xBB\xBF", src)
for _, xform := range xforms {
dst, err := xform.enc.NewEncoder().Bytes(src)
if err != nil {
log.Fatal(err)
}
write(xform.dir, p, "", dst)
write(xform.dir+"_bom", p, xform.bom, dst)
}
}
}

func write(dir, orig, bom string, buf []byte) {
f, err := os.Create(filepath.Join(dir, filepath.Base(orig)))
if err != nil {
log.Fatal(err)
}
if _, err = f.Write([]byte(bom)); err != nil {
log.Fatal(err)
}
if _, err = f.Write(buf); err != nil {
log.Fatal(err)
}
}
93 changes: 80 additions & 13 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
// a provided JSON Schema - https://json-schema.org/
package main

//go:generate go run gen_testdata.go

import (
"bufio"
"bytes"
"flag"
"fmt"
"io"
Expand All @@ -15,21 +18,37 @@ import (
"strings"
"sync"

"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"

"github.com/ghodss/yaml"
"github.com/mitchellh/go-homedir"
"github.com/xeipuuv/gojsonschema"
)

var (
version = "v1.3.0-dev"
version = "v1.4.0-dev"
schemaFlag = flag.String("s", "", "primary JSON schema to validate against, required")
quietFlag = flag.Bool("q", false, "quiet, only print validation failures and errors")
versionFlag = flag.Bool("v", false, "print version and exit")
bomFlag = flag.Bool("b", false, "allow BOM in JSON files, error if seen and unset")

listFlags stringFlags
refFlags stringFlags
)

// https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding
const (
bomUTF8 = "\xEF\xBB\xBF"
bomUTF16BE = "\xFE\xFF"
bomUTF16LE = "\xFF\xFE"
)

var (
encUTF16BE = unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
encUTF16LE = unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
)

func init() {
flag.Var(&listFlags, "l", "validate JSON documents from newline separated paths and/or globs in a text file (relative to the basename of the file itself)")
flag.Var(&refFlags, "r", "referenced schema(s), can be globs and/or used multiple times")
Expand Down Expand Up @@ -60,7 +79,7 @@ func realMain(args []string, w io.Writer) int {
dir := filepath.Dir(list)
f, err := os.Open(list)
if err != nil {
log.Fatalf("%s: %s\n", list, err)
return schemaError("%s: %s", list, err)
}
defer f.Close()

Expand All @@ -74,7 +93,7 @@ func realMain(args []string, w io.Writer) int {
docs = append(docs, glob(pattern)...)
}
if err := scanner.Err(); err != nil {
log.Fatalf("%s: invalid file list: %s\n", list, err)
return schemaError("%s: invalid file list: %s", list, err)
}
}
if len(docs) == 0 {
Expand All @@ -85,13 +104,13 @@ func realMain(args []string, w io.Writer) int {
sl := gojsonschema.NewSchemaLoader()
schemaPath, err := filepath.Abs(*schemaFlag)
if err != nil {
log.Fatalf("%s: unable to convert to absolute path: %s\n", *schemaFlag, err)
return schemaError("%s: unable to convert to absolute path: %s", *schemaFlag, err)
}
for _, ref := range refFlags {
for _, p := range glob(ref) {
absPath, err := filepath.Abs(p)
if err != nil {
log.Fatalf("%s: unable to convert to absolute path: %s\n", absPath, err)
return schemaError("%s: unable to convert to absolute path: %s", absPath, err)
}

if absPath == schemaPath {
Expand All @@ -100,22 +119,22 @@ func realMain(args []string, w io.Writer) int {

loader, err := jsonLoader(absPath)
if err != nil {
log.Fatalf("%s: unable to load schema ref: %s\n", *schemaFlag, err)
return schemaError("%s: unable to load schema ref: %s", *schemaFlag, err)
}

if err := sl.AddSchemas(loader); err != nil {
log.Fatalf("%s: invalid schema: %s\n", p, err)
return schemaError("%s: invalid schema: %s", p, err)
}
}
}

schemaLoader, err := jsonLoader(schemaPath)
if err != nil {
log.Fatalf("%s: unable to load schema: %s\n", *schemaFlag, err)
return schemaError("%s: unable to load schema: %s", *schemaFlag, err)
}
schema, err := sl.Compile(schemaLoader)
if err != nil {
log.Fatalf("%s: invalid schema: %s\n", *schemaFlag, err)
return schemaError("%s: invalid schema: %s", *schemaFlag, err)
}

// Validate the schema against each doc in parallel, limiting simultaneous
Expand All @@ -131,7 +150,6 @@ func realMain(args []string, w io.Writer) int {
sem <- 0
defer func() { <-sem }()


loader, err := jsonLoader(path)
if err != nil {
msg := fmt.Sprintf("%s: error: load doc: %s", path, err)
Expand Down Expand Up @@ -190,19 +208,62 @@ func jsonLoader(path string) (gojsonschema.JSONLoader, error) {
}
switch filepath.Ext(path) {
case ".yml", ".yaml":
// TODO YAML requires the precense of a BOM to detect UTF-16
// text. Is there a decent hueristic to detect UTF-16 text
// missing a BOM so we can provide a better error message?
buf, err = yaml.YAMLToJSON(buf)
default:
buf, err = jsonDecodeCharset(buf)
}
if err != nil {
return nil, err
}
// TODO What if we have an empty document?
return gojsonschema.NewBytesLoader(buf), nil
}

// jsonDecodeCharset attempts to detect UTF-16 (LE or BE) JSON text and
// decode as appropriate. It also skips a BOM at the start of the buffer
// if `-b` was specified. Presence of a BOM is an error otherwise.
func jsonDecodeCharset(buf []byte) ([]byte, error) {
if len(buf) < 2 { // UTF-8
return buf, nil
}

bom := ""
var enc encoding.Encoding
switch {
case bytes.HasPrefix(buf, []byte(bomUTF8)):
bom = bomUTF8
case bytes.HasPrefix(buf, []byte(bomUTF16BE)):
bom = bomUTF16BE
enc = encUTF16BE
case bytes.HasPrefix(buf, []byte(bomUTF16LE)):
bom = bomUTF16LE
enc = encUTF16LE
case buf[0] == 0:
enc = encUTF16BE
case buf[1] == 0:
enc = encUTF16LE
}

if bom != "" {
if !*bomFlag {
return nil, fmt.Errorf("unexpected BOM, see `-b` flag")
}
buf = buf[len(bom):]
}
if enc != nil {
return enc.NewDecoder().Bytes(buf)
}
return buf, nil
}

func printUsage() {
fmt.Fprintf(os.Stderr, `Usage: %s -s schema.(json|yml) [options] document.(json|yml) ...

yajsv validates JSON and YAML document(s) against a schema. One of three statuses are
reported per document:
yajsv validates JSON and YAML document(s) against a schema. One of three status
results are reported per document:

pass: Document is valid relative to the schema
fail: Document is invalid relative to the schema
Expand All @@ -212,7 +273,8 @@ func printUsage() {
schema validation failure.

Sets the exit code to 1 on any failures, 2 on any errors, 3 on both, 4 on
invalid usage. Otherwise, 0 is returned if everything passes validation.
invalid usage, 5 on schema definition or file-list errors. Otherwise, 0 is
returned if everything passes validation.

Options:

Expand All @@ -227,6 +289,11 @@ func usageError(msg string) int {
return 4
}

func schemaError(format string, args ...interface{}) int {
fmt.Fprintf(os.Stderr, format+"\n", args...)
return 5
}

// glob is a wrapper that also resolves `~` since we may be skipping
// the shell expansion when single-quoting globs at the command line
func glob(pattern string) []string {
Expand Down
Loading