From aba7d200d8f5f9f34a8fbfc02b35e709392add62 Mon Sep 17 00:00:00 2001 From: alandonovan Date: Thu, 10 Dec 2020 13:46:54 -0500 Subject: [PATCH] starlark: add 'bytes' data type, for binary strings This change defines a 'bytes' data type, an immutable string of bytes. In this Go implementation of Starlark, ordinary strings are also strings of bytes, so the behavior of the two is very similar. However, that is not required by the spec. Other implementations of Starlark, notably in Java, may use strings of UTF-16 codes for the ordinary string type, and thus need a distinct type for byte strings. See testdata/bytes.star for a tour of the API, and some remaining questions. See the attached issue for an outline of the proposed spec change. A Java implementation is underway, but is greatly complicated by Bazel's unfortunate misdecoding of UTF-8 files as Latin1. The string.elems iterable view is now indexable. This change removes go.starlark.net.lib.proto.Bytes. Updates https://github.com/bazelbuild/starlark/issues/112 Change-Id: Ieccd177a2662ca2106016165b50073a670ae7f2c --- go.mod | 4 +- go.sum | 10 +- internal/compile/compile.go | 32 +++++-- internal/compile/serial.go | 22 +++-- lib/proto/proto.go | 87 +---------------- starlark/eval.go | 13 +++ starlark/eval_test.go | 1 + starlark/library.go | 151 +++++++++++++++++++++++++---- starlark/testdata/bytes.star | 131 +++++++++++++++++++++++++ starlark/testdata/json.star | 2 +- starlark/testdata/string.star | 26 ++++- starlark/value.go | 176 ++++++++++++++++++++++++++-------- syntax/parse.go | 7 +- syntax/parse_test.go | 7 +- syntax/quote.go | 40 +++++++- syntax/quote_test.go | 2 +- syntax/scan.go | 22 ++++- syntax/scan_test.go | 26 +++++ syntax/syntax.go | 4 +- 19 files changed, 573 insertions(+), 190 deletions(-) create mode 100644 starlark/testdata/bytes.star diff --git a/go.mod b/go.mod index 50bc0004..d14060ec 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,8 @@ require ( github.com/chzyer/logex v1.1.10 // indirect github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 // indirect - golang.org/x/sys v0.0.0-20200803210538-64077c9b5642 + github.com/google/go-cmp v0.5.1 // indirect + golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f + golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect google.golang.org/protobuf v1.25.0 ) diff --git a/go.sum b/go.sum index b40c868a..90a8048b 100644 --- a/go.sum +++ b/go.sum @@ -24,8 +24,9 @@ github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5a github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.0 h1:/QaMHBdZ26BB3SSst0Iwl10Epc+xhTquomWX0oZEB6w= github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.1 h1:JFrFEBb2xKufg6XkJsJr+WbKb4FQlURi5RUcBveYu9k= +github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -42,15 +43,16 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20200803210538-64077c9b5642 h1:B6caxRw+hozq68X2MY7jEpZh/cr4/aHLv9xU8Kkadrw= -golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= diff --git a/internal/compile/compile.go b/internal/compile/compile.go index eb8e1627..86407136 100644 --- a/internal/compile/compile.go +++ b/internal/compile/compile.go @@ -33,6 +33,7 @@ import ( "os" "path/filepath" "strconv" + "strings" "sync" "go.starlark.net/resolve" @@ -46,7 +47,7 @@ var Disassemble = false const debug = false // make code generation verbose, for debugging the compiler // Increment this to force recompilation of saved bytecode files. -const Version = 10 +const Version = 11 type Opcode uint8 @@ -306,12 +307,15 @@ func (op Opcode) String() string { type Program struct { Loads []Binding // name (really, string) and position of each load stmt Names []string // names of attributes and predeclared variables - Constants []interface{} // = string | int64 | float64 | *big.Int + Constants []interface{} // = string | int64 | float64 | *big.Int | Bytes Functions []*Funcode Globals []Binding // for error messages and tracing Toplevel *Funcode // module initialization function } +// The type of a byte string literal value, to distinguish from text string. +type Bytes string + // A Funcode is the code of a compiled Starlark function. // // Funcodes are serialized by the encoder.function method, @@ -860,6 +864,8 @@ func PrintOp(fn *Funcode, pc uint32, op Opcode, arg uint32) { switch x := fn.Prog.Constants[arg].(type) { case string: comment = strconv.Quote(x) + case Bytes: + comment = "b" + strconv.Quote(string(x)) default: comment = fmt.Sprint(x) } @@ -1286,8 +1292,12 @@ func (fcomp *fcomp) expr(e syntax.Expr) { fcomp.lookup(e) case *syntax.Literal: - // e.Value is int64, float64, *bigInt, or string. - fcomp.emit1(CONSTANT, fcomp.pcomp.constantIndex(e.Value)) + // e.Value is int64, float64, *bigInt, string + v := e.Value + if e.Token == syntax.BYTES { + v = Bytes(v.(string)) + } + fcomp.emit1(CONSTANT, fcomp.pcomp.constantIndex(v)) case *syntax.ListExpr: for _, x := range e.List { @@ -1525,7 +1535,7 @@ func (fcomp *fcomp) plus(e *syntax.BinaryExpr) { } // addable reports whether e is a statically addable -// expression: a [s]tring, [l]ist, or [t]uple. +// expression: a [s]tring, [b]ytes, [l]ist, or [t]uple. func addable(e syntax.Expr) rune { switch e := e.(type) { case *syntax.Literal: @@ -1533,6 +1543,8 @@ func addable(e syntax.Expr) rune { switch e.Token { case syntax.STRING: return 's' + case syntax.BYTES: + return 'b' } case *syntax.ListExpr: return 'l' @@ -1547,12 +1559,16 @@ func addable(e syntax.Expr) rune { // The resulting syntax is degenerate, lacking position, etc. func add(code rune, args []summand) syntax.Expr { switch code { - case 's': - var buf bytes.Buffer + case 's', 'b': + var buf strings.Builder for _, arg := range args { buf.WriteString(arg.x.(*syntax.Literal).Value.(string)) } - return &syntax.Literal{Token: syntax.STRING, Value: buf.String()} + tok := syntax.STRING + if code == 'b' { + tok = syntax.BYTES + } + return &syntax.Literal{Token: tok, Value: buf.String()} case 'l': var elems []syntax.Expr for _, arg := range args { diff --git a/internal/compile/serial.go b/internal/compile/serial.go index 0107ef9c..adadabfc 100644 --- a/internal/compile/serial.go +++ b/internal/compile/serial.go @@ -51,9 +51,10 @@ package compile // // Constant: # type data // type varint # 0=string string -// data ... # 1=int varint -// # 2=float varint (bits as uint64) -// # 3=bigint string (decimal ASCII text) +// data ... # 1=bytes string +// # 2=int varint +// # 3=float varint (bits as uint64) +// # 4=bigint string (decimal ASCII text) // // The encoding starts with a four-byte magic number. // The next four bytes are a little-endian uint32 @@ -109,14 +110,17 @@ func (prog *Program) Encode() []byte { case string: e.int(0) e.string(c) - case int64: + case Bytes: e.int(1) + e.string(string(c)) + case int64: + e.int(2) e.int64(c) case float64: - e.int(2) + e.int(3) e.uint64(math.Float64bits(c)) case *big.Int: - e.int(3) + e.int(4) e.string(c.Text(10)) } } @@ -249,10 +253,12 @@ func DecodeProgram(data []byte) (_ *Program, err error) { case 0: c = d.string() case 1: - c = d.int64() + c = Bytes(d.string()) case 2: - c = math.Float64frombits(d.uint64()) + c = d.int64() case 3: + c = math.Float64frombits(d.uint64()) + case 4: c, _ = new(big.Int).SetString(d.string(), 10) } constants[i] = c diff --git a/lib/proto/proto.go b/lib/proto/proto.go index 84aa0d63..149162db 100644 --- a/lib/proto/proto.go +++ b/lib/proto/proto.go @@ -79,8 +79,6 @@ package proto // TODO(adonovan): Go and Starlark API improvements: -// - Contribute the 'bytes' data type to the core language. -// See https://github.com/bazelbuild/starlark/issues/112. // - Make Message and RepeatedField comparable. // (NOTE: proto.Equal works only with generated message types.) // - Support maps, oneof, any. But not messageset if we can avoid it. @@ -234,7 +232,7 @@ func marshal(_ *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwar if err != nil { return nil, fmt.Errorf("%s: %v", fn.Name(), err) } - return Bytes(data), nil + return starlark.Bytes(data), nil } else { text, err := prototext.MarshalOptions{Indent: " "}.Marshal(m.Message()) if err != nil { @@ -247,7 +245,7 @@ func marshal(_ *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwar // unmarshal(msg) decodes a binary protocol message to a Message. func unmarshal(thread *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) { var desc MessageDescriptor - var data Bytes + var data starlark.Bytes if err := starlark.UnpackPositionalArgs(fn.Name(), args, kwargs, 2, &desc, &data); err != nil { return nil, err } @@ -486,7 +484,7 @@ func toProto(fdesc protoreflect.FieldDescriptor, v starlark.Value) (protoreflect case protoreflect.StringKind: if s, ok := starlark.AsString(v); ok { return protoreflect.ValueOfString(s), nil - } else if b, ok := v.(Bytes); ok { + } else if b, ok := v.(starlark.Bytes); ok { // TODO(adonovan): allow bytes for string? Not friendly to a Java port. return protoreflect.ValueOfBytes([]byte(b)), nil } @@ -497,7 +495,7 @@ func toProto(fdesc protoreflect.FieldDescriptor, v starlark.Value) (protoreflect // Instead provide b"..." literals in the core // and a bytes(str) conversion. return protoreflect.ValueOfBytes([]byte(s)), nil - } else if b, ok := v.(Bytes); ok { + } else if b, ok := v.(starlark.Bytes); ok { return protoreflect.ValueOfBytes([]byte(b)), nil } @@ -588,7 +586,7 @@ func toStarlark1(typ protoreflect.FieldDescriptor, x protoreflect.Value, frozen return starlark.String(x.String()) case protoreflect.BytesKind: - return Bytes(x.Bytes()) + return starlark.Bytes(x.Bytes()) case protoreflect.DoubleKind, protoreflect.FloatKind: return starlark.Float(x.Float()) @@ -1232,78 +1230,3 @@ func (x EnumValueDescriptor) CompareSameType(op syntax.Token, y_ starlark.Value, return false, fmt.Errorf("%s %s %s not implemented", x.Type(), op, y_.Type()) } } - -// A Bytes is an immutable sequence of bytes. -// It is comparable, iterable, indexable, and sliceable. -// -// (In go.starlark.net, text Strings are also byte strings, -// but we shouldn't rely on that. -// See https://github.com/bazelbuild/starlark/issues/112.) -type Bytes string - -var ( - _ starlark.Comparable = Bytes("") - _ starlark.Iterable = Bytes("") - _ starlark.Sliceable = Bytes("") - _ starlark.Sequence = Bytes("") -) - -func (b Bytes) String() string { return fmt.Sprintf("<%d bytes>", len(b)) } -func (b Bytes) Type() string { return "bytes" } -func (b Bytes) Freeze() {} // immutable -func (b Bytes) Truth() starlark.Bool { return len(b) > 0 } -func (b Bytes) Hash() (uint32, error) { return starlark.String(b).Hash() } -func (b Bytes) Len() int { return len(b) } -func (b Bytes) Index(i int) starlark.Value { return starlark.MakeInt(int(b[i])) } - -func (b Bytes) Slice(start, end, step int) starlark.Value { - if step == 1 { - return b[start:end] - } - - sign := signum(step) - var str []byte - for i := start; signum(end-i) == sign; i += step { - str = append(str, b[i]) - } - return Bytes(str) -} - -// From Hacker's Delight, section 2.8. -func signum64(x int64) int { return int(uint64(x>>63) | uint64(-x)>>63) } -func signum(x int) int { return signum64(int64(x)) } - -func (b Bytes) Iterate() starlark.Iterator { return &bytesIterator{string(b)} } - -type bytesIterator struct{ string } - -func (it *bytesIterator) Next(p *starlark.Value) bool { - if it.string == "" { - return false - } - *p = starlark.MakeInt(int(it.string[0])) - it.string = it.string[1:] - return true -} - -func (it *bytesIterator) Done() {} - -func (x Bytes) CompareSameType(op syntax.Token, y_ starlark.Value, depth int) (bool, error) { - y := y_.(Bytes) - cmp := strings.Compare(string(x), string(y)) - switch op { - case syntax.EQL: - return cmp == 0, nil - case syntax.NEQ: - return cmp != 0, nil - case syntax.LE: - return cmp <= 0, nil - case syntax.LT: - return cmp < 0, nil - case syntax.GE: - return cmp >= 0, nil - case syntax.GT: - return cmp > 0, nil - } - panic(op) -} diff --git a/starlark/eval.go b/starlark/eval.go index c9bbb67b..9bc87709 100644 --- a/starlark/eval.go +++ b/starlark/eval.go @@ -478,6 +478,8 @@ func makeToplevelFunction(prog *compile.Program, predeclared StringDict) *Functi v = MakeBigInt(c) case string: v = String(c) + case compile.Bytes: + v = Bytes(c) case float64: v = Float(c) default: @@ -796,6 +798,8 @@ func Binary(op syntax.Token, x, y Value) (Value, error) { return xf * y, nil case String: return stringRepeat(y, x) + case Bytes: + return bytesRepeat(y, x) case *List: elems, err := tupleRepeat(Tuple(y.elems), x) if err != nil { @@ -820,6 +824,10 @@ func Binary(op syntax.Token, x, y Value) (Value, error) { if y, ok := y.(Int); ok { return stringRepeat(x, y) } + case Bytes: + if y, ok := y.(Int); ok { + return bytesRepeat(x, y) + } case *List: if y, ok := y.(Int); ok { elems, err := tupleRepeat(Tuple(x.elems), y) @@ -1138,6 +1146,11 @@ func tupleRepeat(elems Tuple, n Int) (Tuple, error) { return res, nil } +func bytesRepeat(b Bytes, n Int) (Bytes, error) { + res, err := stringRepeat(String(b), n) + return Bytes(res), err +} + func stringRepeat(s String, n Int) (String, error) { if s == "" { return "", nil diff --git a/starlark/eval_test.go b/starlark/eval_test.go index 81f8c580..b5c0a131 100644 --- a/starlark/eval_test.go +++ b/starlark/eval_test.go @@ -117,6 +117,7 @@ func TestExecFile(t *testing.T) { "testdata/assign.star", "testdata/bool.star", "testdata/builtins.star", + "testdata/bytes.star", "testdata/control.star", "testdata/dict.star", "testdata/float.star", diff --git a/starlark/library.go b/starlark/library.go index 17638240..cd73f089 100644 --- a/starlark/library.go +++ b/starlark/library.go @@ -42,6 +42,7 @@ func init() { "any": NewBuiltin("any", any), "all": NewBuiltin("all", all), "bool": NewBuiltin("bool", bool_), + "bytes": NewBuiltin("bytes", bytes_), "chr": NewBuiltin("chr", chr), "dict": NewBuiltin("dict", dict), "dir": NewBuiltin("dir", dir), @@ -73,6 +74,10 @@ func init() { // methods of built-in types // https://github.com/google/starlark-go/blob/master/doc/spec.md#built-in-methods var ( + bytesMethods = map[string]*Builtin{ + "elems": NewBuiltin("elems", bytes_elems), + } + dictMethods = map[string]*Builtin{ "clear": NewBuiltin("clear", dict_clear), "get": NewBuiltin("get", dict_get), @@ -198,6 +203,45 @@ func bool_(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error return x.Truth(), nil } +// https://github.com/google/starlark-go/blob/master/doc/spec.md#bytes +func bytes_(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) { + if len(kwargs) > 0 { + return nil, fmt.Errorf("bytes does not accept keyword arguments") + } + if len(args) != 1 { + return nil, fmt.Errorf("bytes: got %d arguments, want exactly 1", len(args)) + } + switch x := args[0].(type) { + case Bytes: + return x, nil + case String: + // Invalid encodings are replaced by that of U+FFFD. + return Bytes(utf8Transcode(string(x))), nil + case Iterable: + // iterable of numeric byte values + var buf strings.Builder + if n := Len(x); n >= 0 { + // common case: known length + buf.Grow(n) + } + iter := x.Iterate() + defer iter.Done() + var elem Value + var b byte + for i := 0; iter.Next(&elem); i++ { + if err := AsInt(elem, &b); err != nil { + return nil, fmt.Errorf("bytes: at index %d, %s", i, err) + } + buf.WriteByte(b) + } + return Bytes(buf.String()), nil + + default: + // Unlike string(foo), which stringifies it, bytes(foo) is an error. + return nil, fmt.Errorf("bytes: got %s, want string, bytes, or iterable of ints", x.Type()) + } +} + // https://github.com/google/starlark-go/blob/master/doc/spec.md#chr func chr(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) { if len(kwargs) > 0 { @@ -261,9 +305,6 @@ func enumerate(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, e } iter := iterable.Iterate() - if iter == nil { - return nil, fmt.Errorf("enumerate: got %s, want iterable", iterable.Type()) - } defer iter.Done() var pairs []Value @@ -693,16 +734,26 @@ func ord(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) if len(args) != 1 { return nil, fmt.Errorf("ord: got %d arguments, want 1", len(args)) } - s, ok := AsString(args[0]) - if !ok { - return nil, fmt.Errorf("ord: got %s, want string", args[0].Type()) - } - r, sz := utf8.DecodeRuneInString(s) - if sz == 0 || sz != len(s) { - n := utf8.RuneCountInString(s) - return nil, fmt.Errorf("ord: string encodes %d Unicode code points, want 1", n) + switch x := args[0].(type) { + case String: + // ord(string) returns int value of sole rune. + s := string(x) + r, sz := utf8.DecodeRuneInString(s) + if sz == 0 || sz != len(s) { + n := utf8.RuneCountInString(s) + return nil, fmt.Errorf("ord: string encodes %d Unicode code points, want 1", n) + } + return MakeInt(int(r)), nil + + case Bytes: + // ord(bytes) returns int value of sole byte. + if len(x) != 1 { + return nil, fmt.Errorf("ord: bytes has length %d, want 1", len(x)) + } + return MakeInt(int(x[0])), nil + default: + return nil, fmt.Errorf("ord: got %s, want string or bytes", x.Type()) } - return MakeInt(int(r)), nil } // https://github.com/google/starlark-go/blob/master/doc/spec.md#print @@ -718,6 +769,8 @@ func print(thread *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, error } if s, ok := AsString(v); ok { buf.WriteString(s) + } else if b, ok := v.(Bytes); ok { + buf.WriteString(string(b)) } else { writeValue(buf, v, nil) } @@ -995,11 +1048,29 @@ func str(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) if len(args) != 1 { return nil, fmt.Errorf("str: got %d arguments, want exactly 1", len(args)) } - x := args[0] - if _, ok := AsString(x); !ok { - x = String(x.String()) + switch x := args[0].(type) { + case String: + return x, nil + case Bytes: + // Invalid encodings are replaced by that of U+FFFD. + return String(utf8Transcode(string(x))), nil + default: + return String(x.String()), nil } - return x, nil +} + +// utf8Transcode returns the UTF-8-to-UTF-8 transcoding of s. +// The effect is that each code unit that is part of an +// invalid sequence is replaced by U+FFFD. +func utf8Transcode(s string) string { + if utf8.ValidString(s) { + return s + } + var out strings.Builder + for _, r := range s { + out.WriteRune(r) + } + return out.String() } // https://github.com/google/starlark-go/blob/master/doc/spec.md#tuple @@ -1376,13 +1447,51 @@ func string_iterable(_ *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, if err := UnpackPositionalArgs(b.Name(), args, kwargs, 0); err != nil { return nil, err } - return stringIterable{ - s: b.Receiver().(String), - ords: b.Name()[len(b.Name())-2] == 'd', - codepoints: b.Name()[0] == 'c', - }, nil + s := b.Receiver().(String) + ords := b.Name()[len(b.Name())-2] == 'd' + codepoints := b.Name()[0] == 'c' + if codepoints { + return stringCodepoints{s, ords}, nil + } else { + return stringElems{s, ords}, nil + } +} + +// bytes_elems returns an unspecified iterable value whose +// iterator yields successive 1-byte substrings. +func bytes_elems(_ *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, error) { + if err := UnpackPositionalArgs(b.Name(), args, kwargs, 0); err != nil { + return nil, err + } + return bytesIterable{b.Receiver().(Bytes)}, nil } +// A bytesIterable is an iterable returned by bytes.elems(), +// whose iterator yields a sequence of numeric bytes values. +type bytesIterable struct{ bytes Bytes } + +var _ Iterable = (*bytesIterable)(nil) + +func (bi bytesIterable) String() string { return bi.bytes.String() + ".elems()" } +func (bi bytesIterable) Type() string { return "bytes.elems" } +func (bi bytesIterable) Freeze() {} // immutable +func (bi bytesIterable) Truth() Bool { return True } +func (bi bytesIterable) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", bi.Type()) } +func (bi bytesIterable) Iterate() Iterator { return &bytesIterator{bi.bytes} } + +type bytesIterator struct{ bytes Bytes } + +func (it *bytesIterator) Next(p *Value) bool { + if it.bytes == "" { + return false + } + *p = MakeInt(int(it.bytes[0])) + it.bytes = it.bytes[1:] + return true +} + +func (*bytesIterator) Done() {} + // https://github.com/google/starlark-go/blob/master/doc/spec.md#string·count func string_count(_ *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, error) { var sub string diff --git a/starlark/testdata/bytes.star b/starlark/testdata/bytes.star new file mode 100644 index 00000000..d524cb44 --- /dev/null +++ b/starlark/testdata/bytes.star @@ -0,0 +1,131 @@ +# Tests of 'bytes' (immutable byte strings). + +load("assert.star", "assert") + +# bytes(string) -- UTF-k to UTF-8 transcoding with U+FFFD replacement +hello = bytes("hello, 世界") +goodbye = bytes("goodbye") +empty = bytes("") +nonprinting = bytes("\t\n\u200D") # TAB, NEWLINE, DEL, ZERO_WIDTH_JOINER +assert.eq(bytes("hello, 世界"[:-1]), b"hello, 世��") + +# bytes(iterable of int) -- construct from numeric byte values +assert.eq(bytes([65, 66, 67]), b"ABC") +assert.eq(bytes((65, 66, 67)), b"ABC") +assert.eq(bytes([0xf0, 0x9f, 0x98, 0xbf]), b"😿") +assert.fails(lambda: bytes([300]), + "at index 0, 300 out of range .want value in unsigned 8-bit range") +assert.fails(lambda: bytes([b"a"]), + "at index 0, got bytes, want int") +assert.fails(lambda: bytes(1), "want string, bytes, or iterable of ints") + +# literals +assert.eq(b"hello, 世界", hello) +assert.eq(b"goodbye", goodbye) +assert.eq(b"", empty) +assert.eq(b"\t\n\u200D", nonprinting) +assert.ne("abc", b"abc") +assert.eq(b"\012\xff\u0400\U0001F63F", b"\n\xffЀ😿") # see scanner tests for more +assert.eq(rb"\r\n\t", b"\\r\\n\\t") # raw + +# type +assert.eq(type(hello), "bytes") + +# len +assert.eq(len(hello), 13) +assert.eq(len(goodbye), 7) +assert.eq(len(empty), 0) + +# truth +assert.true(hello) +assert.true(goodbye) +assert.true(not empty) + +# str(bytes) does UTF-8 to UTF-k transcoding. +assert.eq(str(hello), "hello, 世界") +assert.eq(str(hello[:-1]), "hello, 世��") # incomplete UTF-8 encoding => U+FFFD +assert.eq(str(goodbye), "goodbye") +assert.eq(str(empty), "") +assert.eq(str(nonprinting), "\t\n\u200d") +assert.eq(str(b"\udc00"), "�") # unpaired surrogate => U+FFFD + +# reprq +assert.eq(repr(hello), r'b"hello, 世界"') +assert.eq(repr(hello[:-1]), r'b"hello, 世\xe7\x95"') # (incomplete UTF-8 encoding ) +assert.eq(repr(goodbye), 'b"goodbye"') +assert.eq(repr(empty), 'b""') +assert.eq(repr(nonprinting), 'b"\\t\\n\\u200d"') + +# equality +assert.eq(hello, hello) +assert.ne(hello, goodbye) +assert.eq(bytes("goodbye"), goodbye) + +# ordered comparison +assert.lt(bytes("abc"), bytes("abd")) +assert.lt(bytes("abc"), bytes("abcd")) +assert.lt(bytes("\x7f"), bytes("\x80")) # bytes compare as uint8, not int8 + +# bytes are dict-hashable +dict = {hello: 1, goodbye: 2} +dict[bytes("goodbye")] = 3 +assert.eq(len(dict), 2) +assert.eq(dict[goodbye], 3) + +# indexing +assert.eq(goodbye[0], bytes("g")) +assert.eq(goodbye[-1], bytes("e")) +assert.fails(lambda: goodbye[100], "out of range") + +# slicing +assert.eq(goodbye[:4], bytes("good")) +assert.eq(goodbye[4:], bytes("bye")) +assert.eq(goodbye[::2], bytes("gobe")) +assert.eq(goodbye[3:4], bytes("d")) # special case: len=1 +assert.eq(goodbye[4:4], bytes("")) # special case: len=0 + +# ord +assert.eq(ord(b"a"), 97) +assert.fails(lambda: ord(b"ab"), "ord: bytes has length 2, want 1") +assert.fails(lambda: ord(b""), "ord: bytes has length 0, want 1") + +# repeat (bytes * int) +assert.eq(goodbye * 3, bytes("goodbyegoodbyegoodbye")) +assert.eq(3 * goodbye, bytes("goodbyegoodbyegoodbye")) + +# elems() returns an iterable value over 1-byte substrings. +assert.eq(type(hello.elems()), "bytes.elems") +assert.eq(str(hello.elems()), "b\"hello, 世界\".elems()") +assert.eq(list(hello.elems()), [104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140]) +assert.eq(bytes([104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140]), hello) +assert.eq(list(goodbye.elems()), [103, 111, 111, 100, 98, 121, 101]) +assert.eq(list(empty.elems()), []) + +# TODO(adonovan); +# - more methods: find, index, split, etc. +# - hash(bytes)? +# +# Summary of string operations (put this in spec). +# +# string to number: +# - bytes[i] returns numeric value of ith byte. +# - ord(string) returns numeric value of sole code point in string. +# - ord(string[i]) is not a useful operation: fails on non-ASCII; see below. +# Q. Perhaps ord should return the first (not sole) code point? Then it becomes a UTF-8 decoder. +# Perhaps ord(string, index=int) should apply the index and relax the len=1 check. +# - string.codepoint() iterates over 1-codepoint substrings. +# - string.codepoint_ords() iterates over numeric values of code points in string. +# - string.elems() iterates over 1-element (UTF-k code) substrings. +# - string.elem_ords() iterates over numeric UTF-k code values. +# - string.elem_ords()[i] returns numeric value of ith element (UTF-k code). +# - string.elems()[i] returns substring of a single element (UTF-k code). +# - int(string) parses string as decimal (or other) numeric literal. +# +# number to string: +# - chr(int) returns string, UTF-k encoding of Unicode code point (like Python). +# Redundant with '%c' % int (which Python2 calls 'unichr'.) +# - bytes(chr(int)) returns byte string containing UTF-8 encoding of one code point. +# - bytes([int]) returns 1-byte string (with regrettable list allocation). +# - string(int) - format number as decimal. + +# - TODO(adonovan): how to create a 1-element string? (analogous to bytes([int])) diff --git a/starlark/testdata/json.star b/starlark/testdata/json.star index ef33d91a..7c7b316c 100644 --- a/starlark/testdata/json.star +++ b/starlark/testdata/json.star @@ -23,7 +23,7 @@ assert.eq(json.encode(range(3)), "[0,1,2]") # a built-in iterable assert.eq(json.encode(dict(x = 1, y = "two")), '{"x":1,"y":"two"}') assert.eq(json.encode(dict(y = "two", x = 1)), '{"x":1,"y":"two"}') # key, not insertion, order assert.eq(json.encode(struct(x = 1, y = "two")), '{"x":1,"y":"two"}') # a user-defined HasAttrs -assert.eq(json.encode("\x80"), '"\\ufffd"') # invalid UTF-8 -> replacement char +assert.eq(json.encode("😹"[:1]), '"\\ufffd"') # invalid UTF-8 -> replacement char def encode_error(expr, error): assert.fails(lambda: json.encode(expr), error) diff --git a/starlark/testdata/string.star b/starlark/testdata/string.star index 859f6458..c46e8745 100644 --- a/starlark/testdata/string.star +++ b/starlark/testdata/string.star @@ -6,6 +6,14 @@ load("assert.star", "assert") # raw string literals: assert.eq(r"a\bc", "a\\bc") +# Hex and octal escapes may encode any byte value +# even in a "text" (not 'bytes') string. +# This is not required by the spec, but is necessary +# in the Go implementation so that repr(x) works for +# invalid Unicode, such as half an emoji. +assert.eq(list("\x80\377".elems()), ["\x80", "\xff"]) +assert.eq(list("\x80\377".codepoints()), ["�", "�"]) + # truth assert.true("abc") assert.true(chr(0)) @@ -46,28 +54,34 @@ assert.fails(lambda: ord(""), "string encodes 0 Unicode code points, want 1") assert.fails(lambda: ord("😿"[1:]), "string encodes 3 Unicode code points, want 1") # 3 x 0xFFFD # string.codepoint_ords -assert.eq(type("abcЙ😿".codepoint_ords()), "codepoints") +assert.eq(type("abcЙ😿".codepoint_ords()), "string.codepoints") assert.eq(str("abcЙ😿".codepoint_ords()), '"abcЙ😿".codepoint_ords()') assert.eq(list("abcЙ😿".codepoint_ords()), [97, 98, 99, 1049, 128575]) assert.eq(list(("A" + "😿Z"[1:]).codepoint_ords()), [ord("A"), 0xFFFD, 0xFFFD, 0xFFFD, ord("Z")]) assert.eq(list("".codepoint_ords()), []) +assert.fails(lambda: "abcЙ😿".codepoint_ords()[2], "unhandled index") # not indexable +assert.fails(lambda: len("abcЙ😿".codepoint_ords()), "no len") # unknown length # string.codepoints -assert.eq(type("abcЙ😿".codepoints()), "codepoints") +assert.eq(type("abcЙ😿".codepoints()), "string.codepoints") assert.eq(str("abcЙ😿".codepoints()), '"abcЙ😿".codepoints()') assert.eq(list("abcЙ😿".codepoints()), ["a", "b", "c", "Й", "😿"]) -assert.eq(list(("A" + "😿Z"[1:]).codepoints()), ["A", "\x9f", "\x98", "\xbf", "Z"]) +assert.eq(list(("A" + "😿Z"[1:]).codepoints()), ["A", "�", "�", "�", "Z"]) assert.eq(list("".codepoints()), []) +assert.fails(lambda: "abcЙ😿".codepoints()[2], "unhandled index") # not indexable +assert.fails(lambda: len("abcЙ😿".codepoints()), "no len") # unknown length # string.elem_ords -assert.eq(type("abcЙ😿".elem_ords()), "elems") +assert.eq(type("abcЙ😿".elem_ords()), "string.elems") assert.eq(str("abcЙ😿".elem_ords()), '"abcЙ😿".elem_ords()') assert.eq(list("abcЙ😿".elem_ords()), [97, 98, 99, 208, 153, 240, 159, 152, 191]) assert.eq(list(("A" + "😿Z"[1:]).elem_ords()), [65, 159, 152, 191, 90]) assert.eq(list("".elem_ords()), []) +assert.eq("abcЙ😿".elem_ords()[2], 99) # indexable +assert.eq(len("abcЙ😿".elem_ords()), 9) # known length # string.elems -assert.eq(type("abcЙ😿".elems()), "elems") +assert.eq(type("abcЙ😿".elems()), "string.elems") assert.eq(str("abcЙ😿".elems()), '"abcЙ😿".elems()') assert.eq( list("abcЙ😿".elems()), @@ -78,6 +92,8 @@ assert.eq( ["A", "\x9f", "\x98", "\xbf", "Z"], ) assert.eq(list("".elems()), []) +assert.eq("abcЙ😿".elems()[2], "c") # indexable +assert.eq(len("abcЙ😿".elems()), 9) # known length # indexing, x[i] assert.eq("Hello, 世界!"[0], "H") diff --git a/starlark/value.go b/starlark/value.go index bcec7508..ce6cf332 100644 --- a/starlark/value.go +++ b/starlark/value.go @@ -499,13 +499,20 @@ func (f Float) Unary(op syntax.Token) (Value, error) { return nil, nil } -// String is the type of a Starlark string. +// String is the type of a Starlark text string. // // A String encapsulates an an immutable sequence of bytes, // but strings are not directly iterable. Instead, iterate // over the result of calling one of these four methods: // codepoints, codepoint_ords, elems, elem_ords. // +// Strings typically contain text; use Bytes for binary strings. +// The Starlark spec defines text strings as sequences of UTF-k +// codes that encode Unicode code points. In this Go implementation, +// k=8, whereas in a Java implementation, k=16. For portability, +// operations on strings should aim to avoid assumptions about +// the value of k. +// // Warning: the contract of the Value interface's String method is that // it returns the value printed in Starlark notation, // so s.String() or fmt.Sprintf("%s", s) returns a quoted string. @@ -545,73 +552,106 @@ func (x String) CompareSameType(op syntax.Token, y_ Value, depth int) (bool, err func AsString(x Value) (string, bool) { v, ok := x.(String); return string(v), ok } -// A stringIterable is an iterable whose iterator yields a sequence of -// either Unicode code points or elements (bytes), -// either numerically or as successive substrings. -type stringIterable struct { - s String - ords bool - codepoints bool +// A stringElems is an iterable whose iterator yields a sequence of +// elements (bytes), either numerically or as successive substrings. +// It is an indexable sequence. +type stringElems struct { + s String + ords bool } -var _ Iterable = (*stringIterable)(nil) +var ( + _ Iterable = (*stringElems)(nil) + _ Indexable = (*stringElems)(nil) +) -func (si stringIterable) String() string { - var etype string - if si.codepoints { - etype = "codepoint" +func (si stringElems) String() string { + if si.ords { + return si.s.String() + ".elem_ords()" } else { - etype = "elem" + return si.s.String() + ".elems()" } +} +func (si stringElems) Type() string { return "string.elems" } +func (si stringElems) Freeze() {} // immutable +func (si stringElems) Truth() Bool { return True } +func (si stringElems) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", si.Type()) } +func (si stringElems) Iterate() Iterator { return &stringElemsIterator{si, 0} } +func (si stringElems) Len() int { return len(si.s) } +func (si stringElems) Index(i int) Value { if si.ords { - return si.s.String() + "." + etype + "_ords()" + return MakeInt(int(si.s[i])) } else { - return si.s.String() + "." + etype + "s()" + // TODO(adonovan): opt: preallocate canonical 1-byte strings + // to avoid interface allocation. + return si.s[i : i+1] + } +} + +type stringElemsIterator struct { + si stringElems + i int +} + +func (it *stringElemsIterator) Next(p *Value) bool { + if it.i == len(it.si.s) { + return false } + *p = it.si.Index(it.i) + it.i++ + return true +} + +func (*stringElemsIterator) Done() {} + +// A stringCodepoints is an iterable whose iterator yields a sequence of +// Unicode code points, either numerically or as successive substrings. +// It is not indexable. +type stringCodepoints struct { + s String + ords bool } -func (si stringIterable) Type() string { - if si.codepoints { - return "codepoints" + +var _ Iterable = (*stringCodepoints)(nil) + +func (si stringCodepoints) String() string { + if si.ords { + return si.s.String() + ".codepoint_ords()" } else { - return "elems" + return si.s.String() + ".codepoints()" } } -func (si stringIterable) Freeze() {} // immutable -func (si stringIterable) Truth() Bool { return True } -func (si stringIterable) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", si.Type()) } -func (si stringIterable) Iterate() Iterator { return &stringIterator{si, 0} } +func (si stringCodepoints) Type() string { return "string.codepoints" } +func (si stringCodepoints) Freeze() {} // immutable +func (si stringCodepoints) Truth() Bool { return True } +func (si stringCodepoints) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", si.Type()) } +func (si stringCodepoints) Iterate() Iterator { return &stringCodepointsIterator{si, 0} } -type stringIterator struct { - si stringIterable +type stringCodepointsIterator struct { + si stringCodepoints i int } -func (it *stringIterator) Next(p *Value) bool { +func (it *stringCodepointsIterator) Next(p *Value) bool { s := it.si.s[it.i:] if s == "" { return false } - if it.si.codepoints { - r, sz := utf8.DecodeRuneInString(string(s)) - if !it.si.ords { - *p = s[:sz] + r, sz := utf8.DecodeRuneInString(string(s)) + if !it.si.ords { + if r == utf8.RuneError { + *p = String(r) } else { - *p = MakeInt(int(r)) + *p = s[:sz] } - it.i += sz } else { - b := int(s[0]) - if !it.si.ords { - *p = s[:1] - } else { - *p = MakeInt(b) - } - it.i += 1 + *p = MakeInt(int(r)) } + it.i += sz return true } -func (*stringIterator) Done() {} +func (*stringCodepointsIterator) Done() {} // A Function is a function defined by a Starlark def statement or lambda expression. // The initialization behavior of a Starlark module is also represented by a Function. @@ -1084,6 +1124,7 @@ func writeValue(out *strings.Builder, x Value, path []Value) { case nil: out.WriteString("") // indicates a bug + // These four cases are duplicates of T.String(), for efficiency. case NoneType: out.WriteString("None") @@ -1318,6 +1359,8 @@ func Len(x Value) int { switch x := x.(type) { case String: return x.Len() + case Indexable: + return x.Len() case Sequence: return x.Len() } @@ -1335,3 +1378,54 @@ func Iterate(x Value) Iterator { } return nil } + +// Bytes is the type of a Starlark binary string. +// +// A Bytes encapsulates an immutable sequence of bytes. +// It is comparable, indexable, and sliceable, but not direcly iterable; +// use bytes.elems() for an iterable view. +// +// In this Go implementation, the elements of 'string' and 'bytes' are +// both bytes, but in other implementations, notably Java, the elements +// of a 'string' are UTF-16 codes (Java chars). The spec abstracts text +// strings as sequences of UTF-k codes that encode Unicode code points, +// and operations that convert from text to binary incur UTF-k-to-UTF-8 +// transcoding; conversely, conversion from binary to text incurs +// UTF-8-to-UTF-k transcoding. Because k=8 for Go, these operations +// are the identity function, at least for valid encodings of text. +type Bytes string + +var ( + _ Comparable = Bytes("") + _ Sliceable = Bytes("") + _ Indexable = Bytes("") +) + +func (b Bytes) String() string { return fmt.Sprintf("b%q", string(b)) } +func (b Bytes) Type() string { return "bytes" } +func (b Bytes) Freeze() {} // immutable +func (b Bytes) Truth() Bool { return len(b) > 0 } +func (b Bytes) Hash() (uint32, error) { return String(b).Hash() } +func (b Bytes) Len() int { return len(b) } +func (b Bytes) Index(i int) Value { return b[i : i+1] } + +func (b Bytes) Attr(name string) (Value, error) { return builtinAttr(b, name, bytesMethods) } +func (b Bytes) AttrNames() []string { return builtinAttrNames(bytesMethods) } + +func (b Bytes) Slice(start, end, step int) Value { + if step == 1 { + return b[start:end] + } + + sign := signum(step) + var str []byte + for i := start; signum(end-i) == sign; i += step { + str = append(str, b[i]) + } + return Bytes(str) +} + +func (x Bytes) CompareSameType(op syntax.Token, y_ Value, depth int) (bool, error) { + y := y_.(Bytes) + return threeway(op, strings.Compare(string(x), string(y))), nil +} diff --git a/syntax/parse.go b/syntax/parse.go index 0281e4b8..e5548dde 100644 --- a/syntax/parse.go +++ b/syntax/parse.go @@ -771,8 +771,7 @@ func (p *parser) parseArgs() []Expr { } // primary = IDENT -// | INT | FLOAT -// | STRING +// | INT | FLOAT | STRING | BYTES // | '[' ... // list literal or comprehension // | '{' ... // dict literal or comprehension // | '(' ... // tuple or parenthesized expression @@ -782,7 +781,7 @@ func (p *parser) parsePrimary() Expr { case IDENT: return p.parseIdent() - case INT, FLOAT, STRING: + case INT, FLOAT, STRING, BYTES: var val interface{} tok := p.tok switch tok { @@ -794,7 +793,7 @@ func (p *parser) parsePrimary() Expr { } case FLOAT: val = p.tokval.float - case STRING: + case STRING, BYTES: val = p.tokval.string } raw := p.tokval.raw diff --git a/syntax/parse_test.go b/syntax/parse_test.go index 76f9eb38..e06fad02 100644 --- a/syntax/parse_test.go +++ b/syntax/parse_test.go @@ -361,9 +361,12 @@ func writeTree(out *bytes.Buffer, x reflect.Value) { case reflect.Struct: switch v := x.Interface().(type) { case syntax.Literal: - if v.Token == syntax.STRING { + switch v.Token { + case syntax.STRING: fmt.Fprintf(out, "%q", v.Value) - } else if v.Token == syntax.INT { + case syntax.BYTES: + fmt.Fprintf(out, "b%q", v.Value) + case syntax.INT: fmt.Fprintf(out, "%d", v.Value) } return diff --git a/syntax/quote.go b/syntax/quote.go index 49cb259c..1e1119c2 100644 --- a/syntax/quote.go +++ b/syntax/quote.go @@ -10,6 +10,7 @@ import ( "fmt" "strconv" "strings" + "unicode" ) // unesc maps single-letter chars following \ to their actual values. @@ -41,15 +42,20 @@ var esc = [256]byte{ } // unquote unquotes the quoted string, returning the actual -// string value, whether the original was triple-quoted, and -// an error describing invalid input. -func unquote(quoted string) (s string, triple bool, err error) { +// string value, whether the original was triple-quoted, +// whether it was a byte string, and an error describing invalid input. +func unquote(quoted string) (s string, triple, isByte bool, err error) { // Check for raw prefix: means don't interpret the inner \. raw := false if strings.HasPrefix(quoted, "r") { raw = true quoted = quoted[1:] } + // Check for bytes prefix. + if strings.HasPrefix(quoted, "b") { + isByte = true + quoted = quoted[1:] + } if len(quoted) < 2 { err = fmt.Errorf("string literal too short") @@ -138,7 +144,7 @@ func unquote(quoted string) (s string, triple bool, err error) { quoted = quoted[2:] case '0', '1', '2', '3', '4', '5', '6', '7': - // Octal escape, up to 3 digits. + // Octal escape, up to 3 digits, \OOO. n := int(quoted[1] - '0') quoted = quoted[2:] for i := 1; i < 3; i++ { @@ -158,7 +164,7 @@ func unquote(quoted string) (s string, triple bool, err error) { buf.WriteByte(byte(n)) case 'x': - // Hexadecimal escape, exactly 2 digits. + // Hexadecimal escape, exactly 2 digits, \xXX. if len(quoted) < 4 { err = fmt.Errorf(`truncated escape sequence %s`, quoted) return @@ -170,6 +176,30 @@ func unquote(quoted string) (s string, triple bool, err error) { } buf.WriteByte(byte(n)) quoted = quoted[4:] + + case 'u', 'U': + // Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits. + // Unpaired surrogates are allowed (unlike in Go). + sz := 6 + if quoted[1] == 'U' { + sz = 10 + } + if len(quoted) < sz { + err = fmt.Errorf(`truncated escape sequence %s`, quoted) + return + } + n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0) + if err1 != nil { + err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz]) + return + } + if n > unicode.MaxRune { + err = fmt.Errorf(`code point out of range: %s (max \U%08x)`, + quoted[:sz], n) + return + } + buf.WriteRune(rune(n)) + quoted = quoted[sz:] } } diff --git a/syntax/quote_test.go b/syntax/quote_test.go index f9068eee..a2471b4f 100644 --- a/syntax/quote_test.go +++ b/syntax/quote_test.go @@ -59,7 +59,7 @@ func TestQuote(t *testing.T) { func TestUnquote(t *testing.T) { for _, tt := range quoteTests { - s, triple, err := unquote(tt.q) + s, triple, _, err := unquote(tt.q) wantTriple := strings.HasPrefix(tt.q, `"""`) || strings.HasPrefix(tt.q, `'''`) if s != tt.s || triple != wantTriple || err != nil { t.Errorf("unquote(%s) = %#q, %v, %v want %#q, %v, nil", tt.q, s, triple, err, tt.s, wantTriple) diff --git a/syntax/scan.go b/syntax/scan.go index 53d9f5c5..caecc360 100644 --- a/syntax/scan.go +++ b/syntax/scan.go @@ -35,6 +35,7 @@ const ( INT // 123 FLOAT // 1.23e45 STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo" + BYTES // b"foo", etc // Punctuation PLUS // + @@ -407,7 +408,7 @@ type tokenValue struct { int int64 // decoded int bigInt *big.Int // decoded integers > int64 float float64 // decoded float - string string // decoded string + string string // decoded string or bytes pos Position // start position of token } @@ -627,8 +628,15 @@ start: // identifier or keyword if isIdentStart(c) { - // raw string literal - if c == 'r' && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') { + if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') { + // r"..." + // b"..." + sc.readRune() + c = sc.peekRune() + return sc.scanString(val, c) + } else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') { + // rb"..." + sc.readRune() sc.readRune() c = sc.peekRune() return sc.scanString(val, c) @@ -872,12 +880,16 @@ func (sc *scanner) scanString(val *tokenValue, quote rune) Token { } val.raw = raw.String() - s, _, err := unquote(val.raw) + s, _, isByte, err := unquote(val.raw) if err != nil { sc.error(start, err.Error()) } val.string = s - return STRING + if isByte { + return BYTES + } else { + return STRING + } } func (sc *scanner) scanNumber(val *tokenValue, c rune) Token { diff --git a/syntax/scan_test.go b/syntax/scan_test.go index 0f2d9f22..daa3d4c4 100644 --- a/syntax/scan_test.go +++ b/syntax/scan_test.go @@ -44,6 +44,8 @@ func scan(src interface{}) (tokens string, err error) { fmt.Fprintf(&buf, "%e", val.float) case STRING: fmt.Fprintf(&buf, "%q", val.string) + case BYTES: + fmt.Fprintf(&buf, "b%q", val.string) default: buf.WriteString(tok.String()) } @@ -192,6 +194,30 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated {`"\377"`, `"\xff" EOF`}, {`"\378"`, `"\x1f8" EOF`}, // = '\37' + '8' {`"\400"`, `foo.star:1:1: invalid escape sequence \400`}, // unlike Python 2 and 3 + // hex escapes + {`"\x00"`, `"\x00" EOF`}, + {`"\xff"`, `"\xff" EOF`}, + {`"\xFf"`, `"\xff" EOF`}, + {`"\xF"`, `foo.star:1:1: truncated escape sequence \xF`}, + {`"\x"`, `foo.star:1:1: truncated escape sequence \x`}, + {`"\xfg"`, `foo.star:1:1: invalid escape sequence \xfg`}, + // Unicode escapes + // \uXXXX + {`"\u0400"`, `"Ѐ" EOF`}, + {`"\u100"`, `foo.star:1:1: truncated escape sequence \u100`}, + {`"\u04000"`, `"Ѐ0" EOF`}, // = U+0400 + '0' + {`"\u100g"`, `foo.star:1:1: invalid escape sequence \u100g`}, + {`"\u4E16"`, `"世" EOF`}, + {`"\udc00"`, `"�" EOF`}, // unpaired surrogates ok + // \UXXXXXXXX + {`"\U00000400"`, `"Ѐ" EOF`}, + {`"\U0000400"`, `foo.star:1:1: truncated escape sequence \U0000400`}, + {`"\U000004000"`, `"Ѐ0" EOF`}, // = U+0400 + '0' + {`"\U1000000g"`, `foo.star:1:1: invalid escape sequence \U1000000g`}, + {`"\U0010FFFF"`, `"\U0010ffff" EOF`}, + {`"\U00110000"`, `foo.star:1:1: code point out of range: \U00110000 (max \U00110000)`}, + {`"\U0001F63F"`, `"😿" EOF`}, + {`"\U0000dc00"`, `"�" EOF`}, // unpaired surrogates ok // backslash escapes // As in Go, a backslash must escape something. diff --git a/syntax/syntax.go b/syntax/syntax.go index b4817c1a..20b28bb6 100644 --- a/syntax/syntax.go +++ b/syntax/syntax.go @@ -251,10 +251,10 @@ func (x *Ident) Span() (start, end Position) { // A Literal represents a literal string or number. type Literal struct { commentsRef - Token Token // = STRING | INT + Token Token // = STRING | BYTES | INT | FLOAT TokenPos Position Raw string // uninterpreted text - Value interface{} // = string | int64 | *big.Int + Value interface{} // = string | int64 | *big.Int | float64 } func (x *Literal) Span() (start, end Position) {