diff --git a/go.mod b/go.mod index 50bc0004..d14060ec 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,8 @@ require ( github.com/chzyer/logex v1.1.10 // indirect github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 // indirect - golang.org/x/sys v0.0.0-20200803210538-64077c9b5642 + github.com/google/go-cmp v0.5.1 // indirect + golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f + golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect google.golang.org/protobuf v1.25.0 ) diff --git a/go.sum b/go.sum index b40c868a..90a8048b 100644 --- a/go.sum +++ b/go.sum @@ -24,8 +24,9 @@ github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5a github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.0 h1:/QaMHBdZ26BB3SSst0Iwl10Epc+xhTquomWX0oZEB6w= github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.1 h1:JFrFEBb2xKufg6XkJsJr+WbKb4FQlURi5RUcBveYu9k= +github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -42,15 +43,16 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/sys v0.0.0-20200803210538-64077c9b5642 h1:B6caxRw+hozq68X2MY7jEpZh/cr4/aHLv9xU8Kkadrw= -golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= diff --git a/internal/compile/compile.go b/internal/compile/compile.go index eb8e1627..86407136 100644 --- a/internal/compile/compile.go +++ b/internal/compile/compile.go @@ -33,6 +33,7 @@ import ( "os" "path/filepath" "strconv" + "strings" "sync" "go.starlark.net/resolve" @@ -46,7 +47,7 @@ var Disassemble = false const debug = false // make code generation verbose, for debugging the compiler // Increment this to force recompilation of saved bytecode files. -const Version = 10 +const Version = 11 type Opcode uint8 @@ -306,12 +307,15 @@ func (op Opcode) String() string { type Program struct { Loads []Binding // name (really, string) and position of each load stmt Names []string // names of attributes and predeclared variables - Constants []interface{} // = string | int64 | float64 | *big.Int + Constants []interface{} // = string | int64 | float64 | *big.Int | Bytes Functions []*Funcode Globals []Binding // for error messages and tracing Toplevel *Funcode // module initialization function } +// The type of a byte string literal value, to distinguish from text string. +type Bytes string + // A Funcode is the code of a compiled Starlark function. // // Funcodes are serialized by the encoder.function method, @@ -860,6 +864,8 @@ func PrintOp(fn *Funcode, pc uint32, op Opcode, arg uint32) { switch x := fn.Prog.Constants[arg].(type) { case string: comment = strconv.Quote(x) + case Bytes: + comment = "b" + strconv.Quote(string(x)) default: comment = fmt.Sprint(x) } @@ -1286,8 +1292,12 @@ func (fcomp *fcomp) expr(e syntax.Expr) { fcomp.lookup(e) case *syntax.Literal: - // e.Value is int64, float64, *bigInt, or string. - fcomp.emit1(CONSTANT, fcomp.pcomp.constantIndex(e.Value)) + // e.Value is int64, float64, *bigInt, string + v := e.Value + if e.Token == syntax.BYTES { + v = Bytes(v.(string)) + } + fcomp.emit1(CONSTANT, fcomp.pcomp.constantIndex(v)) case *syntax.ListExpr: for _, x := range e.List { @@ -1525,7 +1535,7 @@ func (fcomp *fcomp) plus(e *syntax.BinaryExpr) { } // addable reports whether e is a statically addable -// expression: a [s]tring, [l]ist, or [t]uple. +// expression: a [s]tring, [b]ytes, [l]ist, or [t]uple. func addable(e syntax.Expr) rune { switch e := e.(type) { case *syntax.Literal: @@ -1533,6 +1543,8 @@ func addable(e syntax.Expr) rune { switch e.Token { case syntax.STRING: return 's' + case syntax.BYTES: + return 'b' } case *syntax.ListExpr: return 'l' @@ -1547,12 +1559,16 @@ func addable(e syntax.Expr) rune { // The resulting syntax is degenerate, lacking position, etc. func add(code rune, args []summand) syntax.Expr { switch code { - case 's': - var buf bytes.Buffer + case 's', 'b': + var buf strings.Builder for _, arg := range args { buf.WriteString(arg.x.(*syntax.Literal).Value.(string)) } - return &syntax.Literal{Token: syntax.STRING, Value: buf.String()} + tok := syntax.STRING + if code == 'b' { + tok = syntax.BYTES + } + return &syntax.Literal{Token: tok, Value: buf.String()} case 'l': var elems []syntax.Expr for _, arg := range args { diff --git a/internal/compile/serial.go b/internal/compile/serial.go index 0107ef9c..adadabfc 100644 --- a/internal/compile/serial.go +++ b/internal/compile/serial.go @@ -51,9 +51,10 @@ package compile // // Constant: # type data // type varint # 0=string string -// data ... # 1=int varint -// # 2=float varint (bits as uint64) -// # 3=bigint string (decimal ASCII text) +// data ... # 1=bytes string +// # 2=int varint +// # 3=float varint (bits as uint64) +// # 4=bigint string (decimal ASCII text) // // The encoding starts with a four-byte magic number. // The next four bytes are a little-endian uint32 @@ -109,14 +110,17 @@ func (prog *Program) Encode() []byte { case string: e.int(0) e.string(c) - case int64: + case Bytes: e.int(1) + e.string(string(c)) + case int64: + e.int(2) e.int64(c) case float64: - e.int(2) + e.int(3) e.uint64(math.Float64bits(c)) case *big.Int: - e.int(3) + e.int(4) e.string(c.Text(10)) } } @@ -249,10 +253,12 @@ func DecodeProgram(data []byte) (_ *Program, err error) { case 0: c = d.string() case 1: - c = d.int64() + c = Bytes(d.string()) case 2: - c = math.Float64frombits(d.uint64()) + c = d.int64() case 3: + c = math.Float64frombits(d.uint64()) + case 4: c, _ = new(big.Int).SetString(d.string(), 10) } constants[i] = c diff --git a/lib/proto/proto.go b/lib/proto/proto.go index 84aa0d63..149162db 100644 --- a/lib/proto/proto.go +++ b/lib/proto/proto.go @@ -79,8 +79,6 @@ package proto // TODO(adonovan): Go and Starlark API improvements: -// - Contribute the 'bytes' data type to the core language. -// See https://github.com/bazelbuild/starlark/issues/112. // - Make Message and RepeatedField comparable. // (NOTE: proto.Equal works only with generated message types.) // - Support maps, oneof, any. But not messageset if we can avoid it. @@ -234,7 +232,7 @@ func marshal(_ *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwar if err != nil { return nil, fmt.Errorf("%s: %v", fn.Name(), err) } - return Bytes(data), nil + return starlark.Bytes(data), nil } else { text, err := prototext.MarshalOptions{Indent: " "}.Marshal(m.Message()) if err != nil { @@ -247,7 +245,7 @@ func marshal(_ *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwar // unmarshal(msg) decodes a binary protocol message to a Message. func unmarshal(thread *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) { var desc MessageDescriptor - var data Bytes + var data starlark.Bytes if err := starlark.UnpackPositionalArgs(fn.Name(), args, kwargs, 2, &desc, &data); err != nil { return nil, err } @@ -486,7 +484,7 @@ func toProto(fdesc protoreflect.FieldDescriptor, v starlark.Value) (protoreflect case protoreflect.StringKind: if s, ok := starlark.AsString(v); ok { return protoreflect.ValueOfString(s), nil - } else if b, ok := v.(Bytes); ok { + } else if b, ok := v.(starlark.Bytes); ok { // TODO(adonovan): allow bytes for string? Not friendly to a Java port. return protoreflect.ValueOfBytes([]byte(b)), nil } @@ -497,7 +495,7 @@ func toProto(fdesc protoreflect.FieldDescriptor, v starlark.Value) (protoreflect // Instead provide b"..." literals in the core // and a bytes(str) conversion. return protoreflect.ValueOfBytes([]byte(s)), nil - } else if b, ok := v.(Bytes); ok { + } else if b, ok := v.(starlark.Bytes); ok { return protoreflect.ValueOfBytes([]byte(b)), nil } @@ -588,7 +586,7 @@ func toStarlark1(typ protoreflect.FieldDescriptor, x protoreflect.Value, frozen return starlark.String(x.String()) case protoreflect.BytesKind: - return Bytes(x.Bytes()) + return starlark.Bytes(x.Bytes()) case protoreflect.DoubleKind, protoreflect.FloatKind: return starlark.Float(x.Float()) @@ -1232,78 +1230,3 @@ func (x EnumValueDescriptor) CompareSameType(op syntax.Token, y_ starlark.Value, return false, fmt.Errorf("%s %s %s not implemented", x.Type(), op, y_.Type()) } } - -// A Bytes is an immutable sequence of bytes. -// It is comparable, iterable, indexable, and sliceable. -// -// (In go.starlark.net, text Strings are also byte strings, -// but we shouldn't rely on that. -// See https://github.com/bazelbuild/starlark/issues/112.) -type Bytes string - -var ( - _ starlark.Comparable = Bytes("") - _ starlark.Iterable = Bytes("") - _ starlark.Sliceable = Bytes("") - _ starlark.Sequence = Bytes("") -) - -func (b Bytes) String() string { return fmt.Sprintf("<%d bytes>", len(b)) } -func (b Bytes) Type() string { return "bytes" } -func (b Bytes) Freeze() {} // immutable -func (b Bytes) Truth() starlark.Bool { return len(b) > 0 } -func (b Bytes) Hash() (uint32, error) { return starlark.String(b).Hash() } -func (b Bytes) Len() int { return len(b) } -func (b Bytes) Index(i int) starlark.Value { return starlark.MakeInt(int(b[i])) } - -func (b Bytes) Slice(start, end, step int) starlark.Value { - if step == 1 { - return b[start:end] - } - - sign := signum(step) - var str []byte - for i := start; signum(end-i) == sign; i += step { - str = append(str, b[i]) - } - return Bytes(str) -} - -// From Hacker's Delight, section 2.8. -func signum64(x int64) int { return int(uint64(x>>63) | uint64(-x)>>63) } -func signum(x int) int { return signum64(int64(x)) } - -func (b Bytes) Iterate() starlark.Iterator { return &bytesIterator{string(b)} } - -type bytesIterator struct{ string } - -func (it *bytesIterator) Next(p *starlark.Value) bool { - if it.string == "" { - return false - } - *p = starlark.MakeInt(int(it.string[0])) - it.string = it.string[1:] - return true -} - -func (it *bytesIterator) Done() {} - -func (x Bytes) CompareSameType(op syntax.Token, y_ starlark.Value, depth int) (bool, error) { - y := y_.(Bytes) - cmp := strings.Compare(string(x), string(y)) - switch op { - case syntax.EQL: - return cmp == 0, nil - case syntax.NEQ: - return cmp != 0, nil - case syntax.LE: - return cmp <= 0, nil - case syntax.LT: - return cmp < 0, nil - case syntax.GE: - return cmp >= 0, nil - case syntax.GT: - return cmp > 0, nil - } - panic(op) -} diff --git a/starlark/eval.go b/starlark/eval.go index c9bbb67b..9bc87709 100644 --- a/starlark/eval.go +++ b/starlark/eval.go @@ -478,6 +478,8 @@ func makeToplevelFunction(prog *compile.Program, predeclared StringDict) *Functi v = MakeBigInt(c) case string: v = String(c) + case compile.Bytes: + v = Bytes(c) case float64: v = Float(c) default: @@ -796,6 +798,8 @@ func Binary(op syntax.Token, x, y Value) (Value, error) { return xf * y, nil case String: return stringRepeat(y, x) + case Bytes: + return bytesRepeat(y, x) case *List: elems, err := tupleRepeat(Tuple(y.elems), x) if err != nil { @@ -820,6 +824,10 @@ func Binary(op syntax.Token, x, y Value) (Value, error) { if y, ok := y.(Int); ok { return stringRepeat(x, y) } + case Bytes: + if y, ok := y.(Int); ok { + return bytesRepeat(x, y) + } case *List: if y, ok := y.(Int); ok { elems, err := tupleRepeat(Tuple(x.elems), y) @@ -1138,6 +1146,11 @@ func tupleRepeat(elems Tuple, n Int) (Tuple, error) { return res, nil } +func bytesRepeat(b Bytes, n Int) (Bytes, error) { + res, err := stringRepeat(String(b), n) + return Bytes(res), err +} + func stringRepeat(s String, n Int) (String, error) { if s == "" { return "", nil diff --git a/starlark/eval_test.go b/starlark/eval_test.go index 81f8c580..b5c0a131 100644 --- a/starlark/eval_test.go +++ b/starlark/eval_test.go @@ -117,6 +117,7 @@ func TestExecFile(t *testing.T) { "testdata/assign.star", "testdata/bool.star", "testdata/builtins.star", + "testdata/bytes.star", "testdata/control.star", "testdata/dict.star", "testdata/float.star", diff --git a/starlark/library.go b/starlark/library.go index 17638240..cd73f089 100644 --- a/starlark/library.go +++ b/starlark/library.go @@ -42,6 +42,7 @@ func init() { "any": NewBuiltin("any", any), "all": NewBuiltin("all", all), "bool": NewBuiltin("bool", bool_), + "bytes": NewBuiltin("bytes", bytes_), "chr": NewBuiltin("chr", chr), "dict": NewBuiltin("dict", dict), "dir": NewBuiltin("dir", dir), @@ -73,6 +74,10 @@ func init() { // methods of built-in types // https://github.com/google/starlark-go/blob/master/doc/spec.md#built-in-methods var ( + bytesMethods = map[string]*Builtin{ + "elems": NewBuiltin("elems", bytes_elems), + } + dictMethods = map[string]*Builtin{ "clear": NewBuiltin("clear", dict_clear), "get": NewBuiltin("get", dict_get), @@ -198,6 +203,45 @@ func bool_(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error return x.Truth(), nil } +// https://github.com/google/starlark-go/blob/master/doc/spec.md#bytes +func bytes_(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) { + if len(kwargs) > 0 { + return nil, fmt.Errorf("bytes does not accept keyword arguments") + } + if len(args) != 1 { + return nil, fmt.Errorf("bytes: got %d arguments, want exactly 1", len(args)) + } + switch x := args[0].(type) { + case Bytes: + return x, nil + case String: + // Invalid encodings are replaced by that of U+FFFD. + return Bytes(utf8Transcode(string(x))), nil + case Iterable: + // iterable of numeric byte values + var buf strings.Builder + if n := Len(x); n >= 0 { + // common case: known length + buf.Grow(n) + } + iter := x.Iterate() + defer iter.Done() + var elem Value + var b byte + for i := 0; iter.Next(&elem); i++ { + if err := AsInt(elem, &b); err != nil { + return nil, fmt.Errorf("bytes: at index %d, %s", i, err) + } + buf.WriteByte(b) + } + return Bytes(buf.String()), nil + + default: + // Unlike string(foo), which stringifies it, bytes(foo) is an error. + return nil, fmt.Errorf("bytes: got %s, want string, bytes, or iterable of ints", x.Type()) + } +} + // https://github.com/google/starlark-go/blob/master/doc/spec.md#chr func chr(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) { if len(kwargs) > 0 { @@ -261,9 +305,6 @@ func enumerate(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, e } iter := iterable.Iterate() - if iter == nil { - return nil, fmt.Errorf("enumerate: got %s, want iterable", iterable.Type()) - } defer iter.Done() var pairs []Value @@ -693,16 +734,26 @@ func ord(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) if len(args) != 1 { return nil, fmt.Errorf("ord: got %d arguments, want 1", len(args)) } - s, ok := AsString(args[0]) - if !ok { - return nil, fmt.Errorf("ord: got %s, want string", args[0].Type()) - } - r, sz := utf8.DecodeRuneInString(s) - if sz == 0 || sz != len(s) { - n := utf8.RuneCountInString(s) - return nil, fmt.Errorf("ord: string encodes %d Unicode code points, want 1", n) + switch x := args[0].(type) { + case String: + // ord(string) returns int value of sole rune. + s := string(x) + r, sz := utf8.DecodeRuneInString(s) + if sz == 0 || sz != len(s) { + n := utf8.RuneCountInString(s) + return nil, fmt.Errorf("ord: string encodes %d Unicode code points, want 1", n) + } + return MakeInt(int(r)), nil + + case Bytes: + // ord(bytes) returns int value of sole byte. + if len(x) != 1 { + return nil, fmt.Errorf("ord: bytes has length %d, want 1", len(x)) + } + return MakeInt(int(x[0])), nil + default: + return nil, fmt.Errorf("ord: got %s, want string or bytes", x.Type()) } - return MakeInt(int(r)), nil } // https://github.com/google/starlark-go/blob/master/doc/spec.md#print @@ -718,6 +769,8 @@ func print(thread *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, error } if s, ok := AsString(v); ok { buf.WriteString(s) + } else if b, ok := v.(Bytes); ok { + buf.WriteString(string(b)) } else { writeValue(buf, v, nil) } @@ -995,11 +1048,29 @@ func str(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) if len(args) != 1 { return nil, fmt.Errorf("str: got %d arguments, want exactly 1", len(args)) } - x := args[0] - if _, ok := AsString(x); !ok { - x = String(x.String()) + switch x := args[0].(type) { + case String: + return x, nil + case Bytes: + // Invalid encodings are replaced by that of U+FFFD. + return String(utf8Transcode(string(x))), nil + default: + return String(x.String()), nil } - return x, nil +} + +// utf8Transcode returns the UTF-8-to-UTF-8 transcoding of s. +// The effect is that each code unit that is part of an +// invalid sequence is replaced by U+FFFD. +func utf8Transcode(s string) string { + if utf8.ValidString(s) { + return s + } + var out strings.Builder + for _, r := range s { + out.WriteRune(r) + } + return out.String() } // https://github.com/google/starlark-go/blob/master/doc/spec.md#tuple @@ -1376,13 +1447,51 @@ func string_iterable(_ *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, if err := UnpackPositionalArgs(b.Name(), args, kwargs, 0); err != nil { return nil, err } - return stringIterable{ - s: b.Receiver().(String), - ords: b.Name()[len(b.Name())-2] == 'd', - codepoints: b.Name()[0] == 'c', - }, nil + s := b.Receiver().(String) + ords := b.Name()[len(b.Name())-2] == 'd' + codepoints := b.Name()[0] == 'c' + if codepoints { + return stringCodepoints{s, ords}, nil + } else { + return stringElems{s, ords}, nil + } +} + +// bytes_elems returns an unspecified iterable value whose +// iterator yields successive 1-byte substrings. +func bytes_elems(_ *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, error) { + if err := UnpackPositionalArgs(b.Name(), args, kwargs, 0); err != nil { + return nil, err + } + return bytesIterable{b.Receiver().(Bytes)}, nil } +// A bytesIterable is an iterable returned by bytes.elems(), +// whose iterator yields a sequence of numeric bytes values. +type bytesIterable struct{ bytes Bytes } + +var _ Iterable = (*bytesIterable)(nil) + +func (bi bytesIterable) String() string { return bi.bytes.String() + ".elems()" } +func (bi bytesIterable) Type() string { return "bytes.elems" } +func (bi bytesIterable) Freeze() {} // immutable +func (bi bytesIterable) Truth() Bool { return True } +func (bi bytesIterable) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", bi.Type()) } +func (bi bytesIterable) Iterate() Iterator { return &bytesIterator{bi.bytes} } + +type bytesIterator struct{ bytes Bytes } + +func (it *bytesIterator) Next(p *Value) bool { + if it.bytes == "" { + return false + } + *p = MakeInt(int(it.bytes[0])) + it.bytes = it.bytes[1:] + return true +} + +func (*bytesIterator) Done() {} + // https://github.com/google/starlark-go/blob/master/doc/spec.md#string·count func string_count(_ *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, error) { var sub string diff --git a/starlark/testdata/bytes.star b/starlark/testdata/bytes.star new file mode 100644 index 00000000..d524cb44 --- /dev/null +++ b/starlark/testdata/bytes.star @@ -0,0 +1,131 @@ +# Tests of 'bytes' (immutable byte strings). + +load("assert.star", "assert") + +# bytes(string) -- UTF-k to UTF-8 transcoding with U+FFFD replacement +hello = bytes("hello, 世界") +goodbye = bytes("goodbye") +empty = bytes("") +nonprinting = bytes("\t\n\u200D") # TAB, NEWLINE, DEL, ZERO_WIDTH_JOINER +assert.eq(bytes("hello, 世界"[:-1]), b"hello, 世��") + +# bytes(iterable of int) -- construct from numeric byte values +assert.eq(bytes([65, 66, 67]), b"ABC") +assert.eq(bytes((65, 66, 67)), b"ABC") +assert.eq(bytes([0xf0, 0x9f, 0x98, 0xbf]), b"😿") +assert.fails(lambda: bytes([300]), + "at index 0, 300 out of range .want value in unsigned 8-bit range") +assert.fails(lambda: bytes([b"a"]), + "at index 0, got bytes, want int") +assert.fails(lambda: bytes(1), "want string, bytes, or iterable of ints") + +# literals +assert.eq(b"hello, 世界", hello) +assert.eq(b"goodbye", goodbye) +assert.eq(b"", empty) +assert.eq(b"\t\n\u200D", nonprinting) +assert.ne("abc", b"abc") +assert.eq(b"\012\xff\u0400\U0001F63F", b"\n\xffЀ😿") # see scanner tests for more +assert.eq(rb"\r\n\t", b"\\r\\n\\t") # raw + +# type +assert.eq(type(hello), "bytes") + +# len +assert.eq(len(hello), 13) +assert.eq(len(goodbye), 7) +assert.eq(len(empty), 0) + +# truth +assert.true(hello) +assert.true(goodbye) +assert.true(not empty) + +# str(bytes) does UTF-8 to UTF-k transcoding. +assert.eq(str(hello), "hello, 世界") +assert.eq(str(hello[:-1]), "hello, 世��") # incomplete UTF-8 encoding => U+FFFD +assert.eq(str(goodbye), "goodbye") +assert.eq(str(empty), "") +assert.eq(str(nonprinting), "\t\n\u200d") +assert.eq(str(b"\udc00"), "�") # unpaired surrogate => U+FFFD + +# reprq +assert.eq(repr(hello), r'b"hello, 世界"') +assert.eq(repr(hello[:-1]), r'b"hello, 世\xe7\x95"') # (incomplete UTF-8 encoding ) +assert.eq(repr(goodbye), 'b"goodbye"') +assert.eq(repr(empty), 'b""') +assert.eq(repr(nonprinting), 'b"\\t\\n\\u200d"') + +# equality +assert.eq(hello, hello) +assert.ne(hello, goodbye) +assert.eq(bytes("goodbye"), goodbye) + +# ordered comparison +assert.lt(bytes("abc"), bytes("abd")) +assert.lt(bytes("abc"), bytes("abcd")) +assert.lt(bytes("\x7f"), bytes("\x80")) # bytes compare as uint8, not int8 + +# bytes are dict-hashable +dict = {hello: 1, goodbye: 2} +dict[bytes("goodbye")] = 3 +assert.eq(len(dict), 2) +assert.eq(dict[goodbye], 3) + +# indexing +assert.eq(goodbye[0], bytes("g")) +assert.eq(goodbye[-1], bytes("e")) +assert.fails(lambda: goodbye[100], "out of range") + +# slicing +assert.eq(goodbye[:4], bytes("good")) +assert.eq(goodbye[4:], bytes("bye")) +assert.eq(goodbye[::2], bytes("gobe")) +assert.eq(goodbye[3:4], bytes("d")) # special case: len=1 +assert.eq(goodbye[4:4], bytes("")) # special case: len=0 + +# ord +assert.eq(ord(b"a"), 97) +assert.fails(lambda: ord(b"ab"), "ord: bytes has length 2, want 1") +assert.fails(lambda: ord(b""), "ord: bytes has length 0, want 1") + +# repeat (bytes * int) +assert.eq(goodbye * 3, bytes("goodbyegoodbyegoodbye")) +assert.eq(3 * goodbye, bytes("goodbyegoodbyegoodbye")) + +# elems() returns an iterable value over 1-byte substrings. +assert.eq(type(hello.elems()), "bytes.elems") +assert.eq(str(hello.elems()), "b\"hello, 世界\".elems()") +assert.eq(list(hello.elems()), [104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140]) +assert.eq(bytes([104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140]), hello) +assert.eq(list(goodbye.elems()), [103, 111, 111, 100, 98, 121, 101]) +assert.eq(list(empty.elems()), []) + +# TODO(adonovan); +# - more methods: find, index, split, etc. +# - hash(bytes)? +# +# Summary of string operations (put this in spec). +# +# string to number: +# - bytes[i] returns numeric value of ith byte. +# - ord(string) returns numeric value of sole code point in string. +# - ord(string[i]) is not a useful operation: fails on non-ASCII; see below. +# Q. Perhaps ord should return the first (not sole) code point? Then it becomes a UTF-8 decoder. +# Perhaps ord(string, index=int) should apply the index and relax the len=1 check. +# - string.codepoint() iterates over 1-codepoint substrings. +# - string.codepoint_ords() iterates over numeric values of code points in string. +# - string.elems() iterates over 1-element (UTF-k code) substrings. +# - string.elem_ords() iterates over numeric UTF-k code values. +# - string.elem_ords()[i] returns numeric value of ith element (UTF-k code). +# - string.elems()[i] returns substring of a single element (UTF-k code). +# - int(string) parses string as decimal (or other) numeric literal. +# +# number to string: +# - chr(int) returns string, UTF-k encoding of Unicode code point (like Python). +# Redundant with '%c' % int (which Python2 calls 'unichr'.) +# - bytes(chr(int)) returns byte string containing UTF-8 encoding of one code point. +# - bytes([int]) returns 1-byte string (with regrettable list allocation). +# - string(int) - format number as decimal. + +# - TODO(adonovan): how to create a 1-element string? (analogous to bytes([int])) diff --git a/starlark/testdata/json.star b/starlark/testdata/json.star index ef33d91a..7c7b316c 100644 --- a/starlark/testdata/json.star +++ b/starlark/testdata/json.star @@ -23,7 +23,7 @@ assert.eq(json.encode(range(3)), "[0,1,2]") # a built-in iterable assert.eq(json.encode(dict(x = 1, y = "two")), '{"x":1,"y":"two"}') assert.eq(json.encode(dict(y = "two", x = 1)), '{"x":1,"y":"two"}') # key, not insertion, order assert.eq(json.encode(struct(x = 1, y = "two")), '{"x":1,"y":"two"}') # a user-defined HasAttrs -assert.eq(json.encode("\x80"), '"\\ufffd"') # invalid UTF-8 -> replacement char +assert.eq(json.encode("😹"[:1]), '"\\ufffd"') # invalid UTF-8 -> replacement char def encode_error(expr, error): assert.fails(lambda: json.encode(expr), error) diff --git a/starlark/testdata/string.star b/starlark/testdata/string.star index 859f6458..c46e8745 100644 --- a/starlark/testdata/string.star +++ b/starlark/testdata/string.star @@ -6,6 +6,14 @@ load("assert.star", "assert") # raw string literals: assert.eq(r"a\bc", "a\\bc") +# Hex and octal escapes may encode any byte value +# even in a "text" (not 'bytes') string. +# This is not required by the spec, but is necessary +# in the Go implementation so that repr(x) works for +# invalid Unicode, such as half an emoji. +assert.eq(list("\x80\377".elems()), ["\x80", "\xff"]) +assert.eq(list("\x80\377".codepoints()), ["�", "�"]) + # truth assert.true("abc") assert.true(chr(0)) @@ -46,28 +54,34 @@ assert.fails(lambda: ord(""), "string encodes 0 Unicode code points, want 1") assert.fails(lambda: ord("😿"[1:]), "string encodes 3 Unicode code points, want 1") # 3 x 0xFFFD # string.codepoint_ords -assert.eq(type("abcЙ😿".codepoint_ords()), "codepoints") +assert.eq(type("abcЙ😿".codepoint_ords()), "string.codepoints") assert.eq(str("abcЙ😿".codepoint_ords()), '"abcЙ😿".codepoint_ords()') assert.eq(list("abcЙ😿".codepoint_ords()), [97, 98, 99, 1049, 128575]) assert.eq(list(("A" + "😿Z"[1:]).codepoint_ords()), [ord("A"), 0xFFFD, 0xFFFD, 0xFFFD, ord("Z")]) assert.eq(list("".codepoint_ords()), []) +assert.fails(lambda: "abcЙ😿".codepoint_ords()[2], "unhandled index") # not indexable +assert.fails(lambda: len("abcЙ😿".codepoint_ords()), "no len") # unknown length # string.codepoints -assert.eq(type("abcЙ😿".codepoints()), "codepoints") +assert.eq(type("abcЙ😿".codepoints()), "string.codepoints") assert.eq(str("abcЙ😿".codepoints()), '"abcЙ😿".codepoints()') assert.eq(list("abcЙ😿".codepoints()), ["a", "b", "c", "Й", "😿"]) -assert.eq(list(("A" + "😿Z"[1:]).codepoints()), ["A", "\x9f", "\x98", "\xbf", "Z"]) +assert.eq(list(("A" + "😿Z"[1:]).codepoints()), ["A", "�", "�", "�", "Z"]) assert.eq(list("".codepoints()), []) +assert.fails(lambda: "abcЙ😿".codepoints()[2], "unhandled index") # not indexable +assert.fails(lambda: len("abcЙ😿".codepoints()), "no len") # unknown length # string.elem_ords -assert.eq(type("abcЙ😿".elem_ords()), "elems") +assert.eq(type("abcЙ😿".elem_ords()), "string.elems") assert.eq(str("abcЙ😿".elem_ords()), '"abcЙ😿".elem_ords()') assert.eq(list("abcЙ😿".elem_ords()), [97, 98, 99, 208, 153, 240, 159, 152, 191]) assert.eq(list(("A" + "😿Z"[1:]).elem_ords()), [65, 159, 152, 191, 90]) assert.eq(list("".elem_ords()), []) +assert.eq("abcЙ😿".elem_ords()[2], 99) # indexable +assert.eq(len("abcЙ😿".elem_ords()), 9) # known length # string.elems -assert.eq(type("abcЙ😿".elems()), "elems") +assert.eq(type("abcЙ😿".elems()), "string.elems") assert.eq(str("abcЙ😿".elems()), '"abcЙ😿".elems()') assert.eq( list("abcЙ😿".elems()), @@ -78,6 +92,8 @@ assert.eq( ["A", "\x9f", "\x98", "\xbf", "Z"], ) assert.eq(list("".elems()), []) +assert.eq("abcЙ😿".elems()[2], "c") # indexable +assert.eq(len("abcЙ😿".elems()), 9) # known length # indexing, x[i] assert.eq("Hello, 世界!"[0], "H") diff --git a/starlark/value.go b/starlark/value.go index bcec7508..ce6cf332 100644 --- a/starlark/value.go +++ b/starlark/value.go @@ -499,13 +499,20 @@ func (f Float) Unary(op syntax.Token) (Value, error) { return nil, nil } -// String is the type of a Starlark string. +// String is the type of a Starlark text string. // // A String encapsulates an an immutable sequence of bytes, // but strings are not directly iterable. Instead, iterate // over the result of calling one of these four methods: // codepoints, codepoint_ords, elems, elem_ords. // +// Strings typically contain text; use Bytes for binary strings. +// The Starlark spec defines text strings as sequences of UTF-k +// codes that encode Unicode code points. In this Go implementation, +// k=8, whereas in a Java implementation, k=16. For portability, +// operations on strings should aim to avoid assumptions about +// the value of k. +// // Warning: the contract of the Value interface's String method is that // it returns the value printed in Starlark notation, // so s.String() or fmt.Sprintf("%s", s) returns a quoted string. @@ -545,73 +552,106 @@ func (x String) CompareSameType(op syntax.Token, y_ Value, depth int) (bool, err func AsString(x Value) (string, bool) { v, ok := x.(String); return string(v), ok } -// A stringIterable is an iterable whose iterator yields a sequence of -// either Unicode code points or elements (bytes), -// either numerically or as successive substrings. -type stringIterable struct { - s String - ords bool - codepoints bool +// A stringElems is an iterable whose iterator yields a sequence of +// elements (bytes), either numerically or as successive substrings. +// It is an indexable sequence. +type stringElems struct { + s String + ords bool } -var _ Iterable = (*stringIterable)(nil) +var ( + _ Iterable = (*stringElems)(nil) + _ Indexable = (*stringElems)(nil) +) -func (si stringIterable) String() string { - var etype string - if si.codepoints { - etype = "codepoint" +func (si stringElems) String() string { + if si.ords { + return si.s.String() + ".elem_ords()" } else { - etype = "elem" + return si.s.String() + ".elems()" } +} +func (si stringElems) Type() string { return "string.elems" } +func (si stringElems) Freeze() {} // immutable +func (si stringElems) Truth() Bool { return True } +func (si stringElems) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", si.Type()) } +func (si stringElems) Iterate() Iterator { return &stringElemsIterator{si, 0} } +func (si stringElems) Len() int { return len(si.s) } +func (si stringElems) Index(i int) Value { if si.ords { - return si.s.String() + "." + etype + "_ords()" + return MakeInt(int(si.s[i])) } else { - return si.s.String() + "." + etype + "s()" + // TODO(adonovan): opt: preallocate canonical 1-byte strings + // to avoid interface allocation. + return si.s[i : i+1] + } +} + +type stringElemsIterator struct { + si stringElems + i int +} + +func (it *stringElemsIterator) Next(p *Value) bool { + if it.i == len(it.si.s) { + return false } + *p = it.si.Index(it.i) + it.i++ + return true +} + +func (*stringElemsIterator) Done() {} + +// A stringCodepoints is an iterable whose iterator yields a sequence of +// Unicode code points, either numerically or as successive substrings. +// It is not indexable. +type stringCodepoints struct { + s String + ords bool } -func (si stringIterable) Type() string { - if si.codepoints { - return "codepoints" + +var _ Iterable = (*stringCodepoints)(nil) + +func (si stringCodepoints) String() string { + if si.ords { + return si.s.String() + ".codepoint_ords()" } else { - return "elems" + return si.s.String() + ".codepoints()" } } -func (si stringIterable) Freeze() {} // immutable -func (si stringIterable) Truth() Bool { return True } -func (si stringIterable) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", si.Type()) } -func (si stringIterable) Iterate() Iterator { return &stringIterator{si, 0} } +func (si stringCodepoints) Type() string { return "string.codepoints" } +func (si stringCodepoints) Freeze() {} // immutable +func (si stringCodepoints) Truth() Bool { return True } +func (si stringCodepoints) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", si.Type()) } +func (si stringCodepoints) Iterate() Iterator { return &stringCodepointsIterator{si, 0} } -type stringIterator struct { - si stringIterable +type stringCodepointsIterator struct { + si stringCodepoints i int } -func (it *stringIterator) Next(p *Value) bool { +func (it *stringCodepointsIterator) Next(p *Value) bool { s := it.si.s[it.i:] if s == "" { return false } - if it.si.codepoints { - r, sz := utf8.DecodeRuneInString(string(s)) - if !it.si.ords { - *p = s[:sz] + r, sz := utf8.DecodeRuneInString(string(s)) + if !it.si.ords { + if r == utf8.RuneError { + *p = String(r) } else { - *p = MakeInt(int(r)) + *p = s[:sz] } - it.i += sz } else { - b := int(s[0]) - if !it.si.ords { - *p = s[:1] - } else { - *p = MakeInt(b) - } - it.i += 1 + *p = MakeInt(int(r)) } + it.i += sz return true } -func (*stringIterator) Done() {} +func (*stringCodepointsIterator) Done() {} // A Function is a function defined by a Starlark def statement or lambda expression. // The initialization behavior of a Starlark module is also represented by a Function. @@ -1084,6 +1124,7 @@ func writeValue(out *strings.Builder, x Value, path []Value) { case nil: out.WriteString("") // indicates a bug + // These four cases are duplicates of T.String(), for efficiency. case NoneType: out.WriteString("None") @@ -1318,6 +1359,8 @@ func Len(x Value) int { switch x := x.(type) { case String: return x.Len() + case Indexable: + return x.Len() case Sequence: return x.Len() } @@ -1335,3 +1378,54 @@ func Iterate(x Value) Iterator { } return nil } + +// Bytes is the type of a Starlark binary string. +// +// A Bytes encapsulates an immutable sequence of bytes. +// It is comparable, indexable, and sliceable, but not direcly iterable; +// use bytes.elems() for an iterable view. +// +// In this Go implementation, the elements of 'string' and 'bytes' are +// both bytes, but in other implementations, notably Java, the elements +// of a 'string' are UTF-16 codes (Java chars). The spec abstracts text +// strings as sequences of UTF-k codes that encode Unicode code points, +// and operations that convert from text to binary incur UTF-k-to-UTF-8 +// transcoding; conversely, conversion from binary to text incurs +// UTF-8-to-UTF-k transcoding. Because k=8 for Go, these operations +// are the identity function, at least for valid encodings of text. +type Bytes string + +var ( + _ Comparable = Bytes("") + _ Sliceable = Bytes("") + _ Indexable = Bytes("") +) + +func (b Bytes) String() string { return fmt.Sprintf("b%q", string(b)) } +func (b Bytes) Type() string { return "bytes" } +func (b Bytes) Freeze() {} // immutable +func (b Bytes) Truth() Bool { return len(b) > 0 } +func (b Bytes) Hash() (uint32, error) { return String(b).Hash() } +func (b Bytes) Len() int { return len(b) } +func (b Bytes) Index(i int) Value { return b[i : i+1] } + +func (b Bytes) Attr(name string) (Value, error) { return builtinAttr(b, name, bytesMethods) } +func (b Bytes) AttrNames() []string { return builtinAttrNames(bytesMethods) } + +func (b Bytes) Slice(start, end, step int) Value { + if step == 1 { + return b[start:end] + } + + sign := signum(step) + var str []byte + for i := start; signum(end-i) == sign; i += step { + str = append(str, b[i]) + } + return Bytes(str) +} + +func (x Bytes) CompareSameType(op syntax.Token, y_ Value, depth int) (bool, error) { + y := y_.(Bytes) + return threeway(op, strings.Compare(string(x), string(y))), nil +} diff --git a/syntax/parse.go b/syntax/parse.go index 0281e4b8..e5548dde 100644 --- a/syntax/parse.go +++ b/syntax/parse.go @@ -771,8 +771,7 @@ func (p *parser) parseArgs() []Expr { } // primary = IDENT -// | INT | FLOAT -// | STRING +// | INT | FLOAT | STRING | BYTES // | '[' ... // list literal or comprehension // | '{' ... // dict literal or comprehension // | '(' ... // tuple or parenthesized expression @@ -782,7 +781,7 @@ func (p *parser) parsePrimary() Expr { case IDENT: return p.parseIdent() - case INT, FLOAT, STRING: + case INT, FLOAT, STRING, BYTES: var val interface{} tok := p.tok switch tok { @@ -794,7 +793,7 @@ func (p *parser) parsePrimary() Expr { } case FLOAT: val = p.tokval.float - case STRING: + case STRING, BYTES: val = p.tokval.string } raw := p.tokval.raw diff --git a/syntax/parse_test.go b/syntax/parse_test.go index 76f9eb38..e06fad02 100644 --- a/syntax/parse_test.go +++ b/syntax/parse_test.go @@ -361,9 +361,12 @@ func writeTree(out *bytes.Buffer, x reflect.Value) { case reflect.Struct: switch v := x.Interface().(type) { case syntax.Literal: - if v.Token == syntax.STRING { + switch v.Token { + case syntax.STRING: fmt.Fprintf(out, "%q", v.Value) - } else if v.Token == syntax.INT { + case syntax.BYTES: + fmt.Fprintf(out, "b%q", v.Value) + case syntax.INT: fmt.Fprintf(out, "%d", v.Value) } return diff --git a/syntax/quote.go b/syntax/quote.go index 49cb259c..1e1119c2 100644 --- a/syntax/quote.go +++ b/syntax/quote.go @@ -10,6 +10,7 @@ import ( "fmt" "strconv" "strings" + "unicode" ) // unesc maps single-letter chars following \ to their actual values. @@ -41,15 +42,20 @@ var esc = [256]byte{ } // unquote unquotes the quoted string, returning the actual -// string value, whether the original was triple-quoted, and -// an error describing invalid input. -func unquote(quoted string) (s string, triple bool, err error) { +// string value, whether the original was triple-quoted, +// whether it was a byte string, and an error describing invalid input. +func unquote(quoted string) (s string, triple, isByte bool, err error) { // Check for raw prefix: means don't interpret the inner \. raw := false if strings.HasPrefix(quoted, "r") { raw = true quoted = quoted[1:] } + // Check for bytes prefix. + if strings.HasPrefix(quoted, "b") { + isByte = true + quoted = quoted[1:] + } if len(quoted) < 2 { err = fmt.Errorf("string literal too short") @@ -138,7 +144,7 @@ func unquote(quoted string) (s string, triple bool, err error) { quoted = quoted[2:] case '0', '1', '2', '3', '4', '5', '6', '7': - // Octal escape, up to 3 digits. + // Octal escape, up to 3 digits, \OOO. n := int(quoted[1] - '0') quoted = quoted[2:] for i := 1; i < 3; i++ { @@ -158,7 +164,7 @@ func unquote(quoted string) (s string, triple bool, err error) { buf.WriteByte(byte(n)) case 'x': - // Hexadecimal escape, exactly 2 digits. + // Hexadecimal escape, exactly 2 digits, \xXX. if len(quoted) < 4 { err = fmt.Errorf(`truncated escape sequence %s`, quoted) return @@ -170,6 +176,30 @@ func unquote(quoted string) (s string, triple bool, err error) { } buf.WriteByte(byte(n)) quoted = quoted[4:] + + case 'u', 'U': + // Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits. + // Unpaired surrogates are allowed (unlike in Go). + sz := 6 + if quoted[1] == 'U' { + sz = 10 + } + if len(quoted) < sz { + err = fmt.Errorf(`truncated escape sequence %s`, quoted) + return + } + n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0) + if err1 != nil { + err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz]) + return + } + if n > unicode.MaxRune { + err = fmt.Errorf(`code point out of range: %s (max \U%08x)`, + quoted[:sz], n) + return + } + buf.WriteRune(rune(n)) + quoted = quoted[sz:] } } diff --git a/syntax/quote_test.go b/syntax/quote_test.go index f9068eee..a2471b4f 100644 --- a/syntax/quote_test.go +++ b/syntax/quote_test.go @@ -59,7 +59,7 @@ func TestQuote(t *testing.T) { func TestUnquote(t *testing.T) { for _, tt := range quoteTests { - s, triple, err := unquote(tt.q) + s, triple, _, err := unquote(tt.q) wantTriple := strings.HasPrefix(tt.q, `"""`) || strings.HasPrefix(tt.q, `'''`) if s != tt.s || triple != wantTriple || err != nil { t.Errorf("unquote(%s) = %#q, %v, %v want %#q, %v, nil", tt.q, s, triple, err, tt.s, wantTriple) diff --git a/syntax/scan.go b/syntax/scan.go index 53d9f5c5..caecc360 100644 --- a/syntax/scan.go +++ b/syntax/scan.go @@ -35,6 +35,7 @@ const ( INT // 123 FLOAT // 1.23e45 STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo" + BYTES // b"foo", etc // Punctuation PLUS // + @@ -407,7 +408,7 @@ type tokenValue struct { int int64 // decoded int bigInt *big.Int // decoded integers > int64 float float64 // decoded float - string string // decoded string + string string // decoded string or bytes pos Position // start position of token } @@ -627,8 +628,15 @@ start: // identifier or keyword if isIdentStart(c) { - // raw string literal - if c == 'r' && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') { + if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') { + // r"..." + // b"..." + sc.readRune() + c = sc.peekRune() + return sc.scanString(val, c) + } else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') { + // rb"..." + sc.readRune() sc.readRune() c = sc.peekRune() return sc.scanString(val, c) @@ -872,12 +880,16 @@ func (sc *scanner) scanString(val *tokenValue, quote rune) Token { } val.raw = raw.String() - s, _, err := unquote(val.raw) + s, _, isByte, err := unquote(val.raw) if err != nil { sc.error(start, err.Error()) } val.string = s - return STRING + if isByte { + return BYTES + } else { + return STRING + } } func (sc *scanner) scanNumber(val *tokenValue, c rune) Token { diff --git a/syntax/scan_test.go b/syntax/scan_test.go index 0f2d9f22..daa3d4c4 100644 --- a/syntax/scan_test.go +++ b/syntax/scan_test.go @@ -44,6 +44,8 @@ func scan(src interface{}) (tokens string, err error) { fmt.Fprintf(&buf, "%e", val.float) case STRING: fmt.Fprintf(&buf, "%q", val.string) + case BYTES: + fmt.Fprintf(&buf, "b%q", val.string) default: buf.WriteString(tok.String()) } @@ -192,6 +194,30 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated {`"\377"`, `"\xff" EOF`}, {`"\378"`, `"\x1f8" EOF`}, // = '\37' + '8' {`"\400"`, `foo.star:1:1: invalid escape sequence \400`}, // unlike Python 2 and 3 + // hex escapes + {`"\x00"`, `"\x00" EOF`}, + {`"\xff"`, `"\xff" EOF`}, + {`"\xFf"`, `"\xff" EOF`}, + {`"\xF"`, `foo.star:1:1: truncated escape sequence \xF`}, + {`"\x"`, `foo.star:1:1: truncated escape sequence \x`}, + {`"\xfg"`, `foo.star:1:1: invalid escape sequence \xfg`}, + // Unicode escapes + // \uXXXX + {`"\u0400"`, `"Ѐ" EOF`}, + {`"\u100"`, `foo.star:1:1: truncated escape sequence \u100`}, + {`"\u04000"`, `"Ѐ0" EOF`}, // = U+0400 + '0' + {`"\u100g"`, `foo.star:1:1: invalid escape sequence \u100g`}, + {`"\u4E16"`, `"世" EOF`}, + {`"\udc00"`, `"�" EOF`}, // unpaired surrogates ok + // \UXXXXXXXX + {`"\U00000400"`, `"Ѐ" EOF`}, + {`"\U0000400"`, `foo.star:1:1: truncated escape sequence \U0000400`}, + {`"\U000004000"`, `"Ѐ0" EOF`}, // = U+0400 + '0' + {`"\U1000000g"`, `foo.star:1:1: invalid escape sequence \U1000000g`}, + {`"\U0010FFFF"`, `"\U0010ffff" EOF`}, + {`"\U00110000"`, `foo.star:1:1: code point out of range: \U00110000 (max \U00110000)`}, + {`"\U0001F63F"`, `"😿" EOF`}, + {`"\U0000dc00"`, `"�" EOF`}, // unpaired surrogates ok // backslash escapes // As in Go, a backslash must escape something. diff --git a/syntax/syntax.go b/syntax/syntax.go index b4817c1a..20b28bb6 100644 --- a/syntax/syntax.go +++ b/syntax/syntax.go @@ -251,10 +251,10 @@ func (x *Ident) Span() (start, end Position) { // A Literal represents a literal string or number. type Literal struct { commentsRef - Token Token // = STRING | INT + Token Token // = STRING | BYTES | INT | FLOAT TokenPos Position Raw string // uninterpreted text - Value interface{} // = string | int64 | *big.Int + Value interface{} // = string | int64 | *big.Int | float64 } func (x *Literal) Span() (start, end Position) {