aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoralandonovan <adonovan@google.com>2021-02-12 16:57:32 -0500
committerGitHub <noreply@github.com>2021-02-12 16:57:32 -0500
commitebe61bd709bf23d7baddbb34e79084d7d156be04 (patch)
tree2efd7da6ec6557b9aaed7932d19e07e1beade288
parent0a10e4fe7402e37a43d9b62c15bfeac1cd4ef272 (diff)
downloadstarlark-go-ebe61bd709bf23d7baddbb34e79084d7d156be04.tar.gz
starlark: add 'bytes' data type, for binary strings (#330)
THIS IS AN INCOMPATIBLE LANGUAGE CHANGE; see below This change defines a 'bytes' data type, an immutable string of bytes. In this Go implementation of Starlark, ordinary strings are also strings of bytes, so the behavior of the two is very similar. However, that is not required by the spec. Other implementations of Starlark, notably in Java, may use strings of UTF-16 codes for the ordinary string type, and thus need a distinct type for byte strings. See testdata/bytes.star for a tour of the API, and some remaining questions. See the attached issue for an outline of the proposed spec change. A Java implementation is underway, but is greatly complicated by Bazel's unfortunate misdecoding of UTF-8 files as Latin1. The string.elems iterable view is now indexable. The old syntax.quote function (which was in fact not used except in tests) has been replaced by syntax.Quote, which is similar to Go's strconv.Quote. This change removes go.starlark.net.lib.proto.Bytes. IMPORTANT: string literals that previously used hex escapes \xXX or octal escapes \OOO to denote byte values greater than 127 will now result in a compile error advising you to use \u escapes instead if you want the UTF-8 encoding of a code point in the range U+80 to U+FF. A string literal can no longer denote invalid text, such as the 1-element string formerly written "\xff". Updates https://github.com/bazelbuild/starlark/issues/112 Fixes https://github.com/google/starlark-go/issues/222
-rw-r--r--go.mod4
-rw-r--r--go.sum10
-rw-r--r--internal/compile/compile.go32
-rw-r--r--internal/compile/serial.go22
-rw-r--r--lib/proto/proto.go87
-rw-r--r--starlark/eval.go26
-rw-r--r--starlark/eval_test.go1
-rw-r--r--starlark/hashtable.go4
-rw-r--r--starlark/library.go179
-rw-r--r--starlark/testdata/bytes.star159
-rw-r--r--starlark/testdata/json.star2
-rw-r--r--starlark/testdata/string.star40
-rw-r--r--starlark/value.go180
-rw-r--r--syntax/parse.go7
-rw-r--r--syntax/parse_test.go7
-rw-r--r--syntax/quote.go170
-rw-r--r--syntax/quote_test.go21
-rw-r--r--syntax/scan.go24
-rw-r--r--syntax/scan_test.go46
-rw-r--r--syntax/syntax.go2
20 files changed, 739 insertions, 284 deletions
diff --git a/go.mod b/go.mod
index 50bc000..d14060e 100644
--- a/go.mod
+++ b/go.mod
@@ -6,6 +6,8 @@ require (
github.com/chzyer/logex v1.1.10 // indirect
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 // indirect
- golang.org/x/sys v0.0.0-20200803210538-64077c9b5642
+ github.com/google/go-cmp v0.5.1 // indirect
+ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f
+ golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/protobuf v1.25.0
)
diff --git a/go.sum b/go.sum
index b40c868..90a8048 100644
--- a/go.sum
+++ b/go.sum
@@ -24,8 +24,9 @@ github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5a
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.5.0 h1:/QaMHBdZ26BB3SSst0Iwl10Epc+xhTquomWX0oZEB6w=
github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.1 h1:JFrFEBb2xKufg6XkJsJr+WbKb4FQlURi5RUcBveYu9k=
+github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
@@ -42,15 +43,16 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
-golang.org/x/sys v0.0.0-20200803210538-64077c9b5642 h1:B6caxRw+hozq68X2MY7jEpZh/cr4/aHLv9xU8Kkadrw=
-golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
-golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
diff --git a/internal/compile/compile.go b/internal/compile/compile.go
index ab67018..c314e6e 100644
--- a/internal/compile/compile.go
+++ b/internal/compile/compile.go
@@ -33,6 +33,7 @@ import (
"os"
"path/filepath"
"strconv"
+ "strings"
"sync"
"go.starlark.net/resolve"
@@ -46,7 +47,7 @@ var Disassemble = false
const debug = false // make code generation verbose, for debugging the compiler
// Increment this to force recompilation of saved bytecode files.
-const Version = 11
+const Version = 12
type Opcode uint8
@@ -309,12 +310,15 @@ func (op Opcode) String() string {
type Program struct {
Loads []Binding // name (really, string) and position of each load stmt
Names []string // names of attributes and predeclared variables
- Constants []interface{} // = string | int64 | float64 | *big.Int
+ Constants []interface{} // = string | int64 | float64 | *big.Int | Bytes
Functions []*Funcode
Globals []Binding // for error messages and tracing
Toplevel *Funcode // module initialization function
}
+// The type of a bytes literal value, to distinguish from text string.
+type Bytes string
+
// A Funcode is the code of a compiled Starlark function.
//
// Funcodes are serialized by the encoder.function method,
@@ -863,6 +867,8 @@ func PrintOp(fn *Funcode, pc uint32, op Opcode, arg uint32) {
switch x := fn.Prog.Constants[arg].(type) {
case string:
comment = strconv.Quote(x)
+ case Bytes:
+ comment = "b" + strconv.Quote(string(x))
default:
comment = fmt.Sprint(x)
}
@@ -1283,8 +1289,12 @@ func (fcomp *fcomp) expr(e syntax.Expr) {
fcomp.lookup(e)
case *syntax.Literal:
- // e.Value is int64, float64, *bigInt, or string.
- fcomp.emit1(CONSTANT, fcomp.pcomp.constantIndex(e.Value))
+ // e.Value is int64, float64, *bigInt, string
+ v := e.Value
+ if e.Token == syntax.BYTES {
+ v = Bytes(v.(string))
+ }
+ fcomp.emit1(CONSTANT, fcomp.pcomp.constantIndex(v))
case *syntax.ListExpr:
for _, x := range e.List {
@@ -1522,7 +1532,7 @@ func (fcomp *fcomp) plus(e *syntax.BinaryExpr) {
}
// addable reports whether e is a statically addable
-// expression: a [s]tring, [l]ist, or [t]uple.
+// expression: a [s]tring, [b]ytes, [l]ist, or [t]uple.
func addable(e syntax.Expr) rune {
switch e := e.(type) {
case *syntax.Literal:
@@ -1530,6 +1540,8 @@ func addable(e syntax.Expr) rune {
switch e.Token {
case syntax.STRING:
return 's'
+ case syntax.BYTES:
+ return 'b'
}
case *syntax.ListExpr:
return 'l'
@@ -1544,12 +1556,16 @@ func addable(e syntax.Expr) rune {
// The resulting syntax is degenerate, lacking position, etc.
func add(code rune, args []summand) syntax.Expr {
switch code {
- case 's':
- var buf bytes.Buffer
+ case 's', 'b':
+ var buf strings.Builder
for _, arg := range args {
buf.WriteString(arg.x.(*syntax.Literal).Value.(string))
}
- return &syntax.Literal{Token: syntax.STRING, Value: buf.String()}
+ tok := syntax.STRING
+ if code == 'b' {
+ tok = syntax.BYTES
+ }
+ return &syntax.Literal{Token: tok, Value: buf.String()}
case 'l':
var elems []syntax.Expr
for _, arg := range args {
diff --git a/internal/compile/serial.go b/internal/compile/serial.go
index 0107ef9..adadabf 100644
--- a/internal/compile/serial.go
+++ b/internal/compile/serial.go
@@ -51,9 +51,10 @@ package compile
//
// Constant: # type data
// type varint # 0=string string
-// data ... # 1=int varint
-// # 2=float varint (bits as uint64)
-// # 3=bigint string (decimal ASCII text)
+// data ... # 1=bytes string
+// # 2=int varint
+// # 3=float varint (bits as uint64)
+// # 4=bigint string (decimal ASCII text)
//
// The encoding starts with a four-byte magic number.
// The next four bytes are a little-endian uint32
@@ -109,14 +110,17 @@ func (prog *Program) Encode() []byte {
case string:
e.int(0)
e.string(c)
- case int64:
+ case Bytes:
e.int(1)
+ e.string(string(c))
+ case int64:
+ e.int(2)
e.int64(c)
case float64:
- e.int(2)
+ e.int(3)
e.uint64(math.Float64bits(c))
case *big.Int:
- e.int(3)
+ e.int(4)
e.string(c.Text(10))
}
}
@@ -249,10 +253,12 @@ func DecodeProgram(data []byte) (_ *Program, err error) {
case 0:
c = d.string()
case 1:
- c = d.int64()
+ c = Bytes(d.string())
case 2:
- c = math.Float64frombits(d.uint64())
+ c = d.int64()
case 3:
+ c = math.Float64frombits(d.uint64())
+ case 4:
c, _ = new(big.Int).SetString(d.string(), 10)
}
constants[i] = c
diff --git a/lib/proto/proto.go b/lib/proto/proto.go
index 84aa0d6..149162d 100644
--- a/lib/proto/proto.go
+++ b/lib/proto/proto.go
@@ -79,8 +79,6 @@
package proto
// TODO(adonovan): Go and Starlark API improvements:
-// - Contribute the 'bytes' data type to the core language.
-// See https://github.com/bazelbuild/starlark/issues/112.
// - Make Message and RepeatedField comparable.
// (NOTE: proto.Equal works only with generated message types.)
// - Support maps, oneof, any. But not messageset if we can avoid it.
@@ -234,7 +232,7 @@ func marshal(_ *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwar
if err != nil {
return nil, fmt.Errorf("%s: %v", fn.Name(), err)
}
- return Bytes(data), nil
+ return starlark.Bytes(data), nil
} else {
text, err := prototext.MarshalOptions{Indent: " "}.Marshal(m.Message())
if err != nil {
@@ -247,7 +245,7 @@ func marshal(_ *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwar
// unmarshal(msg) decodes a binary protocol message to a Message.
func unmarshal(thread *starlark.Thread, fn *starlark.Builtin, args starlark.Tuple, kwargs []starlark.Tuple) (starlark.Value, error) {
var desc MessageDescriptor
- var data Bytes
+ var data starlark.Bytes
if err := starlark.UnpackPositionalArgs(fn.Name(), args, kwargs, 2, &desc, &data); err != nil {
return nil, err
}
@@ -486,7 +484,7 @@ func toProto(fdesc protoreflect.FieldDescriptor, v starlark.Value) (protoreflect
case protoreflect.StringKind:
if s, ok := starlark.AsString(v); ok {
return protoreflect.ValueOfString(s), nil
- } else if b, ok := v.(Bytes); ok {
+ } else if b, ok := v.(starlark.Bytes); ok {
// TODO(adonovan): allow bytes for string? Not friendly to a Java port.
return protoreflect.ValueOfBytes([]byte(b)), nil
}
@@ -497,7 +495,7 @@ func toProto(fdesc protoreflect.FieldDescriptor, v starlark.Value) (protoreflect
// Instead provide b"..." literals in the core
// and a bytes(str) conversion.
return protoreflect.ValueOfBytes([]byte(s)), nil
- } else if b, ok := v.(Bytes); ok {
+ } else if b, ok := v.(starlark.Bytes); ok {
return protoreflect.ValueOfBytes([]byte(b)), nil
}
@@ -588,7 +586,7 @@ func toStarlark1(typ protoreflect.FieldDescriptor, x protoreflect.Value, frozen
return starlark.String(x.String())
case protoreflect.BytesKind:
- return Bytes(x.Bytes())
+ return starlark.Bytes(x.Bytes())
case protoreflect.DoubleKind, protoreflect.FloatKind:
return starlark.Float(x.Float())
@@ -1232,78 +1230,3 @@ func (x EnumValueDescriptor) CompareSameType(op syntax.Token, y_ starlark.Value,
return false, fmt.Errorf("%s %s %s not implemented", x.Type(), op, y_.Type())
}
}
-
-// A Bytes is an immutable sequence of bytes.
-// It is comparable, iterable, indexable, and sliceable.
-//
-// (In go.starlark.net, text Strings are also byte strings,
-// but we shouldn't rely on that.
-// See https://github.com/bazelbuild/starlark/issues/112.)
-type Bytes string
-
-var (
- _ starlark.Comparable = Bytes("")
- _ starlark.Iterable = Bytes("")
- _ starlark.Sliceable = Bytes("")
- _ starlark.Sequence = Bytes("")
-)
-
-func (b Bytes) String() string { return fmt.Sprintf("<%d bytes>", len(b)) }
-func (b Bytes) Type() string { return "bytes" }
-func (b Bytes) Freeze() {} // immutable
-func (b Bytes) Truth() starlark.Bool { return len(b) > 0 }
-func (b Bytes) Hash() (uint32, error) { return starlark.String(b).Hash() }
-func (b Bytes) Len() int { return len(b) }
-func (b Bytes) Index(i int) starlark.Value { return starlark.MakeInt(int(b[i])) }
-
-func (b Bytes) Slice(start, end, step int) starlark.Value {
- if step == 1 {
- return b[start:end]
- }
-
- sign := signum(step)
- var str []byte
- for i := start; signum(end-i) == sign; i += step {
- str = append(str, b[i])
- }
- return Bytes(str)
-}
-
-// From Hacker's Delight, section 2.8.
-func signum64(x int64) int { return int(uint64(x>>63) | uint64(-x)>>63) }
-func signum(x int) int { return signum64(int64(x)) }
-
-func (b Bytes) Iterate() starlark.Iterator { return &bytesIterator{string(b)} }
-
-type bytesIterator struct{ string }
-
-func (it *bytesIterator) Next(p *starlark.Value) bool {
- if it.string == "" {
- return false
- }
- *p = starlark.MakeInt(int(it.string[0]))
- it.string = it.string[1:]
- return true
-}
-
-func (it *bytesIterator) Done() {}
-
-func (x Bytes) CompareSameType(op syntax.Token, y_ starlark.Value, depth int) (bool, error) {
- y := y_.(Bytes)
- cmp := strings.Compare(string(x), string(y))
- switch op {
- case syntax.EQL:
- return cmp == 0, nil
- case syntax.NEQ:
- return cmp != 0, nil
- case syntax.LE:
- return cmp <= 0, nil
- case syntax.LT:
- return cmp < 0, nil
- case syntax.GE:
- return cmp >= 0, nil
- case syntax.GT:
- return cmp > 0, nil
- }
- panic(op)
-}
diff --git a/starlark/eval.go b/starlark/eval.go
index c9bbb67..d0ad91f 100644
--- a/starlark/eval.go
+++ b/starlark/eval.go
@@ -478,6 +478,8 @@ func makeToplevelFunction(prog *compile.Program, predeclared StringDict) *Functi
v = MakeBigInt(c)
case string:
v = String(c)
+ case compile.Bytes:
+ v = Bytes(c)
case float64:
v = Float(c)
default:
@@ -796,6 +798,8 @@ func Binary(op syntax.Token, x, y Value) (Value, error) {
return xf * y, nil
case String:
return stringRepeat(y, x)
+ case Bytes:
+ return bytesRepeat(y, x)
case *List:
elems, err := tupleRepeat(Tuple(y.elems), x)
if err != nil {
@@ -820,6 +824,10 @@ func Binary(op syntax.Token, x, y Value) (Value, error) {
if y, ok := y.(Int); ok {
return stringRepeat(x, y)
}
+ case Bytes:
+ if y, ok := y.(Int); ok {
+ return bytesRepeat(x, y)
+ }
case *List:
if y, ok := y.(Int); ok {
elems, err := tupleRepeat(Tuple(x.elems), y)
@@ -996,6 +1004,19 @@ func Binary(op syntax.Token, x, y Value) (Value, error) {
return nil, fmt.Errorf("'in <string>' requires string as left operand, not %s", x.Type())
}
return Bool(strings.Contains(string(y), string(needle))), nil
+ case Bytes:
+ switch needle := x.(type) {
+ case Bytes:
+ return Bool(strings.Contains(string(y), string(needle))), nil
+ case Int:
+ var b byte
+ if err := AsInt(needle, &b); err != nil {
+ return nil, fmt.Errorf("int in bytes: %s", err)
+ }
+ return Bool(strings.IndexByte(string(y), b) >= 0), nil
+ default:
+ return nil, fmt.Errorf("'in bytes' requires bytes or int as left operand, not %s", x.Type())
+ }
case rangeValue:
i, err := NumberToInt(x)
if err != nil {
@@ -1138,6 +1159,11 @@ func tupleRepeat(elems Tuple, n Int) (Tuple, error) {
return res, nil
}
+func bytesRepeat(b Bytes, n Int) (Bytes, error) {
+ res, err := stringRepeat(String(b), n)
+ return Bytes(res), err
+}
+
func stringRepeat(s String, n Int) (String, error) {
if s == "" {
return "", nil
diff --git a/starlark/eval_test.go b/starlark/eval_test.go
index 4ce08d3..9752fe8 100644
--- a/starlark/eval_test.go
+++ b/starlark/eval_test.go
@@ -115,6 +115,7 @@ func TestExecFile(t *testing.T) {
"testdata/assign.star",
"testdata/bool.star",
"testdata/builtins.star",
+ "testdata/bytes.star",
"testdata/control.star",
"testdata/dict.star",
"testdata/float.star",
diff --git a/starlark/hashtable.go b/starlark/hashtable.go
index d425019..27990b5 100644
--- a/starlark/hashtable.go
+++ b/starlark/hashtable.go
@@ -362,9 +362,9 @@ func hashString(s string) uint32 {
//go:linkname goStringHash runtime.stringHash
func goStringHash(s string, seed uintptr) uintptr
-// softHashString computes the FNV hash of s in software.
+// softHashString computes the 32-bit FNV-1a hash of s in software.
func softHashString(s string) uint32 {
- var h uint32
+ var h uint32 = 2166136261
for i := 0; i < len(s); i++ {
h ^= uint32(s[i])
h *= 16777619
diff --git a/starlark/library.go b/starlark/library.go
index 5645418..5620426 100644
--- a/starlark/library.go
+++ b/starlark/library.go
@@ -42,6 +42,7 @@ func init() {
"any": NewBuiltin("any", any),
"all": NewBuiltin("all", all),
"bool": NewBuiltin("bool", bool_),
+ "bytes": NewBuiltin("bytes", bytes_),
"chr": NewBuiltin("chr", chr),
"dict": NewBuiltin("dict", dict),
"dir": NewBuiltin("dir", dir),
@@ -73,6 +74,10 @@ func init() {
// methods of built-in types
// https://github.com/google/starlark-go/blob/master/doc/spec.md#built-in-methods
var (
+ bytesMethods = map[string]*Builtin{
+ "elems": NewBuiltin("elems", bytes_elems),
+ }
+
dictMethods = map[string]*Builtin{
"clear": NewBuiltin("clear", dict_clear),
"get": NewBuiltin("get", dict_get),
@@ -198,6 +203,45 @@ func bool_(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error
return x.Truth(), nil
}
+// https://github.com/google/starlark-go/blob/master/doc/spec.md#bytes
+func bytes_(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) {
+ if len(kwargs) > 0 {
+ return nil, fmt.Errorf("bytes does not accept keyword arguments")
+ }
+ if len(args) != 1 {
+ return nil, fmt.Errorf("bytes: got %d arguments, want exactly 1", len(args))
+ }
+ switch x := args[0].(type) {
+ case Bytes:
+ return x, nil
+ case String:
+ // Invalid encodings are replaced by that of U+FFFD.
+ return Bytes(utf8Transcode(string(x))), nil
+ case Iterable:
+ // iterable of numeric byte values
+ var buf strings.Builder
+ if n := Len(x); n >= 0 {
+ // common case: known length
+ buf.Grow(n)
+ }
+ iter := x.Iterate()
+ defer iter.Done()
+ var elem Value
+ var b byte
+ for i := 0; iter.Next(&elem); i++ {
+ if err := AsInt(elem, &b); err != nil {
+ return nil, fmt.Errorf("bytes: at index %d, %s", i, err)
+ }
+ buf.WriteByte(b)
+ }
+ return Bytes(buf.String()), nil
+
+ default:
+ // Unlike string(foo), which stringifies it, bytes(foo) is an error.
+ return nil, fmt.Errorf("bytes: got %s, want string, bytes, or iterable of ints", x.Type())
+ }
+}
+
// https://github.com/google/starlark-go/blob/master/doc/spec.md#chr
func chr(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) {
if len(kwargs) > 0 {
@@ -261,9 +305,6 @@ func enumerate(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, e
}
iter := iterable.Iterate()
- if iter == nil {
- return nil, fmt.Errorf("enumerate: got %s, want iterable", iterable.Type())
- }
defer iter.Done()
var pairs []Value
@@ -433,19 +474,27 @@ func hasattr(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, err
// https://github.com/google/starlark-go/blob/master/doc/spec.md#hash
func hash(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error) {
- var s string
- if err := UnpackPositionalArgs("hash", args, kwargs, 1, &s); err != nil {
+ var x Value
+ if err := UnpackPositionalArgs("hash", args, kwargs, 1, &x); err != nil {
return nil, err
}
- // The Starlark spec requires that the hash function be
- // deterministic across all runs, motivated by the need
- // for reproducibility of builds. Thus we cannot call
- // String.Hash, which uses the fastest implementation
- // available, because as varies across process restarts,
- // and may evolve with the implementation.
-
- return MakeInt(int(javaStringHash(s))), nil
+ var h int
+ switch x := x.(type) {
+ case String:
+ // The Starlark spec requires that the hash function be
+ // deterministic across all runs, motivated by the need
+ // for reproducibility of builds. Thus we cannot call
+ // String.Hash, which uses the fastest implementation
+ // available, because as varies across process restarts,
+ // and may evolve with the implementation.
+ h = int(javaStringHash(string(x)))
+ case Bytes:
+ h = int(softHashString(string(x))) // FNV32
+ default:
+ return nil, fmt.Errorf("hash: got %s, want string or bytes", x.Type())
+ }
+ return MakeInt(h), nil
}
// javaStringHash returns the same hash as would be produced by
@@ -691,16 +740,26 @@ func ord(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error)
if len(args) != 1 {
return nil, fmt.Errorf("ord: got %d arguments, want 1", len(args))
}
- s, ok := AsString(args[0])
- if !ok {
- return nil, fmt.Errorf("ord: got %s, want string", args[0].Type())
- }
- r, sz := utf8.DecodeRuneInString(s)
- if sz == 0 || sz != len(s) {
- n := utf8.RuneCountInString(s)
- return nil, fmt.Errorf("ord: string encodes %d Unicode code points, want 1", n)
+ switch x := args[0].(type) {
+ case String:
+ // ord(string) returns int value of sole rune.
+ s := string(x)
+ r, sz := utf8.DecodeRuneInString(s)
+ if sz == 0 || sz != len(s) {
+ n := utf8.RuneCountInString(s)
+ return nil, fmt.Errorf("ord: string encodes %d Unicode code points, want 1", n)
+ }
+ return MakeInt(int(r)), nil
+
+ case Bytes:
+ // ord(bytes) returns int value of sole byte.
+ if len(x) != 1 {
+ return nil, fmt.Errorf("ord: bytes has length %d, want 1", len(x))
+ }
+ return MakeInt(int(x[0])), nil
+ default:
+ return nil, fmt.Errorf("ord: got %s, want string or bytes", x.Type())
}
- return MakeInt(int(r)), nil
}
// https://github.com/google/starlark-go/blob/master/doc/spec.md#print
@@ -716,6 +775,8 @@ func print(thread *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, error
}
if s, ok := AsString(v); ok {
buf.WriteString(s)
+ } else if b, ok := v.(Bytes); ok {
+ buf.WriteString(string(b))
} else {
writeValue(buf, v, nil)
}
@@ -993,11 +1054,29 @@ func str(thread *Thread, _ *Builtin, args Tuple, kwargs []Tuple) (Value, error)
if len(args) != 1 {
return nil, fmt.Errorf("str: got %d arguments, want exactly 1", len(args))
}
- x := args[0]
- if _, ok := AsString(x); !ok {
- x = String(x.String())
+ switch x := args[0].(type) {
+ case String:
+ return x, nil
+ case Bytes:
+ // Invalid encodings are replaced by that of U+FFFD.
+ return String(utf8Transcode(string(x))), nil
+ default:
+ return String(x.String()), nil
}
- return x, nil
+}
+
+// utf8Transcode returns the UTF-8-to-UTF-8 transcoding of s.
+// The effect is that each code unit that is part of an
+// invalid sequence is replaced by U+FFFD.
+func utf8Transcode(s string) string {
+ if utf8.ValidString(s) {
+ return s
+ }
+ var out strings.Builder
+ for _, r := range s {
+ out.WriteRune(r)
+ }
+ return out.String()
}
// https://github.com/google/starlark-go/blob/master/doc/spec.md#tuple
@@ -1374,13 +1453,51 @@ func string_iterable(_ *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value,
if err := UnpackPositionalArgs(b.Name(), args, kwargs, 0); err != nil {
return nil, err
}
- return stringIterable{
- s: b.Receiver().(String),
- ords: b.Name()[len(b.Name())-2] == 'd',
- codepoints: b.Name()[0] == 'c',
- }, nil
+ s := b.Receiver().(String)
+ ords := b.Name()[len(b.Name())-2] == 'd'
+ codepoints := b.Name()[0] == 'c'
+ if codepoints {
+ return stringCodepoints{s, ords}, nil
+ } else {
+ return stringElems{s, ords}, nil
+ }
+}
+
+// bytes_elems returns an unspecified iterable value whose
+// iterator yields the int values of successive elements.
+func bytes_elems(_ *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, error) {
+ if err := UnpackPositionalArgs(b.Name(), args, kwargs, 0); err != nil {
+ return nil, err
+ }
+ return bytesIterable{b.Receiver().(Bytes)}, nil
+}
+
+// A bytesIterable is an iterable returned by bytes.elems(),
+// whose iterator yields a sequence of numeric bytes values.
+type bytesIterable struct{ bytes Bytes }
+
+var _ Iterable = (*bytesIterable)(nil)
+
+func (bi bytesIterable) String() string { return bi.bytes.String() + ".elems()" }
+func (bi bytesIterable) Type() string { return "bytes.elems" }
+func (bi bytesIterable) Freeze() {} // immutable
+func (bi bytesIterable) Truth() Bool { return True }
+func (bi bytesIterable) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", bi.Type()) }
+func (bi bytesIterable) Iterate() Iterator { return &bytesIterator{bi.bytes} }
+
+type bytesIterator struct{ bytes Bytes }
+
+func (it *bytesIterator) Next(p *Value) bool {
+ if it.bytes == "" {
+ return false
+ }
+ *p = MakeInt(int(it.bytes[0]))
+ it.bytes = it.bytes[1:]
+ return true
}
+func (*bytesIterator) Done() {}
+
// https://github.com/google/starlark-go/blob/master/doc/spec.md#string·count
func string_count(_ *Thread, b *Builtin, args Tuple, kwargs []Tuple) (Value, error) {
var sub string
diff --git a/starlark/testdata/bytes.star b/starlark/testdata/bytes.star
new file mode 100644
index 0000000..d500403
--- /dev/null
+++ b/starlark/testdata/bytes.star
@@ -0,0 +1,159 @@
+# Tests of 'bytes' (immutable byte strings).
+
+load("assert.star", "assert")
+
+# bytes(string) -- UTF-k to UTF-8 transcoding with U+FFFD replacement
+hello = bytes("hello, 世界")
+goodbye = bytes("goodbye")
+empty = bytes("")
+nonprinting = bytes("\t\n\x7F\u200D") # TAB, NEWLINE, DEL, ZERO_WIDTH_JOINER
+assert.eq(bytes("hello, 世界"[:-1]), b"hello, 世��")
+
+# bytes(iterable of int) -- construct from numeric byte values
+assert.eq(bytes([65, 66, 67]), b"ABC")
+assert.eq(bytes((65, 66, 67)), b"ABC")
+assert.eq(bytes([0xf0, 0x9f, 0x98, 0xbf]), b"😿")
+assert.fails(lambda: bytes([300]),
+ "at index 0, 300 out of range .want value in unsigned 8-bit range")
+assert.fails(lambda: bytes([b"a"]),
+ "at index 0, got bytes, want int")
+assert.fails(lambda: bytes(1), "want string, bytes, or iterable of ints")
+
+# literals
+assert.eq(b"hello, 世界", hello)
+assert.eq(b"goodbye", goodbye)
+assert.eq(b"", empty)
+assert.eq(b"\t\n\x7F\u200D", nonprinting)
+assert.ne("abc", b"abc")
+assert.eq(b"\012\xff\u0400\U0001F63F", b"\n\xffЀ😿") # see scanner tests for more
+assert.eq(rb"\r\n\t", b"\\r\\n\\t") # raw
+
+# type
+assert.eq(type(hello), "bytes")
+
+# len
+assert.eq(len(hello), 13)
+assert.eq(len(goodbye), 7)
+assert.eq(len(empty), 0)
+assert.eq(len(b"A"), 1)
+assert.eq(len(b"Ѐ"), 2)
+assert.eq(len(b"世"), 3)
+assert.eq(len(b"😿"), 4)
+
+# truth
+assert.true(hello)
+assert.true(goodbye)
+assert.true(not empty)
+
+# str(bytes) does UTF-8 to UTF-k transcoding.
+# TODO(adonovan): specify.
+assert.eq(str(hello), "hello, 世界")
+assert.eq(str(hello[:-1]), "hello, 世��") # incomplete UTF-8 encoding => U+FFFD
+assert.eq(str(goodbye), "goodbye")
+assert.eq(str(empty), "")
+assert.eq(str(nonprinting), "\t\n\x7f\u200d")
+assert.eq(str(b"\xED\xB0\x80"), "���") # UTF-8 encoding of unpaired surrogate => U+FFFD x 3
+
+# repr
+assert.eq(repr(hello), r'b"hello, 世界"')
+assert.eq(repr(hello[:-1]), r'b"hello, 世\xe7\x95"') # (incomplete UTF-8 encoding )
+assert.eq(repr(goodbye), 'b"goodbye"')
+assert.eq(repr(empty), 'b""')
+assert.eq(repr(nonprinting), 'b"\\t\\n\\x7f\\u200d"')
+
+# equality
+assert.eq(hello, hello)
+assert.ne(hello, goodbye)
+assert.eq(b"goodbye", goodbye)
+
+# ordered comparison
+assert.lt(b"abc", b"abd")
+assert.lt(b"abc", b"abcd")
+assert.lt(b"\x7f", b"\x80") # bytes compare as uint8, not int8
+
+# bytes are dict-hashable
+dict = {hello: 1, goodbye: 2}
+dict[b"goodbye"] = 3
+assert.eq(len(dict), 2)
+assert.eq(dict[goodbye], 3)
+
+# hash(bytes) is 32-bit FNV-1a.
+assert.eq(hash(b""), 0x811c9dc5)
+assert.eq(hash(b"a"), 0xe40c292c)
+assert.eq(hash(b"ab"), 0x4d2505ca)
+assert.eq(hash(b"abc"), 0x1a47e90b)
+
+# indexing
+assert.eq(goodbye[0], b"g")
+assert.eq(goodbye[-1], b"e")
+assert.fails(lambda: goodbye[100], "out of range")
+
+# slicing
+assert.eq(goodbye[:4], b"good")
+assert.eq(goodbye[4:], b"bye")
+assert.eq(goodbye[::2], b"gobe")
+assert.eq(goodbye[3:4], b"d") # special case: len=1
+assert.eq(goodbye[4:4], b"") # special case: len=0
+
+# bytes in bytes
+assert.eq(b"bc" in b"abcd", True)
+assert.eq(b"bc" in b"dcab", False)
+assert.fails(lambda: "bc" in b"dcab", "requires bytes or int as left operand, not string")
+
+# int in bytes
+assert.eq(97 in b"abc", True) # 97='a'
+assert.eq(100 in b"abc", False) # 100='d'
+assert.fails(lambda: 256 in b"abc", "int in bytes: 256 out of range")
+assert.fails(lambda: -1 in b"abc", "int in bytes: -1 out of range")
+
+# ord TODO(adonovan): specify
+assert.eq(ord(b"a"), 97)
+assert.fails(lambda: ord(b"ab"), "ord: bytes has length 2, want 1")
+assert.fails(lambda: ord(b""), "ord: bytes has length 0, want 1")
+
+# repeat (bytes * int)
+assert.eq(goodbye * 3, b"goodbyegoodbyegoodbye")
+assert.eq(3 * goodbye, b"goodbyegoodbyegoodbye")
+
+# elems() returns an iterable value over 1-byte substrings.
+assert.eq(type(hello.elems()), "bytes.elems")
+assert.eq(str(hello.elems()), "b\"hello, 世界\".elems()")
+assert.eq(list(hello.elems()), [104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140])
+assert.eq(bytes([104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140]), hello)
+assert.eq(list(goodbye.elems()), [103, 111, 111, 100, 98, 121, 101])
+assert.eq(list(empty.elems()), [])
+assert.eq(bytes(hello.elems()), hello) # bytes(iterable) is dual to bytes.elems()
+
+# x[i] = ...
+def f():
+ b"abc"[1] = b"B"
+
+assert.fails(f, "bytes.*does not support.*assignment")
+
+# TODO(adonovan): the specification is not finalized in many areas:
+# - chr, ord functions
+# - encoding/decoding bytes to string.
+# - methods: find, index, split, etc.
+#
+# Summary of string operations (put this in spec).
+#
+# string to number:
+# - bytes[i] returns numeric value of ith byte.
+# - ord(string) returns numeric value of sole code point in string.
+# - ord(string[i]) is not a useful operation: fails on non-ASCII; see below.
+# Q. Perhaps ord should return the first (not sole) code point? Then it becomes a UTF-8 decoder.
+# Perhaps ord(string, index=int) should apply the index and relax the len=1 check.
+# - string.codepoint() iterates over 1-codepoint substrings.
+# - string.codepoint_ords() iterates over numeric values of code points in string.
+# - string.elems() iterates over 1-element (UTF-k code) substrings.
+# - string.elem_ords() iterates over numeric UTF-k code values.
+# - string.elem_ords()[i] returns numeric value of ith element (UTF-k code).
+# - string.elems()[i] returns substring of a single element (UTF-k code).
+# - int(string) parses string as decimal (or other) numeric literal.
+#
+# number to string:
+# - chr(int) returns string, UTF-k encoding of Unicode code point (like Python).
+# Redundant with '%c' % int (which Python2 calls 'unichr'.)
+# - bytes(chr(int)) returns byte string containing UTF-8 encoding of one code point.
+# - bytes([int]) returns 1-byte string (with regrettable list allocation).
+# - str(int) - format number as decimal.
diff --git a/starlark/testdata/json.star b/starlark/testdata/json.star
index ef33d91..7c7b316 100644
--- a/starlark/testdata/json.star
+++ b/starlark/testdata/json.star
@@ -23,7 +23,7 @@ assert.eq(json.encode(range(3)), "[0,1,2]") # a built-in iterable
assert.eq(json.encode(dict(x = 1, y = "two")), '{"x":1,"y":"two"}')
assert.eq(json.encode(dict(y = "two", x = 1)), '{"x":1,"y":"two"}') # key, not insertion, order
assert.eq(json.encode(struct(x = 1, y = "two")), '{"x":1,"y":"two"}') # a user-defined HasAttrs
-assert.eq(json.encode("\x80"), '"\\ufffd"') # invalid UTF-8 -> replacement char
+assert.eq(json.encode("😹"[:1]), '"\\ufffd"') # invalid UTF-8 -> replacement char
def encode_error(expr, error):
assert.fails(lambda: json.encode(expr), error)
diff --git a/starlark/testdata/string.star b/starlark/testdata/string.star
index 84c6791..b317d1a 100644
--- a/starlark/testdata/string.star
+++ b/starlark/testdata/string.star
@@ -37,8 +37,9 @@ assert.eq(chr(1049), "Й") # 2-byte UTF-8 encoding
assert.eq(chr(0x1F63F), "😿") # 4-byte UTF-8 encoding
assert.fails(lambda: chr(-1), "Unicode code point -1 out of range \\(<0\\)")
assert.fails(lambda: chr(0x110000), "Unicode code point U\\+110000 out of range \\(>0x10FFFF\\)")
-assert.eq(ord("A"), 65)
-assert.eq(ord("Й"), 1049)
+assert.eq(ord("A"), 0x41)
+assert.eq(ord("Й"), 0x419)
+assert.eq(ord("世"), 0x4e16)
assert.eq(ord("😿"), 0x1F63F)
assert.eq(ord("Й"[1:]), 0xFFFD) # = Unicode replacement character
assert.fails(lambda: ord("abc"), "string encodes 3 Unicode code points, want 1")
@@ -46,42 +47,50 @@ assert.fails(lambda: ord(""), "string encodes 0 Unicode code points, want 1")
assert.fails(lambda: ord("😿"[1:]), "string encodes 3 Unicode code points, want 1") # 3 x 0xFFFD
# string.codepoint_ords
-assert.eq(type("abcЙ😿".codepoint_ords()), "codepoints")
+assert.eq(type("abcЙ😿".codepoint_ords()), "string.codepoints")
assert.eq(str("abcЙ😿".codepoint_ords()), '"abcЙ😿".codepoint_ords()')
assert.eq(list("abcЙ😿".codepoint_ords()), [97, 98, 99, 1049, 128575])
assert.eq(list(("A" + "😿Z"[1:]).codepoint_ords()), [ord("A"), 0xFFFD, 0xFFFD, 0xFFFD, ord("Z")])
assert.eq(list("".codepoint_ords()), [])
+assert.fails(lambda: "abcЙ😿".codepoint_ords()[2], "unhandled index") # not indexable
+assert.fails(lambda: len("abcЙ😿".codepoint_ords()), "no len") # unknown length
# string.codepoints
-assert.eq(type("abcЙ😿".codepoints()), "codepoints")
+assert.eq(type("abcЙ😿".codepoints()), "string.codepoints")
assert.eq(str("abcЙ😿".codepoints()), '"abcЙ😿".codepoints()')
assert.eq(list("abcЙ😿".codepoints()), ["a", "b", "c", "Й", "😿"])
-assert.eq(list(("A" + "😿Z"[1:]).codepoints()), ["A", "\x9f", "\x98", "\xbf", "Z"])
+assert.eq(list(("A" + "😿Z"[1:]).codepoints()), ["A", "�", "�", "�", "Z"])
assert.eq(list("".codepoints()), [])
+assert.fails(lambda: "abcЙ😿".codepoints()[2], "unhandled index") # not indexable
+assert.fails(lambda: len("abcЙ😿".codepoints()), "no len") # unknown length
# string.elem_ords
-assert.eq(type("abcЙ😿".elem_ords()), "elems")
+assert.eq(type("abcЙ😿".elem_ords()), "string.elems")
assert.eq(str("abcЙ😿".elem_ords()), '"abcЙ😿".elem_ords()')
assert.eq(list("abcЙ😿".elem_ords()), [97, 98, 99, 208, 153, 240, 159, 152, 191])
assert.eq(list(("A" + "😿Z"[1:]).elem_ords()), [65, 159, 152, 191, 90])
assert.eq(list("".elem_ords()), [])
+assert.eq("abcЙ😿".elem_ords()[2], 99) # indexable
+assert.eq(len("abcЙ😿".elem_ords()), 9) # known length
-# string.elems
-assert.eq(type("abcЙ😿".elems()), "elems")
+# string.elems (1-byte substrings, which are invalid text)
+assert.eq(type("abcЙ😿".elems()), "string.elems")
assert.eq(str("abcЙ😿".elems()), '"abcЙ😿".elems()')
assert.eq(
- list("abcЙ😿".elems()),
- ["a", "b", "c", "\xd0", "\x99", "\xf0", "\x9f", "\x98", "\xbf"],
+ repr(list("abcЙ😿".elems())),
+ r'["a", "b", "c", "\xd0", "\x99", "\xf0", "\x9f", "\x98", "\xbf"]',
)
assert.eq(
- list(("A" + "😿Z"[1:]).elems()),
- ["A", "\x9f", "\x98", "\xbf", "Z"],
+ repr(list(("A" + "😿Z"[1:]).elems())),
+ r'["A", "\x9f", "\x98", "\xbf", "Z"]',
)
assert.eq(list("".elems()), [])
+assert.eq("abcЙ😿".elems()[2], "c") # indexable
+assert.eq(len("abcЙ😿".elems()), 9) # known length
# indexing, x[i]
assert.eq("Hello, 世界!"[0], "H")
-assert.eq("Hello, 世界!"[7], "\xe4")
+assert.eq(repr("Hello, 世界!"[7]), r'"\xe4"') # (invalid text)
assert.eq("Hello, 世界!"[13], "!")
assert.fails(lambda: "abc"[-4], "out of range")
assert.eq("abc"[-3], "a")
@@ -93,10 +102,8 @@ assert.eq("abc"[2], "c")
assert.fails(lambda: "abc"[4], "out of range")
# x[i] = ...
-x2 = "abc"
-
def f():
- x2[1] = "B"
+ "abc"[1] = "B"
assert.fails(f, "string.*does not support.*assignment")
@@ -122,6 +129,7 @@ assert.eq("abc"[:3], "abc")
assert.eq("abc"[:4], "abc")
assert.eq("abc"[1:2], "b")
assert.eq("abc"[2:1], "")
+assert.eq(repr("😿"[:1]), r'"\xf0"') # (invalid text)
# non-unit strides
assert.eq("abcd"[0:4:1], "abcd")
diff --git a/starlark/value.go b/starlark/value.go
index bcec750..81e29ed 100644
--- a/starlark/value.go
+++ b/starlark/value.go
@@ -499,13 +499,20 @@ func (f Float) Unary(op syntax.Token) (Value, error) {
return nil, nil
}
-// String is the type of a Starlark string.
+// String is the type of a Starlark text string.
//
// A String encapsulates an an immutable sequence of bytes,
// but strings are not directly iterable. Instead, iterate
// over the result of calling one of these four methods:
// codepoints, codepoint_ords, elems, elem_ords.
//
+// Strings typically contain text; use Bytes for binary strings.
+// The Starlark spec defines text strings as sequences of UTF-k
+// codes that encode Unicode code points. In this Go implementation,
+// k=8, whereas in a Java implementation, k=16. For portability,
+// operations on strings should aim to avoid assumptions about
+// the value of k.
+//
// Warning: the contract of the Value interface's String method is that
// it returns the value printed in Starlark notation,
// so s.String() or fmt.Sprintf("%s", s) returns a quoted string.
@@ -513,7 +520,7 @@ func (f Float) Unary(op syntax.Token) (Value, error) {
// of a Starlark string as a Go string.
type String string
-func (s String) String() string { return strconv.Quote(string(s)) }
+func (s String) String() string { return syntax.Quote(string(s), false) }
func (s String) GoString() string { return string(s) }
func (s String) Type() string { return "string" }
func (s String) Freeze() {} // immutable
@@ -545,73 +552,106 @@ func (x String) CompareSameType(op syntax.Token, y_ Value, depth int) (bool, err
func AsString(x Value) (string, bool) { v, ok := x.(String); return string(v), ok }
-// A stringIterable is an iterable whose iterator yields a sequence of
-// either Unicode code points or elements (bytes),
-// either numerically or as successive substrings.
-type stringIterable struct {
- s String
- ords bool
- codepoints bool
+// A stringElems is an iterable whose iterator yields a sequence of
+// elements (bytes), either numerically or as successive substrings.
+// It is an indexable sequence.
+type stringElems struct {
+ s String
+ ords bool
}
-var _ Iterable = (*stringIterable)(nil)
+var (
+ _ Iterable = (*stringElems)(nil)
+ _ Indexable = (*stringElems)(nil)
+)
-func (si stringIterable) String() string {
- var etype string
- if si.codepoints {
- etype = "codepoint"
+func (si stringElems) String() string {
+ if si.ords {
+ return si.s.String() + ".elem_ords()"
} else {
- etype = "elem"
+ return si.s.String() + ".elems()"
}
+}
+func (si stringElems) Type() string { return "string.elems" }
+func (si stringElems) Freeze() {} // immutable
+func (si stringElems) Truth() Bool { return True }
+func (si stringElems) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", si.Type()) }
+func (si stringElems) Iterate() Iterator { return &stringElemsIterator{si, 0} }
+func (si stringElems) Len() int { return len(si.s) }
+func (si stringElems) Index(i int) Value {
if si.ords {
- return si.s.String() + "." + etype + "_ords()"
+ return MakeInt(int(si.s[i]))
} else {
- return si.s.String() + "." + etype + "s()"
+ // TODO(adonovan): opt: preallocate canonical 1-byte strings
+ // to avoid interface allocation.
+ return si.s[i : i+1]
+ }
+}
+
+type stringElemsIterator struct {
+ si stringElems
+ i int
+}
+
+func (it *stringElemsIterator) Next(p *Value) bool {
+ if it.i == len(it.si.s) {
+ return false
}
+ *p = it.si.Index(it.i)
+ it.i++
+ return true
+}
+
+func (*stringElemsIterator) Done() {}
+
+// A stringCodepoints is an iterable whose iterator yields a sequence of
+// Unicode code points, either numerically or as successive substrings.
+// It is not indexable.
+type stringCodepoints struct {
+ s String
+ ords bool
}
-func (si stringIterable) Type() string {
- if si.codepoints {
- return "codepoints"
+
+var _ Iterable = (*stringCodepoints)(nil)
+
+func (si stringCodepoints) String() string {
+ if si.ords {
+ return si.s.String() + ".codepoint_ords()"
} else {
- return "elems"
+ return si.s.String() + ".codepoints()"
}
}
-func (si stringIterable) Freeze() {} // immutable
-func (si stringIterable) Truth() Bool { return True }
-func (si stringIterable) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", si.Type()) }
-func (si stringIterable) Iterate() Iterator { return &stringIterator{si, 0} }
+func (si stringCodepoints) Type() string { return "string.codepoints" }
+func (si stringCodepoints) Freeze() {} // immutable
+func (si stringCodepoints) Truth() Bool { return True }
+func (si stringCodepoints) Hash() (uint32, error) { return 0, fmt.Errorf("unhashable: %s", si.Type()) }
+func (si stringCodepoints) Iterate() Iterator { return &stringCodepointsIterator{si, 0} }
-type stringIterator struct {
- si stringIterable
+type stringCodepointsIterator struct {
+ si stringCodepoints
i int
}
-func (it *stringIterator) Next(p *Value) bool {
+func (it *stringCodepointsIterator) Next(p *Value) bool {
s := it.si.s[it.i:]
if s == "" {
return false
}
- if it.si.codepoints {
- r, sz := utf8.DecodeRuneInString(string(s))
- if !it.si.ords {
- *p = s[:sz]
+ r, sz := utf8.DecodeRuneInString(string(s))
+ if !it.si.ords {
+ if r == utf8.RuneError {
+ *p = String(r)
} else {
- *p = MakeInt(int(r))
+ *p = s[:sz]
}
- it.i += sz
} else {
- b := int(s[0])
- if !it.si.ords {
- *p = s[:1]
- } else {
- *p = MakeInt(b)
- }
- it.i += 1
+ *p = MakeInt(int(r))
}
+ it.i += sz
return true
}
-func (*stringIterator) Done() {}
+func (*stringCodepointsIterator) Done() {}
// A Function is a function defined by a Starlark def statement or lambda expression.
// The initialization behavior of a Starlark module is also represented by a Function.
@@ -1084,6 +1124,7 @@ func writeValue(out *strings.Builder, x Value, path []Value) {
case nil:
out.WriteString("<nil>") // indicates a bug
+ // These four cases are duplicates of T.String(), for efficiency.
case NoneType:
out.WriteString("None")
@@ -1098,7 +1139,7 @@ func writeValue(out *strings.Builder, x Value, path []Value) {
}
case String:
- fmt.Fprintf(out, "%q", string(x))
+ out.WriteString(syntax.Quote(string(x), false))
case *List:
out.WriteByte('[')
@@ -1318,6 +1359,8 @@ func Len(x Value) int {
switch x := x.(type) {
case String:
return x.Len()
+ case Indexable:
+ return x.Len()
case Sequence:
return x.Len()
}
@@ -1335,3 +1378,54 @@ func Iterate(x Value) Iterator {
}
return nil
}
+
+// Bytes is the type of a Starlark binary string.
+//
+// A Bytes encapsulates an immutable sequence of bytes.
+// It is comparable, indexable, and sliceable, but not direcly iterable;
+// use bytes.elems() for an iterable view.
+//
+// In this Go implementation, the elements of 'string' and 'bytes' are
+// both bytes, but in other implementations, notably Java, the elements
+// of a 'string' are UTF-16 codes (Java chars). The spec abstracts text
+// strings as sequences of UTF-k codes that encode Unicode code points,
+// and operations that convert from text to binary incur UTF-k-to-UTF-8
+// transcoding; conversely, conversion from binary to text incurs
+// UTF-8-to-UTF-k transcoding. Because k=8 for Go, these operations
+// are the identity function, at least for valid encodings of text.
+type Bytes string
+
+var (
+ _ Comparable = Bytes("")
+ _ Sliceable = Bytes("")
+ _ Indexable = Bytes("")
+)
+
+func (b Bytes) String() string { return syntax.Quote(string(b), true) }
+func (b Bytes) Type() string { return "bytes" }
+func (b Bytes) Freeze() {} // immutable
+func (b Bytes) Truth() Bool { return len(b) > 0 }
+func (b Bytes) Hash() (uint32, error) { return String(b).Hash() }
+func (b Bytes) Len() int { return len(b) }
+func (b Bytes) Index(i int) Value { return b[i : i+1] }
+
+func (b Bytes) Attr(name string) (Value, error) { return builtinAttr(b, name, bytesMethods) }
+func (b Bytes) AttrNames() []string { return builtinAttrNames(bytesMethods) }
+
+func (b Bytes) Slice(start, end, step int) Value {
+ if step == 1 {
+ return b[start:end]
+ }
+
+ sign := signum(step)
+ var str []byte
+ for i := start; signum(end-i) == sign; i += step {
+ str = append(str, b[i])
+ }
+ return Bytes(str)
+}
+
+func (x Bytes) CompareSameType(op syntax.Token, y_ Value, depth int) (bool, error) {
+ y := y_.(Bytes)
+ return threeway(op, strings.Compare(string(x), string(y))), nil
+}
diff --git a/syntax/parse.go b/syntax/parse.go
index 50b8087..f4c8fff 100644
--- a/syntax/parse.go
+++ b/syntax/parse.go
@@ -771,8 +771,7 @@ func (p *parser) parseArgs() []Expr {
}
// primary = IDENT
-// | INT | FLOAT
-// | STRING
+// | INT | FLOAT | STRING | BYTES
// | '[' ... // list literal or comprehension
// | '{' ... // dict literal or comprehension
// | '(' ... // tuple or parenthesized expression
@@ -782,7 +781,7 @@ func (p *parser) parsePrimary() Expr {
case IDENT:
return p.parseIdent()
- case INT, FLOAT, STRING:
+ case INT, FLOAT, STRING, BYTES:
var val interface{}
tok := p.tok
switch tok {
@@ -794,7 +793,7 @@ func (p *parser) parsePrimary() Expr {
}
case FLOAT:
val = p.tokval.float
- case STRING:
+ case STRING, BYTES:
val = p.tokval.string
}
raw := p.tokval.raw
diff --git a/syntax/parse_test.go b/syntax/parse_test.go
index 6052e79..fedbb3e 100644
--- a/syntax/parse_test.go
+++ b/syntax/parse_test.go
@@ -361,9 +361,12 @@ func writeTree(out *bytes.Buffer, x reflect.Value) {
case reflect.Struct:
switch v := x.Interface().(type) {
case syntax.Literal:
- if v.Token == syntax.STRING {
+ switch v.Token {
+ case syntax.STRING:
fmt.Fprintf(out, "%q", v.Value)
- } else if v.Token == syntax.INT {
+ case syntax.BYTES:
+ fmt.Fprintf(out, "b%q", v.Value)
+ case syntax.INT:
fmt.Fprintf(out, "%d", v.Value)
}
return
diff --git a/syntax/quote.go b/syntax/quote.go
index 49cb259..741e106 100644
--- a/syntax/quote.go
+++ b/syntax/quote.go
@@ -10,6 +10,8 @@ import (
"fmt"
"strconv"
"strings"
+ "unicode"
+ "unicode/utf8"
)
// unesc maps single-letter chars following \ to their actual values.
@@ -41,15 +43,20 @@ var esc = [256]byte{
}
// unquote unquotes the quoted string, returning the actual
-// string value, whether the original was triple-quoted, and
-// an error describing invalid input.
-func unquote(quoted string) (s string, triple bool, err error) {
+// string value, whether the original was triple-quoted,
+// whether it was a byte string, and an error describing invalid input.
+func unquote(quoted string) (s string, triple, isByte bool, err error) {
// Check for raw prefix: means don't interpret the inner \.
raw := false
if strings.HasPrefix(quoted, "r") {
raw = true
quoted = quoted[1:]
}
+ // Check for bytes prefix.
+ if strings.HasPrefix(quoted, "b") {
+ isByte = true
+ quoted = quoted[1:]
+ }
if len(quoted) < 2 {
err = fmt.Errorf("string literal too short")
@@ -138,7 +145,7 @@ func unquote(quoted string) (s string, triple bool, err error) {
quoted = quoted[2:]
case '0', '1', '2', '3', '4', '5', '6', '7':
- // Octal escape, up to 3 digits.
+ // Octal escape, up to 3 digits, \OOO.
n := int(quoted[1] - '0')
quoted = quoted[2:]
for i := 1; i < 3; i++ {
@@ -148,6 +155,10 @@ func unquote(quoted string) (s string, triple bool, err error) {
n = n*8 + int(quoted[0]-'0')
quoted = quoted[1:]
}
+ if !isByte && n > 127 {
+ err = fmt.Errorf(`non-ASCII octal escape \%o (use \u%04X for the UTF-8 encoding of U+%04X)`, n, n, n)
+ return
+ }
if n >= 256 {
// NOTE: Python silently discards the high bit,
// so that '\541' == '\141' == 'a'.
@@ -158,7 +169,7 @@ func unquote(quoted string) (s string, triple bool, err error) {
buf.WriteByte(byte(n))
case 'x':
- // Hexadecimal escape, exactly 2 digits.
+ // Hexadecimal escape, exactly 2 digits, \xXX. [0-127]
if len(quoted) < 4 {
err = fmt.Errorf(`truncated escape sequence %s`, quoted)
return
@@ -168,8 +179,41 @@ func unquote(quoted string) (s string, triple bool, err error) {
err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
return
}
+ if !isByte && n > 127 {
+ err = fmt.Errorf(`non-ASCII hex escape %s (use \u%04X for the UTF-8 encoding of U+%04X)`,
+ quoted[:4], n, n)
+ return
+ }
buf.WriteByte(byte(n))
quoted = quoted[4:]
+
+ case 'u', 'U':
+ // Unicode code point, 4 (\uXXXX) or 8 (\UXXXXXXXX) hex digits.
+ sz := 6
+ if quoted[1] == 'U' {
+ sz = 10
+ }
+ if len(quoted) < sz {
+ err = fmt.Errorf(`truncated escape sequence %s`, quoted)
+ return
+ }
+ n, err1 := strconv.ParseUint(quoted[2:sz], 16, 0)
+ if err1 != nil {
+ err = fmt.Errorf(`invalid escape sequence %s`, quoted[:sz])
+ return
+ }
+ if n > unicode.MaxRune {
+ err = fmt.Errorf(`code point out of range: %s (max \U%08x)`,
+ quoted[:sz], n)
+ return
+ }
+ // As in Go, surrogates are disallowed.
+ if 0xD800 <= n && n < 0xE000 {
+ err = fmt.Errorf(`invalid Unicode code point U+%04X`, n)
+ return
+ }
+ buf.WriteRune(rune(n))
+ quoted = quoted[sz:]
}
}
@@ -187,67 +231,79 @@ func indexByte(s string, b byte) int {
return -1
}
-// hex is a list of the hexadecimal digits, for use in quoting.
-// We always print lower-case hexadecimal.
-const hex = "0123456789abcdef"
+// Quote returns a Starlark literal that denotes s.
+// If b, it returns a bytes literal.
+func Quote(s string, b bool) string {
+ const hex = "0123456789abcdef"
+ var runeTmp [utf8.UTFMax]byte
-// quote returns the quoted form of the string value "x".
-// If triple is true, quote uses the triple-quoted form """x""".
-func quote(unquoted string, triple bool) string {
- q := `"`
- if triple {
- q = `"""`
+ buf := make([]byte, 0, 3*len(s)/2)
+ if b {
+ buf = append(buf, 'b')
}
-
- buf := new(strings.Builder)
- buf.WriteString(q)
-
- for i := 0; i < len(unquoted); i++ {
- c := unquoted[i]
- if c == '"' && triple && (i+1 < len(unquoted) && unquoted[i+1] != '"' || i+2 < len(unquoted) && unquoted[i+2] != '"') {
- // Can pass up to two quotes through, because they are followed by a non-quote byte.
- buf.WriteByte(c)
- if i+1 < len(unquoted) && unquoted[i+1] == '"' {
- buf.WriteByte(c)
- i++
- }
- continue
+ buf = append(buf, '"')
+ for width := 0; len(s) > 0; s = s[width:] {
+ r := rune(s[0])
+ width = 1
+ if r >= utf8.RuneSelf {
+ r, width = utf8.DecodeRuneInString(s)
}
- if triple && c == '\n' {
- // Can allow newline in triple-quoted string.
- buf.WriteByte(c)
+ if width == 1 && r == utf8.RuneError {
+ // String (!b) literals accept \xXX escapes only for ASCII,
+ // but we must use them here to represent invalid bytes.
+ // The result is not a legal literal.
+ buf = append(buf, `\x`...)
+ buf = append(buf, hex[s[0]>>4])
+ buf = append(buf, hex[s[0]&0xF])
continue
}
- if c == '\'' {
- // Can allow ' since we always use ".
- buf.WriteByte(c)
+ if r == '"' || r == '\\' { // always backslashed
+ buf = append(buf, '\\')
+ buf = append(buf, byte(r))
continue
}
- if esc[c] != 0 {
- buf.WriteByte('\\')
- buf.WriteByte(esc[c])
+ if strconv.IsPrint(r) {
+ n := utf8.EncodeRune(runeTmp[:], r)
+ buf = append(buf, runeTmp[:n]...)
continue
}
- if c < 0x20 || c >= 0x80 {
- // BUILD files are supposed to be Latin-1, so escape all control and high bytes.
- // I'd prefer to use \x here, but Blaze does not implement
- // \x in quoted strings (b/7272572).
- buf.WriteByte('\\')
- buf.WriteByte(hex[c>>6]) // actually octal but reusing hex digits 0-7.
- buf.WriteByte(hex[(c>>3)&7])
- buf.WriteByte(hex[c&7])
- /*
- buf.WriteByte('\\')
- buf.WriteByte('x')
- buf.WriteByte(hex[c>>4])
- buf.WriteByte(hex[c&0xF])
- */
- continue
+ switch r {
+ case '\a':
+ buf = append(buf, `\a`...)
+ case '\b':
+ buf = append(buf, `\b`...)
+ case '\f':
+ buf = append(buf, `\f`...)
+ case '\n':
+ buf = append(buf, `\n`...)
+ case '\r':
+ buf = append(buf, `\r`...)
+ case '\t':
+ buf = append(buf, `\t`...)
+ case '\v':
+ buf = append(buf, `\v`...)
+ default:
+ switch {
+ case r < ' ' || r == 0x7f:
+ buf = append(buf, `\x`...)
+ buf = append(buf, hex[byte(r)>>4])
+ buf = append(buf, hex[byte(r)&0xF])
+ case r > utf8.MaxRune:
+ r = 0xFFFD
+ fallthrough
+ case r < 0x10000:
+ buf = append(buf, `\u`...)
+ for s := 12; s >= 0; s -= 4 {
+ buf = append(buf, hex[r>>uint(s)&0xF])
+ }
+ default:
+ buf = append(buf, `\U`...)
+ for s := 28; s >= 0; s -= 4 {
+ buf = append(buf, hex[r>>uint(s)&0xF])
+ }
+ }
}
- buf.WriteByte(c)
- continue
}
-
- buf.WriteString(q)
- return buf.String()
+ buf = append(buf, '"')
+ return string(buf)
}
diff --git a/syntax/quote_test.go b/syntax/quote_test.go
index f9068ee..be7498b 100644
--- a/syntax/quote_test.go
+++ b/syntax/quote_test.go
@@ -22,17 +22,14 @@ var quoteTests = []struct {
{`'quote"here'`, `quote"here`, false},
{`"quote'here"`, `quote'here`, true},
{`'quote\'here'`, `quote'here`, false},
- {`"""hello " ' world "" asdf ''' foo"""`, `hello " ' world "" asdf ''' foo`, true},
- {`"""hello
-world"""`, "hello\nworld", true},
- {`"\a\b\f\n\r\t\v\000\377"`, "\a\b\f\n\r\t\v\000\xFF", true},
- {`"\a\b\f\n\r\t\v\x00\xff"`, "\a\b\f\n\r\t\v\000\xFF", false},
- {`"\a\b\f\n\r\t\v\000\xFF"`, "\a\b\f\n\r\t\v\000\xFF", false},
- {`"\a\b\f\n\r\t\v\000\377\"'\\\003\200"`, "\a\b\f\n\r\t\v\x00\xFF\"'\\\x03\x80", true},
- {`"\a\b\f\n\r\t\v\x00\xff\"'\\\x03\x80"`, "\a\b\f\n\r\t\v\x00\xFF\"'\\\x03\x80", false},
- {`"\a\b\f\n\r\t\v\000\xFF\"'\\\x03\x80"`, "\a\b\f\n\r\t\v\x00\xFF\"'\\\x03\x80", false},
- {`"\a\b\f\n\r\t\v\000\xFF\"\\\x03\x80"`, "\a\b\f\n\r\t\v\x00\xFF\"\\\x03\x80", false},
+ {`"\a\b\f\n\r\t\v\x00\x7f"`, "\a\b\f\n\r\t\v\000\x7F", true},
+ {`"\a\b\f\n\r\t\v\x00\x7f"`, "\a\b\f\n\r\t\v\000\x7F", false},
+ {`"\a\b\f\n\r\t\v\x00\x7f"`, "\a\b\f\n\r\t\v\000\x7F", false},
+ {`"\a\b\f\n\r\t\v\x00\x7f\"'\\\x03"`, "\a\b\f\n\r\t\v\x00\x7F\"'\\\x03", true},
+ {`"\a\b\f\n\r\t\v\x00\x7f\"'\\\x03"`, "\a\b\f\n\r\t\v\x00\x7F\"'\\\x03", false},
+ {`"\a\b\f\n\r\t\v\x00\x7f\"'\\\x03"`, "\a\b\f\n\r\t\v\x00\x7F\"'\\\x03", false},
+ {`"\a\b\f\n\r\t\v\x00\x7f\"\\\x03"`, "\a\b\f\n\r\t\v\x00\x7F\"\\\x03", false},
{
`"cat $(SRCS) | grep '\\s*ip_block:' | sed -e 's/\\s*ip_block: \"\\([^ ]*\\)\"/ \x27\\1\x27,/g' >> $@; "`,
"cat $(SRCS) | grep '\\s*ip_block:' | sed -e 's/\\s*ip_block: \"\\([^ ]*\\)\"/ '\\1',/g' >> $@; ",
@@ -50,7 +47,7 @@ func TestQuote(t *testing.T) {
if !tt.std {
continue
}
- q := quote(tt.s, strings.HasPrefix(tt.q, `"""`))
+ q := Quote(tt.s, false)
if q != tt.q {
t.Errorf("quote(%#q) = %s, want %s", tt.s, q, tt.q)
}
@@ -59,7 +56,7 @@ func TestQuote(t *testing.T) {
func TestUnquote(t *testing.T) {
for _, tt := range quoteTests {
- s, triple, err := unquote(tt.q)
+ s, triple, _, err := unquote(tt.q)
wantTriple := strings.HasPrefix(tt.q, `"""`) || strings.HasPrefix(tt.q, `'''`)
if s != tt.s || triple != wantTriple || err != nil {
t.Errorf("unquote(%s) = %#q, %v, %v want %#q, %v, nil", tt.q, s, triple, err, tt.s, wantTriple)
diff --git a/syntax/scan.go b/syntax/scan.go
index a162264..bb4165e 100644
--- a/syntax/scan.go
+++ b/syntax/scan.go
@@ -35,6 +35,7 @@ const (
INT // 123
FLOAT // 1.23e45
STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo"
+ BYTES // b"foo", etc
// Punctuation
PLUS // +
@@ -268,7 +269,7 @@ func newScanner(filename string, src interface{}, keepComments bool) (*scanner,
lineStart: true,
keepComments: keepComments,
}
- sc.readline, _ = src.(func() ([]byte, error)) // REPL only
+ sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only
if sc.readline == nil {
data, err := readSource(filename, src)
if err != nil {
@@ -422,7 +423,7 @@ type tokenValue struct {
int int64 // decoded int
bigInt *big.Int // decoded integers > int64
float float64 // decoded float
- string string // decoded string
+ string string // decoded string or bytes
pos Position // start position of token
}
@@ -642,8 +643,15 @@ start:
// identifier or keyword
if isIdentStart(c) {
- // raw string literal
- if c == 'r' && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
+ if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') {
+ // r"..."
+ // b"..."
+ sc.readRune()
+ c = sc.peekRune()
+ return sc.scanString(val, c)
+ } else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') {
+ // rb"..."
+ sc.readRune()
sc.readRune()
c = sc.peekRune()
return sc.scanString(val, c)
@@ -887,12 +895,16 @@ func (sc *scanner) scanString(val *tokenValue, quote rune) Token {
}
val.raw = raw.String()
- s, _, err := unquote(val.raw)
+ s, _, isByte, err := unquote(val.raw)
if err != nil {
sc.error(start, err.Error())
}
val.string = s
- return STRING
+ if isByte {
+ return BYTES
+ } else {
+ return STRING
+ }
}
func (sc *scanner) scanNumber(val *tokenValue, c rune) Token {
diff --git a/syntax/scan_test.go b/syntax/scan_test.go
index 0f2d9f2..9582bd7 100644
--- a/syntax/scan_test.go
+++ b/syntax/scan_test.go
@@ -10,6 +10,7 @@ import (
"go/build"
"io/ioutil"
"path/filepath"
+ "strings"
"testing"
)
@@ -42,8 +43,8 @@ func scan(src interface{}) (tokens string, err error) {
}
case FLOAT:
fmt.Fprintf(&buf, "%e", val.float)
- case STRING:
- fmt.Fprintf(&buf, "%q", val.string)
+ case STRING, BYTES:
+ buf.WriteString(Quote(val.string, tok == BYTES))
default:
buf.WriteString(tok.String())
}
@@ -189,9 +190,34 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated
{"i = 012934", `foo.star:1:5: invalid int literal`},
// octal escapes in string literals
{`"\037"`, `"\x1f" EOF`},
- {`"\377"`, `"\xff" EOF`},
- {`"\378"`, `"\x1f8" EOF`}, // = '\37' + '8'
- {`"\400"`, `foo.star:1:1: invalid escape sequence \400`}, // unlike Python 2 and 3
+ {`"\377"`, `foo.star:1:1: non-ASCII octal escape \377 (use \u00FF for the UTF-8 encoding of U+00FF)`},
+ {`"\378"`, `"\x1f8" EOF`}, // = '\37' + '8'
+ {`"\400"`, `foo.star:1:1: non-ASCII octal escape \400`}, // unlike Python 2 and 3
+ // hex escapes
+ {`"\x00\x20\x09\x41\x7e\x7f"`, `"\x00 \tA~\x7f" EOF`}, // DEL is non-printable
+ {`"\x80"`, `foo.star:1:1: non-ASCII hex escape`},
+ {`"\xff"`, `foo.star:1:1: non-ASCII hex escape`},
+ {`"\xFf"`, `foo.star:1:1: non-ASCII hex escape`},
+ {`"\xF"`, `foo.star:1:1: truncated escape sequence \xF`},
+ {`"\x"`, `foo.star:1:1: truncated escape sequence \x`},
+ {`"\xfg"`, `foo.star:1:1: invalid escape sequence \xfg`},
+ // Unicode escapes
+ // \uXXXX
+ {`"\u0400"`, `"Ѐ" EOF`},
+ {`"\u100"`, `foo.star:1:1: truncated escape sequence \u100`},
+ {`"\u04000"`, `"Ѐ0" EOF`}, // = U+0400 + '0'
+ {`"\u100g"`, `foo.star:1:1: invalid escape sequence \u100g`},
+ {`"\u4E16"`, `"世" EOF`},
+ {`"\udc00"`, `foo.star:1:1: invalid Unicode code point U+DC00`}, // surrogate
+ // \UXXXXXXXX
+ {`"\U00000400"`, `"Ѐ" EOF`},
+ {`"\U0000400"`, `foo.star:1:1: truncated escape sequence \U0000400`},
+ {`"\U000004000"`, `"Ѐ0" EOF`}, // = U+0400 + '0'
+ {`"\U1000000g"`, `foo.star:1:1: invalid escape sequence \U1000000g`},
+ {`"\U0010FFFF"`, `"\U0010ffff" EOF`},
+ {`"\U00110000"`, `foo.star:1:1: code point out of range: \U00110000 (max \U00110000)`},
+ {`"\U0001F63F"`, `"😿" EOF`},
+ {`"\U0000dc00"`, `foo.star:1:1: invalid Unicode code point U+DC00`}, // surrogate
// backslash escapes
// As in Go, a backslash must escape something.
@@ -218,6 +244,12 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated
{`r'\"'`, `"\\\"" EOF`},
{`'a\zb'`, `foo.star:1:1: invalid escape sequence \z`},
{`"\o123"`, `foo.star:1:1: invalid escape sequence \o`},
+ // bytes literals (where they differ from text strings)
+ {`b"AЀ世😿"`, `b"AЀ世😿`}, // 1-4 byte encodings, literal
+ {`b"\x41\u0400\u4e16\U0001F63F"`, `b"AЀ世😿"`}, // same, as escapes
+ {`b"\377\378\x80\xff\xFf"`, `b"\xff\x1f8\x80\xff\xff" EOF`}, // hex/oct escapes allow non-ASCII
+ {`b"\400"`, `foo.star:1:2: invalid escape sequence \400`},
+ {`b"\udc00"`, `foo.star:1:2: invalid Unicode code point U+DC00`}, // (same as string)
// floats starting with octal digits
{"012934.", `1.293400e+04 EOF`},
{"012934.1", `1.293410e+04 EOF`},
@@ -243,7 +275,9 @@ pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated
if err != nil {
got = err.(Error).Error()
}
- if test.want != got {
+ // Prefix match allows us to truncate errors in expecations.
+ // Success cases all end in EOF.
+ if !strings.HasPrefix(got, test.want) {
t.Errorf("scan `%s` = [%s], want [%s]", test.input, got, test.want)
}
}
diff --git a/syntax/syntax.go b/syntax/syntax.go
index 8bbf5c0..20b28bb 100644
--- a/syntax/syntax.go
+++ b/syntax/syntax.go
@@ -251,7 +251,7 @@ func (x *Ident) Span() (start, end Position) {
// A Literal represents a literal string or number.
type Literal struct {
commentsRef
- Token Token // = STRING | INT | FLOAT
+ Token Token // = STRING | BYTES | INT | FLOAT
TokenPos Position
Raw string // uninterpreted text
Value interface{} // = string | int64 | *big.Int | float64