diff options
Diffstat (limited to 'gopls/internal/lsp/glob/glob.go')
-rw-r--r-- | gopls/internal/lsp/glob/glob.go | 349 |
1 files changed, 349 insertions, 0 deletions
diff --git a/gopls/internal/lsp/glob/glob.go b/gopls/internal/lsp/glob/glob.go new file mode 100644 index 000000000..a540ebefa --- /dev/null +++ b/gopls/internal/lsp/glob/glob.go @@ -0,0 +1,349 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package glob implements an LSP-compliant glob pattern matcher for testing. +package glob + +import ( + "errors" + "fmt" + "strings" + "unicode/utf8" +) + +// A Glob is an LSP-compliant glob pattern, as defined by the spec: +// https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#documentFilter +// +// NOTE: this implementation is currently only intended for testing. In order +// to make it production ready, we'd need to: +// - verify it against the VS Code implementation +// - add more tests +// - microbenchmark, likely avoiding the element interface +// - resolve the question of what is meant by "character". If it's a UTF-16 +// code (as we suspect) it'll be a bit more work. +// +// Quoting from the spec: +// Glob patterns can have the following syntax: +// - `*` to match one or more characters in a path segment +// - `?` to match on one character in a path segment +// - `**` to match any number of path segments, including none +// - `{}` to group sub patterns into an OR expression. (e.g. `**/*.{ts,js}` +// matches all TypeScript and JavaScript files) +// - `[]` to declare a range of characters to match in a path segment +// (e.g., `example.[0-9]` to match on `example.0`, `example.1`, …) +// - `[!...]` to negate a range of characters to match in a path segment +// (e.g., `example.[!0-9]` to match on `example.a`, `example.b`, but +// not `example.0`) +// +// Expanding on this: +// - '/' matches one or more literal slashes. +// - any other character matches itself literally. +type Glob struct { + elems []element // pattern elements +} + +// Parse builds a Glob for the given pattern, returning an error if the pattern +// is invalid. +func Parse(pattern string) (*Glob, error) { + g, _, err := parse(pattern, false) + return g, err +} + +func parse(pattern string, nested bool) (*Glob, string, error) { + g := new(Glob) + for len(pattern) > 0 { + switch pattern[0] { + case '/': + pattern = pattern[1:] + g.elems = append(g.elems, slash{}) + + case '*': + if len(pattern) > 1 && pattern[1] == '*' { + if (len(g.elems) > 0 && g.elems[len(g.elems)-1] != slash{}) || (len(pattern) > 2 && pattern[2] != '/') { + return nil, "", errors.New("** may only be adjacent to '/'") + } + pattern = pattern[2:] + g.elems = append(g.elems, starStar{}) + break + } + pattern = pattern[1:] + g.elems = append(g.elems, star{}) + + case '?': + pattern = pattern[1:] + g.elems = append(g.elems, anyChar{}) + + case '{': + var gs group + for pattern[0] != '}' { + pattern = pattern[1:] + g, pat, err := parse(pattern, true) + if err != nil { + return nil, "", err + } + if len(pat) == 0 { + return nil, "", errors.New("unmatched '{'") + } + pattern = pat + gs = append(gs, g) + } + pattern = pattern[1:] + g.elems = append(g.elems, gs) + + case '}', ',': + if nested { + return g, pattern, nil + } + pattern = g.parseLiteral(pattern, false) + + case '[': + pattern = pattern[1:] + if len(pattern) == 0 { + return nil, "", errBadRange + } + negate := false + if pattern[0] == '!' { + pattern = pattern[1:] + negate = true + } + low, sz, err := readRangeRune(pattern) + if err != nil { + return nil, "", err + } + pattern = pattern[sz:] + if len(pattern) == 0 || pattern[0] != '-' { + return nil, "", errBadRange + } + pattern = pattern[1:] + high, sz, err := readRangeRune(pattern) + if err != nil { + return nil, "", err + } + pattern = pattern[sz:] + if len(pattern) == 0 || pattern[0] != ']' { + return nil, "", errBadRange + } + pattern = pattern[1:] + g.elems = append(g.elems, charRange{negate, low, high}) + + default: + pattern = g.parseLiteral(pattern, nested) + } + } + return g, "", nil +} + +// helper for decoding a rune in range elements, e.g. [a-z] +func readRangeRune(input string) (rune, int, error) { + r, sz := utf8.DecodeRuneInString(input) + var err error + if r == utf8.RuneError { + // See the documentation for DecodeRuneInString. + switch sz { + case 0: + err = errBadRange + case 1: + err = errInvalidUTF8 + } + } + return r, sz, err +} + +var ( + errBadRange = errors.New("'[' patterns must be of the form [x-y]") + errInvalidUTF8 = errors.New("invalid UTF-8 encoding") +) + +func (g *Glob) parseLiteral(pattern string, nested bool) string { + var specialChars string + if nested { + specialChars = "*?{[/}," + } else { + specialChars = "*?{[/" + } + end := strings.IndexAny(pattern, specialChars) + if end == -1 { + end = len(pattern) + } + g.elems = append(g.elems, literal(pattern[:end])) + return pattern[end:] +} + +func (g *Glob) String() string { + var b strings.Builder + for _, e := range g.elems { + fmt.Fprint(&b, e) + } + return b.String() +} + +// element holds a glob pattern element, as defined below. +type element fmt.Stringer + +// element types. +type ( + slash struct{} // One or more '/' separators + literal string // string literal, not containing /, *, ?, {}, or [] + star struct{} // * + anyChar struct{} // ? + starStar struct{} // ** + group []*Glob // {foo, bar, ...} grouping + charRange struct { // [a-z] character range + negate bool + low, high rune + } +) + +func (s slash) String() string { return "/" } +func (l literal) String() string { return string(l) } +func (s star) String() string { return "*" } +func (a anyChar) String() string { return "?" } +func (s starStar) String() string { return "**" } +func (g group) String() string { + var parts []string + for _, g := range g { + parts = append(parts, g.String()) + } + return "{" + strings.Join(parts, ",") + "}" +} +func (r charRange) String() string { + return "[" + string(r.low) + "-" + string(r.high) + "]" +} + +// Match reports whether the input string matches the glob pattern. +func (g *Glob) Match(input string) bool { + return match(g.elems, input) +} + +func match(elems []element, input string) (ok bool) { + var elem interface{} + for len(elems) > 0 { + elem, elems = elems[0], elems[1:] + switch elem := elem.(type) { + case slash: + if len(input) == 0 || input[0] != '/' { + return false + } + for input[0] == '/' { + input = input[1:] + } + + case starStar: + // Special cases: + // - **/a matches "a" + // - **/ matches everything + // + // Note that if ** is followed by anything, it must be '/' (this is + // enforced by Parse). + if len(elems) > 0 { + elems = elems[1:] + } + + // A trailing ** matches anything. + if len(elems) == 0 { + return true + } + + // Backtracking: advance pattern segments until the remaining pattern + // elements match. + for len(input) != 0 { + if match(elems, input) { + return true + } + _, input = split(input) + } + return false + + case literal: + if !strings.HasPrefix(input, string(elem)) { + return false + } + input = input[len(elem):] + + case star: + var segInput string + segInput, input = split(input) + + elemEnd := len(elems) + for i, e := range elems { + if e == (slash{}) { + elemEnd = i + break + } + } + segElems := elems[:elemEnd] + elems = elems[elemEnd:] + + // A trailing * matches the entire segment. + if len(segElems) == 0 { + break + } + + // Backtracking: advance characters until remaining subpattern elements + // match. + matched := false + for i := range segInput { + if match(segElems, segInput[i:]) { + matched = true + break + } + } + if !matched { + return false + } + + case anyChar: + if len(input) == 0 || input[0] == '/' { + return false + } + input = input[1:] + + case group: + // Append remaining pattern elements to each group member looking for a + // match. + var branch []element + for _, m := range elem { + branch = branch[:0] + branch = append(branch, m.elems...) + branch = append(branch, elems...) + if match(branch, input) { + return true + } + } + return false + + case charRange: + if len(input) == 0 || input[0] == '/' { + return false + } + c, sz := utf8.DecodeRuneInString(input) + if c < elem.low || c > elem.high { + return false + } + input = input[sz:] + + default: + panic(fmt.Sprintf("segment type %T not implemented", elem)) + } + } + + return len(input) == 0 +} + +// split returns the portion before and after the first slash +// (or sequence of consecutive slashes). If there is no slash +// it returns (input, nil). +func split(input string) (first, rest string) { + i := strings.IndexByte(input, '/') + if i < 0 { + return input, "" + } + first = input[:i] + for j := i; j < len(input); j++ { + if input[j] != '/' { + return first, input[j:] + } + } + return first, "" +} |