diff options
Diffstat (limited to 'internal/fuzzy/symbol.go')
-rw-r--r-- | internal/fuzzy/symbol.go | 237 |
1 files changed, 237 insertions, 0 deletions
diff --git a/internal/fuzzy/symbol.go b/internal/fuzzy/symbol.go new file mode 100644 index 000000000..073a4cd10 --- /dev/null +++ b/internal/fuzzy/symbol.go @@ -0,0 +1,237 @@ +// Copyright 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package fuzzy + +import ( + "unicode" +) + +// SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols +// of the form: +// +// example.com/path/to/package.object.field +// +// Knowing that we are matching symbols like this allows us to make the +// following optimizations: +// - We can incorporate right-to-left relevance directly into the score +// calculation. +// - We can match from right to left, discarding leading bytes if the input is +// too long. +// - We just take the right-most match without losing too much precision. This +// allows us to use an O(n) algorithm. +// - We can operate directly on chunked strings; in many cases we will +// be storing the package path and/or package name separately from the +// symbol or identifiers, so doing this avoids allocating strings. +// - We can return the index of the right-most match, allowing us to trim +// irrelevant qualification. +// +// This implementation is experimental, serving as a reference fast algorithm +// to compare to the fuzzy algorithm implemented by Matcher. +type SymbolMatcher struct { + // Using buffers of length 256 is both a reasonable size for most qualified + // symbols, and makes it easy to avoid bounds checks by using uint8 indexes. + pattern [256]rune + patternLen uint8 + inputBuffer [256]rune // avoid allocating when considering chunks + roles [256]uint32 // which roles does a rune play (word start, etc.) + segments [256]uint8 // how many segments from the right is each rune +} + +const ( + segmentStart uint32 = 1 << iota + wordStart + separator +) + +// NewSymbolMatcher creates a SymbolMatcher that may be used to match the given +// search pattern. +// +// Currently this matcher only accepts case-insensitive fuzzy patterns. +// +// An empty pattern matches no input. +func NewSymbolMatcher(pattern string) *SymbolMatcher { + m := &SymbolMatcher{} + for _, p := range pattern { + m.pattern[m.patternLen] = unicode.ToLower(p) + m.patternLen++ + if m.patternLen == 255 || int(m.patternLen) == len(pattern) { + // break at 255 so that we can represent patternLen with a uint8. + break + } + } + return m +} + +// Match looks for the right-most match of the search pattern within the symbol +// represented by concatenating the given chunks, returning its offset and +// score. +// +// If a match is found, the first return value will hold the absolute byte +// offset within all chunks for the start of the symbol. In other words, the +// index of the match within strings.Join(chunks, ""). If no match is found, +// the first return value will be -1. +// +// The second return value will be the score of the match, which is always +// between 0 and 1, inclusive. A score of 0 indicates no match. +func (m *SymbolMatcher) Match(chunks []string) (int, float64) { + // Explicit behavior for an empty pattern. + // + // As a minor optimization, this also avoids nilness checks later on, since + // the compiler can prove that m != nil. + if m.patternLen == 0 { + return -1, 0 + } + + // First phase: populate the input buffer with lower-cased runes. + // + // We could also check for a forward match here, but since we'd have to write + // the entire input anyway this has negligible impact on performance. + + var ( + inputLen = uint8(0) + modifiers = wordStart | segmentStart + ) + +input: + for _, chunk := range chunks { + for _, r := range chunk { + if r == '.' || r == '/' { + modifiers |= separator + } + // optimization: avoid calls to unicode.ToLower, which can't be inlined. + l := r + if r <= unicode.MaxASCII { + if 'A' <= r && r <= 'Z' { + l = r + 'a' - 'A' + } + } else { + l = unicode.ToLower(r) + } + if l != r { + modifiers |= wordStart + } + m.inputBuffer[inputLen] = l + m.roles[inputLen] = modifiers + inputLen++ + if m.roles[inputLen-1]&separator != 0 { + modifiers = wordStart | segmentStart + } else { + modifiers = 0 + } + // TODO: we should prefer the right-most input if it overflows, rather + // than the left-most as we're doing here. + if inputLen == 255 { + break input + } + } + } + + // Second phase: find the right-most match, and count segments from the + // right. + + var ( + pi = uint8(m.patternLen - 1) // pattern index + p = m.pattern[pi] // pattern rune + start = -1 // start offset of match + rseg = uint8(0) + ) + const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes. + + for ii := inputLen - 1; ; ii-- { + r := m.inputBuffer[ii] + if rseg < maxSeg && m.roles[ii]&separator != 0 { + rseg++ + } + m.segments[ii] = rseg + if p == r { + if pi == 0 { + start = int(ii) + break + } + pi-- + p = m.pattern[pi] + } + // Don't check ii >= 0 in the loop condition: ii is a uint8. + if ii == 0 { + break + } + } + + if start < 0 { + // no match: skip scoring + return -1, 0 + } + + // Third phase: find the shortest match, and compute the score. + + // Score is the average score for each character. + // + // A character score is the multiple of: + // 1. 1.0 if the character starts a segment, .8 if the character start a + // mid-segment word, otherwise 0.6. This carries over to immediately + // following characters. + // 2. For the final character match, the multiplier from (1) is reduced to + // .8 if the next character in the input is a mid-segment word, or 0.6 if + // the next character in the input is not a word or segment start. This + // ensures that we favor whole-word or whole-segment matches over prefix + // matches. + // 3. 1.0 if the character is part of the last segment, otherwise + // 1.0-.2*<segments from the right>, with a max segment count of 3. + // + // This is a very naive algorithm, but it is fast. There's lots of prior art + // here, and we should leverage it. For example, we could explicitly consider + // character distance, and exact matches of words or segments. + // + // Also note that this might not actually find the highest scoring match, as + // doing so could require a non-linear algorithm, depending on how the score + // is calculated. + + pi = 0 + p = m.pattern[pi] + + const ( + segStreak = 1.0 + wordStreak = 0.8 + noStreak = 0.6 + perSegment = 0.2 // we count at most 3 segments above + ) + + streakBonus := noStreak + totScore := 0.0 + for ii := uint8(start); ii < inputLen; ii++ { + r := m.inputBuffer[ii] + if r == p { + pi++ + p = m.pattern[pi] + // Note: this could be optimized with some bit operations. + switch { + case m.roles[ii]&segmentStart != 0 && segStreak > streakBonus: + streakBonus = segStreak + case m.roles[ii]&wordStart != 0 && wordStreak > streakBonus: + streakBonus = wordStreak + } + finalChar := pi >= m.patternLen + // finalCost := 1.0 + if finalChar && streakBonus > noStreak { + switch { + case ii == inputLen-1 || m.roles[ii+1]&segmentStart != 0: + // Full segment: no reduction + case m.roles[ii+1]&wordStart != 0: + streakBonus = wordStreak + default: + streakBonus = noStreak + } + } + totScore += streakBonus * (1.0 - float64(m.segments[ii])*perSegment) + if finalChar { + break + } + } else { + streakBonus = noStreak + } + } + + return start, totScore / float64(m.patternLen) +} |