diff options
Diffstat (limited to 'v2/classifier.go')
-rw-r--r-- | v2/classifier.go | 364 |
1 files changed, 364 insertions, 0 deletions
diff --git a/v2/classifier.go b/v2/classifier.go new file mode 100644 index 0000000..4bb3e5c --- /dev/null +++ b/v2/classifier.go @@ -0,0 +1,364 @@ +// Copyright 2020 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package classifier + +import ( + "bytes" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "sort" + "strings" +) + +// Match is the information about a single instance of a detected match. +type Match struct { + Name string + Confidence float64 + MatchType string + Variant string + StartLine int + EndLine int + StartTokenIndex int + EndTokenIndex int +} + +// Results captures the summary information and matches detected by the +// classifier. +type Results struct { + Matches Matches + TotalInputLines int +} + +// Matches is a sortable slice of Match. +type Matches []*Match + +// Swap two elements of Matches. +func (d Matches) Swap(i, j int) { d[i], d[j] = d[j], d[i] } +func (d Matches) Len() int { return len(d) } +func (d Matches) Less(i, j int) bool { + di, dj := d[i], d[j] + // Return matches ordered by confidence + if di.Confidence != dj.Confidence { + return di.Confidence > dj.Confidence + } + // Licenses of same confidence are ordered by their appearance + if di.StartTokenIndex != dj.StartTokenIndex { + return di.StartTokenIndex < dj.StartTokenIndex + } + // Should never get here, but tiebreak based on the larger license. + return di.EndTokenIndex > dj.EndTokenIndex +} + +// Match reports instances of the supplied content in the corpus. +func (c *Classifier) match(in io.Reader) (Results, error) { + id, err := tokenizeStream(in, true, c.dict, false) + if err != nil { + return Results{}, err + } + + firstPass := make(map[string]*indexedDocument) + for l, d := range c.docs { + sim := id.tokenSimilarity(d) + + if c.tc.traceTokenize(l) { + c.tc.trace("Token similarity for %s: %.2f", l, sim) + } + + if sim >= c.threshold { + firstPass[l] = d + } + } + + if len(firstPass) == 0 { + return Results{ + Matches: nil, + TotalInputLines: 0, + }, nil + } + + // Perform the expensive work of generating a searchset to look for token runs. + id.generateSearchSet(c.q) + + var candidates Matches + candidates = append(candidates, id.Matches...) + + for l, d := range firstPass { + matches := c.findPotentialMatches(d.s, id.s, c.threshold) + for _, m := range matches { + startIndex := m.TargetStart + endIndex := m.TargetEnd + conf, startOffset, endOffset := c.score(l, id, d, startIndex, endIndex) + if conf >= c.threshold && (endIndex-startIndex-startOffset-endOffset) > 0 { + candidates = append(candidates, &Match{ + Name: LicenseName(l), + Variant: variantName(l), + MatchType: detectionType(l), + Confidence: conf, + StartLine: id.Tokens[startIndex+startOffset].Line, + EndLine: id.Tokens[endIndex-endOffset-1].Line, + StartTokenIndex: startIndex + startOffset, + EndTokenIndex: endIndex - endOffset - 1, + }) + } + + } + } + sort.Sort(candidates) + retain := make([]bool, len(candidates)) + for i, c := range candidates { + // Filter out overlapping licenses based primarily on confidence. Since + // the candidates slice is ordered by confidence, we look for overlaps and + // decide if we retain the record c. + + // For each candidate, only add it to the report unless we have a + // higher-quality hit that contains these lines. In the case of two + // licenses having overlap, we consider 'token density' to break ties. If a + // less confident match of a larger license has more matching tokens than a + // perfect match of a smaller license, we want to keep that. This handles + // licenses that include another license as a subtext. NPL contains MPL + // as a concrete example. + + keep := true + proposals := make(map[int]bool) + for j, o := range candidates { + if j == i { + break + } + // Make sure to only check containment on licenses that are still in consideration at this point. + if contains(c, o) && retain[j] { + // The license here can override a previous detection, but that isn't sufficient to be kept + // on its own. Consider the licenses Xnet, MPL-1.1 and NPL-1.1 in a file that just has MPL-1.1. + // The confidence rating on NPL-1.1 will cause Xnet to not be retained, which is correct, but it + // shouldn't be retained if the token confidence for MPL is higher than NPL since the NPL-specific + // bits are missing. + + ctoks := float64(c.EndTokenIndex - c.StartTokenIndex) + otoks := float64(o.EndTokenIndex - o.StartTokenIndex) + cconf := ctoks * c.Confidence + oconf := otoks * o.Confidence + + // If the two licenses are exactly the same confidence, that means we + // have an ambiguous detect and should retain both, so the caller can + // see and resolve the situation. + if cconf > oconf { + proposals[j] = false + } else if oconf > cconf { + keep = false + } + } else if overlaps(c, o) && retain[j] { + // if the ending and start lines exactly overlap, it's OK to keep both + if c.StartLine != o.EndLine { + keep = false + } + } + + if !keep { + break + } + } + if keep { + retain[i] = true + for p, v := range proposals { + retain[p] = v + } + } + } + + var out Matches + for i, keep := range retain { + if keep { + out = append(out, candidates[i]) + } + } + return Results{ + Matches: out, + TotalInputLines: id.Tokens[len(id.Tokens)-1].Line, + }, nil +} + +// Classifier provides methods for identifying open source licenses in text +// content. +type Classifier struct { + tc *TraceConfiguration + dict *dictionary + docs map[string]*indexedDocument + threshold float64 + q int // The value of q for q-grams in this corpus +} + +// NewClassifier creates a classifier with an empty corpus. +func NewClassifier(threshold float64) *Classifier { + classifier := &Classifier{ + tc: new(TraceConfiguration), + dict: newDictionary(), + docs: make(map[string]*indexedDocument), + threshold: threshold, + q: computeQ(threshold), + } + return classifier +} + +// Normalize takes input content and applies the following transforms to aid in +// identifying license content. The return value of this function is +// line-separated text which is the basis for position values returned by the +// classifier. +// +// 1. Breaks up long lines of text. This helps with detecting licenses like in +// TODO(wcn):URL reference +// +// 2. Certain ignorable texts are removed to aid matching blocks of text. +// Introductory lines such as "The MIT License" are removed. Copyright notices +// are removed since the parties are variable and shouldn't impact matching. +// +// It is NOT necessary to call this function to simply identify licenses in a +// file. It should only be called to aid presenting this information to the user +// in context (for example, creating diffs of differences to canonical +// licenses). +// +// It is an invariant of the classifier that calling Match(Normalize(in)) will +// return the same results as Match(in). +func (c *Classifier) Normalize(in []byte) []byte { + doc, err := tokenizeStream(bytes.NewReader(in), false, c.dict, true) + if err != nil { + panic("should not be reachable, since bytes.NewReader().Read() should never fail") + } + + var buf bytes.Buffer + + switch len(doc.Tokens) { + case 0: + return nil + case 1: + buf.WriteString(c.dict.getWord(doc.Tokens[0].ID)) + return buf.Bytes() + } + + prevLine := 1 + buf.WriteString(c.dict.getWord(doc.Tokens[0].ID)) + for _, t := range doc.Tokens[1:] { + // Only write out an EOL token that incremented the line + if t.Line == prevLine+1 { + buf.WriteString(eol) + } + + // Only write tokens that aren't EOL + txt := c.dict.getWord(t.ID) + + if txt != eol { + // Only put a space between tokens if the previous token was on the same + // line. This prevents spaces after an EOL + if t.Line == prevLine { + buf.WriteString(" ") + } + buf.WriteString(txt) + } + + prevLine = t.Line + } + return buf.Bytes() +} + +// LoadLicenses adds the contents of the supplied directory to the corpus of the +// classifier. +func (c *Classifier) LoadLicenses(dir string) error { + var files []string + err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return nil + } + if !strings.HasSuffix(path, "txt") { + return nil + } + files = append(files, path) + return nil + }) + if err != nil { + return err + } + + for _, f := range files { + relativePath := strings.Replace(f, dir, "", 1) + sep := fmt.Sprintf("%c", os.PathSeparator) + segments := strings.Split(relativePath, sep) + if len(segments) < 3 { + c.tc.trace("Insufficient segment count for path: %s", relativePath) + continue + } + category, name, variant := segments[1], segments[2], segments[3] + b, err := ioutil.ReadFile(f) + if err != nil { + return err + } + + c.AddContent(category, name, variant, []byte(string(b))) + } + return nil +} + +// SetTraceConfiguration installs a tracing configuration for the classifier. +func (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) { + c.tc = in + c.tc.init() +} + +// Match finds matches within an unknown text. This will not modify the contents +// of the supplied byte slice. +func (c *Classifier) Match(in []byte) Results { + // Since bytes.NewReader().Read() will never return an error, tokenizeStream + // will never return an error so it's okay to ignore the return value in this + // case. + res, _ := c.MatchFrom(bytes.NewReader(in)) + return res +} + +// MatchFrom finds matches within the read content. +func (c *Classifier) MatchFrom(in io.Reader) (Results, error) { + return c.match(in) +} + +func detectionType(in string) string { + splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator)) + return splits[0] +} + +func variantName(in string) string { + splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator)) + return splits[2] +} + +// LicenseName produces the output name for a license, removing the internal structure +// of the filename in use. +func LicenseName(in string) string { + splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator)) + return splits[1] +} + +// contains returns true iff b is completely inside a +func contains(a, b *Match) bool { + return a.StartLine <= b.StartLine && a.EndLine >= b.EndLine +} + +// returns true iff b <= a <= c +func between(a, b, c int) bool { + return b <= a && a <= c +} + +// returns true iff the ranges covered by a and b overlap. +func overlaps(a, b *Match) bool { + return between(a.StartLine, b.StartLine, b.EndLine) || between(a.EndLine, b.StartLine, b.EndLine) +} |