aboutsummaryrefslogtreecommitdiff
path: root/v2/classifier.go
diff options
context:
space:
mode:
Diffstat (limited to 'v2/classifier.go')
-rw-r--r--v2/classifier.go364
1 files changed, 364 insertions, 0 deletions
diff --git a/v2/classifier.go b/v2/classifier.go
new file mode 100644
index 0000000..4bb3e5c
--- /dev/null
+++ b/v2/classifier.go
@@ -0,0 +1,364 @@
+// Copyright 2020 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package classifier
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "sort"
+ "strings"
+)
+
+// Match is the information about a single instance of a detected match.
+type Match struct {
+ Name string
+ Confidence float64
+ MatchType string
+ Variant string
+ StartLine int
+ EndLine int
+ StartTokenIndex int
+ EndTokenIndex int
+}
+
+// Results captures the summary information and matches detected by the
+// classifier.
+type Results struct {
+ Matches Matches
+ TotalInputLines int
+}
+
+// Matches is a sortable slice of Match.
+type Matches []*Match
+
+// Swap two elements of Matches.
+func (d Matches) Swap(i, j int) { d[i], d[j] = d[j], d[i] }
+func (d Matches) Len() int { return len(d) }
+func (d Matches) Less(i, j int) bool {
+ di, dj := d[i], d[j]
+ // Return matches ordered by confidence
+ if di.Confidence != dj.Confidence {
+ return di.Confidence > dj.Confidence
+ }
+ // Licenses of same confidence are ordered by their appearance
+ if di.StartTokenIndex != dj.StartTokenIndex {
+ return di.StartTokenIndex < dj.StartTokenIndex
+ }
+ // Should never get here, but tiebreak based on the larger license.
+ return di.EndTokenIndex > dj.EndTokenIndex
+}
+
+// Match reports instances of the supplied content in the corpus.
+func (c *Classifier) match(in io.Reader) (Results, error) {
+ id, err := tokenizeStream(in, true, c.dict, false)
+ if err != nil {
+ return Results{}, err
+ }
+
+ firstPass := make(map[string]*indexedDocument)
+ for l, d := range c.docs {
+ sim := id.tokenSimilarity(d)
+
+ if c.tc.traceTokenize(l) {
+ c.tc.trace("Token similarity for %s: %.2f", l, sim)
+ }
+
+ if sim >= c.threshold {
+ firstPass[l] = d
+ }
+ }
+
+ if len(firstPass) == 0 {
+ return Results{
+ Matches: nil,
+ TotalInputLines: 0,
+ }, nil
+ }
+
+ // Perform the expensive work of generating a searchset to look for token runs.
+ id.generateSearchSet(c.q)
+
+ var candidates Matches
+ candidates = append(candidates, id.Matches...)
+
+ for l, d := range firstPass {
+ matches := c.findPotentialMatches(d.s, id.s, c.threshold)
+ for _, m := range matches {
+ startIndex := m.TargetStart
+ endIndex := m.TargetEnd
+ conf, startOffset, endOffset := c.score(l, id, d, startIndex, endIndex)
+ if conf >= c.threshold && (endIndex-startIndex-startOffset-endOffset) > 0 {
+ candidates = append(candidates, &Match{
+ Name: LicenseName(l),
+ Variant: variantName(l),
+ MatchType: detectionType(l),
+ Confidence: conf,
+ StartLine: id.Tokens[startIndex+startOffset].Line,
+ EndLine: id.Tokens[endIndex-endOffset-1].Line,
+ StartTokenIndex: startIndex + startOffset,
+ EndTokenIndex: endIndex - endOffset - 1,
+ })
+ }
+
+ }
+ }
+ sort.Sort(candidates)
+ retain := make([]bool, len(candidates))
+ for i, c := range candidates {
+ // Filter out overlapping licenses based primarily on confidence. Since
+ // the candidates slice is ordered by confidence, we look for overlaps and
+ // decide if we retain the record c.
+
+ // For each candidate, only add it to the report unless we have a
+ // higher-quality hit that contains these lines. In the case of two
+ // licenses having overlap, we consider 'token density' to break ties. If a
+ // less confident match of a larger license has more matching tokens than a
+ // perfect match of a smaller license, we want to keep that. This handles
+ // licenses that include another license as a subtext. NPL contains MPL
+ // as a concrete example.
+
+ keep := true
+ proposals := make(map[int]bool)
+ for j, o := range candidates {
+ if j == i {
+ break
+ }
+ // Make sure to only check containment on licenses that are still in consideration at this point.
+ if contains(c, o) && retain[j] {
+ // The license here can override a previous detection, but that isn't sufficient to be kept
+ // on its own. Consider the licenses Xnet, MPL-1.1 and NPL-1.1 in a file that just has MPL-1.1.
+ // The confidence rating on NPL-1.1 will cause Xnet to not be retained, which is correct, but it
+ // shouldn't be retained if the token confidence for MPL is higher than NPL since the NPL-specific
+ // bits are missing.
+
+ ctoks := float64(c.EndTokenIndex - c.StartTokenIndex)
+ otoks := float64(o.EndTokenIndex - o.StartTokenIndex)
+ cconf := ctoks * c.Confidence
+ oconf := otoks * o.Confidence
+
+ // If the two licenses are exactly the same confidence, that means we
+ // have an ambiguous detect and should retain both, so the caller can
+ // see and resolve the situation.
+ if cconf > oconf {
+ proposals[j] = false
+ } else if oconf > cconf {
+ keep = false
+ }
+ } else if overlaps(c, o) && retain[j] {
+ // if the ending and start lines exactly overlap, it's OK to keep both
+ if c.StartLine != o.EndLine {
+ keep = false
+ }
+ }
+
+ if !keep {
+ break
+ }
+ }
+ if keep {
+ retain[i] = true
+ for p, v := range proposals {
+ retain[p] = v
+ }
+ }
+ }
+
+ var out Matches
+ for i, keep := range retain {
+ if keep {
+ out = append(out, candidates[i])
+ }
+ }
+ return Results{
+ Matches: out,
+ TotalInputLines: id.Tokens[len(id.Tokens)-1].Line,
+ }, nil
+}
+
+// Classifier provides methods for identifying open source licenses in text
+// content.
+type Classifier struct {
+ tc *TraceConfiguration
+ dict *dictionary
+ docs map[string]*indexedDocument
+ threshold float64
+ q int // The value of q for q-grams in this corpus
+}
+
+// NewClassifier creates a classifier with an empty corpus.
+func NewClassifier(threshold float64) *Classifier {
+ classifier := &Classifier{
+ tc: new(TraceConfiguration),
+ dict: newDictionary(),
+ docs: make(map[string]*indexedDocument),
+ threshold: threshold,
+ q: computeQ(threshold),
+ }
+ return classifier
+}
+
+// Normalize takes input content and applies the following transforms to aid in
+// identifying license content. The return value of this function is
+// line-separated text which is the basis for position values returned by the
+// classifier.
+//
+// 1. Breaks up long lines of text. This helps with detecting licenses like in
+// TODO(wcn):URL reference
+//
+// 2. Certain ignorable texts are removed to aid matching blocks of text.
+// Introductory lines such as "The MIT License" are removed. Copyright notices
+// are removed since the parties are variable and shouldn't impact matching.
+//
+// It is NOT necessary to call this function to simply identify licenses in a
+// file. It should only be called to aid presenting this information to the user
+// in context (for example, creating diffs of differences to canonical
+// licenses).
+//
+// It is an invariant of the classifier that calling Match(Normalize(in)) will
+// return the same results as Match(in).
+func (c *Classifier) Normalize(in []byte) []byte {
+ doc, err := tokenizeStream(bytes.NewReader(in), false, c.dict, true)
+ if err != nil {
+ panic("should not be reachable, since bytes.NewReader().Read() should never fail")
+ }
+
+ var buf bytes.Buffer
+
+ switch len(doc.Tokens) {
+ case 0:
+ return nil
+ case 1:
+ buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
+ return buf.Bytes()
+ }
+
+ prevLine := 1
+ buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
+ for _, t := range doc.Tokens[1:] {
+ // Only write out an EOL token that incremented the line
+ if t.Line == prevLine+1 {
+ buf.WriteString(eol)
+ }
+
+ // Only write tokens that aren't EOL
+ txt := c.dict.getWord(t.ID)
+
+ if txt != eol {
+ // Only put a space between tokens if the previous token was on the same
+ // line. This prevents spaces after an EOL
+ if t.Line == prevLine {
+ buf.WriteString(" ")
+ }
+ buf.WriteString(txt)
+ }
+
+ prevLine = t.Line
+ }
+ return buf.Bytes()
+}
+
+// LoadLicenses adds the contents of the supplied directory to the corpus of the
+// classifier.
+func (c *Classifier) LoadLicenses(dir string) error {
+ var files []string
+ err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
+ if err != nil {
+ return nil
+ }
+ if !strings.HasSuffix(path, "txt") {
+ return nil
+ }
+ files = append(files, path)
+ return nil
+ })
+ if err != nil {
+ return err
+ }
+
+ for _, f := range files {
+ relativePath := strings.Replace(f, dir, "", 1)
+ sep := fmt.Sprintf("%c", os.PathSeparator)
+ segments := strings.Split(relativePath, sep)
+ if len(segments) < 3 {
+ c.tc.trace("Insufficient segment count for path: %s", relativePath)
+ continue
+ }
+ category, name, variant := segments[1], segments[2], segments[3]
+ b, err := ioutil.ReadFile(f)
+ if err != nil {
+ return err
+ }
+
+ c.AddContent(category, name, variant, []byte(string(b)))
+ }
+ return nil
+}
+
+// SetTraceConfiguration installs a tracing configuration for the classifier.
+func (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) {
+ c.tc = in
+ c.tc.init()
+}
+
+// Match finds matches within an unknown text. This will not modify the contents
+// of the supplied byte slice.
+func (c *Classifier) Match(in []byte) Results {
+ // Since bytes.NewReader().Read() will never return an error, tokenizeStream
+ // will never return an error so it's okay to ignore the return value in this
+ // case.
+ res, _ := c.MatchFrom(bytes.NewReader(in))
+ return res
+}
+
+// MatchFrom finds matches within the read content.
+func (c *Classifier) MatchFrom(in io.Reader) (Results, error) {
+ return c.match(in)
+}
+
+func detectionType(in string) string {
+ splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
+ return splits[0]
+}
+
+func variantName(in string) string {
+ splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
+ return splits[2]
+}
+
+// LicenseName produces the output name for a license, removing the internal structure
+// of the filename in use.
+func LicenseName(in string) string {
+ splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
+ return splits[1]
+}
+
+// contains returns true iff b is completely inside a
+func contains(a, b *Match) bool {
+ return a.StartLine <= b.StartLine && a.EndLine >= b.EndLine
+}
+
+// returns true iff b <= a <= c
+func between(a, b, c int) bool {
+ return b <= a && a <= c
+}
+
+// returns true iff the ranges covered by a and b overlap.
+func overlaps(a, b *Match) bool {
+ return between(a.StartLine, b.StartLine, b.EndLine) || between(a.EndLine, b.StartLine, b.EndLine)
+}