1 files changed, 364 insertions, 0 deletions
diff --git a/v2/classifier.go b/v2/classifier.go
new file mode 100644
index 0000000..4bb3e5c
--- /dev/null
+++ b/v2/classifier.go
@@ -0,0 +1,364 @@
+// Copyright 2020 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package classifier
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+)
+
+// Match is the information about a single instance of a detected match.
+type Match struct {
+	Name            string
+	Confidence      float64
+	MatchType       string
+	Variant         string
+	StartLine       int
+	EndLine         int
+	StartTokenIndex int
+	EndTokenIndex   int
+}
+
+// Results captures the summary information and matches detected by the
+// classifier.
+type Results struct {
+	Matches         Matches
+	TotalInputLines int
+}
+
+// Matches is a sortable slice of Match.
+type Matches []*Match
+
+// Swap two elements of Matches.
+func (d Matches) Swap(i, j int) { d[i], d[j] = d[j], d[i] }
+func (d Matches) Len() int      { return len(d) }
+func (d Matches) Less(i, j int) bool {
+	di, dj := d[i], d[j]
+	// Return matches ordered by confidence
+	if di.Confidence != dj.Confidence {
+		return di.Confidence > dj.Confidence
+	}
+	// Licenses of same confidence are ordered by their appearance
+	if di.StartTokenIndex != dj.StartTokenIndex {
+		return di.StartTokenIndex < dj.StartTokenIndex
+	}
+	// Should never get here, but tiebreak based on the larger license.
+	return di.EndTokenIndex > dj.EndTokenIndex
+}
+
+// Match reports instances of the supplied content in the corpus.
+func (c *Classifier) match(in io.Reader) (Results, error) {
+	id, err := tokenizeStream(in, true, c.dict, false)
+	if err != nil {
+		return Results{}, err
+	}
+
+	firstPass := make(map[string]*indexedDocument)
+	for l, d := range c.docs {
+		sim := id.tokenSimilarity(d)
+
+		if c.tc.traceTokenize(l) {
+			c.tc.trace("Token similarity for %s: %.2f", l, sim)
+		}
+
+		if sim >= c.threshold {
+			firstPass[l] = d
+		}
+	}
+
+	if len(firstPass) == 0 {
+		return Results{
+			Matches:         nil,
+			TotalInputLines: 0,
+		}, nil
+	}
+
+	// Perform the expensive work of generating a searchset to look for token runs.
+	id.generateSearchSet(c.q)
+
+	var candidates Matches
+	candidates = append(candidates, id.Matches...)
+
+	for l, d := range firstPass {
+		matches := c.findPotentialMatches(d.s, id.s, c.threshold)
+		for _, m := range matches {
+			startIndex := m.TargetStart
+			endIndex := m.TargetEnd
+			conf, startOffset, endOffset := c.score(l, id, d, startIndex, endIndex)
+			if conf >= c.threshold && (endIndex-startIndex-startOffset-endOffset) > 0 {
+				candidates = append(candidates, &Match{
+					Name:            LicenseName(l),
+					Variant:         variantName(l),
+					MatchType:       detectionType(l),
+					Confidence:      conf,
+					StartLine:       id.Tokens[startIndex+startOffset].Line,
+					EndLine:         id.Tokens[endIndex-endOffset-1].Line,
+					StartTokenIndex: startIndex + startOffset,
+					EndTokenIndex:   endIndex - endOffset - 1,
+				})
+			}
+
+		}
+	}
+	sort.Sort(candidates)
+	retain := make([]bool, len(candidates))
+	for i, c := range candidates {
+		// Filter out overlapping licenses based primarily on confidence. Since
+		// the candidates slice is ordered by confidence, we look for overlaps and
+		// decide if we retain the record c.
+
+		// For each candidate, only add it to the report unless we have a
+		// higher-quality hit that contains these lines. In the case of two
+		// licenses having overlap, we consider 'token density' to break ties. If a
+		// less confident match of a larger license has more matching tokens than a
+		// perfect match of a smaller license, we want to keep that. This handles
+		// licenses that include another license as a subtext. NPL contains MPL
+		// as a concrete example.
+
+		keep := true
+		proposals := make(map[int]bool)
+		for j, o := range candidates {
+			if j == i {
+				break
+			}
+			// Make sure to only check containment on licenses that are still in consideration at this point.
+			if contains(c, o) && retain[j] {
+				// The license here can override a previous detection, but that isn't sufficient to be kept
+				// on its own. Consider the licenses Xnet, MPL-1.1 and NPL-1.1 in a file that just has MPL-1.1.
+				// The confidence rating on NPL-1.1 will cause Xnet to not be retained, which is correct, but it
+				// shouldn't be retained if the token confidence for MPL is higher than NPL since the NPL-specific
+				// bits are missing.
+
+				ctoks := float64(c.EndTokenIndex - c.StartTokenIndex)
+				otoks := float64(o.EndTokenIndex - o.StartTokenIndex)
+				cconf := ctoks * c.Confidence
+				oconf := otoks * o.Confidence
+
+				// If the two licenses are exactly the same confidence, that means we
+				// have an ambiguous detect and should retain both, so the caller can
+				// see and resolve the situation.
+				if cconf > oconf {
+					proposals[j] = false
+				} else if oconf > cconf {
+					keep = false
+				}
+			} else if overlaps(c, o) && retain[j] {
+				// if the ending and start lines exactly overlap, it's OK to keep both
+				if c.StartLine != o.EndLine {
+					keep = false
+				}
+			}
+
+			if !keep {
+				break
+			}
+		}
+		if keep {
+			retain[i] = true
+			for p, v := range proposals {
+				retain[p] = v
+			}
+		}
+	}
+
+	var out Matches
+	for i, keep := range retain {
+		if keep {
+			out = append(out, candidates[i])
+		}
+	}
+	return Results{
+		Matches:         out,
+		TotalInputLines: id.Tokens[len(id.Tokens)-1].Line,
+	}, nil
+}
+
+// Classifier provides methods for identifying open source licenses in text
+// content.
+type Classifier struct {
+	tc        *TraceConfiguration
+	dict      *dictionary
+	docs      map[string]*indexedDocument
+	threshold float64
+	q         int // The value of q for q-grams in this corpus
+}
+
+// NewClassifier creates a classifier with an empty corpus.
+func NewClassifier(threshold float64) *Classifier {
+	classifier := &Classifier{
+		tc:        new(TraceConfiguration),
+		dict:      newDictionary(),
+		docs:      make(map[string]*indexedDocument),
+		threshold: threshold,
+		q:         computeQ(threshold),
+	}
+	return classifier
+}
+
+// Normalize takes input content and applies the following transforms to aid in
+// identifying license content. The return value of this function is
+// line-separated text which is the basis for position values returned by the
+// classifier.
+//
+// 1. Breaks up long lines of text. This helps with detecting licenses like in
+// TODO(wcn):URL reference
+//
+// 2. Certain ignorable texts are removed to aid matching blocks of text.
+// Introductory lines such as "The MIT License" are removed. Copyright notices
+// are removed since the parties are variable and shouldn't impact matching.
+//
+// It is NOT necessary to call this function to simply identify licenses in a
+// file. It should only be called to aid presenting this information to the user
+// in context (for example, creating diffs of differences to canonical
+// licenses).
+//
+// It is an invariant of the classifier that calling Match(Normalize(in)) will
+// return the same results as Match(in).
+func (c *Classifier) Normalize(in []byte) []byte {
+	doc, err := tokenizeStream(bytes.NewReader(in), false, c.dict, true)
+	if err != nil {
+		panic("should not be reachable, since bytes.NewReader().Read() should never fail")
+	}
+
+	var buf bytes.Buffer
+
+	switch len(doc.Tokens) {
+	case 0:
+		return nil
+	case 1:
+		buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
+		return buf.Bytes()
+	}
+
+	prevLine := 1
+	buf.WriteString(c.dict.getWord(doc.Tokens[0].ID))
+	for _, t := range doc.Tokens[1:] {
+		// Only write out an EOL token that incremented the line
+		if t.Line == prevLine+1 {
+			buf.WriteString(eol)
+		}
+
+		// Only write tokens that aren't EOL
+		txt := c.dict.getWord(t.ID)
+
+		if txt != eol {
+			// Only put a space between tokens if the previous token was on the same
+			// line. This prevents spaces after an EOL
+			if t.Line == prevLine {
+				buf.WriteString(" ")
+			}
+			buf.WriteString(txt)
+		}
+
+		prevLine = t.Line
+	}
+	return buf.Bytes()
+}
+
+// LoadLicenses adds the contents of the supplied directory to the corpus of the
+// classifier.
+func (c *Classifier) LoadLicenses(dir string) error {
+	var files []string
+	err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return nil
+		}
+		if !strings.HasSuffix(path, "txt") {
+			return nil
+		}
+		files = append(files, path)
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+
+	for _, f := range files {
+		relativePath := strings.Replace(f, dir, "", 1)
+		sep := fmt.Sprintf("%c", os.PathSeparator)
+		segments := strings.Split(relativePath, sep)
+		if len(segments) < 3 {
+			c.tc.trace("Insufficient segment count for path: %s", relativePath)
+			continue
+		}
+		category, name, variant := segments[1], segments[2], segments[3]
+		b, err := ioutil.ReadFile(f)
+		if err != nil {
+			return err
+		}
+
+		c.AddContent(category, name, variant, []byte(string(b)))
+	}
+	return nil
+}
+
+// SetTraceConfiguration installs a tracing configuration for the classifier.
+func (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) {
+	c.tc = in
+	c.tc.init()
+}
+
+// Match finds matches within an unknown text. This will not modify the contents
+// of the supplied byte slice.
+func (c *Classifier) Match(in []byte) Results {
+	// Since bytes.NewReader().Read() will never return an error, tokenizeStream
+	// will never return an error so it's okay to ignore the return value in this
+	// case.
+	res, _ := c.MatchFrom(bytes.NewReader(in))
+	return res
+}
+
+// MatchFrom finds matches within the read content.
+func (c *Classifier) MatchFrom(in io.Reader) (Results, error) {
+	return c.match(in)
+}
+
+func detectionType(in string) string {
+	splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
+	return splits[0]
+}
+
+func variantName(in string) string {
+	splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
+	return splits[2]
+}
+
+// LicenseName produces the output name for a license, removing the internal structure
+// of the filename in use.
+func LicenseName(in string) string {
+	splits := strings.Split(in, fmt.Sprintf("%c", os.PathSeparator))
+	return splits[1]
+}
+
+// contains returns true iff b is completely inside a
+func contains(a, b *Match) bool {
+	return a.StartLine <= b.StartLine && a.EndLine >= b.EndLine
+}
+
+// returns true iff b <= a <= c
+func between(a, b, c int) bool {
+	return b <= a && a <= c
+}
+
+// returns true iff the ranges covered by a and b overlap.
+func overlaps(a, b *Match) bool {
+	return between(a.StartLine, b.StartLine, b.EndLine) || between(a.EndLine, b.StartLine, b.EndLine)
+}