v2/document.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176

// Copyright 2020 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package classifier provides the implementation of the v2 license classifier.
package classifier

import (
	"bytes"
	"fmt"
	"os"
	"strings"
)

type tokenID int // type to ensure safety when manipulating token identifiers.

// token provides detailed information about a single textual token in the document.
type token struct {
	Text     string // normalized text of the token
	Line     int    // line position of this token in the source
	Previous string // for the first token in a line, any previous text.
}

type indexedToken struct {
	Line int     // line position of this token in the source
	ID   tokenID // identifier of the text in the dictionary
}

type indexedDocument struct {
	Norm    string          // The normalized token sequence
	Tokens  []indexedToken  // ordered tokens of the document
	Matches Matches         // these are matches identified while processing the original, untokenized text via regexp matching
	f       *frequencyTable // frequencies computed for this document
	dict    *dictionary     // The corpus dictionary for this document
	s       *searchSet      // The searchset for this document
	runes   []rune
}

func (d *indexedDocument) generateSearchSet(q int) {
	d.s = newSearchSet(d, q)
}

func (d *indexedDocument) size() int {
	return len(d.Tokens)
}

// normalized returns a string of the normalized tokens concatenated with a
// single space. This is used by the diff algorithm.
// TODO: it'd be more efficient to have the diff algorithm work with the raw tokens directly
// and avoid these ephemeral allocations.
func (d *indexedDocument) normalized() string {
	var w strings.Builder
	for i, t := range d.Tokens {
		w.WriteString(d.dict.getWord(t.ID))
		if (i + 1) != d.size() {
			w.WriteString(" ")
		}
	}
	return w.String()
}

func computeQ(threshold float64) int {
	// q is the lower bound for token runs (q-grams) that must exist
	// in content that can be recognized at the specified threshold.
	// Imagine a document with 100 tokens, and a threshold of 80%. This means
	// that in a worst-case scenario, the 20 errors are evenly distributed to
	// create the sortest possible token runs. In this case, there would be
	// a repeating sequence of 4 good tokens and 1 errored token, occurring
	// 20 times. This function returns the minimum token length, or returning
	// a value of 1 if necessary (since a threshold level below 50% would generate
	// a run of 0-length, which is meaningless.)
	if threshold == 1.0 {
		return 10 // avoid divide by 0
	}

	return max(1, int((threshold)/(1.0-threshold)))
}

func max(a, b int) int {
	if a > b {
		return a
	}
	return b
}

// AddContent incorporates the provided textual content into the classifier for
// matching. This will not modify the supplied content.
func (c *Classifier) AddContent(category, name, variant string, content []byte) {
	// Since bytes.NewReader().Read() will never return an error, tokenizeStream
	// will never return an error so it's okay to ignore the return value in this
	// case.
	doc, _ := tokenizeStream(bytes.NewReader(content), true, c.dict, true)
	c.addDocument(category, name, variant, doc)
}

// addDocument takes a textual document and incorporates it into the classifier for matching.
func (c *Classifier) addDocument(category, name, variant string, id *indexedDocument) {
	// For documents that are part of the corpus, we add them to the dictionary and
	// compute their associated search data eagerly so they are ready for matching against
	// candidates.
	indexName := c.generateDocName(category, name, variant)
	id.generateSearchSet(c.q)
	id.s.origin = indexName
	c.docs[indexName] = id
}

// createTargetIndexedDocument creates an indexed document without adding the
// words to the classifier dictionary. This should be used for matching targets, not
// populating the corpus.
func (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument {
	doc, _ := tokenizeStream(bytes.NewReader(in), true, c.dict, false)
	return doc
}

func (c *Classifier) generateDocName(category, name, variant string) string {
	return fmt.Sprintf("%s%c%s%c%s", category, os.PathSeparator, name, os.PathSeparator, variant)
}
func (c *Classifier) getIndexedDocument(category, name, variant string) *indexedDocument {
	return c.docs[c.generateDocName(category, name, variant)]
}

// dictionary is used to intern all the token words encountered in the text corpus.
// words and indices form an inverse mapping relationship. It is just a convenience type
// over a pair of correlated maps.
type dictionary struct {
	words   map[tokenID]string
	indices map[string]tokenID
}

func newDictionary() *dictionary {
	return &dictionary{
		words:   make(map[tokenID]string),
		indices: make(map[string]tokenID),
	}
}

// add inserts the provided word into the dictionary if it does not already exist.
func (d *dictionary) add(word string) tokenID {
	if idx := d.getIndex(word); idx != unknownIndex {
		return idx
	}
	// token IDs start from 1, 0 is reserved for the invalid ID
	idx := tokenID(len(d.words) + 1)
	d.words[idx] = word
	d.indices[word] = idx
	return idx
}

var unknownWord = "UNKNOWN"
var unknownIndex = tokenID(0)

// getIndex returns the index of the supplied word, or 0 if the word is not in the dictionary.
func (d *dictionary) getIndex(word string) tokenID {
	if idx, found := d.indices[word]; found {
		return idx
	}
	return unknownIndex
}

// getWord returns the word associated with the index.
func (d *dictionary) getWord(index tokenID) string {
	if word, found := d.words[index]; found {
		return word
	}
	return unknownWord
}