diff options
author | Bill Neubauer <wcn@google.com> | 2022-06-28 10:23:57 -0700 |
---|---|---|
committer | Bill Neubauer <bill.neubauer@gmail.com> | 2022-09-16 10:06:11 -0700 |
commit | ebfb5e6792acd76483c177dcdb6853b4f5b868c6 (patch) | |
tree | 40e83ddb64b28fefeb91cecbd8c3b77cc195e43d | |
parent | cb39e2ce0ae3978c8384a16d4ba4de007583e7fe (diff) | |
download | licenseclassifier-ebfb5e6792acd76483c177dcdb6853b4f5b868c6.tar.gz |
Removing the Index field from the token structures.
It's completely redundant since it's the position of the token in the slice. I
thought there would be a use for it, but it never materialized.
PiperOrigin-RevId: 457764131
-rw-r--r-- | v2/classifier.go | 4 | ||||
-rw-r--r-- | v2/classifier_test.go | 2 | ||||
-rw-r--r-- | v2/document.go | 11 | ||||
-rw-r--r-- | v2/searchset_test.go | 4 | ||||
-rw-r--r-- | v2/tokenizer.go | 6 | ||||
-rw-r--r-- | v2/tokenizer_test.go | 50 |
6 files changed, 31 insertions, 46 deletions
diff --git a/v2/classifier.go b/v2/classifier.go index cc54d2e..b2f4d76 100644 --- a/v2/classifier.go +++ b/v2/classifier.go @@ -108,8 +108,8 @@ func (c *Classifier) match(in []byte) Results { Confidence: conf, StartLine: id.Tokens[startIndex+startOffset].Line, EndLine: id.Tokens[endIndex-endOffset-1].Line, - StartTokenIndex: id.Tokens[startIndex+startOffset].Index, - EndTokenIndex: id.Tokens[endIndex-endOffset-1].Index, + StartTokenIndex: startIndex + startOffset, + EndTokenIndex: endIndex - endOffset - 1, }) } diff --git a/v2/classifier_test.go b/v2/classifier_test.go index e5e1471..7baf54d 100644 --- a/v2/classifier_test.go +++ b/v2/classifier_test.go @@ -20,6 +20,7 @@ import ( "io/ioutil" "log" "os" + "path" "path/filepath" "sort" "strings" @@ -28,7 +29,6 @@ import ( "github.com/davecgh/go-spew/spew" "github.com/google/go-cmp/cmp" - "path" ) type scenario struct { diff --git a/v2/document.go b/v2/document.go index fac5e65..73ccaab 100644 --- a/v2/document.go +++ b/v2/document.go @@ -26,7 +26,6 @@ type tokenID int // type to ensure safety when manipulating token identifiers. // token provides detailed information about a single textual token in the document. type token struct { Text string // normalized text of the token - Index int // the token's location in the tokenized document Line int // line position of this token in the source Previous string // for the first token in a line, any previous text. } @@ -38,9 +37,8 @@ type document struct { } type indexedToken struct { - Index int // the token's location in the tokenized document - Line int // line position of this token in the source - ID tokenID // identifier of the text in the dictionary + Line int // line position of this token in the source + ID tokenID // identifier of the text in the dictionary } type indexedDocument struct { @@ -138,9 +136,8 @@ func (c *Classifier) generateIndexedDocument(d *document, addWords bool) *indexe } id.Tokens = append(id.Tokens, indexedToken{ - Index: t.Index, - Line: t.Line, - ID: tokID, + Line: t.Line, + ID: tokID, }) } diff --git a/v2/searchset_test.go b/v2/searchset_test.go index e4db813..ccaa3c3 100644 --- a/v2/searchset_test.go +++ b/v2/searchset_test.go @@ -75,8 +75,8 @@ func TestSearchSet_New(t *testing.T) { q: 4, want: &searchSet{ Tokens: []indexedToken{ - {Index: 0, Line: 1, ID: 1}, - {Index: 1, Line: 1, ID: 2}, + {Line: 1, ID: 1}, + {Line: 1, ID: 2}, }, Hashes: hash{1957950203: tokenRanges{&tokenRange{Start: 0, End: 2}}}, Checksums: []uint32{1957950203}, diff --git a/v2/tokenizer.go b/v2/tokenizer.go index 2ab19ef..875cc7e 100644 --- a/v2/tokenizer.go +++ b/v2/tokenizer.go @@ -186,7 +186,7 @@ func cleanupTokens(in []*token, removeEol bool) []*token { // If we are reconstructing a hyphenated word, don't append the EOL // now, do it when the word is reconstructed. if partialWord == "" { - out = append(out, &token{Text: eol, Line: tok.Line, Index: tokIdx}) + out = append(out, &token{Text: eol, Line: tok.Line}) tokIdx++ } continue @@ -201,20 +201,18 @@ func cleanupTokens(in []*token, removeEol bool) []*token { // Repair hyphenated words tp := in[i-1] tp.Text = partialWord + t - tp.Index = tokIdx tp.Previous = "" out = append(out, tp) tokIdx++ if !removeEol { // Append the EOL now that the whole word is recovered - out = append(out, &token{Text: eol, Line: tp.Line, Index: tokIdx}) + out = append(out, &token{Text: eol, Line: tp.Line}) tokIdx++ } partialWord = "" } else { tok.Text = t - tok.Index = tokIdx tok.Previous = "" out = append(out, tok) tokIdx++ diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go index fec5b96..662685c 100644 --- a/v2/tokenizer_test.go +++ b/v2/tokenizer_test.go @@ -83,54 +83,44 @@ The AWESOME Project`, output: &document{ Tokens: []*token{ { - Text: "the", - Index: 0, - Line: 1, + Text: "the", + Line: 1, }, { - Text: "awesome", - Index: 1, - Line: 1, + Text: "awesome", + Line: 1, }, { - Text: "project", - Index: 2, - Line: 1, + Text: "project", + Line: 1, }, { - Text: "license", - Index: 3, - Line: 1, + Text: "license", + Line: 1, }, { - Text: "modifications", - Index: 4, - Line: 3, + Text: "modifications", + Line: 3, }, { - Text: "prohibited", - Index: 5, - Line: 4, + Text: "prohibited", + Line: 4, }, { - Text: "introduction", - Index: 6, - Line: 8, + Text: "introduction", + Line: 8, }, { - Text: "the", - Index: 7, - Line: 10, + Text: "the", + Line: 10, }, { - Text: "awesome", - Index: 8, - Line: 10, + Text: "awesome", + Line: 10, }, { - Text: "project", - Index: 9, - Line: 10, + Text: "project", + Line: 10, }, }, Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}}, |