aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBill Neubauer <wcn@google.com>2022-06-28 10:23:57 -0700
committerBill Neubauer <bill.neubauer@gmail.com>2022-09-16 10:06:11 -0700
commitebfb5e6792acd76483c177dcdb6853b4f5b868c6 (patch)
tree40e83ddb64b28fefeb91cecbd8c3b77cc195e43d
parentcb39e2ce0ae3978c8384a16d4ba4de007583e7fe (diff)
downloadlicenseclassifier-ebfb5e6792acd76483c177dcdb6853b4f5b868c6.tar.gz
Removing the Index field from the token structures.
It's completely redundant since it's the position of the token in the slice. I thought there would be a use for it, but it never materialized. PiperOrigin-RevId: 457764131
-rw-r--r--v2/classifier.go4
-rw-r--r--v2/classifier_test.go2
-rw-r--r--v2/document.go11
-rw-r--r--v2/searchset_test.go4
-rw-r--r--v2/tokenizer.go6
-rw-r--r--v2/tokenizer_test.go50
6 files changed, 31 insertions, 46 deletions
diff --git a/v2/classifier.go b/v2/classifier.go
index cc54d2e..b2f4d76 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -108,8 +108,8 @@ func (c *Classifier) match(in []byte) Results {
Confidence: conf,
StartLine: id.Tokens[startIndex+startOffset].Line,
EndLine: id.Tokens[endIndex-endOffset-1].Line,
- StartTokenIndex: id.Tokens[startIndex+startOffset].Index,
- EndTokenIndex: id.Tokens[endIndex-endOffset-1].Index,
+ StartTokenIndex: startIndex + startOffset,
+ EndTokenIndex: endIndex - endOffset - 1,
})
}
diff --git a/v2/classifier_test.go b/v2/classifier_test.go
index e5e1471..7baf54d 100644
--- a/v2/classifier_test.go
+++ b/v2/classifier_test.go
@@ -20,6 +20,7 @@ import (
"io/ioutil"
"log"
"os"
+ "path"
"path/filepath"
"sort"
"strings"
@@ -28,7 +29,6 @@ import (
"github.com/davecgh/go-spew/spew"
"github.com/google/go-cmp/cmp"
- "path"
)
type scenario struct {
diff --git a/v2/document.go b/v2/document.go
index fac5e65..73ccaab 100644
--- a/v2/document.go
+++ b/v2/document.go
@@ -26,7 +26,6 @@ type tokenID int // type to ensure safety when manipulating token identifiers.
// token provides detailed information about a single textual token in the document.
type token struct {
Text string // normalized text of the token
- Index int // the token's location in the tokenized document
Line int // line position of this token in the source
Previous string // for the first token in a line, any previous text.
}
@@ -38,9 +37,8 @@ type document struct {
}
type indexedToken struct {
- Index int // the token's location in the tokenized document
- Line int // line position of this token in the source
- ID tokenID // identifier of the text in the dictionary
+ Line int // line position of this token in the source
+ ID tokenID // identifier of the text in the dictionary
}
type indexedDocument struct {
@@ -138,9 +136,8 @@ func (c *Classifier) generateIndexedDocument(d *document, addWords bool) *indexe
}
id.Tokens = append(id.Tokens, indexedToken{
- Index: t.Index,
- Line: t.Line,
- ID: tokID,
+ Line: t.Line,
+ ID: tokID,
})
}
diff --git a/v2/searchset_test.go b/v2/searchset_test.go
index e4db813..ccaa3c3 100644
--- a/v2/searchset_test.go
+++ b/v2/searchset_test.go
@@ -75,8 +75,8 @@ func TestSearchSet_New(t *testing.T) {
q: 4,
want: &searchSet{
Tokens: []indexedToken{
- {Index: 0, Line: 1, ID: 1},
- {Index: 1, Line: 1, ID: 2},
+ {Line: 1, ID: 1},
+ {Line: 1, ID: 2},
},
Hashes: hash{1957950203: tokenRanges{&tokenRange{Start: 0, End: 2}}},
Checksums: []uint32{1957950203},
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 2ab19ef..875cc7e 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -186,7 +186,7 @@ func cleanupTokens(in []*token, removeEol bool) []*token {
// If we are reconstructing a hyphenated word, don't append the EOL
// now, do it when the word is reconstructed.
if partialWord == "" {
- out = append(out, &token{Text: eol, Line: tok.Line, Index: tokIdx})
+ out = append(out, &token{Text: eol, Line: tok.Line})
tokIdx++
}
continue
@@ -201,20 +201,18 @@ func cleanupTokens(in []*token, removeEol bool) []*token {
// Repair hyphenated words
tp := in[i-1]
tp.Text = partialWord + t
- tp.Index = tokIdx
tp.Previous = ""
out = append(out, tp)
tokIdx++
if !removeEol {
// Append the EOL now that the whole word is recovered
- out = append(out, &token{Text: eol, Line: tp.Line, Index: tokIdx})
+ out = append(out, &token{Text: eol, Line: tp.Line})
tokIdx++
}
partialWord = ""
} else {
tok.Text = t
- tok.Index = tokIdx
tok.Previous = ""
out = append(out, tok)
tokIdx++
diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go
index fec5b96..662685c 100644
--- a/v2/tokenizer_test.go
+++ b/v2/tokenizer_test.go
@@ -83,54 +83,44 @@ The AWESOME Project`,
output: &document{
Tokens: []*token{
{
- Text: "the",
- Index: 0,
- Line: 1,
+ Text: "the",
+ Line: 1,
},
{
- Text: "awesome",
- Index: 1,
- Line: 1,
+ Text: "awesome",
+ Line: 1,
},
{
- Text: "project",
- Index: 2,
- Line: 1,
+ Text: "project",
+ Line: 1,
},
{
- Text: "license",
- Index: 3,
- Line: 1,
+ Text: "license",
+ Line: 1,
},
{
- Text: "modifications",
- Index: 4,
- Line: 3,
+ Text: "modifications",
+ Line: 3,
},
{
- Text: "prohibited",
- Index: 5,
- Line: 4,
+ Text: "prohibited",
+ Line: 4,
},
{
- Text: "introduction",
- Index: 6,
- Line: 8,
+ Text: "introduction",
+ Line: 8,
},
{
- Text: "the",
- Index: 7,
- Line: 10,
+ Text: "the",
+ Line: 10,
},
{
- Text: "awesome",
- Index: 8,
- Line: 10,
+ Text: "awesome",
+ Line: 10,
},
{
- Text: "project",
- Index: 9,
- Line: 10,
+ Text: "project",
+ Line: 10,
},
},
Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}},