Removing the Index field from the token structures.

It's completely redundant since it's the position of the token in the slice. I thought there would be a use for it, but it never materialized. PiperOrigin-RevId: 457764131
author: Bill Neubauer <wcn@google.com> 2022-06-28 10:23:57 -0700
committer: Bill Neubauer <bill.neubauer@gmail.com> 2022-09-16 10:06:11 -0700
commit: ebfb5e6792acd76483c177dcdb6853b4f5b868c6 (patch)
tree: 40e83ddb64b28fefeb91cecbd8c3b77cc195e43d
parent: cb39e2ce0ae3978c8384a16d4ba4de007583e7fe (diff)
download: licenseclassifier-ebfb5e6792acd76483c177dcdb6853b4f5b868c6.tar.gz
6 files changed, 31 insertions, 46 deletions
diff --git a/v2/classifier.go b/v2/classifier.go
index cc54d2e..b2f4d76 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -108,8 +108,8 @@ func (c *Classifier) match(in []byte) Results {
 					Confidence:      conf,
 					StartLine:       id.Tokens[startIndex+startOffset].Line,
 					EndLine:         id.Tokens[endIndex-endOffset-1].Line,
-					StartTokenIndex: id.Tokens[startIndex+startOffset].Index,
-					EndTokenIndex:   id.Tokens[endIndex-endOffset-1].Index,
+					StartTokenIndex: startIndex + startOffset,
+					EndTokenIndex:   endIndex - endOffset - 1,
 				})
 			}
 
diff --git a/v2/classifier_test.go b/v2/classifier_test.go
index e5e1471..7baf54d 100644
--- a/v2/classifier_test.go
+++ b/v2/classifier_test.go
@@ -20,6 +20,7 @@ import (
 	"io/ioutil"
 	"log"
 	"os"
+	"path"
 	"path/filepath"
 	"sort"
 	"strings"
@@ -28,7 +29,6 @@ import (
 
 	"github.com/davecgh/go-spew/spew"
 	"github.com/google/go-cmp/cmp"
-	"path"
 )
 
 type scenario struct {
diff --git a/v2/document.go b/v2/document.go
index fac5e65..73ccaab 100644
--- a/v2/document.go
+++ b/v2/document.go
@@ -26,7 +26,6 @@ type tokenID int // type to ensure safety when manipulating token identifiers.
 // token provides detailed information about a single textual token in the document.
 type token struct {
 	Text     string // normalized text of the token
-	Index    int    // the token's location in the tokenized document
 	Line     int    // line position of this token in the source
 	Previous string // for the first token in a line, any previous text.
 }
@@ -38,9 +37,8 @@ type document struct {
 }
 
 type indexedToken struct {
-	Index int     // the token's location in the tokenized document
-	Line  int     // line position of this token in the source
-	ID    tokenID // identifier of the text in the dictionary
+	Line int     // line position of this token in the source
+	ID   tokenID // identifier of the text in the dictionary
 }
 
 type indexedDocument struct {
@@ -138,9 +136,8 @@ func (c *Classifier) generateIndexedDocument(d *document, addWords bool) *indexe
 		}
 
 		id.Tokens = append(id.Tokens, indexedToken{
-			Index: t.Index,
-			Line:  t.Line,
-			ID:    tokID,
+			Line: t.Line,
+			ID:   tokID,
 		})
 
 	}
diff --git a/v2/searchset_test.go b/v2/searchset_test.go
index e4db813..ccaa3c3 100644
--- a/v2/searchset_test.go
+++ b/v2/searchset_test.go
@@ -75,8 +75,8 @@ func TestSearchSet_New(t *testing.T) {
 			q:           4,
 			want: &searchSet{
 				Tokens: []indexedToken{
-					{Index: 0, Line: 1, ID: 1},
-					{Index: 1, Line: 1, ID: 2},
+					{Line: 1, ID: 1},
+					{Line: 1, ID: 2},
 				},
 				Hashes:         hash{1957950203: tokenRanges{&tokenRange{Start: 0, End: 2}}},
 				Checksums:      []uint32{1957950203},
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 2ab19ef..875cc7e 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -186,7 +186,7 @@ func cleanupTokens(in []*token, removeEol bool) []*token {
 			// If we are reconstructing a hyphenated word, don't append the EOL
 			// now, do it when the word is reconstructed.
 			if partialWord == "" {
-				out = append(out, &token{Text: eol, Line: tok.Line, Index: tokIdx})
+				out = append(out, &token{Text: eol, Line: tok.Line})
 				tokIdx++
 			}
 			continue
@@ -201,20 +201,18 @@ func cleanupTokens(in []*token, removeEol bool) []*token {
 			// Repair hyphenated words
 			tp := in[i-1]
 			tp.Text = partialWord + t
-			tp.Index = tokIdx
 			tp.Previous = ""
 			out = append(out, tp)
 			tokIdx++
 			if !removeEol {
 				// Append the EOL now that the whole word is recovered
-				out = append(out, &token{Text: eol, Line: tp.Line, Index: tokIdx})
+				out = append(out, &token{Text: eol, Line: tp.Line})
 				tokIdx++
 			}
 
 			partialWord = ""
 		} else {
 			tok.Text = t
-			tok.Index = tokIdx
 			tok.Previous = ""
 			out = append(out, tok)
 			tokIdx++
diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go
index fec5b96..662685c 100644
--- a/v2/tokenizer_test.go
+++ b/v2/tokenizer_test.go
@@ -83,54 +83,44 @@ The AWESOME Project`,
 			output: &document{
 				Tokens: []*token{
 					{
-						Text:  "the",
-						Index: 0,
-						Line:  1,
+						Text: "the",
+						Line: 1,
 					},
 					{
-						Text:  "awesome",
-						Index: 1,
-						Line:  1,
+						Text: "awesome",
+						Line: 1,
 					},
 					{
-						Text:  "project",
-						Index: 2,
-						Line:  1,
+						Text: "project",
+						Line: 1,
 					},
 					{
-						Text:  "license",
-						Index: 3,
-						Line:  1,
+						Text: "license",
+						Line: 1,
 					},
 					{
-						Text:  "modifications",
-						Index: 4,
-						Line:  3,
+						Text: "modifications",
+						Line: 3,
 					},
 					{
-						Text:  "prohibited",
-						Index: 5,
-						Line:  4,
+						Text: "prohibited",
+						Line: 4,
 					},
 					{
-						Text:  "introduction",
-						Index: 6,
-						Line:  8,
+						Text: "introduction",
+						Line: 8,
 					},
 					{
-						Text:  "the",
-						Index: 7,
-						Line:  10,
+						Text: "the",
+						Line: 10,
 					},
 					{
-						Text:  "awesome",
-						Index: 8,
-						Line:  10,
+						Text: "awesome",
+						Line: 10,
 					},
 					{
-						Text:  "project",
-						Index: 9,
-						Line:  10,
+						Text: "project",
+						Line: 10,
 					},
 				},
 				Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}},
author	Bill Neubauer <wcn@google.com>	2022-06-28 10:23:57 -0700
committer	Bill Neubauer <bill.neubauer@gmail.com>	2022-09-16 10:06:11 -0700
commit	ebfb5e6792acd76483c177dcdb6853b4f5b868c6 (patch)
tree	40e83ddb64b28fefeb91cecbd8c3b77cc195e43d
parent	cb39e2ce0ae3978c8384a16d4ba4de007583e7fe (diff)
download	licenseclassifier-ebfb5e6792acd76483c177dcdb6853b4f5b868c6.tar.gz