aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBill Neubauer <wcn@google.com>2022-08-22 11:54:55 -0700
committerBill Neubauer <bill.neubauer@gmail.com>2022-09-16 10:06:11 -0700
commit9af65096d7ceb845f034197c5f6b762894c680eb (patch)
treeb64f8d5894a9b05cf285fda107a7ce487efd695e
parentbbfad6347cd1f2f7e28fb20144f64d60c700181b (diff)
downloadlicenseclassifier-9af65096d7ceb845f034197c5f6b762894c680eb.tar.gz
Make the public facing API be implemented in terms of io.Reader rather than
[]byte. This allows for larger files or other inputs to be handled without requiring the full contents to be stored in memory at one time. PiperOrigin-RevId: 474376181
-rw-r--r--v2/classifier.go23
-rw-r--r--v2/document.go9
-rw-r--r--v2/tokenizer.go9
-rw-r--r--v2/tokenizer_test.go11
4 files changed, 29 insertions, 23 deletions
diff --git a/v2/classifier.go b/v2/classifier.go
index f163030..4bb3e5c 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -65,8 +65,11 @@ func (d Matches) Less(i, j int) bool {
}
// Match reports instances of the supplied content in the corpus.
-func (c *Classifier) match(in []byte) Results {
- id := c.createTargetIndexedDocument(in)
+func (c *Classifier) match(in io.Reader) (Results, error) {
+ id, err := tokenizeStream(in, true, c.dict, false)
+ if err != nil {
+ return Results{}, err
+ }
firstPass := make(map[string]*indexedDocument)
for l, d := range c.docs {
@@ -85,7 +88,7 @@ func (c *Classifier) match(in []byte) Results {
return Results{
Matches: nil,
TotalInputLines: 0,
- }
+ }, nil
}
// Perform the expensive work of generating a searchset to look for token runs.
@@ -185,7 +188,7 @@ func (c *Classifier) match(in []byte) Results {
return Results{
Matches: out,
TotalInputLines: id.Tokens[len(id.Tokens)-1].Line,
- }
+ }, nil
}
// Classifier provides methods for identifying open source licenses in text
@@ -316,16 +319,16 @@ func (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) {
// Match finds matches within an unknown text. This will not modify the contents
// of the supplied byte slice.
func (c *Classifier) Match(in []byte) Results {
- return c.match(in)
+ // Since bytes.NewReader().Read() will never return an error, tokenizeStream
+ // will never return an error so it's okay to ignore the return value in this
+ // case.
+ res, _ := c.MatchFrom(bytes.NewReader(in))
+ return res
}
// MatchFrom finds matches within the read content.
func (c *Classifier) MatchFrom(in io.Reader) (Results, error) {
- b, err := ioutil.ReadAll(in)
- if err != nil {
- return Results{}, fmt.Errorf("classifier couldn't read: %w", err)
- }
- return c.Match(b), nil
+ return c.match(in)
}
func detectionType(in string) string {
diff --git a/v2/document.go b/v2/document.go
index 6f3c1b5..dcedb67 100644
--- a/v2/document.go
+++ b/v2/document.go
@@ -16,6 +16,7 @@
package classifier
import (
+ "bytes"
"fmt"
"os"
"strings"
@@ -95,7 +96,10 @@ func max(a, b int) int {
// AddContent incorporates the provided textual content into the classifier for
// matching. This will not modify the supplied content.
func (c *Classifier) AddContent(category, name, variant string, content []byte) {
- doc := tokenize(content, c.dict, true)
+ // Since bytes.NewReader().Read() will never return an error, tokenizeStream
+ // will never return an error so it's okay to ignore the return value in this
+ // case.
+ doc, _ := tokenizeStream(bytes.NewReader(content), true, c.dict, true)
c.addDocument(category, name, variant, doc)
}
@@ -114,7 +118,8 @@ func (c *Classifier) addDocument(category, name, variant string, id *indexedDocu
// words to the classifier dictionary. This should be used for matching targets, not
// populating the corpus.
func (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument {
- return tokenize(in, c.dict, false)
+ doc, _ := tokenizeStream(bytes.NewReader(in), true, c.dict, false)
+ return doc
}
func (c *Classifier) generateDocName(category, name, variant string) string {
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 0d3917e..607b0d4 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -15,7 +15,6 @@
package classifier
import (
- "bytes"
"html"
"io"
"regexp"
@@ -67,14 +66,6 @@ var ignorableTexts = []*regexp.Regexp{
regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
}
-func tokenize(in []byte, dict *dictionary, updateDict bool) *indexedDocument {
- // Since bytes.NewReader().Read() will never return an error, tokenizeStream
- // will never return an error so it's okay to ignore the return value in this
- // case.
- id, _ := tokenizeStream(bytes.NewReader(in), true, dict, updateDict)
- return id
-}
-
// tokenizeStream reads bytes from src and produces an indexedDocument of its
// cotent. tokenizeStream will never return an error of its own, it can only
// return an error from the provided Reader. If the provided Reader never
diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go
index 6ddab4c..4da91a6 100644
--- a/v2/tokenizer_test.go
+++ b/v2/tokenizer_test.go
@@ -15,6 +15,7 @@
package classifier
import (
+ "bytes"
"io"
"strings"
"testing"
@@ -144,7 +145,10 @@ The AWESOME Project`,
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
- d := tokenize([]byte(test.input), newDictionary(), true)
+ d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, newDictionary(), true)
+ if err != nil {
+ t.Errorf("%s failed: got unexpected error %v", test.name, err)
+ }
if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" {
t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff)
}
@@ -293,7 +297,10 @@ The FreeType Project`,
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
dict := newDictionary()
- d := tokenize([]byte(test.input), dict, true)
+ d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, dict, true)
+ if err != nil {
+ t.Errorf("%s failed: got unexpected error %v", test.name, err)
+ }
var b strings.Builder
for _, tok := range d.Tokens {
b.WriteString(dict.getWord(tok.ID))