Make the public facing API be implemented in terms of io.Reader rather than

[]byte. This allows for larger files or other inputs to be handled without requiring the full contents to be stored in memory at one time. PiperOrigin-RevId: 474376181
author: Bill Neubauer <wcn@google.com> 2022-08-22 11:54:55 -0700
committer: Bill Neubauer <bill.neubauer@gmail.com> 2022-09-16 10:06:11 -0700
commit: 9af65096d7ceb845f034197c5f6b762894c680eb (patch)
tree: b64f8d5894a9b05cf285fda107a7ce487efd695e
parent: bbfad6347cd1f2f7e28fb20144f64d60c700181b (diff)
download: licenseclassifier-9af65096d7ceb845f034197c5f6b762894c680eb.tar.gz
4 files changed, 29 insertions, 23 deletions
diff --git a/v2/classifier.go b/v2/classifier.go
index f163030..4bb3e5c 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -65,8 +65,11 @@ func (d Matches) Less(i, j int) bool {
 }
 
 // Match reports instances of the supplied content in the corpus.
-func (c *Classifier) match(in []byte) Results {
-	id := c.createTargetIndexedDocument(in)
+func (c *Classifier) match(in io.Reader) (Results, error) {
+	id, err := tokenizeStream(in, true, c.dict, false)
+	if err != nil {
+		return Results{}, err
+	}
 
 	firstPass := make(map[string]*indexedDocument)
 	for l, d := range c.docs {
@@ -85,7 +88,7 @@ func (c *Classifier) match(in []byte) Results {
 		return Results{
 			Matches:         nil,
 			TotalInputLines: 0,
-		}
+		}, nil
 	}
 
 	// Perform the expensive work of generating a searchset to look for token runs.
@@ -185,7 +188,7 @@ func (c *Classifier) match(in []byte) Results {
 	return Results{
 		Matches:         out,
 		TotalInputLines: id.Tokens[len(id.Tokens)-1].Line,
-	}
+	}, nil
 }
 
 // Classifier provides methods for identifying open source licenses in text
@@ -316,16 +319,16 @@ func (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) {
 // Match finds matches within an unknown text. This will not modify the contents
 // of the supplied byte slice.
 func (c *Classifier) Match(in []byte) Results {
-	return c.match(in)
+	// Since bytes.NewReader().Read() will never return an error, tokenizeStream
+	// will never return an error so it's okay to ignore the return value in this
+	// case.
+	res, _ := c.MatchFrom(bytes.NewReader(in))
+	return res
 }
 
 // MatchFrom finds matches within the read content.
 func (c *Classifier) MatchFrom(in io.Reader) (Results, error) {
-	b, err := ioutil.ReadAll(in)
-	if err != nil {
-		return Results{}, fmt.Errorf("classifier couldn't read: %w", err)
-	}
-	return c.Match(b), nil
+	return c.match(in)
 }
 
 func detectionType(in string) string {
diff --git a/v2/document.go b/v2/document.go
index 6f3c1b5..dcedb67 100644
--- a/v2/document.go
+++ b/v2/document.go
@@ -16,6 +16,7 @@
 package classifier
 
 import (
+	"bytes"
 	"fmt"
 	"os"
 	"strings"
@@ -95,7 +96,10 @@ func max(a, b int) int {
 // AddContent incorporates the provided textual content into the classifier for
 // matching. This will not modify the supplied content.
 func (c *Classifier) AddContent(category, name, variant string, content []byte) {
-	doc := tokenize(content, c.dict, true)
+	// Since bytes.NewReader().Read() will never return an error, tokenizeStream
+	// will never return an error so it's okay to ignore the return value in this
+	// case.
+	doc, _ := tokenizeStream(bytes.NewReader(content), true, c.dict, true)
 	c.addDocument(category, name, variant, doc)
 }
 
@@ -114,7 +118,8 @@ func (c *Classifier) addDocument(category, name, variant string, id *indexedDocu
 // words to the classifier dictionary. This should be used for matching targets, not
 // populating the corpus.
 func (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument {
-	return tokenize(in, c.dict, false)
+	doc, _ := tokenizeStream(bytes.NewReader(in), true, c.dict, false)
+	return doc
 }
 
 func (c *Classifier) generateDocName(category, name, variant string) string {
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 0d3917e..607b0d4 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -15,7 +15,6 @@
 package classifier
 
 import (
-	"bytes"
 	"html"
 	"io"
 	"regexp"
@@ -67,14 +66,6 @@ var ignorableTexts = []*regexp.Regexp{
 	regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`),
 }
 
-func tokenize(in []byte, dict *dictionary, updateDict bool) *indexedDocument {
-	// Since bytes.NewReader().Read() will never return an error, tokenizeStream
-	// will never return an error so it's okay to ignore the return value in this
-	// case.
-	id, _ := tokenizeStream(bytes.NewReader(in), true, dict, updateDict)
-	return id
-}
-
 // tokenizeStream reads bytes from src and produces an indexedDocument of its
 // cotent. tokenizeStream will never return an error of its own, it can only
 // return an error from the provided Reader. If the provided Reader never
diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go
index 6ddab4c..4da91a6 100644
--- a/v2/tokenizer_test.go
+++ b/v2/tokenizer_test.go
@@ -15,6 +15,7 @@
 package classifier
 
 import (
+	"bytes"
 	"io"
 	"strings"
 	"testing"
@@ -144,7 +145,10 @@ The AWESOME Project`,
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			d := tokenize([]byte(test.input), newDictionary(), true)
+			d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, newDictionary(), true)
+			if err != nil {
+				t.Errorf("%s failed: got unexpected error %v", test.name, err)
+			}
 			if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" {
 				t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff)
 			}
@@ -293,7 +297,10 @@ The FreeType Project`,
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			dict := newDictionary()
-			d := tokenize([]byte(test.input), dict, true)
+			d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, dict, true)
+			if err != nil {
+				t.Errorf("%s failed: got unexpected error %v", test.name, err)
+			}
 			var b strings.Builder
 			for _, tok := range d.Tokens {
 				b.WriteString(dict.getWord(tok.ID))
author	Bill Neubauer <wcn@google.com>	2022-08-22 11:54:55 -0700
committer	Bill Neubauer <bill.neubauer@gmail.com>	2022-09-16 10:06:11 -0700
commit	9af65096d7ceb845f034197c5f6b762894c680eb (patch)
tree	b64f8d5894a9b05cf285fda107a7ce487efd695e
parent	bbfad6347cd1f2f7e28fb20144f64d60c700181b (diff)
download	licenseclassifier-9af65096d7ceb845f034197c5f6b762894c680eb.tar.gz