diff options
author | Bill Neubauer <wcn@google.com> | 2022-08-22 11:54:55 -0700 |
---|---|---|
committer | Bill Neubauer <bill.neubauer@gmail.com> | 2022-09-16 10:06:11 -0700 |
commit | 9af65096d7ceb845f034197c5f6b762894c680eb (patch) | |
tree | b64f8d5894a9b05cf285fda107a7ce487efd695e | |
parent | bbfad6347cd1f2f7e28fb20144f64d60c700181b (diff) | |
download | licenseclassifier-9af65096d7ceb845f034197c5f6b762894c680eb.tar.gz |
Make the public facing API be implemented in terms of io.Reader rather than
[]byte. This allows for larger files or other inputs to be handled without
requiring the full contents to be stored in memory at one time.
PiperOrigin-RevId: 474376181
-rw-r--r-- | v2/classifier.go | 23 | ||||
-rw-r--r-- | v2/document.go | 9 | ||||
-rw-r--r-- | v2/tokenizer.go | 9 | ||||
-rw-r--r-- | v2/tokenizer_test.go | 11 |
4 files changed, 29 insertions, 23 deletions
diff --git a/v2/classifier.go b/v2/classifier.go index f163030..4bb3e5c 100644 --- a/v2/classifier.go +++ b/v2/classifier.go @@ -65,8 +65,11 @@ func (d Matches) Less(i, j int) bool { } // Match reports instances of the supplied content in the corpus. -func (c *Classifier) match(in []byte) Results { - id := c.createTargetIndexedDocument(in) +func (c *Classifier) match(in io.Reader) (Results, error) { + id, err := tokenizeStream(in, true, c.dict, false) + if err != nil { + return Results{}, err + } firstPass := make(map[string]*indexedDocument) for l, d := range c.docs { @@ -85,7 +88,7 @@ func (c *Classifier) match(in []byte) Results { return Results{ Matches: nil, TotalInputLines: 0, - } + }, nil } // Perform the expensive work of generating a searchset to look for token runs. @@ -185,7 +188,7 @@ func (c *Classifier) match(in []byte) Results { return Results{ Matches: out, TotalInputLines: id.Tokens[len(id.Tokens)-1].Line, - } + }, nil } // Classifier provides methods for identifying open source licenses in text @@ -316,16 +319,16 @@ func (c *Classifier) SetTraceConfiguration(in *TraceConfiguration) { // Match finds matches within an unknown text. This will not modify the contents // of the supplied byte slice. func (c *Classifier) Match(in []byte) Results { - return c.match(in) + // Since bytes.NewReader().Read() will never return an error, tokenizeStream + // will never return an error so it's okay to ignore the return value in this + // case. + res, _ := c.MatchFrom(bytes.NewReader(in)) + return res } // MatchFrom finds matches within the read content. func (c *Classifier) MatchFrom(in io.Reader) (Results, error) { - b, err := ioutil.ReadAll(in) - if err != nil { - return Results{}, fmt.Errorf("classifier couldn't read: %w", err) - } - return c.Match(b), nil + return c.match(in) } func detectionType(in string) string { diff --git a/v2/document.go b/v2/document.go index 6f3c1b5..dcedb67 100644 --- a/v2/document.go +++ b/v2/document.go @@ -16,6 +16,7 @@ package classifier import ( + "bytes" "fmt" "os" "strings" @@ -95,7 +96,10 @@ func max(a, b int) int { // AddContent incorporates the provided textual content into the classifier for // matching. This will not modify the supplied content. func (c *Classifier) AddContent(category, name, variant string, content []byte) { - doc := tokenize(content, c.dict, true) + // Since bytes.NewReader().Read() will never return an error, tokenizeStream + // will never return an error so it's okay to ignore the return value in this + // case. + doc, _ := tokenizeStream(bytes.NewReader(content), true, c.dict, true) c.addDocument(category, name, variant, doc) } @@ -114,7 +118,8 @@ func (c *Classifier) addDocument(category, name, variant string, id *indexedDocu // words to the classifier dictionary. This should be used for matching targets, not // populating the corpus. func (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument { - return tokenize(in, c.dict, false) + doc, _ := tokenizeStream(bytes.NewReader(in), true, c.dict, false) + return doc } func (c *Classifier) generateDocName(category, name, variant string) string { diff --git a/v2/tokenizer.go b/v2/tokenizer.go index 0d3917e..607b0d4 100644 --- a/v2/tokenizer.go +++ b/v2/tokenizer.go @@ -15,7 +15,6 @@ package classifier import ( - "bytes" "html" "io" "regexp" @@ -67,14 +66,6 @@ var ignorableTexts = []*regexp.Regexp{ regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`), } -func tokenize(in []byte, dict *dictionary, updateDict bool) *indexedDocument { - // Since bytes.NewReader().Read() will never return an error, tokenizeStream - // will never return an error so it's okay to ignore the return value in this - // case. - id, _ := tokenizeStream(bytes.NewReader(in), true, dict, updateDict) - return id -} - // tokenizeStream reads bytes from src and produces an indexedDocument of its // cotent. tokenizeStream will never return an error of its own, it can only // return an error from the provided Reader. If the provided Reader never diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go index 6ddab4c..4da91a6 100644 --- a/v2/tokenizer_test.go +++ b/v2/tokenizer_test.go @@ -15,6 +15,7 @@ package classifier import ( + "bytes" "io" "strings" "testing" @@ -144,7 +145,10 @@ The AWESOME Project`, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - d := tokenize([]byte(test.input), newDictionary(), true) + d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, newDictionary(), true) + if err != nil { + t.Errorf("%s failed: got unexpected error %v", test.name, err) + } if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" { t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff) } @@ -293,7 +297,10 @@ The FreeType Project`, for _, test := range tests { t.Run(test.name, func(t *testing.T) { dict := newDictionary() - d := tokenize([]byte(test.input), dict, true) + d, err := tokenizeStream(bytes.NewReader([]byte(test.input)), true, dict, true) + if err != nil { + t.Errorf("%s failed: got unexpected error %v", test.name, err) + } var b strings.Builder for _, tok := range d.Tokens { b.WriteString(dict.getWord(tok.ID)) |