Adds Copyright detection to the report generated by the classifier.

Previously copyright lines were just silently dropped, but now the classifier returns a match to indicate that the line contained an identified copyright statement. The matched text is still pruned from the normalized output, this only changes the output report from the classifier. PiperOrigin-RevId: 444592120
author: Bill Neubauer <wcn@google.com> 2022-04-26 10:08:12 -0700
committer: Bill Neubauer <bill.neubauer@gmail.com> 2022-09-16 10:06:11 -0700
commit: cb39e2ce0ae3978c8384a16d4ba4de007583e7fe (patch)
tree: 80c86f0101b666b62b5c2c8bb9c874175c5814b4
parent: dd64e698b98547386f13e9e683dc1fb3b6029c60 (diff)
download: licenseclassifier-cb39e2ce0ae3978c8384a16d4ba4de007583e7fe.tar.gz
35 files changed, 60 insertions, 51 deletions
diff --git a/v2/classifier.go b/v2/classifier.go
index 8e92da9..cc54d2e 100644
--- a/v2/classifier.go
+++ b/v2/classifier.go
@@ -92,6 +92,8 @@ func (c *Classifier) match(in []byte) Results {
 	id.generateSearchSet(c.q)
 
 	var candidates Matches
+	candidates = append(candidates, id.Matches...)
+
 	for l, d := range firstPass {
 		matches := c.findPotentialMatches(d.s, id.s, c.threshold)
 		for _, m := range matches {
@@ -228,8 +230,8 @@ func NewClassifier(threshold float64) *Classifier {
 // It is an invariant of the classifier that calling Match(Normalize(in)) will
 // return the same results as Match(in).
 func (c *Classifier) Normalize(in []byte) []byte {
-	text := normalizeDoc(in, false)
-	doc := extractDoc(text, false)
+	text, _ := normalizeDoc(in, false)
+	doc := extractDoc(text, false, nil)
 
 	var buf bytes.Buffer
 
diff --git a/v2/document.go b/v2/document.go
index 429c77f..fac5e65 100644
--- a/v2/document.go
+++ b/v2/document.go
@@ -33,7 +33,8 @@ type token struct {
 
 // document is the representation of the input text for downstream filtering and matching.
 type document struct {
-	Tokens []*token // ordered tokens of the document
+	Tokens  []*token // ordered tokens of the document
+	Matches Matches  // these are matches identified while processing the original, untokenized text via regexp matching
 }
 
 type indexedToken struct {
@@ -43,12 +44,13 @@ type indexedToken struct {
 }
 
 type indexedDocument struct {
-	Tokens []indexedToken  // ordered tokens of the document
-	f      *frequencyTable // frequencies computed for this document
-	dict   *dictionary     // The corpus dictionary for this document
-	s      *searchSet      // The searchset for this document
-	runes  []rune
-	norm   string // The normalized token sequence
+	Tokens  []indexedToken  // ordered tokens of the document
+	Matches Matches         // these are matches identified while processing the original, untokenized text via regexp matching
+	f       *frequencyTable // frequencies computed for this document
+	dict    *dictionary     // The corpus dictionary for this document
+	s       *searchSet      // The searchset for this document
+	runes   []rune
+	norm    string // The normalized token sequence
 }
 
 func (d *indexedDocument) generateSearchSet(q int) {
@@ -122,8 +124,9 @@ func (c *Classifier) addDocument(category, name, variant string, doc *document)
 // is true, the classifier dictionary is updated with new tokens encountered in the document.
 func (c *Classifier) generateIndexedDocument(d *document, addWords bool) *indexedDocument {
 	id := &indexedDocument{
-		Tokens: make([]indexedToken, 0, len(d.Tokens)),
-		dict:   c.dict,
+		Tokens:  make([]indexedToken, 0, len(d.Tokens)),
+		dict:    c.dict,
+		Matches: d.Matches,
 	}
 
 	for _, t := range d.Tokens {
diff --git a/v2/scenarios/114431182 b/v2/scenarios/114431182
index 2cb32de..2216765 100644
--- a/v2/scenarios/114431182
+++ b/v2/scenarios/114431182
@@ -1,5 +1,5 @@
 Legacy classifier didn't identify a license.
-EXPECTED:Apache-2.0
+EXPECTED:Apache-2.0,Copyright
 // Copyright 2018 Google LLC
 
 // Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/v2/scenarios/116495682_1 b/v2/scenarios/116495682_1
index 97dd6a8..d2a123e 100644
--- a/v2/scenarios/116495682_1
+++ b/v2/scenarios/116495682_1
@@ -1,5 +1,5 @@
 Legacy classifier doesn't recognize Ruby license.
-EXPECTED:Apache-2.0,MIT,Ruby
+EXPECTED:Apache-2.0,Copyright,MIT,Ruby
    Puppet - Automating Configuration Management.
 
    Copyright (C) 2005-2016 Puppet, Inc.
diff --git a/v2/scenarios/134172128 b/v2/scenarios/134172128
index 6aca0ae..7d1bf1c 100644
--- a/v2/scenarios/134172128
+++ b/v2/scenarios/134172128
@@ -1,5 +1,5 @@
 Legacy classifier doesn't detect header
-EXPECTED:Apache-2.0
+EXPECTED:Apache-2.0,Copyright
 /**
  * Copyright 2019 Google LLC
  *
diff --git a/v2/scenarios/143431863 b/v2/scenarios/143431863
index bd2e02e..650ec01 100644
--- a/v2/scenarios/143431863
+++ b/v2/scenarios/143431863
@@ -1,5 +1,5 @@
 New header for GPL-3.0 with Bison exception
-EXPECTED:GPL-3.0-with-bison-exception
+EXPECTED:Copyright,GPL-3.0-with-bison-exception
 * A Bison parser, made by GNU Bison 3.0.4.  */
 
 /* Bison interface for Yacc-like parsers in C
diff --git a/v2/scenarios/145684916 b/v2/scenarios/145684916
index 08c851c..e06d7d1 100644
--- a/v2/scenarios/145684916
+++ b/v2/scenarios/145684916
@@ -1,5 +1,5 @@
 Legacy classifier identifies GPL-1.0
-EXPECTED:LGPL-2.0
+EXPECTED:Copyright,LGPL-2.0
 /* Provide relocatable packages.
    Copyright (C) 2003 Free Software Foundation, Inc.
    Written by Bruno Haible <bruno@clisp.org>, 2003.
diff --git a/v2/scenarios/149757877 b/v2/scenarios/149757877
index 21cbc10..631020c 100644
--- a/v2/scenarios/149757877
+++ b/v2/scenarios/149757877
@@ -1,5 +1,5 @@
 Legacy classifier identifies GPL-1.0 or GPL-3.0.
-EXPECTED:GPL-2.0,MIT
+EXPECTED:Copyright,GPL-2.0,MIT
 /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) AND MIT) */
 /*
  *  compress_params.h - codec types and parameters for compressed data
diff --git a/v2/scenarios/149962816 b/v2/scenarios/149962816
index c9b68e9..56b018d 100644
--- a/v2/scenarios/149962816
+++ b/v2/scenarios/149962816
@@ -1,5 +1,5 @@
 Legacy classifier identifies GPL-2.1
-EXPECTED:LGPL-2.1
+EXPECTED:Copyright,LGPL-2.1
 /*
    Copyright (C) 2010 by Ronnie Sahlberg <ronniesahlberg@gmail.com>
 
diff --git a/v2/scenarios/149962877 b/v2/scenarios/149962877
index 37b7570..2388b29 100644
--- a/v2/scenarios/149962877
+++ b/v2/scenarios/149962877
@@ -1,5 +1,5 @@
 Legacy classifier identifies GPL-3.0
-EXPECTED:GPL-2.0
+EXPECTED:Copyright,GPL-2.0
 /*
    iscsi-test tool
 
diff --git a/v2/scenarios/150245846 b/v2/scenarios/150245846
index 7bc265e..e6d20f1 100644
--- a/v2/scenarios/150245846
+++ b/v2/scenarios/150245846
@@ -1,5 +1,5 @@
 Legacy classifier identifies GPL-3.0.
-EXPECTED:LGPL-3.0
+EXPECTED:Copyright,LGPL-3.0
 /*
  * Python bindings module for liblnk (pylnk)
  *
diff --git a/v2/scenarios/150280258 b/v2/scenarios/150280258
index 154e59a..75f11d6 100644
--- a/v2/scenarios/150280258
+++ b/v2/scenarios/150280258
@@ -1,5 +1,5 @@
 Legacy classifier identifies ImageMagick
-EXPECTED:
+EXPECTED:Copyright
 / Copyright 2020 Google Inc.
 //
 // Licensed under the BSD-3-Clause license; you may not use this file except in
diff --git a/v2/scenarios/150310130 b/v2/scenarios/150310130
index b534de6..7ca5b69 100644
--- a/v2/scenarios/150310130
+++ b/v2/scenarios/150310130
@@ -1,5 +1,5 @@
 Legacy classifier identifies LGPL-3.0.
-EXPECTED:GPL-3.0
+EXPECTED:Copyright,GPL-3.0
 // Copyright (C) 2011-2014 Free Software Foundation, Inc.
 //
 // This file is part of the GNU ISO C++ Library.  This library is free
diff --git a/v2/scenarios/151151704 b/v2/scenarios/151151704
index cf68e81..2b07bac 100644
--- a/v2/scenarios/151151704
+++ b/v2/scenarios/151151704
@@ -1,5 +1,5 @@
 Legacy classifier identifies BSD-2-Clause-NetBSD
-EXPECTED:Apache-2.0,BSD-2-Clause,BSD-3-Clause,MIT,NCSA,Unlicense,Zlib
+EXPECTED:Apache-2.0,BSD-2-Clause,BSD-3-Clause,Copyright,MIT,NCSA,Unlicense,Zlib
 Emscripten is available under 2 licenses, the MIT license and the
 University of Illinois/NCSA Open Source License.
 
diff --git a/v2/scenarios/151882032 b/v2/scenarios/151882032
index 35b7305..068962d 100644
--- a/v2/scenarios/151882032
+++ b/v2/scenarios/151882032
@@ -1,5 +1,5 @@
 Legacy classifier identifies BSD-4 clause.
-EXPECTED:BSD-3-Clause
+EXPECTED:BSD-3-Clause,Copyright
 ***
  * ASM: a very small and fast Java bytecode manipulation framework
  * Copyright (c) 2000-2011 INRIA, France Telecom
diff --git a/v2/scenarios/153757612 b/v2/scenarios/153757612
index e6950e7..4e11e89 100644
--- a/v2/scenarios/153757612
+++ b/v2/scenarios/153757612
@@ -1,5 +1,5 @@
 Legacy classifier identifies GPL-3.0.
-EXPECTED:GPL-2.0
+EXPECTED:Copyright,GPL-2.0
 /*
  * BIOS Decode
  *
diff --git a/v2/scenarios/153891703 b/v2/scenarios/153891703
index b863d61..82c6b63 100644
--- a/v2/scenarios/153891703
+++ b/v2/scenarios/153891703
@@ -1,5 +1,5 @@
 Legacy classifier identifies BSD-3-Clause
-EXPECTED:BSD-3-Clause
+EXPECTED:BSD-3-Clause,Copyright
 Copyright (c) 2009-2011, Mozilla Foundation and contributors
 All rights reserved.
 
diff --git a/v2/scenarios/154985893 b/v2/scenarios/154985893
index 496093a..f64ed69 100644
--- a/v2/scenarios/154985893
+++ b/v2/scenarios/154985893
@@ -1,5 +1,5 @@
 Legacy classifier identifies GPL-3.0
-EXPECTED:GPL-2.0
+EXPECTED:Copyright,GPL-2.0
 ## DO NOT EDIT - This file generated from ./build-aux/ltmain.in
 ##               by inline-source v2014-01-03.01
 
diff --git a/v2/scenarios/155506346 b/v2/scenarios/155506346
index 8e6f73f..8d1dc6a 100644
--- a/v2/scenarios/155506346
+++ b/v2/scenarios/155506346
@@ -1,5 +1,5 @@
 Legacy classifier identifies LGPL-3
-EXPECTED:LGPL-2.1
+EXPECTED:Copyright,LGPL-2.1
 /* Extended regular expression matching and search library.
    Copyright (C) 2002, 2003, 2005 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
diff --git a/v2/scenarios/155578201 b/v2/scenarios/155578201
index e078ee5..20c9aa0 100644
--- a/v2/scenarios/155578201
+++ b/v2/scenarios/155578201
@@ -1,5 +1,5 @@
 Legacy classifier identifies GPL-3.0
-EXPECTED:GPL-2.0
+EXPECTED:Copyright,GPL-2.0
 /*
  * ARM mach-virt emulation
  *
diff --git a/v2/scenarios/155689090 b/v2/scenarios/155689090
index 9e09472..905546d 100644
--- a/v2/scenarios/155689090
+++ b/v2/scenarios/155689090
@@ -1,5 +1,5 @@
 Legacy classifier identifies BSD-3-Clause-Attribution
-EXPECTED:BSD-3-Clause
+EXPECTED:BSD-3-Clause,Copyright
 """DatetimeIndex analog for cftime.datetime objects"""
 # The pandas.Index subclass defined here was copied and adapted for
 # use with cftime.datetime objects based on the source code defining
diff --git a/v2/scenarios/156414349 b/v2/scenarios/156414349
index fad1ce1..028c98b 100644
--- a/v2/scenarios/156414349
+++ b/v2/scenarios/156414349
@@ -1,5 +1,5 @@
 Legacy classifier identifies ImageMagick.
-EXPECTED:Apache-2.0
+EXPECTED:Apache-2.0,Copyright
 *
 Copyright 2018 Google LLC
 
diff --git a/v2/scenarios/157091836 b/v2/scenarios/157091836
index 1ad60a9..3b5cbb2 100644
--- a/v2/scenarios/157091836
+++ b/v2/scenarios/157091836
@@ -1,5 +1,5 @@
 Legacy classifier identifies LGPL-2.1
-EXPECTED:LGPL-3.0
+EXPECTED:Copyright,LGPL-3.0
 /**
  * @license
  *                    GNU LESSER GENERAL PUBLIC LICENSE
diff --git a/v2/scenarios/157921572 b/v2/scenarios/157921572
index 3a4a069..363d15c 100644
--- a/v2/scenarios/157921572
+++ b/v2/scenarios/157921572
@@ -1,5 +1,5 @@
 Legacy classifier identifies ImageMagick.
-EXPECTED: Apache-2.0
+EXPECTED: Apache-2.0,Copyright
 #!/bin/bash
 
 # Copyright 2020 Google LLC
diff --git a/v2/scenarios/159389429_08 b/v2/scenarios/159389429_08
index 70e574d..4b26e06 100644
--- a/v2/scenarios/159389429_08
+++ b/v2/scenarios/159389429_08
@@ -1,5 +1,5 @@
 Legacy classifier identifies LGPL-2.1
-EXPECTED:GPL-2.0
+EXPECTED:Copyright,GPL-2.0
 /*
  Copyright (c) 2002, 2012, Oracle and/or its affiliates. All rights reserved.
 
diff --git a/v2/scenarios/159389429_16 b/v2/scenarios/159389429_16
index 4d4c2cb..fdee26c 100644
--- a/v2/scenarios/159389429_16
+++ b/v2/scenarios/159389429_16
@@ -1,5 +1,5 @@
 Legacy classifier identifies GPL-1.0
-EXPECTED:GPL-3.0
+EXPECTED:Copyright,GPL-3.0
 ;;; package-lint-flymake.el --- A package-lint Flymake backend  -*- lexical-binding: t; -*-
 
 ;; Copyright (C) 2018 J. Alexander Branham (alex DOT branham AT gmail DOT com)
diff --git a/v2/scenarios/159475529_1 b/v2/scenarios/159475529_1
index 733f5d5..61476be 100644
--- a/v2/scenarios/159475529_1
+++ b/v2/scenarios/159475529_1
@@ -1,5 +1,5 @@
 Classifier tried to induce match with AGPL.
-EXPECTED:GPL-3.0
+EXPECTED:Copyright,GPL-3.0
 #!/bin/sh
 
 #  Build tools for testing GCC.
diff --git a/v2/scenarios/159475529_2 b/v2/scenarios/159475529_2
index f625f2e..9ca0511 100644
--- a/v2/scenarios/159475529_2
+++ b/v2/scenarios/159475529_2
@@ -1,5 +1,5 @@
 Classifier induced match with AGPL
-EXPECTED:GPL-2.0
+EXPECTED:Copyright,GPL-2.0
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
 <html>
 
diff --git a/v2/scenarios/159478550 b/v2/scenarios/159478550
index b7d6fe5..097d482 100644
--- a/v2/scenarios/159478550
+++ b/v2/scenarios/159478550
@@ -1,5 +1,5 @@
 There are no recognizable license headers in this file. Classifiers matched to SISSL because of standard verbiage.
-EXPECTED:
+EXPECTED:Copyright
 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
diff --git a/v2/scenarios/160997543 b/v2/scenarios/160997543
index d6e7ea0..dd82090 100644
--- a/v2/scenarios/160997543
+++ b/v2/scenarios/160997543
@@ -1,5 +1,5 @@
 Test for an approximate match on libtiff
-EXPECTED:libtiff
+EXPECTED:Copyright,libtiff
 Copyright (c) 2000-2017 Todd M. Helfter
 
 Permission to use, copy, modify, distribute, and sell this software and
diff --git a/v2/scenarios/161236205 b/v2/scenarios/161236205
index 86ca6ff..63dd1ee 100644
--- a/v2/scenarios/161236205
+++ b/v2/scenarios/161236205
@@ -1,5 +1,5 @@
 Add the Sun variant of BSD-3-Clause to the corpus.
-EXPECTED:BSD-3-Clause
+EXPECTED:BSD-3-Clause,Copyright
 Copyright (c) 2003 Sun Microsystems, Inc.  All Rights Reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/v2/scenarios/168158647 b/v2/scenarios/168158647
index eca4cb1..fda44c4 100644
--- a/v2/scenarios/168158647
+++ b/v2/scenarios/168158647
@@ -1,5 +1,5 @@
 GPL versioning construct is different than existing templates.
-EXPECTED:GPL-2.0
+EXPECTED:Copyright,GPL-2.0
 File src/zone.c
 	Copyright © 2011 Mathijs Mohlmann
 	License: GNU General Public License
diff --git a/v2/scenarios/214120190 b/v2/scenarios/214120190
index 3910945..025894f 100644
--- a/v2/scenarios/214120190
+++ b/v2/scenarios/214120190
@@ -1,5 +1,5 @@
 Code to prevent LGPL/GPL misdetections only worked for LGPL-2.1, not LGPL-2.0
-EXPECTED:LGPL-2.0
+EXPECTED:Copyright,LGPL-2.0
 /* glib-unix.h - Unix specific integration
  * Copyright (C) 2011 Red Hat, Inc.
  *
diff --git a/v2/tokenizer.go b/v2/tokenizer.go
index 885eab3..2ab19ef 100644
--- a/v2/tokenizer.go
+++ b/v2/tokenizer.go
@@ -66,28 +66,29 @@ func cleanupToken(in string) string {
 	return out.String()
 }
 
-func normalizeDoc(in []byte, normWords bool) string {
+func normalizeDoc(in []byte, normWords bool) (string, Matches) {
 	// Apply the global transforms described in SPDX
 
 	norm := string(in)
 	norm = html.UnescapeString(norm)
 	norm = normalizePunctuation(norm)
-	norm = removeIgnorableTexts(norm)
+	norm, matches := removeIgnorableTexts(norm)
 
 	if normWords {
 		norm = normalizeWords(norm)
 	}
-	return norm
+	return norm, matches
 }
 
 func tokenize(in []byte) *document {
 	// tokenize produces a document from the input content.
-	text := normalizeDoc(in, true)
-	return extractDoc(text, true)
+	text, matches := normalizeDoc(in, true)
+	return extractDoc(text, true, matches)
 }
 
-func extractDoc(text string, removeEol bool) *document {
+func extractDoc(text string, removeEol bool, matches Matches) *document {
 	var doc document
+	doc.Matches = matches
 	// Iterate on a line-by-line basis.
 	i := 0
 	pos := 0
@@ -357,10 +358,11 @@ var ignorableTexts = []*regexp.Regexp{
 
 // removeIgnorableTexts removes common text, which is not important for
 // classification
-func removeIgnorableTexts(s string) string {
+func removeIgnorableTexts(s string) (string, Matches) {
 	var out []string
+	var matches Matches
 	lines := strings.Split(s, "\n")
-	for _, l := range lines {
+	for i, l := range lines {
 		line := strings.TrimSpace(l)
 		var match bool
 		for _, re := range ignorableTexts {
@@ -373,7 +375,8 @@ func removeIgnorableTexts(s string) string {
 		} else {
 			// We want to preserve line presence for the positional information
 			out = append(out, "")
+			matches = append(matches, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: i + 1, EndLine: i + 1})
 		}
 	}
-	return strings.Join(out, "\n")
+	return strings.Join(out, "\n"), matches
 }
diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go
index a01c43d..fec5b96 100644
--- a/v2/tokenizer_test.go
+++ b/v2/tokenizer_test.go
@@ -133,6 +133,7 @@ The AWESOME Project`,
 						Line:  10,
 					},
 				},
+				Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}},
 			},
 		},
 	}
author	Bill Neubauer <wcn@google.com>	2022-04-26 10:08:12 -0700
committer	Bill Neubauer <bill.neubauer@gmail.com>	2022-09-16 10:06:11 -0700
commit	cb39e2ce0ae3978c8384a16d4ba4de007583e7fe (patch)
tree	80c86f0101b666b62b5c2c8bb9c874175c5814b4
parent	dd64e698b98547386f13e9e683dc1fb3b6029c60 (diff)
download	licenseclassifier-cb39e2ce0ae3978c8384a16d4ba4de007583e7fe.tar.gz