diff options
author | Bill Neubauer <wcn@google.com> | 2022-04-26 10:08:12 -0700 |
---|---|---|
committer | Bill Neubauer <bill.neubauer@gmail.com> | 2022-09-16 10:06:11 -0700 |
commit | cb39e2ce0ae3978c8384a16d4ba4de007583e7fe (patch) | |
tree | 80c86f0101b666b62b5c2c8bb9c874175c5814b4 | |
parent | dd64e698b98547386f13e9e683dc1fb3b6029c60 (diff) | |
download | licenseclassifier-cb39e2ce0ae3978c8384a16d4ba4de007583e7fe.tar.gz |
Adds Copyright detection to the report generated by the classifier.
Previously copyright lines were just silently dropped, but now the classifier
returns a match to indicate that the line contained an identified copyright
statement. The matched text is still pruned from the normalized output, this
only changes the output report from the classifier.
PiperOrigin-RevId: 444592120
35 files changed, 60 insertions, 51 deletions
diff --git a/v2/classifier.go b/v2/classifier.go index 8e92da9..cc54d2e 100644 --- a/v2/classifier.go +++ b/v2/classifier.go @@ -92,6 +92,8 @@ func (c *Classifier) match(in []byte) Results { id.generateSearchSet(c.q) var candidates Matches + candidates = append(candidates, id.Matches...) + for l, d := range firstPass { matches := c.findPotentialMatches(d.s, id.s, c.threshold) for _, m := range matches { @@ -228,8 +230,8 @@ func NewClassifier(threshold float64) *Classifier { // It is an invariant of the classifier that calling Match(Normalize(in)) will // return the same results as Match(in). func (c *Classifier) Normalize(in []byte) []byte { - text := normalizeDoc(in, false) - doc := extractDoc(text, false) + text, _ := normalizeDoc(in, false) + doc := extractDoc(text, false, nil) var buf bytes.Buffer diff --git a/v2/document.go b/v2/document.go index 429c77f..fac5e65 100644 --- a/v2/document.go +++ b/v2/document.go @@ -33,7 +33,8 @@ type token struct { // document is the representation of the input text for downstream filtering and matching. type document struct { - Tokens []*token // ordered tokens of the document + Tokens []*token // ordered tokens of the document + Matches Matches // these are matches identified while processing the original, untokenized text via regexp matching } type indexedToken struct { @@ -43,12 +44,13 @@ type indexedToken struct { } type indexedDocument struct { - Tokens []indexedToken // ordered tokens of the document - f *frequencyTable // frequencies computed for this document - dict *dictionary // The corpus dictionary for this document - s *searchSet // The searchset for this document - runes []rune - norm string // The normalized token sequence + Tokens []indexedToken // ordered tokens of the document + Matches Matches // these are matches identified while processing the original, untokenized text via regexp matching + f *frequencyTable // frequencies computed for this document + dict *dictionary // The corpus dictionary for this document + s *searchSet // The searchset for this document + runes []rune + norm string // The normalized token sequence } func (d *indexedDocument) generateSearchSet(q int) { @@ -122,8 +124,9 @@ func (c *Classifier) addDocument(category, name, variant string, doc *document) // is true, the classifier dictionary is updated with new tokens encountered in the document. func (c *Classifier) generateIndexedDocument(d *document, addWords bool) *indexedDocument { id := &indexedDocument{ - Tokens: make([]indexedToken, 0, len(d.Tokens)), - dict: c.dict, + Tokens: make([]indexedToken, 0, len(d.Tokens)), + dict: c.dict, + Matches: d.Matches, } for _, t := range d.Tokens { diff --git a/v2/scenarios/114431182 b/v2/scenarios/114431182 index 2cb32de..2216765 100644 --- a/v2/scenarios/114431182 +++ b/v2/scenarios/114431182 @@ -1,5 +1,5 @@ Legacy classifier didn't identify a license. -EXPECTED:Apache-2.0 +EXPECTED:Apache-2.0,Copyright // Copyright 2018 Google LLC // Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/v2/scenarios/116495682_1 b/v2/scenarios/116495682_1 index 97dd6a8..d2a123e 100644 --- a/v2/scenarios/116495682_1 +++ b/v2/scenarios/116495682_1 @@ -1,5 +1,5 @@ Legacy classifier doesn't recognize Ruby license. -EXPECTED:Apache-2.0,MIT,Ruby +EXPECTED:Apache-2.0,Copyright,MIT,Ruby Puppet - Automating Configuration Management. Copyright (C) 2005-2016 Puppet, Inc. diff --git a/v2/scenarios/134172128 b/v2/scenarios/134172128 index 6aca0ae..7d1bf1c 100644 --- a/v2/scenarios/134172128 +++ b/v2/scenarios/134172128 @@ -1,5 +1,5 @@ Legacy classifier doesn't detect header -EXPECTED:Apache-2.0 +EXPECTED:Apache-2.0,Copyright /** * Copyright 2019 Google LLC * diff --git a/v2/scenarios/143431863 b/v2/scenarios/143431863 index bd2e02e..650ec01 100644 --- a/v2/scenarios/143431863 +++ b/v2/scenarios/143431863 @@ -1,5 +1,5 @@ New header for GPL-3.0 with Bison exception -EXPECTED:GPL-3.0-with-bison-exception +EXPECTED:Copyright,GPL-3.0-with-bison-exception * A Bison parser, made by GNU Bison 3.0.4. */ /* Bison interface for Yacc-like parsers in C diff --git a/v2/scenarios/145684916 b/v2/scenarios/145684916 index 08c851c..e06d7d1 100644 --- a/v2/scenarios/145684916 +++ b/v2/scenarios/145684916 @@ -1,5 +1,5 @@ Legacy classifier identifies GPL-1.0 -EXPECTED:LGPL-2.0 +EXPECTED:Copyright,LGPL-2.0 /* Provide relocatable packages. Copyright (C) 2003 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2003. diff --git a/v2/scenarios/149757877 b/v2/scenarios/149757877 index 21cbc10..631020c 100644 --- a/v2/scenarios/149757877 +++ b/v2/scenarios/149757877 @@ -1,5 +1,5 @@ Legacy classifier identifies GPL-1.0 or GPL-3.0. -EXPECTED:GPL-2.0,MIT +EXPECTED:Copyright,GPL-2.0,MIT /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) AND MIT) */ /* * compress_params.h - codec types and parameters for compressed data diff --git a/v2/scenarios/149962816 b/v2/scenarios/149962816 index c9b68e9..56b018d 100644 --- a/v2/scenarios/149962816 +++ b/v2/scenarios/149962816 @@ -1,5 +1,5 @@ Legacy classifier identifies GPL-2.1 -EXPECTED:LGPL-2.1 +EXPECTED:Copyright,LGPL-2.1 /* Copyright (C) 2010 by Ronnie Sahlberg <ronniesahlberg@gmail.com> diff --git a/v2/scenarios/149962877 b/v2/scenarios/149962877 index 37b7570..2388b29 100644 --- a/v2/scenarios/149962877 +++ b/v2/scenarios/149962877 @@ -1,5 +1,5 @@ Legacy classifier identifies GPL-3.0 -EXPECTED:GPL-2.0 +EXPECTED:Copyright,GPL-2.0 /* iscsi-test tool diff --git a/v2/scenarios/150245846 b/v2/scenarios/150245846 index 7bc265e..e6d20f1 100644 --- a/v2/scenarios/150245846 +++ b/v2/scenarios/150245846 @@ -1,5 +1,5 @@ Legacy classifier identifies GPL-3.0. -EXPECTED:LGPL-3.0 +EXPECTED:Copyright,LGPL-3.0 /* * Python bindings module for liblnk (pylnk) * diff --git a/v2/scenarios/150280258 b/v2/scenarios/150280258 index 154e59a..75f11d6 100644 --- a/v2/scenarios/150280258 +++ b/v2/scenarios/150280258 @@ -1,5 +1,5 @@ Legacy classifier identifies ImageMagick -EXPECTED: +EXPECTED:Copyright / Copyright 2020 Google Inc. // // Licensed under the BSD-3-Clause license; you may not use this file except in diff --git a/v2/scenarios/150310130 b/v2/scenarios/150310130 index b534de6..7ca5b69 100644 --- a/v2/scenarios/150310130 +++ b/v2/scenarios/150310130 @@ -1,5 +1,5 @@ Legacy classifier identifies LGPL-3.0. -EXPECTED:GPL-3.0 +EXPECTED:Copyright,GPL-3.0 // Copyright (C) 2011-2014 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free diff --git a/v2/scenarios/151151704 b/v2/scenarios/151151704 index cf68e81..2b07bac 100644 --- a/v2/scenarios/151151704 +++ b/v2/scenarios/151151704 @@ -1,5 +1,5 @@ Legacy classifier identifies BSD-2-Clause-NetBSD -EXPECTED:Apache-2.0,BSD-2-Clause,BSD-3-Clause,MIT,NCSA,Unlicense,Zlib +EXPECTED:Apache-2.0,BSD-2-Clause,BSD-3-Clause,Copyright,MIT,NCSA,Unlicense,Zlib Emscripten is available under 2 licenses, the MIT license and the University of Illinois/NCSA Open Source License. diff --git a/v2/scenarios/151882032 b/v2/scenarios/151882032 index 35b7305..068962d 100644 --- a/v2/scenarios/151882032 +++ b/v2/scenarios/151882032 @@ -1,5 +1,5 @@ Legacy classifier identifies BSD-4 clause. -EXPECTED:BSD-3-Clause +EXPECTED:BSD-3-Clause,Copyright *** * ASM: a very small and fast Java bytecode manipulation framework * Copyright (c) 2000-2011 INRIA, France Telecom diff --git a/v2/scenarios/153757612 b/v2/scenarios/153757612 index e6950e7..4e11e89 100644 --- a/v2/scenarios/153757612 +++ b/v2/scenarios/153757612 @@ -1,5 +1,5 @@ Legacy classifier identifies GPL-3.0. -EXPECTED:GPL-2.0 +EXPECTED:Copyright,GPL-2.0 /* * BIOS Decode * diff --git a/v2/scenarios/153891703 b/v2/scenarios/153891703 index b863d61..82c6b63 100644 --- a/v2/scenarios/153891703 +++ b/v2/scenarios/153891703 @@ -1,5 +1,5 @@ Legacy classifier identifies BSD-3-Clause -EXPECTED:BSD-3-Clause +EXPECTED:BSD-3-Clause,Copyright Copyright (c) 2009-2011, Mozilla Foundation and contributors All rights reserved. diff --git a/v2/scenarios/154985893 b/v2/scenarios/154985893 index 496093a..f64ed69 100644 --- a/v2/scenarios/154985893 +++ b/v2/scenarios/154985893 @@ -1,5 +1,5 @@ Legacy classifier identifies GPL-3.0 -EXPECTED:GPL-2.0 +EXPECTED:Copyright,GPL-2.0 ## DO NOT EDIT - This file generated from ./build-aux/ltmain.in ## by inline-source v2014-01-03.01 diff --git a/v2/scenarios/155506346 b/v2/scenarios/155506346 index 8e6f73f..8d1dc6a 100644 --- a/v2/scenarios/155506346 +++ b/v2/scenarios/155506346 @@ -1,5 +1,5 @@ Legacy classifier identifies LGPL-3 -EXPECTED:LGPL-2.1 +EXPECTED:Copyright,LGPL-2.1 /* Extended regular expression matching and search library. Copyright (C) 2002, 2003, 2005 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/v2/scenarios/155578201 b/v2/scenarios/155578201 index e078ee5..20c9aa0 100644 --- a/v2/scenarios/155578201 +++ b/v2/scenarios/155578201 @@ -1,5 +1,5 @@ Legacy classifier identifies GPL-3.0 -EXPECTED:GPL-2.0 +EXPECTED:Copyright,GPL-2.0 /* * ARM mach-virt emulation * diff --git a/v2/scenarios/155689090 b/v2/scenarios/155689090 index 9e09472..905546d 100644 --- a/v2/scenarios/155689090 +++ b/v2/scenarios/155689090 @@ -1,5 +1,5 @@ Legacy classifier identifies BSD-3-Clause-Attribution -EXPECTED:BSD-3-Clause +EXPECTED:BSD-3-Clause,Copyright """DatetimeIndex analog for cftime.datetime objects""" # The pandas.Index subclass defined here was copied and adapted for # use with cftime.datetime objects based on the source code defining diff --git a/v2/scenarios/156414349 b/v2/scenarios/156414349 index fad1ce1..028c98b 100644 --- a/v2/scenarios/156414349 +++ b/v2/scenarios/156414349 @@ -1,5 +1,5 @@ Legacy classifier identifies ImageMagick. -EXPECTED:Apache-2.0 +EXPECTED:Apache-2.0,Copyright * Copyright 2018 Google LLC diff --git a/v2/scenarios/157091836 b/v2/scenarios/157091836 index 1ad60a9..3b5cbb2 100644 --- a/v2/scenarios/157091836 +++ b/v2/scenarios/157091836 @@ -1,5 +1,5 @@ Legacy classifier identifies LGPL-2.1 -EXPECTED:LGPL-3.0 +EXPECTED:Copyright,LGPL-3.0 /** * @license * GNU LESSER GENERAL PUBLIC LICENSE diff --git a/v2/scenarios/157921572 b/v2/scenarios/157921572 index 3a4a069..363d15c 100644 --- a/v2/scenarios/157921572 +++ b/v2/scenarios/157921572 @@ -1,5 +1,5 @@ Legacy classifier identifies ImageMagick. -EXPECTED: Apache-2.0 +EXPECTED: Apache-2.0,Copyright #!/bin/bash # Copyright 2020 Google LLC diff --git a/v2/scenarios/159389429_08 b/v2/scenarios/159389429_08 index 70e574d..4b26e06 100644 --- a/v2/scenarios/159389429_08 +++ b/v2/scenarios/159389429_08 @@ -1,5 +1,5 @@ Legacy classifier identifies LGPL-2.1 -EXPECTED:GPL-2.0 +EXPECTED:Copyright,GPL-2.0 /* Copyright (c) 2002, 2012, Oracle and/or its affiliates. All rights reserved. diff --git a/v2/scenarios/159389429_16 b/v2/scenarios/159389429_16 index 4d4c2cb..fdee26c 100644 --- a/v2/scenarios/159389429_16 +++ b/v2/scenarios/159389429_16 @@ -1,5 +1,5 @@ Legacy classifier identifies GPL-1.0 -EXPECTED:GPL-3.0 +EXPECTED:Copyright,GPL-3.0 ;;; package-lint-flymake.el --- A package-lint Flymake backend -*- lexical-binding: t; -*- ;; Copyright (C) 2018 J. Alexander Branham (alex DOT branham AT gmail DOT com) diff --git a/v2/scenarios/159475529_1 b/v2/scenarios/159475529_1 index 733f5d5..61476be 100644 --- a/v2/scenarios/159475529_1 +++ b/v2/scenarios/159475529_1 @@ -1,5 +1,5 @@ Classifier tried to induce match with AGPL. -EXPECTED:GPL-3.0 +EXPECTED:Copyright,GPL-3.0 #!/bin/sh # Build tools for testing GCC. diff --git a/v2/scenarios/159475529_2 b/v2/scenarios/159475529_2 index f625f2e..9ca0511 100644 --- a/v2/scenarios/159475529_2 +++ b/v2/scenarios/159475529_2 @@ -1,5 +1,5 @@ Classifier induced match with AGPL -EXPECTED:GPL-2.0 +EXPECTED:Copyright,GPL-2.0 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> <html> diff --git a/v2/scenarios/159478550 b/v2/scenarios/159478550 index b7d6fe5..097d482 100644 --- a/v2/scenarios/159478550 +++ b/v2/scenarios/159478550 @@ -1,5 +1,5 @@ There are no recognizable license headers in this file. Classifiers matched to SISSL because of standard verbiage. -EXPECTED: +EXPECTED:Copyright // Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. diff --git a/v2/scenarios/160997543 b/v2/scenarios/160997543 index d6e7ea0..dd82090 100644 --- a/v2/scenarios/160997543 +++ b/v2/scenarios/160997543 @@ -1,5 +1,5 @@ Test for an approximate match on libtiff -EXPECTED:libtiff +EXPECTED:Copyright,libtiff Copyright (c) 2000-2017 Todd M. Helfter Permission to use, copy, modify, distribute, and sell this software and diff --git a/v2/scenarios/161236205 b/v2/scenarios/161236205 index 86ca6ff..63dd1ee 100644 --- a/v2/scenarios/161236205 +++ b/v2/scenarios/161236205 @@ -1,5 +1,5 @@ Add the Sun variant of BSD-3-Clause to the corpus. -EXPECTED:BSD-3-Clause +EXPECTED:BSD-3-Clause,Copyright Copyright (c) 2003 Sun Microsystems, Inc. All Rights Reserved. Redistribution and use in source and binary forms, with or without diff --git a/v2/scenarios/168158647 b/v2/scenarios/168158647 index eca4cb1..fda44c4 100644 --- a/v2/scenarios/168158647 +++ b/v2/scenarios/168158647 @@ -1,5 +1,5 @@ GPL versioning construct is different than existing templates. -EXPECTED:GPL-2.0 +EXPECTED:Copyright,GPL-2.0 File src/zone.c Copyright © 2011 Mathijs Mohlmann License: GNU General Public License diff --git a/v2/scenarios/214120190 b/v2/scenarios/214120190 index 3910945..025894f 100644 --- a/v2/scenarios/214120190 +++ b/v2/scenarios/214120190 @@ -1,5 +1,5 @@ Code to prevent LGPL/GPL misdetections only worked for LGPL-2.1, not LGPL-2.0 -EXPECTED:LGPL-2.0 +EXPECTED:Copyright,LGPL-2.0 /* glib-unix.h - Unix specific integration * Copyright (C) 2011 Red Hat, Inc. * diff --git a/v2/tokenizer.go b/v2/tokenizer.go index 885eab3..2ab19ef 100644 --- a/v2/tokenizer.go +++ b/v2/tokenizer.go @@ -66,28 +66,29 @@ func cleanupToken(in string) string { return out.String() } -func normalizeDoc(in []byte, normWords bool) string { +func normalizeDoc(in []byte, normWords bool) (string, Matches) { // Apply the global transforms described in SPDX norm := string(in) norm = html.UnescapeString(norm) norm = normalizePunctuation(norm) - norm = removeIgnorableTexts(norm) + norm, matches := removeIgnorableTexts(norm) if normWords { norm = normalizeWords(norm) } - return norm + return norm, matches } func tokenize(in []byte) *document { // tokenize produces a document from the input content. - text := normalizeDoc(in, true) - return extractDoc(text, true) + text, matches := normalizeDoc(in, true) + return extractDoc(text, true, matches) } -func extractDoc(text string, removeEol bool) *document { +func extractDoc(text string, removeEol bool, matches Matches) *document { var doc document + doc.Matches = matches // Iterate on a line-by-line basis. i := 0 pos := 0 @@ -357,10 +358,11 @@ var ignorableTexts = []*regexp.Regexp{ // removeIgnorableTexts removes common text, which is not important for // classification -func removeIgnorableTexts(s string) string { +func removeIgnorableTexts(s string) (string, Matches) { var out []string + var matches Matches lines := strings.Split(s, "\n") - for _, l := range lines { + for i, l := range lines { line := strings.TrimSpace(l) var match bool for _, re := range ignorableTexts { @@ -373,7 +375,8 @@ func removeIgnorableTexts(s string) string { } else { // We want to preserve line presence for the positional information out = append(out, "") + matches = append(matches, &Match{Name: "Copyright", MatchType: "Copyright", Confidence: 1.0, StartLine: i + 1, EndLine: i + 1}) } } - return strings.Join(out, "\n") + return strings.Join(out, "\n"), matches } diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go index a01c43d..fec5b96 100644 --- a/v2/tokenizer_test.go +++ b/v2/tokenizer_test.go @@ -133,6 +133,7 @@ The AWESOME Project`, Line: 10, }, }, + Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}}, }, }, } |