blob: 3687290bb43480e82b51f01acf06d7c20b67b9f7 [file] [log] [blame]
//
// Licensed to Apache Software Foundation (ASF) under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Apache Software Foundation (ASF) licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
package license
import (
"reflect"
"regexp"
"runtime"
"strings"
"github.com/apache/skywalking-eyes/license-eye/internal/logger"
)
type Normalizer func(string) string
var (
// normalizers is a list of Normalizer that can be applied to the license text, yet doesn't change the license's
// meanings, according to the matching guide in https://spdx.dev/license-list/matching-guidelines.
// The order matters.
normalizers = []Normalizer{
OneLineNormalizer,
FlattenSpaceNormalizer,
SubstantiveTextsNormalizer,
FlattenSpaceNormalizer,
strings.ToLower,
strings.TrimSpace,
}
// 6. Code Comment Indicators (https://spdx.dev/license-list/matching-guidelines.)
commentIndicators = []*regexp.Regexp{
regexp.MustCompile(`(?m)^\s*#+`), // #
regexp.MustCompile(`(?m)^\s*//+`), // //
regexp.MustCompile(`(?m)^\s*"""+`), // """
regexp.MustCompile(`(?m)^\s*\(\*+`), // (*
regexp.MustCompile(`(?m)^\s*;+`), // ;
regexp.MustCompile(`(?m)^\s*/\*+`), // /*
regexp.MustCompile(`(?m)^\s*\*+/`), // */
regexp.MustCompile(`(?m)^\s*\*+`), // *
regexp.MustCompile(`(?m)^\s*<!--+`), // <!--
regexp.MustCompile(`(?m)^\s*--+>`), // -->
regexp.MustCompile(`(?m)^\s*--+`), // --
regexp.MustCompile(`(?m)^\s*~+`), // ~
regexp.MustCompile(`(?m)^\s*{-+`), // {-
regexp.MustCompile(`(?m)^\s*-}+`), // -}
regexp.MustCompile(`(?m)^\s*::`), // ::
regexp.MustCompile(`(?m)^\s*\.\.`), // ..
regexp.MustCompile(`(?mi)^\s*@?REM`), // @REM
regexp.MustCompile(`(?mi)^\s*%+`), // % e.g. matlab
regexp.MustCompile(`(?m)^\s*{#+`), // {#
regexp.MustCompile(`(?m)^\s*#+}`), // #}
regexp.MustCompile(`(?m)^\s*{\*+`), // {*
regexp.MustCompile(`(?m)^\s*\*+}`), // *}
regexp.MustCompile(`(?m)^\s*'+`), // '
}
flattenSpace = regexp.MustCompile(`\s+`)
substitutableTexts = []struct {
regex *regexp.Regexp
replacement string
}{
{regexp.MustCompile(`(?i)\backnowledgement\b`), "acknowledgment"},
{regexp.MustCompile(`(?i)\banalog\b`), "analogue"},
{regexp.MustCompile(`(?i)\banalyze\b`), "analyse"},
{regexp.MustCompile(`(?i)\bartifact\b`), "artefact"},
{regexp.MustCompile(`(?i)\bauthorization\b`), "authorisation"},
{regexp.MustCompile(`(?i)\bauthorized\b`), "authorised"},
{regexp.MustCompile(`(?i)\bcaliber\b`), "calibre"},
{regexp.MustCompile(`(?i)\bcanceled\b`), "cancelled"},
{regexp.MustCompile(`(?i)\bcapitalizations\b`), "capitalisations"},
{regexp.MustCompile(`(?i)\bcatalog\b`), "catalogue"},
{regexp.MustCompile(`(?i)\bcategorize\b`), "categorise"},
{regexp.MustCompile(`(?i)\bcenter\b`), "centre"},
{regexp.MustCompile(`(?i)\bcopyright holder\b`), "copyright owner"},
{regexp.MustCompile(`(?i)\bemphasized\b`), "emphasised"},
{regexp.MustCompile(`(?i)\bfavor\b`), "favour"},
{regexp.MustCompile(`(?i)\bfavorite\b`), "favourite"},
{regexp.MustCompile(`(?i)\bfulfill\b`), "fulfil"},
{regexp.MustCompile(`(?i)\bfulfillment\b`), "fulfilment"},
{regexp.MustCompile(`(?i)\binitialize\b`), "initialise"},
{regexp.MustCompile(`(?i)\bjudgement\b`), "judgment"},
{regexp.MustCompile(`(?i)\blabeling\b`), "labelling"},
{regexp.MustCompile(`(?i)\blabor\b`), "labour"},
{regexp.MustCompile(`(?i)\blicence\b`), "license"},
{regexp.MustCompile(`(?i)\bmaximize\b`), "maximise"},
{regexp.MustCompile(`(?i)\bmodeled\b`), "modelled"},
{regexp.MustCompile(`(?i)\bmodeling\b`), "modelling"},
{regexp.MustCompile(`(?i)\bnoncommercial\b`), "non-commercial"},
{regexp.MustCompile(`(?i)\boffense\b`), "offence"},
{regexp.MustCompile(`(?i)\boptimize\b`), "optimise"},
{regexp.MustCompile(`(?i)\borganization\b`), "organisation"},
{regexp.MustCompile(`(?i)\borganize\b`), "organise"},
{regexp.MustCompile(`(?i)\bpercent\b`), "per cent"},
{regexp.MustCompile(`(?i)\bpractice\b`), "practise"},
{regexp.MustCompile(`(?i)\bprogram\b`), "programme"},
{regexp.MustCompile(`(?i)\brealize\b`), "realise"},
{regexp.MustCompile(`(?i)\brecognize\b`), "recognise"},
{regexp.MustCompile(`(?i)\bsignaling\b`), "signalling"},
{regexp.MustCompile(`(?i)\bsub licen[sc]e\b`), "sublicense"},
{regexp.MustCompile(`(?i)\bsub-licen[sc]e\b`), "sublicense"},
{regexp.MustCompile(`(?i)\butilization\b`), "utilisation"},
{regexp.MustCompile(`(?i)\bwhile\b`), "whilst"},
{regexp.MustCompile(`(?i)\bwilfull\b`), "wilful"},
{regexp.MustCompile(`©`), "Copyright "},
{regexp.MustCompile(`\(([cC])\)`), "Copyright "},
{regexp.MustCompile(`\bhttps://`), "http://"},
{regexp.MustCompile(`“+`), `'`},
{regexp.MustCompile(`”+`), `'`},
{regexp.MustCompile(`’+`), "'"},
{regexp.MustCompile("`+"), "'"},
{regexp.MustCompile(`"+`), "'"},
{regexp.MustCompile(`'+`), "'"},
{regexp.MustCompile(`(?i)\b(the )?Apache Software Foundation( \(ASF\))?`), "the ASF"},
// Prettier chars
{regexp.MustCompile(`[-=*]{3,}`), ""},
// Mozilla Public License, Version 2.0
// Mozilla Public License Version 2.0
{
regexp.MustCompile(`(?i)Mozilla Public License version 2\.0`),
"Mozilla Public License, Version 2.0",
},
// Mozilla Public License, v. 2.0
// ...
{
regexp.MustCompile(`(?i)Mozilla Public License,? v\. ?2\.0`),
"Mozilla Public License, v. 2.0",
},
{
regexp.MustCompile(`(?i)IN NO EVENT SHALL (.+?) BE LIABLE`),
"IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE",
},
{
regexp.MustCompile(`(?i)The names of (its|the) contributors may not be used to endorse`),
"Neither the name of the copyright holder nor the names of its contributors may be used to endorse",
},
{
regexp.MustCompile(`(?i)The name (.+?) may not be used to endorse`),
"Neither the name of the copyright holder nor the names of its contributors may be used to endorse",
},
{
regexp.MustCompile(`(?i)(neither the name of) (.+?) (nor the names of)`),
"$1 the copyright holder $3",
},
{
regexp.MustCompile(`(?i)you may not use this (file|library) except`),
"you may not use this file except",
},
{
regexp.MustCompile(`(?i)THIS SOFTWARE IS PROVIDED BY (.+?)'AS IS'`),
`THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'`,
},
{
regexp.MustCompile(`(?im)\(including the next paragraph\)`),
"",
},
}
lineProcessors = []struct {
regexp *regexp.Regexp
replacement string
}{
// BSD-3-Clause
// MIT
{ // remove optional header
regexp.MustCompile(`(?im)^\s*\(?(The )?MIT License( \((MIT|Expat)\))?\)?$`),
"",
},
// ISC
{ // remove optional header
regexp.MustCompile(`(?im)^\s*(The )?ISC License:?$`),
"",
},
// leading chars such as >, * just for pretty printing
{
regexp.MustCompile(`(?m)^[>*]\s+`),
" ",
},
// Listing bullets such as a., b., 1., 2.
{
regexp.MustCompile(`(?m)^\s*[a-z0-9]\. `),
" ",
},
// Listing bullets such as (a), (b), (1), (2)
{
regexp.MustCompile(`(?m)^\s*\([a-z0-9]\) `),
" ",
},
// trailing chars such as >, * just for pretty printing
{
regexp.MustCompile(`(?m)\s+[*]$`),
" ",
},
// Copyright (c) .....
{
regexp.MustCompile(`(?m)^\s*Copyright (\([cC©]\))?.+$`),
"",
},
// This should be the last one processor
{
regexp.MustCompile("[\n\r]+"),
" ",
},
}
)
// NormalizePattern applies a chain of Normalizers to the license pattern to make it cleaner for identification.
func NormalizePattern(pattern string) string {
for _, normalize := range normalizers {
pattern = normalize(pattern)
}
return pattern
}
// NormalizeHeader applies a chain of Normalizers to the file header to make it cleaner for identification.
func NormalizeHeader(header string) string {
ns := append([]Normalizer{CommentIndicatorNormalizer}, normalizers...)
for _, normalize := range ns {
logger.Log.Debugf("After normalized by %+v:", runtime.FuncForPC(reflect.ValueOf(normalize).Pointer()).Name())
header = normalize(header)
logger.Log.Debugln(header)
}
return header
}
// Normalize applies a chain of Normalizers to the license text to make it cleaner for identification.
func Normalize(license string) string {
ns := append([]Normalizer{CommentIndicatorNormalizer}, normalizers...)
for _, normalize := range ns {
license = normalize(license)
}
return license
}
// OneLineNormalizer normalizes the text line by line and finally merge them into one line.
func OneLineNormalizer(text string) string {
for _, s := range lineProcessors {
text = s.regexp.ReplaceAllString(text, s.replacement)
}
return text
}
// SubstantiveTextsNormalizer normalizes the license text by substituting some words that
// doesn't change the meaning of the license.
func SubstantiveTextsNormalizer(text string) string {
for _, s := range substitutableTexts {
text = s.regex.ReplaceAllString(text, s.replacement)
}
return text
}
// CommentIndicatorNormalizer trims the leading characters of comments, such as /*, <!--, --, (*, etc..
func CommentIndicatorNormalizer(text string) string {
for _, leadingChars := range commentIndicators {
text = leadingChars.ReplaceAllString(text, "")
}
return text
}
// FlattenSpaceNormalizer flattens continuous spaces into a single space.
func FlattenSpaceNormalizer(text string) string {
return flattenSpace.ReplaceAllString(text, " ")
}