pkg/license/norm.go - skywalking-eyes - Git at Google

 //
 // Licensed to Apache Software Foundation (ASF) under one or more contributor
 // license agreements. See the NOTICE file distributed with
 // this work for additional information regarding copyright
 // ownership. Apache Software Foundation (ASF) licenses this file to you under
 // the Apache License, Version 2.0 (the "License"); you may
 // not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
 //
 package license

 import (
 	"regexp"
 	"strings"
 )

 type Normalizer func(string) string

 var (
 	// normalizers is a list of Normalizer that can be applied to the license text, yet doesn't change the license's
 	// meanings, according to the matching guide in https://spdx.dev/license-list/matching-guidelines.
 	// The order matters.
 	normalizers = []Normalizer{
 		VariablesNormalizer,
 		OneLineNormalizer,
 		FlattenSpaceNormalizer,
 		SubstantiveTextsNormalizer,
 		FlattenSpaceNormalizer,
 		strings.ToLower,
 		strings.TrimSpace,
 	}

 	// 6. Code Comment Indicators (https://spdx.dev/license-list/matching-guidelines.)
 	commentIndicators = []*regexp.Regexp{
 		regexp.MustCompile(`(?m)^\s*#+`),    // #
 		regexp.MustCompile(`(?m)^\s*//+`),   // //
 		regexp.MustCompile(`(?m)^\s*"""+`),  // """
 		regexp.MustCompile(`(?m)^\s*\(\*+`), // (*

 		regexp.MustCompile(`(?m)^\s*/\*+`), // /*
 		regexp.MustCompile(`(?m)^\s*\*+/`), //  */
 		regexp.MustCompile(`(?m)^\s*\*+`),  //  *

 		regexp.MustCompile(`(?m)^\s*<!--+`), // <!--
 		regexp.MustCompile(`(?m)^\s*--+>`),  // -->
 		regexp.MustCompile(`(?m)^\s*--+`),   // --
 		regexp.MustCompile(`(?m)^\s*~+`),    //   ~

 		regexp.MustCompile(`(?m)^\s*{-+`), // {-
 		regexp.MustCompile(`(?m)^\s*-}+`), // -}

 		regexp.MustCompile(`(?m)^\s*::`),     // ::
 		regexp.MustCompile(`(?mi)^\s*@?REM`), // @REM
 	}

 	flattenSpace = regexp.MustCompile(`\s+`)

 	substitutableTexts = []struct {
 		regex       *regexp.Regexp
 		replacement string
 	}{
 		{regexp.MustCompile(`(?i)\backnowledgement\b`), "acknowledgment"},
 		{regexp.MustCompile(`(?i)\banalog\b`), "analogue"},
 		{regexp.MustCompile(`(?i)\banalyze\b`), "analyse"},
 		{regexp.MustCompile(`(?i)\bartifact\b`), "artefact"},
 		{regexp.MustCompile(`(?i)\bauthorization\b`), "authorisation"},
 		{regexp.MustCompile(`(?i)\bauthorized\b`), "authorised"},
 		{regexp.MustCompile(`(?i)\bcaliber\b`), "calibre"},
 		{regexp.MustCompile(`(?i)\bcanceled\b`), "cancelled"},
 		{regexp.MustCompile(`(?i)\bcapitalizations\b`), "capitalisations"},
 		{regexp.MustCompile(`(?i)\bcatalog\b`), "catalogue"},
 		{regexp.MustCompile(`(?i)\bcategorize\b`), "categorise"},
 		{regexp.MustCompile(`(?i)\bcenter\b`), "centre"},
 		{regexp.MustCompile(`(?i)\bcopyright holder\b`), "copyright owner"},
 		{regexp.MustCompile(`(?i)\bemphasized\b`), "emphasised"},
 		{regexp.MustCompile(`(?i)\bfavor\b`), "favour"},
 		{regexp.MustCompile(`(?i)\bfavorite\b`), "favourite"},
 		{regexp.MustCompile(`(?i)\bfulfill\b`), "fulfil"},
 		{regexp.MustCompile(`(?i)\bfulfillment\b`), "fulfilment"},
 		{regexp.MustCompile(`(?i)\binitialize\b`), "initialise"},
 		{regexp.MustCompile(`(?i)\bjudgement\b`), "judgment"},
 		{regexp.MustCompile(`(?i)\blabeling\b`), "labelling"},
 		{regexp.MustCompile(`(?i)\blabor\b`), "labour"},
 		{regexp.MustCompile(`(?i)\blicense\b`), "licence"},
 		{regexp.MustCompile(`(?i)\bmaximize\b`), "maximise"},
 		{regexp.MustCompile(`(?i)\bmodeled\b`), "modelled"},
 		{regexp.MustCompile(`(?i)\bmodeling\b`), "modelling"},
 		{regexp.MustCompile(`(?i)\bnoncommercial\b`), "non-commercial"},
 		{regexp.MustCompile(`(?i)\boffense\b`), "offence"},
 		{regexp.MustCompile(`(?i)\boptimize\b`), "optimise"},
 		{regexp.MustCompile(`(?i)\borganization\b`), "organisation"},
 		{regexp.MustCompile(`(?i)\borganize\b`), "organise"},
 		{regexp.MustCompile(`(?i)\bpercent\b`), "per cent"},
 		{regexp.MustCompile(`(?i)\bpractice\b`), "practise"},
 		{regexp.MustCompile(`(?i)\bprogram\b`), "programme"},
 		{regexp.MustCompile(`(?i)\brealize\b`), "realise"},
 		{regexp.MustCompile(`(?i)\brecognize\b`), "recognise"},
 		{regexp.MustCompile(`(?i)\bsignaling\b`), "signalling"},
 		{regexp.MustCompile(`(?i)\bsublicense\b`), "sub-license"},
 		{regexp.MustCompile(`(?i)\bsub-license\b`), "sub license"},
 		{regexp.MustCompile(`(?i)\bsublicense\b`), "sub license"},
 		{regexp.MustCompile(`(?i)\butilization\b`), "utilisation"},
 		{regexp.MustCompile(`(?i)\bwhile\b`), "whilst"},
 		{regexp.MustCompile(`(?i)\bwilfull\b`), "wilful"},

 		{regexp.MustCompile(`©`), "Copyright "},
 		{regexp.MustCompile(`\(c\)`), "Copyright "},
 		{regexp.MustCompile(`\bhttps://`), "http://"},

 		{regexp.MustCompile(`(?i)\b(the )?Apache Software Foundation( \(ASF\))?`), "the ASF"},
 	}

 	variables = []struct {
 		regexp      *regexp.Regexp
 		replacement string
 	}{
 		// BSD-3-Clause
 		{
 			regexp.MustCompile(`(?im)(^(Copyright \(c\)) (\d{4}) (.+?) (All rights reserved\.)?$\n?)+`),
 			"$2 [year] [owner]. $5",
 		},
 		{
 			regexp.MustCompile(`(?i)(neither the name of) (.+?) (nor the names of)`),
 			"$1 the copyright holder $3",
 		},
 		// MIT
 		{ // remove optional header
 			regexp.MustCompile(`(?im)^The MIT License \(MIT\)$`),
 			"",
 		},
 		{
 			regexp.MustCompile(`(?im)^(Copyright \(c\)) (\d{4}) (.+?)$`),
 			"$1 [year] [owner]",
 		},
 		{
 			regexp.MustCompile(`(?im)\(including the next paragraph\)`),
 			"",
 		},
 	}
 )

 // NormalizePattern applies a chain of Normalizers to the license pattern to make it cleaner for identification.
 func NormalizePattern(pattern string) string {
 	for _, normalize := range normalizers {
 		pattern = normalize(pattern)
 	}
 	return pattern
 }

 // NormalizeHeader applies a chain of Normalizers to the file header to make it cleaner for identification.
 func NormalizeHeader(header string) string {
 	ns := append([]Normalizer{CommentIndicatorNormalizer}, normalizers...)
 	for _, normalize := range ns {
 		header = normalize(header)
 	}
 	return header
 }

 // Normalize applies a chain of Normalizers to the license text to make it cleaner for identification.
 func Normalize(license string) string {
 	for _, normalize := range normalizers {
 		license = normalize(license)
 	}
 	return license
 }

 // OneLineNormalizer simply removes all line breaks to flatten the license text into one line.
 func OneLineNormalizer(text string) string {
 	return regexp.MustCompile("[\n\r]+").ReplaceAllString(text, " ")
 }

 // SubstantiveTextsNormalizer normalizes the license text by substituting some words that
 // doesn't change the meaning of the license.
 func SubstantiveTextsNormalizer(text string) string {
 	for _, s := range substitutableTexts {
 		text = s.regex.ReplaceAllString(text, s.replacement)
 	}
 	return text
 }

 // CommentIndicatorNormalizer trims the leading characters of comments, such as /*, <!--, --, (*, etc..
 func CommentIndicatorNormalizer(text string) string {
 	for _, leadingChars := range commentIndicators {
 		text = leadingChars.ReplaceAllString(text, "")
 	}
 	return text
 }

 // FlattenSpaceNormalizer flattens continuous spaces into a single space.
 func FlattenSpaceNormalizer(text string) string {
 	return flattenSpace.ReplaceAllString(text, " ")
 }

 // VariablesNormalizer replace the variables actual value into variable name.
 func VariablesNormalizer(text string) string {
 	for _, v := range variables {
 		text = v.regexp.ReplaceAllString(text, v.replacement)
 	}

 	return text
 }
	//
	// Licensed to Apache Software Foundation (ASF) under one or more contributor
	// license agreements. See the NOTICE file distributed with
	// this work for additional information regarding copyright
	// ownership. Apache Software Foundation (ASF) licenses this file to you under
	// the Apache License, Version 2.0 (the "License"); you may
	// not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.
	//
	package license

	import (
	"regexp"
	"strings"
	)

	type Normalizer func(string) string

	var (
	// normalizers is a list of Normalizer that can be applied to the license text, yet doesn't change the license's
	// meanings, according to the matching guide in https://spdx.dev/license-list/matching-guidelines.
	// The order matters.
	normalizers = []Normalizer{
	VariablesNormalizer,
	OneLineNormalizer,
	FlattenSpaceNormalizer,
	SubstantiveTextsNormalizer,
	FlattenSpaceNormalizer,
	strings.ToLower,
	strings.TrimSpace,
	}

	// 6. Code Comment Indicators (https://spdx.dev/license-list/matching-guidelines.)
	commentIndicators = []*regexp.Regexp{
	regexp.MustCompile(`(?m)^\s*#+`), // #
	regexp.MustCompile(`(?m)^\s*//+`), // //
	regexp.MustCompile(`(?m)^\s*"""+`), // """
	regexp.MustCompile(`(?m)^\s\(\+`), // (*

	regexp.MustCompile(`(?m)^\s/\+`), // /*
	regexp.MustCompile(`(?m)^\s\+/`), // */
	regexp.MustCompile(`(?m)^\s\+`), // *

	regexp.MustCompile(`(?m)^\s*<!--+`), // <!--
	regexp.MustCompile(`(?m)^\s*--+>`), // -->
	regexp.MustCompile(`(?m)^\s*--+`), // --
	regexp.MustCompile(`(?m)^\s*~+`), // ~

	regexp.MustCompile(`(?m)^\s*{-+`), // {-
	regexp.MustCompile(`(?m)^\s*-}+`), // -}

	regexp.MustCompile(`(?m)^\s*::`), // ::
	regexp.MustCompile(`(?mi)^\s*@?REM`), // @REM
	}

	flattenSpace = regexp.MustCompile(`\s+`)

	substitutableTexts = []struct {
	regex *regexp.Regexp
	replacement string
	}{
	{regexp.MustCompile(`(?i)\backnowledgement\b`), "acknowledgment"},
	{regexp.MustCompile(`(?i)\banalog\b`), "analogue"},
	{regexp.MustCompile(`(?i)\banalyze\b`), "analyse"},
	{regexp.MustCompile(`(?i)\bartifact\b`), "artefact"},
	{regexp.MustCompile(`(?i)\bauthorization\b`), "authorisation"},
	{regexp.MustCompile(`(?i)\bauthorized\b`), "authorised"},
	{regexp.MustCompile(`(?i)\bcaliber\b`), "calibre"},
	{regexp.MustCompile(`(?i)\bcanceled\b`), "cancelled"},
	{regexp.MustCompile(`(?i)\bcapitalizations\b`), "capitalisations"},
	{regexp.MustCompile(`(?i)\bcatalog\b`), "catalogue"},
	{regexp.MustCompile(`(?i)\bcategorize\b`), "categorise"},
	{regexp.MustCompile(`(?i)\bcenter\b`), "centre"},
	{regexp.MustCompile(`(?i)\bcopyright holder\b`), "copyright owner"},
	{regexp.MustCompile(`(?i)\bemphasized\b`), "emphasised"},
	{regexp.MustCompile(`(?i)\bfavor\b`), "favour"},
	{regexp.MustCompile(`(?i)\bfavorite\b`), "favourite"},
	{regexp.MustCompile(`(?i)\bfulfill\b`), "fulfil"},
	{regexp.MustCompile(`(?i)\bfulfillment\b`), "fulfilment"},
	{regexp.MustCompile(`(?i)\binitialize\b`), "initialise"},
	{regexp.MustCompile(`(?i)\bjudgement\b`), "judgment"},
	{regexp.MustCompile(`(?i)\blabeling\b`), "labelling"},
	{regexp.MustCompile(`(?i)\blabor\b`), "labour"},
	{regexp.MustCompile(`(?i)\blicense\b`), "licence"},
	{regexp.MustCompile(`(?i)\bmaximize\b`), "maximise"},
	{regexp.MustCompile(`(?i)\bmodeled\b`), "modelled"},
	{regexp.MustCompile(`(?i)\bmodeling\b`), "modelling"},
	{regexp.MustCompile(`(?i)\bnoncommercial\b`), "non-commercial"},
	{regexp.MustCompile(`(?i)\boffense\b`), "offence"},
	{regexp.MustCompile(`(?i)\boptimize\b`), "optimise"},
	{regexp.MustCompile(`(?i)\borganization\b`), "organisation"},
	{regexp.MustCompile(`(?i)\borganize\b`), "organise"},
	{regexp.MustCompile(`(?i)\bpercent\b`), "per cent"},
	{regexp.MustCompile(`(?i)\bpractice\b`), "practise"},
	{regexp.MustCompile(`(?i)\bprogram\b`), "programme"},
	{regexp.MustCompile(`(?i)\brealize\b`), "realise"},
	{regexp.MustCompile(`(?i)\brecognize\b`), "recognise"},
	{regexp.MustCompile(`(?i)\bsignaling\b`), "signalling"},
	{regexp.MustCompile(`(?i)\bsublicense\b`), "sub-license"},
	{regexp.MustCompile(`(?i)\bsub-license\b`), "sub license"},
	{regexp.MustCompile(`(?i)\bsublicense\b`), "sub license"},
	{regexp.MustCompile(`(?i)\butilization\b`), "utilisation"},
	{regexp.MustCompile(`(?i)\bwhile\b`), "whilst"},
	{regexp.MustCompile(`(?i)\bwilfull\b`), "wilful"},

	{regexp.MustCompile(`©`), "Copyright "},
	{regexp.MustCompile(`\(c\)`), "Copyright "},
	{regexp.MustCompile(`\bhttps://`), "http://"},

	{regexp.MustCompile(`(?i)\b(the )?Apache Software Foundation( \(ASF\))?`), "the ASF"},
	}

	variables = []struct {
	regexp *regexp.Regexp
	replacement string
	}{
	// BSD-3-Clause
	{
	regexp.MustCompile(`(?im)(^(Copyright \(c\)) (\d{4}) (.+?) (All rights reserved\.)?$\n?)+`),
	"$2 [year] [owner]. $5",
	},
	{
	regexp.MustCompile(`(?i)(neither the name of) (.+?) (nor the names of)`),
	"$1 the copyright holder $3",
	},
	// MIT
	{ // remove optional header
	regexp.MustCompile(`(?im)^The MIT License \(MIT\)$`),
	"",
	},
	{
	regexp.MustCompile(`(?im)^(Copyright \(c\)) (\d{4}) (.+?)$`),
	"$1 [year] [owner]",
	},
	{
	regexp.MustCompile(`(?im)\(including the next paragraph\)`),
	"",
	},
	}
	)

	// NormalizePattern applies a chain of Normalizers to the license pattern to make it cleaner for identification.
	func NormalizePattern(pattern string) string {
	for _, normalize := range normalizers {
	pattern = normalize(pattern)
	}
	return pattern
	}

	// NormalizeHeader applies a chain of Normalizers to the file header to make it cleaner for identification.
	func NormalizeHeader(header string) string {
	ns := append([]Normalizer{CommentIndicatorNormalizer}, normalizers...)
	for _, normalize := range ns {
	header = normalize(header)
	}
	return header
	}

	// Normalize applies a chain of Normalizers to the license text to make it cleaner for identification.
	func Normalize(license string) string {
	for _, normalize := range normalizers {
	license = normalize(license)
	}
	return license
	}

	// OneLineNormalizer simply removes all line breaks to flatten the license text into one line.
	func OneLineNormalizer(text string) string {
	return regexp.MustCompile("[\n\r]+").ReplaceAllString(text, " ")
	}

	// SubstantiveTextsNormalizer normalizes the license text by substituting some words that
	// doesn't change the meaning of the license.
	func SubstantiveTextsNormalizer(text string) string {
	for _, s := range substitutableTexts {
	text = s.regex.ReplaceAllString(text, s.replacement)
	}
	return text
	}

	// CommentIndicatorNormalizer trims the leading characters of comments, such as /, <!--, --, (, etc..
	func CommentIndicatorNormalizer(text string) string {
	for _, leadingChars := range commentIndicators {
	text = leadingChars.ReplaceAllString(text, "")
	}
	return text
	}

	// FlattenSpaceNormalizer flattens continuous spaces into a single space.
	func FlattenSpaceNormalizer(text string) string {
	return flattenSpace.ReplaceAllString(text, " ")
	}

	// VariablesNormalizer replace the variables actual value into variable name.
	func VariablesNormalizer(text string) string {
	for _, v := range variables {
	text = v.regexp.ReplaceAllString(text, v.replacement)
	}

	return text
	}