blob: a21f95c3e4ec1d9327aae14f3d565b5c10ec5343 [file] [log] [blame]
#!/usr/bin/perl -w
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#=========================================================
#
# For files that do not yet have an Apache License, insert the 2.0 license.
# Adds comment markers for the relevant file type.
#
# This can also be used to provide a summary of the current situation.
# It will detect the presence of various different license headers.
# Use the -p option for practice mode.
#
# Limitations:
# - Only developed and tested for certain file types. Others will be
# reported and skipped.
# Needs tweaks for other types (see "configuration" section below).
# - Only inserts missing licenses and detects and reports other license types.
# See ./update-AL20.pl to update to the current license style.
#
# Caveats:
# - As usual, make a backup of your tree first or be prepared to 'svn revert -R'
# your working copy if the script stuffs up.
#
# WARNING: Be sure to look at the output of this script for warnings.
# WARNING: Be sure to do the normal 'svn diff' and review.
# Attend to the warning in tools/copy2license.pl about "collective copyright".
#
# Developed only for UNIX, YMMV.
#
# Procedure:
# See ./relicense.txt for an example procedure.
# Use -p for practise mode.
# Run the script. It will descend the directory tree.
# Run with no parameters or -h to show usage.
#
#=========================================================
use strict;
use vars qw($opt_h $opt_p);
use Getopt::Std;
use File::Basename;
use File::Find;
#--------------------------------------------------
# ensure proper usage
getopts("hp");
if ((scalar @ARGV < 1) || defined($opt_h)) {
ShowUsage();
exit;
}
my $startDir = shift;
my $avoidList = shift;
if (!-e $startDir) {
print STDERR qq!
The start directory '$startDir' does not exist.
!;
ShowUsage();
exit;
}
if (defined($avoidList) && !-e $avoidList) {
print STDERR qq!
The list of files to avoid '$avoidList' does not exist.
!;
ShowUsage();
exit;
}
if ($opt_p) { print STDERR "\nDoing practice run. No files will be written\n"; }
print qq!
AL-20 = Apache License 2.0 with original Copyright line.
AL-20a = Apache License 2.0 with original Copyright line and "or its licensors".
AL-20b = Apache License 2.0 with no Copyright line, i.e. the current style.
----------------------
!;
#--------------------------------------------------
# do some configuration
my $license = qq!Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
!;
my @license = split(/\n/, $license);
# build a hash of filename extensions to be processed
# together with the particular style of comment marker to use.
my @xmlFileTypes = (
".xml", ".xsl", ".xslt", ".xmap", ".xcat",
".xconf", ".xroles", ".roles", ".xsp", ".rss",
".xinfo", ".xprofile", ".xsamples", ".xtest", ".xweb", ".xwelcome",
".samplesxconf", ".samplesxpipe", ".svg", ".xhtml", ".xhtml2", ".gt", ".jx", ".jmx",
".jdo", ".orm", ".jdoquery", ".jelly",
".jxt", ".meta", ".pagesheet", ".stx", ".xegrm", ".xgrm", ".xlex", ".xmi",
".xsd", ".rng", ".rdf", ".rdfs", ".xul", ".tld", ".xxe", ".ft", ".fv",
".wsdd", ".wsdl", ".xlog", ".pom", ".owl",
);
my @sgmlFileTypes = (
".dtd", ".mod", ".sgml", ".sgm",
);
my @htmlFileTypes = (
".html", ".htm", ".jsp", ".ihtml",
);
my @freemarkerFileTypes = (
".ftl",
);
my @cFileTypes = (
".java", ".js", ".c", ".h", ".cpp", ".cc", ".cs", ".css", ".egrm", ".grm",
".javascript", ".jj", ".gy", ".g",
);
my @shFileTypes = (
".sh", ".ccf", ".pl", ".py", ".sed", ".awk",
);
my @propertiesFileTypes = (
".properties", ".rnc", ".rnx", ".prefs", ".rb", ".handlers", ".schemas",
);
my @dosFileTypes = (
".bat", ".cmd",
);
my @sqlFileTypes = (
".script", ".sql",
);
my @vmFileTypes = (
".vm",
);
my @ignoreFileTypes = (
".txt", ".dcl", ".ent", ".pen", ".project"
);
my (%fileTypes, $fileType);
foreach $fileType (@xmlFileTypes) {
$fileTypes{$fileType}{type} = "xml";
$fileTypes{$fileType}{openComment} = "<!--\n";
$fileTypes{$fileType}{leaderComment} = " ";
$fileTypes{$fileType}{closeComment} = "-->\n";
# insert after line 1 which must be the xml declaration
$fileTypes{$fileType}{insertionPoint} = "1";
}
foreach $fileType (@sgmlFileTypes) {
$fileTypes{$fileType}{type} = "sgml";
$fileTypes{$fileType}{openComment} = "<!--\n";
$fileTypes{$fileType}{leaderComment} = " ";
$fileTypes{$fileType}{closeComment} = "-->\n";
# insert at very top of file
$fileTypes{$fileType}{insertionPoint} = "0";
}
foreach $fileType (@htmlFileTypes) {
$fileTypes{$fileType}{type} = "html";
$fileTypes{$fileType}{openComment} = "<!--\n";
$fileTypes{$fileType}{leaderComment} = " ";
$fileTypes{$fileType}{closeComment} = "-->\n";
# insert at very top of file
$fileTypes{$fileType}{insertionPoint} = "0";
}
foreach $fileType (@freemarkerFileTypes) {
$fileTypes{$fileType}{type} = "html";
$fileTypes{$fileType}{openComment} = "<#--\n";
$fileTypes{$fileType}{leaderComment} = " ";
$fileTypes{$fileType}{closeComment} = "-->\n";
# insert at very top of file
$fileTypes{$fileType}{insertionPoint} = "0";
}
foreach $fileType (@cFileTypes) {
$fileTypes{$fileType}{type} = "C";
$fileTypes{$fileType}{openComment} = "/*\n";
$fileTypes{$fileType}{leaderComment} = "* ";
$fileTypes{$fileType}{closeComment} = "*/\n";
# insert at very top of file
$fileTypes{$fileType}{insertionPoint} = "0";
}
foreach $fileType (@shFileTypes) {
$fileTypes{$fileType}{type} = "sh";
$fileTypes{$fileType}{openComment} = "\n";
$fileTypes{$fileType}{leaderComment} = "# ";
$fileTypes{$fileType}{closeComment} = "\n";
# insert after line 1 which must be #! script invocation
$fileTypes{$fileType}{insertionPoint} = "1";
}
foreach $fileType (@propertiesFileTypes) {
$fileTypes{$fileType}{type} = "properties";
$fileTypes{$fileType}{openComment} = "";
$fileTypes{$fileType}{leaderComment} = "# ";
$fileTypes{$fileType}{closeComment} = "\n";
# insert at very top of file
$fileTypes{$fileType}{insertionPoint} = "0";
}
foreach $fileType (@dosFileTypes) {
$fileTypes{$fileType}{type} = "dos";
$fileTypes{$fileType}{openComment} = "\@echo off\n";
$fileTypes{$fileType}{leaderComment} = "rem ";
$fileTypes{$fileType}{closeComment} = "\n";
# insert at very top of file
$fileTypes{$fileType}{insertionPoint} = "0";
}
foreach $fileType (@sqlFileTypes) {
$fileTypes{$fileType}{type} = "sql";
$fileTypes{$fileType}{openComment} = "";
$fileTypes{$fileType}{leaderComment} = "-- ";
$fileTypes{$fileType}{closeComment} = "\n";
# insert at very top of file
$fileTypes{$fileType}{insertionPoint} = "0";
}
foreach $fileType (@vmFileTypes) {
$fileTypes{$fileType}{type} = "vm";
$fileTypes{$fileType}{openComment} = "#*\n";
$fileTypes{$fileType}{leaderComment} = " ";
$fileTypes{$fileType}{closeComment} = "*#\n";
# insert after line 1 which must be the xml declaration
$fileTypes{$fileType}{insertionPoint} = "1";
}
my ($countTotal, $countUnknownType, $countIgnoreType) = (0, 0, 0);
my ($countXmlDeclMissing, $countInserted, $countAvoid) = (0, 0, 0);
my ($countLicense, $countLicense10, $countLicense11, $countLicense12) = (0, 0, 0, 0);
my ($countLicensePD, $countLicenseOther) = (0, 0);
my ($countLicense20, $countLicense20a, $countLicense20b) = (0, 0, 0);
# 3rdParty users of an Apache License
my ($countLicenseF20, $countLicenseF11, $countLicenseF12) = (0, 0, 0);
my $dualLicensesDetected = 0;
my %uniqueSuffixes;
my @avoidList;
# read the avoidList
if (defined($avoidList)) {
open(INPUT, "<$avoidList") or die "Could not open input file '$avoidList': $!";
while (<INPUT>) {
next if (/^#/);
chomp;
push(@avoidList, $_);
}
close INPUT;
}
#--------------------------------------------------
sub process_file {
return unless -f && -T; # process only text files
my $fileName = $File::Find::name;
my ($file, $dir, $ext) = fileparse($fileName, qr/\.[^.]*/);
return if ($dir =~ /\/CVS\//); # skip CVS directories
return if ($dir =~ /\/\.svn\//); # skip SVN directories
return if ($fileName =~ /.cvsignore/); # skip
return if ($file =~ /^\./); # skip hidden files
foreach my $avoidFn (@avoidList) {
if ($fileName =~ /$avoidFn/) {
$countAvoid++;
return;
}
}
$countTotal++;
if ($ext eq "") { $ext = "NoExtension"; }
$uniqueSuffixes{$ext}++;
print "$fileName, ";
my $tmpFile = $fileName . ".tmp";
open(INPUT, "<$fileName") or die "Could not open input file '$fileName': $!";
# First do some tests on the file to ensure it does not already have a license
# and ensure that XML files have an xml declaration.
my ($existsLicense, $warnDualLicense, $existsXmlDecl) = (0, 0, 0);
my ($warnAL20OldLicense) = 0;
my ($warnAL20aOldLicense) = 0;
my $licenseType = "";
undef $/; # slurp the whole file
my $content = <INPUT>;
# we want our matches to happen only in the top part of the file
# NOTE: You may want to relax this from time-to-time to find
# all possible dual-license issues.
my $headContent = substr($content, 0, 1500);
$headContent =~ s/[ \t]+/ /g;
# detect various existing licenses
LICENSE_CASE: {
if ($headContent =~ /Licensed to the Apache Software Foundation \(ASF\) under/) {
$existsLicense = 1; $countLicense++;
$countLicense20b++; $licenseType = "AL-20b";
last LICENSE_CASE;
}
if ($headContent =~ /Licensed under the Apache License.*Version 2.0/) {
$existsLicense = 1; $countLicense++;
if ($headContent =~ /Apache Software Foundation or its licensors/) {
$countLicense20a++; $licenseType = "AL-20a";
$warnAL20aOldLicense = 1;
}
else {
if ($headContent =~ /Copyright.*Apache Software Foundation/) {
$countLicense20++; $licenseType = "AL-20";
$warnAL20OldLicense = 1;
}
else {
$countLicenseF20++; $licenseType = "F-AL-20";
}
}
last LICENSE_CASE;
}
if ($headContent =~ /The Apache Software License.*Version 1.2/) {
$existsLicense = 1; $countLicense++;
if ($headContent =~ /Copyright.*Apache Software Foundation/) {
$countLicense12++; $licenseType = "AL-12";
}
else {
$countLicenseF12++; $licenseType = "F-AL-12";
}
last LICENSE_CASE;
}
if ($headContent =~ /The Apache Software License.*Version 1.1/) {
$existsLicense = 1; $countLicense++;
if ($headContent =~ /Copyright.*Apache Software Foundation/) {
$countLicense11++; $licenseType = "AL-11";
}
else {
$countLicenseF11++; $licenseType = "F-AL-11";
}
last LICENSE_CASE;
}
if ($headContent =~ /Copyright.*The Apache Group/) {
$countLicense10++; $licenseType = "AL-10";
$existsLicense = 1; $countLicense++;
last LICENSE_CASE;
}
if ($headContent =~ /Public Domain.*/i) {
$countLicensePD++; $licenseType = "PublicDomain";
$existsLicense = 1; $countLicense++;
last LICENSE_CASE;
}
# catchall
if ($headContent =~ /Copyright|\(c\)/i) {
# do process xml files that have a copyright attribute
last LICENSE_CASE if ($headContent =~ /copyright=/i);
# do process DTD files that have a copyright attribute
last LICENSE_CASE if ($headContent =~ /copyright CDATA/i);
# do process css files that have a .copyright section
last LICENSE_CASE if ($headContent =~ /\.copyright/i);
# do process files that just talk about copyright
last LICENSE_CASE if ($headContent =~ /copyright statement/i);
$countLicenseOther++; $licenseType = "Other";
$existsLicense = 1; $countLicense++;
last LICENSE_CASE;
}
# catchall
if ($headContent =~ /re[ -]*distribut/i) {
$countLicenseOther++; $licenseType = "Other";
$existsLicense = 1; $countLicense++;
last LICENSE_CASE;
}
}
# Try to detect if a new AL-20 license has been accidently inserted
# as well as having some other license.
# FIXME: If a practice run reveals more types of Foregin copyright
# then add patterns here.
if ($licenseType =~ /AL-20/) {
if (($headContent =~ /Rights Reserved/i) ||
($headContent =~ /Public Domain/i) ||
($headContent =~ /Copyright.*Copyright/i)) {
$warnDualLicense = 1; $dualLicensesDetected++;
}
}
# ensure that xml files have an xml declaration
if ($headContent =~ /^<\?xml/) { $existsXmlDecl = 1; }
$/ = "\n"; # reset input record separator
my $recognisedFileType = 0; my $thisFileType = "unknown";
foreach $fileType (keys %fileTypes) {
if ($fileType eq $ext) {
$recognisedFileType = 1;
$thisFileType = $fileTypes{$fileType}{type};
last;
}
}
print "extension=$ext, fileType=$thisFileType, ";
if (!$existsXmlDecl && ($thisFileType eq "xml")) {
print "XML file does not have XML Declaration so skipping\n";
$countXmlDeclMissing++;
return;
}
if ($existsLicense) {
if ($licenseType !~ /^AL/) { print "WARN: "; }
print "Found existing license (licenseType=$licenseType) so skipping";
if ($warnAL20OldLicense) { print ", WARN: old AL-20 copyright notice"; }
if ($warnAL20aOldLicense) { print ", WARN: old AL-20a copyright notice"; }
if ($warnDualLicense) { print ", WARN: dual license"; }
print "\n";
return;
}
foreach $fileType (@ignoreFileTypes) {
if ($fileType eq $ext) {
$countIgnoreType++;
print "ignored, ";
}
}
if (!$recognisedFileType) {
print "File type '$ext' is not recognised so skipping\n";
$countUnknownType++;
return;
}
# Now process the file.
my $insertionDone = 0; my ($line, $thisLine);
if (!$opt_p) {
open(OUTPUT, ">$tmpFile")
or die "Could not open output file '$tmpFile': $!";
}
$countInserted++;
if ($fileTypes{$ext}{insertionPoint} == 0) {
print "Insert new license\n";
if (!$opt_p) {
print OUTPUT $fileTypes{$ext}{openComment};
foreach $line (@license) {
$thisLine = $fileTypes{$ext}{leaderComment} . $line;
$thisLine =~ s/\s+$//;
print OUTPUT $thisLine, "\n";
}
print OUTPUT $fileTypes{$ext}{closeComment};
}
$insertionDone = 1;
}
seek(INPUT, 0, 0); $. = 0; # rewind to top of file
while (<INPUT>) {
if (!$opt_p) {
print OUTPUT $_ or die "Could not write output file '$fileName': $!";
}
if (!$insertionDone) {
if ($. == $fileTypes{$ext}{insertionPoint}) {
print "Insert new license\n";
if (!$opt_p) {
print OUTPUT $fileTypes{$ext}{openComment};
foreach $line (@license) {
$thisLine = $fileTypes{$ext}{leaderComment} . $line;
$thisLine =~ s/\s+$//;
print OUTPUT $thisLine, "\n";
}
print OUTPUT $fileTypes{$ext}{closeComment};
}
$insertionDone = 1;
}
}
}
close INPUT or die "Could not close input file '$fileName': $!";
if (!$opt_p) {
close OUTPUT or die "Could not close output file '$tmpFile': $!";
rename($tmpFile, $fileName);
}
}
find(\&process_file, $startDir);
#--------------------------------------------------
# Report some statistics
my $statsMsg = "were";
if ($opt_p) { $statsMsg = "would be"; }
$countUnknownType -= $countIgnoreType;
print STDERR qq!
Total $countTotal text files were investigated.
New licenses $statsMsg inserted in $countInserted files.
Skipped $countLicense files with an existing license:
(Apache v2.0=$countLicense20, v2.0a=$countLicense20a, v2.0b=$countLicense20b)
(Apache v1.2=$countLicense12, v1.1=$countLicense11, v1.0=$countLicense10)
(Other=$countLicenseOther, PublicDomain=$countLicensePD)
(3rdParty using AL v2.0=$countLicenseF20, v1.2=$countLicenseF12, v1.1=$countLicenseF11)
Skipped $countXmlDeclMissing XML files with missing XML Declaration.
!;
if (defined($avoidList)) {
print STDERR "Avoided $countAvoid files as specified in the avoidList\n";
}
print STDERR qq!
Ignored $countIgnoreType files of specified type (@ignoreFileTypes)
Skipped $countUnknownType files of unknown type.
!;
if ($dualLicensesDetected) {
print STDERR qq!
WARNING: $dualLicensesDetected files had another license as well as the new
Apache v2.0 license. (Scan the log output for lines with "WARN: dual".)
!;
}
my $suffix;
if ($countUnknownType > 0) {
print STDERR qq!
List of unknown filename extensions and ignored filename extensions:
(Add new fileTypes to this script if you want them to be catered for.)
!;
foreach $suffix ( sort keys %uniqueSuffixes) {
my $suffixKnown = 0;
foreach $fileType (keys %fileTypes) {
if ($suffix eq $fileType) { $suffixKnown = 1; }
}
if (!$suffixKnown) {
print STDERR "$suffix=$uniqueSuffixes{$suffix} ";
}
}
print STDERR "\n\n";
}
print STDERR "List of all unique filename extensions:\n";
foreach $suffix ( sort keys %uniqueSuffixes) {
print STDERR "$suffix=$uniqueSuffixes{$suffix} ";
}
print STDERR "\n\n";
if ($opt_p) { print STDERR "Finished practice run.\n"; }
#==================================================
# ShowUsage
#==================================================
sub ShowUsage {
print STDERR qq!
Usage: $0 [-h] [-p] startDir [avoidList] > logfile
where:
startDir = The SVN directory (pathname) to start processing. Will descend.
avoidList = List of files and directories to avoid, one per line.
option:
h = Show this help message.
p = Do a practice run. Do not write any files.
!;
}