chore(license): add license checker for copyright and fix license files (cherry-pick #1674) (#1677)

https://github.com/apache/incubator-pegasus/issues/1676

There are several tasks/steps for this PR:

- develop a script to check the consistency between `.licenserc.yaml` and all files of the project.
- according to the check result, fix `.licenserc.yaml`.
- according to the fixed `.licenserc.yaml`, amend `LICENSE`.

License checker could be run just by `python3 scripts/check_license.py`.

This PR is to cherry-pick #1674 into v2.5 to solve issue #1676.
diff --git a/.licenserc.yaml b/.licenserc.yaml
index c6f63af..543a7c0 100644
--- a/.licenserc.yaml
+++ b/.licenserc.yaml
@@ -20,64 +20,37 @@
     copyright-owner: Apache Software Foundation
 
   paths-ignore:
+    # Configuration files that could not be added with copyright info (otherwise would lead to error).
+    - '.rat-excludes'
+    - '**/*.csv'
+    - '**/*.json'
     # All the type of licenses of this project should be added to LICENSE.
+    - 'DISCLAIMER-WIP'
     - 'LICENSE'
     - 'NOTICE'
-    - '.github/pull_request_template.md'
+    # Template files of issues and pull requests for Github.
     - '.github/ISSUE_TEMPLATE/bug_report.md'
     - '.github/ISSUE_TEMPLATE/feature-request.md'
     - '.github/ISSUE_TEMPLATE/general_question.md'
-    - '.rat-excludes'
-    - 'DISCLAIMER-WIP'
-    - 'python-client/requirement.txt'
-    - '.devcontainer/devcontainer.json'
-    # TODO(yingchun): shell/* files are import from thirdparties, we can move them to thirdparty later.
-    - 'src/shell/argh.h'
-    - 'src/shell/linenoise/linenoise.c'
-    - 'src/shell/linenoise/linenoise.h'
-    - 'src/shell/sds/sds.c'
-    - 'src/shell/sds/sds.h'
-    - 'src/shell/sds/sdsalloc.h'
+    - '.github/pull_request_template.md'
+    # Image files for docs.
+    - '**/*.jpg'
+    - '**/*.png'
+    # Files in pdf format.
+    - '**/*.pdf'
+    # Special files for golang.
     - '**/go.sum'
-    - '**/*.csv'
-    - '**/*.json'
+    # TODO(wangdan): Generated files for go client, could generate dynamically?
+    - 'go-client/idl/base/GoUnusedProtection__.go'
     - 'go-client/idl/base/dsn_err_string.go'
     - 'go-client/idl/base/rocskdb_err_string.go'
-    - 'go-client/idl/base/GoUnusedProtection__.go'
+    # Special files for nodejs.
     - '**/.npmigonre'
-    # Copyright (c) Facebook, Inc
-    - 'src/utils/TokenBucket.h'
-    - 'src/utils/test/TokenBucketTest.cpp'
-    - 'src/utils/test/TokenBucketTest.h'
-    # https://github.com/preshing/cpp11-on-multicore/blob/master/LICENSE
-    - 'src/utils/hpc_locks/autoresetevent.h'
-    - 'src/utils/hpc_locks/autoreseteventcondvar.h'
-    - 'src/utils/hpc_locks/benaphore.h'
-    - 'src/utils/hpc_locks/bitfield.h'
-    - 'src/utils/hpc_locks/readme.txt'
-    - 'src/utils/hpc_locks/rwlock.h'
-    - 'src/utils/hpc_locks/sema.h'
-    # Copyright (c) xxxx The Chromium Authors
-    - 'src/utils/safe_strerror_posix.h'
-    - 'src/runtime/build_config.h'
-    - 'src/utils/test/autoref_ptr_test.cpp'
-    - 'src/utils/safe_strerror_posix.cpp'
-    # Copyright 2017 The Abseil Authors
-    - 'src/utils/absl/base/internal/invoke.h'
-    - 'src/utils/absl/utility/utility.h'
-    - 'src/utils/smart_pointers.h'
-    - 'src/utils/string_view.h'
-    - 'src/utils/test/memutil_test.cpp'
-    - 'src/utils/test/string_view_test.cpp'
-    - 'src/utils/test/smart_pointers_test.cpp'
-    - 'src/utils/memutil.h'
-    - 'src/utils/string_view.cpp'
-    # Copyright (c) 2010-2011, Rob Jansen
-    - 'cmake_modules/FindRT.cmake'
-    - 'cmake_modules/FindDL.cmake'
-    # Copyright (c) 2017 Guillaume Papin
-    - 'scripts/run-clang-format.py'
-    # need manual fix
+    # Special files for python.
+    - 'python-client/requirement.txt'
+    # Text files used for tests and could not be added with copyright info (otherwise would lead to error).
+    - 'src/aio/test/copy_source.txt'
+    - 'src/runtime/test/command.txt'
     - 'src/failure_detector/test/gtest.filter'
     - 'src/meta/test/meta_state/gtest.filter'
     - 'src/meta/test/suite1'
@@ -85,6 +58,13 @@
     - 'src/nfs/test/nfs_test_file1'
     - 'src/nfs/test/nfs_test_file2'
     - 'src/runtime/test/gtest.filter'
+    # Used for tests and should be empty, or ignore all comment lines (otherwise would lead to error).
+    - 'src/utils/test/config-empty.ini'
+    # Binary files used for tests and could not be added with copyright info (otherwise would lead to error).
+    - 'src/replica/duplication/test/log.1.0.handle_real_private_log'
+    - 'src/replica/duplication/test/log.1.0.handle_real_private_log2'
+    - 'src/replica/duplication/test/log.1.0.all_loaded_are_write_empties'
+    # Used for patches for thirdparties.
     - 'thirdparty/fix_fds_for_macos.patch'
     - 'thirdparty/fix_jemalloc_for_m1_on_macos.patch'
     - 'thirdparty/fix_libevent_for_macos.patch'
@@ -92,8 +72,52 @@
     - 'thirdparty/fix_s2_for_aarch64.patch'
     - 'thirdparty/fix_thrift_for_cpp11.patch'
     - 'thirdparty/rocksdb_fix_atomic_flush_0879c240.patch'
-    # should be empty, or ignore all comment lines
-    - 'src/utils/test/config-empty.ini'
+    # TODO(yingchun): shell/* files are import from thirdparties, we can move them to thirdparty later.
+    # Copyright (c) 2016, Adi Shavit
+    - 'src/shell/argh.h'
+    # Copyright (c) 2010-2016, Salvatore Sanfilippo, etc.
+    - 'src/shell/linenoise/linenoise.c'
+    # Copyright (c) 2010-2014, Salvatore Sanfilippo, etc.
+    - 'src/shell/linenoise/linenoise.h'
+    # Copyright (c) 2006-2015, Salvatore Sanfilippo, etc.
+    - 'src/shell/sds/sds.c'
+    - 'src/shell/sds/sds.h'
+    - 'src/shell/sds/sdsalloc.h'
+    # Copyright (c) Facebook, Inc
+    - 'src/utils/TokenBucket.h'
+    - 'src/utils/test/TokenBucketTest.cpp'
+    - 'src/utils/test/TokenBucketTest.h'
+    # https://github.com/preshing/modern-cpp-threading/blob/master/LICENSE
+    - 'src/utils/hpc_locks/autoreseteventcondvar.h'
+    # https://github.com/preshing/cpp11-on-multicore/blob/master/LICENSE
+    - 'src/utils/hpc_locks/autoresetevent.h'
+    - 'src/utils/hpc_locks/benaphore.h'
+    - 'src/utils/hpc_locks/bitfield.h'
+    - 'src/utils/hpc_locks/readme.txt'
+    - 'src/utils/hpc_locks/rwlock.h'
+    - 'src/utils/hpc_locks/sema.h'
+    # Copyright (c) 2011 The Chromium Authors
+    - 'src/utils/safe_strerror_posix.h'
+    # Copyright (c) 2012 The Chromium Authors
+    - 'src/runtime/build_config.h'
+    - 'src/utils/test/autoref_ptr_test.cpp'
+    # Copyright (c) 2006-2009 The Chromium Authors
+    - 'src/utils/safe_strerror_posix.cpp'
+    # Copyright 2017 The Abseil Authors
+    - 'src/utils/absl/base/internal/invoke.h'
+    - 'src/utils/absl/utility/utility.h'
+    - 'src/utils/memutil.h'
+    - 'src/utils/smart_pointers.h'
+    - 'src/utils/string_view.cpp'
+    - 'src/utils/string_view.h'
+    - 'src/utils/test/memutil_test.cpp'
+    - 'src/utils/test/smart_pointers_test.cpp'
+    - 'src/utils/test/string_view_test.cpp'
+    # Copyright (c) 2010-2011, Rob Jansen
+    - 'cmake_modules/FindRT.cmake'
+    - 'cmake_modules/FindDL.cmake'
+    # Copyright (c) 2017 Guillaume Papin
+    - 'scripts/run-clang-format.py'
     # The MIT License (MIT), Copyright (c) 2015 Microsoft Corporation
     - 'cmake_modules/BaseFunctions.cmake'
     - 'docs/rdsn-README.md'
@@ -105,7 +129,6 @@
     - 'idl/replica_admin.thrift'
     - 'scripts/compile_thrift.py'
     - 'scripts/learn_stat.py'
-    - 'src/common/api_common.h'
     - 'src/runtime/api_layer1.h'
     - 'src/runtime/api_task.h'
     - 'src/utils/api_utilities.h'
@@ -113,7 +136,6 @@
     - 'src/common/json_helper.h'
     - 'src/runtime/rpc/rpc_stream.h'
     - 'src/runtime/rpc/serialization.h'
-    - 'src/common/serialization_helper/dsn.layer2_types.h'
     - 'src/common/serialization_helper/dsn_types.h'
     - 'src/common/serialization_helper/thrift_helper.h'
     - 'src/runtime/serverlet.h'
@@ -131,7 +153,6 @@
     - 'src/client/partition_resolver.h'
     - 'src/replica/replica_base.h'
     - 'src/common/replica_envs.h'
-    - 'src/replica/replica_test_utils.h'
     - 'src/common/replication.codes.h'
     - 'src/replica/replication_app_base.h'
     - 'src/client/replication_ddl_client.h'
@@ -187,7 +208,6 @@
     - 'src/utils/configuration.h'
     - 'src/utils/crc.h'
     - 'src/utils/customizable_id.h'
-    - 'src/utils/dlib.h'
     - 'src/utils/enum_helper.h'
     - 'src/utils/error_code.h'
     - 'src/utils/errors.h'
@@ -220,7 +240,6 @@
     - 'src/aio/test/aio.cpp'
     - 'src/aio/test/clear.sh'
     - 'src/aio/test/config.ini'
-    - 'src/aio/test/copy_source.txt'
     - 'src/aio/test/run.sh'
     - 'src/block_service/test/config-test.ini'
     - 'src/client/CMakeLists.txt'
@@ -363,7 +382,6 @@
     - 'src/replica/replica_learn.cpp'
     - 'src/replica/replica_stub.cpp'
     - 'src/replica/replica_stub.h'
-    - 'src/replica/replica_test_utils.cpp'
     - 'src/replica/replication_app_base.cpp'
     - 'src/replica/replication_service_app.cpp'
     - 'src/replica/split/test/config-test.ini'
@@ -514,7 +532,6 @@
     - 'src/replica/test/run.sh'
     - 'src/runtime/CMakeLists.txt'
     - 'src/runtime/core_main.cpp'
-    - 'src/runtime/dsn.layer2_types.cpp'
     - 'src/runtime/env.sim.cpp'
     - 'src/runtime/env.sim.h'
     - 'src/runtime/fault_injector.cpp'
@@ -569,7 +586,6 @@
     - 'src/runtime/test/address_test.cpp'
     - 'src/runtime/test/async_call.cpp'
     - 'src/runtime/test/clear.sh'
-    - 'src/runtime/test/command.txt'
     - 'src/runtime/test/config-test-corrupt-message.ini'
     - 'src/runtime/test/config-test-sim.ini'
     - 'src/runtime/test/config-test.ini'
diff --git a/LICENSE b/LICENSE
index f686259..06453d0 100644
--- a/LICENSE
+++ b/LICENSE
@@ -231,7 +231,8 @@
 
 --------------------------------------------------------------------------------
 
-src/shell/linenoise/* - BSD-2-Clause License
+src/shell/linenoise/linenoise.h - BSD-2-Clause License
+src/shell/linenoise/LICENSE
 
   Copyright (c) 2010-2014, Salvatore Sanfilippo <antirez at gmail dot com>
   Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
@@ -263,6 +264,38 @@
 
 --------------------------------------------------------------------------------
 
+src/shell/linenoise/linenoise.c - BSD-2-Clause License
+
+  Copyright (c) 2010-2016, Salvatore Sanfilippo <antirez at gmail dot com>
+  Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--------------------------------------------------------------------------------
+
 src/shell/sds/* - BSD-2-Clause License
 
   Copyright (c) 2006-2015, Salvatore Sanfilippo <antirez at gmail dot com>
@@ -327,12 +360,12 @@
 
 --------------------------------------------------------------------------------
 
-src/utils/smart_pointers.h - Apache 2.0 License
-src/utils/string_view.h
-src/utils/absl/base/internal/invoke.h
+src/utils/absl/base/internal/invoke.h - Apache 2.0 License
 src/utils/absl/utility/utility.h
 src/utils/memutil.h
+src/utils/smart_pointers.h
 src/utils/string_view.cpp
+src/utils/string_view.h
 src/utils/test/memutil_test.cpp
 src/utils/test/smart_pointers_test.cpp
 src/utils/test/string_view_test.cpp
@@ -406,11 +439,12 @@
 --------------------------------------------------------------------------------
 
 src/utils/hpc_locks/autoreseteventcondvar.h - zlib License
-src/utils/hpc_locks/rwlock.h
 src/utils/hpc_locks/autoresetevent.h
-src/utils/hpc_locks/sema.h
-src/utils/hpc_locks/bitfield.h
 src/utils/hpc_locks/benaphore.h
+src/utils/hpc_locks/bitfield.h
+src/utils/hpc_locks/readme.txt
+src/utils/hpc_locks/rwlock.h
+src/utils/hpc_locks/sema.h
 
 Copyright (c) 2015 Jeff Preshing
 
diff --git a/scripts/check_license.py b/scripts/check_license.py
new file mode 100755
index 0000000..8151979
--- /dev/null
+++ b/scripts/check_license.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+import os
+import pprint
+
+PRJ_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+YML_PATH = os.path.join(PRJ_PATH, '.licenserc.yaml')
+
+IGNORED_STARTS_WITH = ['.git/', '.idea/']
+IGNORED_ENDS_WITH = ['.swp', '.npmigonre', 'go.sum', '.csv', '.json', '.pdf', '.jpg', '.png']
+IGNORED_NAMES = {'.licenserc.yaml', 'LICENSE', 'tags'}
+
+COPYRIGHT_MARKERS = [
+    "Copyright (c) 2016, Adi Shavit",
+    "Copyright (c) 2010-2016, Salvatore Sanfilippo",
+    "Copyright (c) 2010-2014, Salvatore Sanfilippo",
+    "Copyright (c) 2006-2015, Salvatore Sanfilippo",
+    "Copyright (c) Facebook, Inc",
+    "https://github.com/preshing/modern-cpp-threading",
+    "https://github.com/preshing/cpp11-on-multicore",
+    "Copyright (c) 2011 The Chromium Authors",
+    "Copyright (c) 2012 The Chromium Authors",
+    "Copyright (c) 2006-2009 The Chromium Authors",
+    "Copyright 2017 The Abseil Authors",
+    "Copyright (c) 2010-2011, Rob Jansen",
+    "Copyright (c) 2017 Guillaume Papin",
+    "Copyright (c) 2015 Microsoft Corporation",
+]
+IGNORED_COPYRIGHT_MARKERS = ["http://www.apache.org/licenses/LICENSE-2.0"]
+
+NO_COPYRIGHT_MARKER_KEY = "NO_COPYRIGHT_MARKER"
+IGNORED_COPYRIGHT_MARKER_KEY = "IGNORED_COPYRIGHT_MARKER"
+
+
+def mark_file(path):
+    with open(path) as f:
+        try:
+            for line in f:
+                for marker in IGNORED_COPYRIGHT_MARKERS:
+                    if marker in line:
+                        return IGNORED_COPYRIGHT_MARKER_KEY
+
+                for marker in COPYRIGHT_MARKERS:
+                    if marker in line:
+                        return marker
+        except UnicodeDecodeError:
+            # Ignore UnicodeDecodeError, since some files might be binary.
+            pass
+
+    # No marker was found, thus marked with no copyright.
+    return NO_COPYRIGHT_MARKER_KEY
+
+
+def is_path_ignored(path):
+    for header in IGNORED_STARTS_WITH:
+        if path.startswith(header):
+            return True
+
+    for trailer in IGNORED_ENDS_WITH:
+        if path.endswith(trailer):
+            return True
+
+    return False
+
+
+def is_name_ignored(name):
+    return name in IGNORED_NAMES
+
+
+def classify_files():
+    """
+    Scan all the files of the project, mark the ones that have copyright info.
+    """
+    marked_files = {}
+
+    for abs_dir, sub_dirs, file_names in os.walk(PRJ_PATH):
+        rel_dir = os.path.relpath(abs_dir, PRJ_PATH)
+        if rel_dir == '.':
+            # Drop the possible prefixed './' for the relative paths.
+            rel_dir = ''
+
+        for name in file_names:
+            # Some kinds of files should be ignored.
+            if is_name_ignored(name):
+                continue
+
+            rel_path = os.path.join(rel_dir, name)
+
+            # Some kinds of dirs/files should be ignored.
+            if is_path_ignored(rel_path):
+                continue
+
+            path = os.path.join(abs_dir, name)
+            marker = mark_file(path)
+
+            # Some kinds of copyright could be ignored, such as Apache LICENSE-2.0.
+            if marker == IGNORED_COPYRIGHT_MARKER_KEY:
+                continue
+
+            if marker not in marked_files:
+                marked_files[marker] = set()
+            marked_files[marker].add(rel_path)
+
+    return marked_files
+
+
+def parse_yml():
+    """
+    Scan all the files in .licenserc.yaml, mark the ones that have copyright info.
+    """
+    marked_files = {}
+
+    with open(YML_PATH) as f:
+        # The files without copyright info are marked with the specific key.
+        current_marker = NO_COPYRIGHT_MARKER_KEY
+        for line in f:
+            for marker in COPYRIGHT_MARKERS:
+                if marker in line:
+                    # Files in following lines would belong to this copyright.
+                    current_marker = marker
+                    break
+            else:
+                begin_idx = line.find("'")
+                if begin_idx < 0:
+                    # There's no file in this line, thus copyright would be reset.
+                    current_marker = NO_COPYRIGHT_MARKER_KEY
+                    continue
+
+                begin_idx += 1
+                end_idx = line.find("'", begin_idx)
+                if end_idx < 0:
+                    raise ValueError("Invalid file path line in {yml_path}".format(yml_path=YML_PATH))
+
+                path = line[begin_idx:end_idx]
+
+                # Some kinds of dirs/files should be ignored.
+                if is_name_ignored(os.path.basename(path)):
+                    continue
+                if is_path_ignored(path):
+                    continue
+
+                if current_marker not in marked_files:
+                    marked_files[current_marker] = set()
+                marked_files[current_marker].add(path)
+
+    return marked_files
+
+
+def check_diff():
+    """
+    Check if .licenserc.yaml is consistent with all real files of the project.
+    """
+    yml_marked_files = parse_yml()
+    marked_files = classify_files()
+    for yml_marker, yml_files in yml_marked_files.items():
+        if yml_marker not in marked_files:
+            print(
+                "marker {yml_marker} in {yml_path} not found in any file of the project".format(yml_marker=yml_marker,
+                                                                                                yml_path=YML_PATH))
+            continue
+
+        files = marked_files[yml_marker]
+        yml_plus = yml_files - files
+        yml_minus = files - yml_files
+        if not yml_plus and not yml_minus:
+            # .licenserc.yaml is consistent with the project.
+            print(
+                "No diff found for marker '{yml_marker}' in {yml_path}".format(yml_marker=yml_marker,
+                                                                               yml_path=YML_PATH))
+            del marked_files[yml_marker]
+            continue
+
+        print("Diff found for marker '{yml_marker}' in {yml_path}:".format(yml_marker=yml_marker, yml_path=YML_PATH))
+        if yml_plus:
+            # Files in .licenserc.yaml, but not in the project.
+            print("{plus}: {yml_plus}".format(plus='+' * len(yml_plus), yml_marker=yml_marker, yml_plus=yml_plus))
+        if yml_minus:
+            # Files in the project, but not in .licenserc.yaml.
+            print("{minus}: {yml_minus}".format(minus='-' * len(yml_minus), yml_minus=yml_minus))
+
+        del marked_files[yml_marker]
+
+    if not marked_files:
+        return
+
+    print("markers in some files of the project not found in {yml_path}:".format(yml_path=YML_PATH))
+    pprint.pprint(marked_files)
+
+
+def main():
+    check_diff()
+
+
+if __name__ == '__main__':
+    main()