From 20d294a10be5be9dfe7a16d07a60e44f7bf3be39 Mon Sep 17 00:00:00 2001 From: Dianjin Wang Date: Mon, 16 Mar 2026 14:36:06 +0800 Subject: [PATCH] ASF: updates for Apache license compliance - Add `.github/workflows/apache-rat-audit.yml` for automated checking. - Configure `apache-rat-plugin` in `pom.xml` with necessary exclusions. - Update `LICENSE` file with correct open-source attributions. - Add `README.apache.md` to document the binary files allowlist. - Add `*.log` to `.gitignore` to prevent false positives during audits. --- .github/workflows/apache-rat-audit.yml | 345 +++++++++++++++++++++++++ .gitignore | 3 + LICENSE | 26 +- README.apache.md | 63 +++++ pom.xml | 328 +++++++++++++++++++++++ 5 files changed, 764 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/apache-rat-audit.yml create mode 100644 README.apache.md create mode 100644 pom.xml diff --git a/.github/workflows/apache-rat-audit.yml b/.github/workflows/apache-rat-audit.yml new file mode 100644 index 00000000..8a251c11 --- /dev/null +++ b/.github/workflows/apache-rat-audit.yml @@ -0,0 +1,345 @@ +# -------------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to You under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# -------------------------------------------------------------------- +# Apache Cloudberry Backup (Incubating) Compliance Workflow +# +# Comprehensive compliance checks for Apache Cloudberry Backup: +# 1. Apache RAT license header validation +# 2. Copyright year verification (NOTICE) +# 3. Binary file presence detection with approved allowlist +# +# Based on Apache Rat tool, run locally with: +# `mvn clean verify -Drat.consoleOutput=true` +# -------------------------------------------------------------------- + +name: Apache Rat License Check + +on: + push: + branches: [main, REL_2_STABLE] + pull_request: + branches: [main, REL_2_STABLE] + types: [opened, synchronize, reopened, edited] + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + rat-check: + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + - name: Check out repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Set up Java and Maven + uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '11' + cache: maven + + - name: Run Apache Rat check + run: | + echo "Running Apache Rat license check..." + mvn clean verify -Drat.consoleOutput=true | tee rat-output.log + + # Check for build failure + if grep -q "\[INFO\] BUILD FAILURE" rat-output.log; then + echo "::error::Apache Rat check failed - build failure detected" + echo "RAT_CHECK=fail" >> $GITHUB_ENV + else + echo "RAT_CHECK=pass" >> $GITHUB_ENV + echo "Apache Rat check passed successfully" + fi + + - name: Check copyright years are up-to-date + run: | + echo "Checking copyright years..." + current_year=$(date -u +"%Y") + echo "CURRENT_YEAR=$current_year" >> $GITHUB_ENV + + # Initialize to pass, will be updated if checks fail + echo "NOTICE_CHECK=pass" >> $GITHUB_ENV + echo "PSQL_HELP_CHECK=pass" >> $GITHUB_ENV + + # Check NOTICE file + echo "Checking NOTICE file..." + if ! grep -q "Copyright 2024-$current_year The Apache Software Foundation" NOTICE; then + echo "::error::NOTICE file does not contain the current year ($current_year)" + echo "NOTICE_CHECK=fail" >> $GITHUB_ENV + else + echo "PASS: NOTICE file contains the current year ($current_year)" + fi + + # Continue execution even if checks fail + if [ "$NOTICE_CHECK" = "pass" ]; then + echo "All copyright year checks passed" + else + echo "Copyright year checks completed with errors" + fi + + - name: Check for binary files + run: | + echo "Checking for binary files..." + echo "Checking extensions: class, jar, tar, tgz, zip, exe, dll, so, gz, bz2" + echo "----------------------------------------------------------------------" + + # Binary file allowlist, see README.apache.md + ALLOWLIST=( + "end_to_end/resources/1-segment-db-filter.tar.gz" + "end_to_end/resources/1-segment-db-replicated.tar.gz" + "end_to_end/resources/1-segment-db-single-data-file.tar.gz" + "end_to_end/resources/1-segment-db.tar.gz" + "end_to_end/resources/2-segment-db-1_24_0.tar.gz" + "end_to_end/resources/2-segment-db-1_26_0.tar.gz" + "end_to_end/resources/2-segment-db-filter.tar.gz" + "end_to_end/resources/2-segment-db-incremental.tar.gz" + "end_to_end/resources/2-segment-db-single-data-file-filter.tar.gz" + "end_to_end/resources/2-segment-db-single-data-file.tar.gz" + "end_to_end/resources/2-segment-db.tar.gz" + "end_to_end/resources/3-segment-db-replicated.tar.gz" + "end_to_end/resources/3-segment-db.tar.gz" + "end_to_end/resources/5-segment-db.tar.gz" + "end_to_end/resources/7-segment-db-filter.tar.gz" + "end_to_end/resources/7-segment-db-single-data-file-filter.tar.gz" + "end_to_end/resources/7-segment-db-single-data-file.tar.gz" + "end_to_end/resources/7-segment-db.tar.gz" + "end_to_end/resources/9-segment-db-incremental.tar.gz" + "end_to_end/resources/9-segment-db-replicated.tar.gz" + "end_to_end/resources/9-segment-db-single-data-file.tar.gz" + "end_to_end/resources/9-segment-db.tar.gz" + "end_to_end/resources/corrupt-db.tar.gz" + "end_to_end/resources/corrupt-metadata-db.tar.gz" + "end_to_end/resources/no-segment-count-db.tar.gz" + ) + + # Check for specific binary file extensions + binary_extensions="class jar tar tgz zip exe dll so gz bz2" + echo "BINARY_EXTENSIONS=${binary_extensions}" >> $GITHUB_ENV + binary_results="" + binaryfiles_found=false + + for extension in ${binary_extensions}; do + printf "Checking *.%-4s files..." "${extension}" + found=$(find . -name "*.${extension}" -type f || true) + + # Filter out allowed files + if [ -n "$found" ]; then + filtered_found="" + while IFS= read -r file; do + is_allowed=false + for allowlist_file in "${ALLOWLIST[@]}"; do + if [ "$file" = "./$allowlist_file" ]; then + is_allowed=true + echo "Allowed: $file" >> binary_allowlist.txt + break + fi + done + if [ "$is_allowed" = false ]; then + filtered_found+="$file"$'\n' + fi + done <<< "$found" + + filtered_found=$(echo "$filtered_found" | sed '/^$/d') + + if [ -n "$filtered_found" ]; then + echo "FOUND" + echo "::error::${extension} files should not exist" + echo "For ASF compatibility: the source tree should not contain" + echo "binary files as users have a hard time verifying their contents." + echo "Found files:" + echo "$filtered_found" | sed 's/^/ /' + echo "${extension}:${filtered_found}" >> binary_results.txt + binaryfiles_found=true + else + echo "NONE (all allowed)" + echo "${extension}:none" >> binary_results.txt + fi + else + echo "NONE" + echo "${extension}:none" >> binary_results.txt + fi + done + + echo "----------------------------------------------------------------------" + if [ "$binaryfiles_found" = true ]; then + echo "ERROR: Non-allowed binary files were found in the source tree" + echo "BINARY_CHECK=fail" >> $GITHUB_ENV + else + echo "PASS: No non-allowed binary files found" + echo "BINARY_CHECK=pass" >> $GITHUB_ENV + fi + + # Show allowlist summary if any allowed files were found + if [ -f binary_allowlist.txt ]; then + echo "" + echo "Allowed binary files (approved):" + cat binary_allowlist.txt | sed 's/^/ /' + fi + + - name: Upload Rat check results + if: always() + uses: actions/upload-artifact@v4 + with: + name: rat-check-results + path: rat-output.log + retention-days: 7 + + - name: Generate Job Summary + if: always() + run: | + { + echo "## Apache Cloudberry PXF Compliance Audit Results" + echo "- Run Time: $(date -u +'%Y-%m-%d %H:%M:%S UTC')" + echo "" + + # Copyright Year Check Summary + echo "### Copyright Year Checks" + echo "**NOTICE file:**" + if [ "$NOTICE_CHECK" = "pass" ]; then + echo "PASS: Contains current year ($CURRENT_YEAR)" + else + echo "ERROR: Does not contain current year ($CURRENT_YEAR)" + fi + echo "" + + # Binary Files Check Summary + echo "### Binary Files Check" + echo "Checked extensions: \`${BINARY_EXTENSIONS}\`" + echo "" + echo "Results:" + echo "\`\`\`" + if [ -f binary_results.txt ]; then + while IFS=: read -r ext files; do + if [ "$files" = "none" ]; then + echo "PASS: No .${ext} files found" + else + echo "ERROR: Found .${ext} files:" + echo "$files" | sed 's/^/ /' + fi + done < binary_results.txt + fi + echo "\`\`\`" + echo "" + + # Allowlist summary + if [ -f binary_allowlist.txt ]; then + echo "### Allowed Binary Files" + echo "The following binary files are approved for testing purposes:" + echo "You can see `README.apache.md` for details." + echo "\`\`\`" + cat binary_allowlist.txt | sed 's/Allowed: //' + echo "\`\`\`" + echo "" + fi + + # Rat check summary + if [[ -f rat-output.log ]]; then + # First extract and display summary statistics (only once) + if grep -q "Rat check: Summary over all files" rat-output.log; then + echo "### License Header Check" + summary_line=$(grep "Rat check: Summary over all files" rat-output.log) + echo "\`\`\`" + echo "$summary_line" + echo "\`\`\`" + echo "" + fi + + # Then determine the result status + if [ "$RAT_CHECK" = "fail" ]; then + echo "#### Check Failed - License Compliance Issues Detected" + echo "" + + # Extract and display files with unapproved licenses + if grep -q "Files with unapproved licenses:" rat-output.log; then + echo "##### Files with Unapproved Licenses" + echo "\`\`\`" + # Get the line with "Files with unapproved licenses:" and all following lines until the dashed line + sed -n '/Files with unapproved licenses:/,/\[INFO\] ------------------------------------------------------------------------/p' rat-output.log | \ + grep -v "\[INFO\] ------------------------------------------------------------------------" | \ + grep -v "^$" | \ + head -20 + echo "\`\`\`" + echo "" + fi + + echo "**How to fix:**" + echo "" + echo "**For new original files you created:**" + echo "- Add the standard Apache License header to each file" + echo "" + echo "**For third-party files with different licenses:**" + echo "- Add the file to exclusion list in \`pom.xml\` under the rat-maven-plugin configuration" + echo "- Ensure the license is compatible with Apache License 2.0" + echo "- Avoid introducing components with incompatible licenses" + echo "" + echo "**Need help?**" + echo "- Run \`mvn clean verify -Drat.consoleOutput=true\` locally for the full report" + echo "- Email dev@cloudberry.apache.org if you have questions about license compatibility" + + elif [ "$RAT_CHECK" = "pass" ]; then + echo "#### Check Passed - All Files Comply with Apache License Requirements" + fi + fi + } >> "$GITHUB_STEP_SUMMARY" + + - name: Report Status + if: always() + shell: bash {0} + run: | + # Check overall status of all checks + overall_status=0 + + # Check Apache RAT status + if [ "$RAT_CHECK" = "fail" ]; then + echo "ERROR: Apache Rat check failed" + overall_status=1 + elif [ "$RAT_CHECK" = "pass" ]; then + echo "Apache Rat check passed" + fi + + # Check copyright year status + if [ -n "$NOTICE_CHECK" ] && [ "$NOTICE_CHECK" = "fail" ]; then + echo "ERROR: NOTICE file copyright year check failed" + overall_status=1 + fi + + # Check binary files status (if this variable exists) + if [ -n "$BINARY_CHECK" ] && [ "$BINARY_CHECK" = "fail" ]; then + echo "ERROR: Binary files check failed" + overall_status=1 + fi + + # Exit with appropriate status + if [ $overall_status -eq 0 ]; then + echo "SUCCESS: All checks passed" + exit 0 + else + echo "FAILURE: One or more checks failed" + exit 1 + fi diff --git a/.gitignore b/.gitignore index 0f08c244..3bef3410 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ _testmain.go gpbackup gprestore gpbackup_helper + +# Logs +*.log diff --git a/LICENSE b/LICENSE index a5a1387e..6c9e5eb8 100644 --- a/LICENSE +++ b/LICENSE @@ -217,4 +217,28 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the License. \ No newline at end of file +limitations under the License. + +The Greenplum Database Backup software includes: +- backup/ +- end_to_end/ +- filepath/ +- gppkg/ +- helper/ +- history/ +- integration/ +- options/ +- plugins/ +- report/ +- restore/ +- testutils/ +- toc/ +- utils/ +- go.mod +- go.sum +- gpbackup.go +- gprestore.go +- gpbackup_helper.go +- gpbackup_s3_plugin.go +- tools.go +- show_coverage.go diff --git a/README.apache.md b/README.apache.md new file mode 100644 index 00000000..3981eff1 --- /dev/null +++ b/README.apache.md @@ -0,0 +1,63 @@ + + +# Apache Cloudberry Backup (Incubating) License Audit Notes + +This file documents licensing clarifications and exceptions as part of ASF release readiness for Apache Cloudberry Backup (Incubating). + +## Historical Attribution Under Apache License 2.0 + +The following entities have contributed to the Greenplum Backup source code under the Apache License 2.0: + +- Greenplum, Inc. +- EMC Corporation +- VMware, Inc. +- Pivotal Software + +RAT matchers are used to classify their license headers accordingly. + +## Compressed Files in Source + +The following compressed files are included in the source tree. These files are archives of text files used for testing purposes and do not contain binary executables. They are not used during the build process. + +- end_to_end/resources/1-segment-db-filter.tar.gz +- end_to_end/resources/1-segment-db-replicated.tar.gz +- end_to_end/resources/1-segment-db-single-data-file.tar.gz +- end_to_end/resources/1-segment-db.tar.gz +- end_to_end/resources/2-segment-db-1_24_0.tar.gz +- end_to_end/resources/2-segment-db-1_26_0.tar.gz +- end_to_end/resources/2-segment-db-filter.tar.gz +- end_to_end/resources/2-segment-db-incremental.tar.gz +- end_to_end/resources/2-segment-db-single-data-file-filter.tar.gz +- end_to_end/resources/2-segment-db-single-data-file.tar.gz +- end_to_end/resources/2-segment-db.tar.gz +- end_to_end/resources/3-segment-db-replicated.tar.gz +- end_to_end/resources/3-segment-db.tar.gz +- end_to_end/resources/5-segment-db.tar.gz +- end_to_end/resources/7-segment-db-filter.tar.gz +- end_to_end/resources/7-segment-db-single-data-file-filter.tar.gz +- end_to_end/resources/7-segment-db-single-data-file.tar.gz +- end_to_end/resources/7-segment-db.tar.gz +- end_to_end/resources/9-segment-db-incremental.tar.gz +- end_to_end/resources/9-segment-db-replicated.tar.gz +- end_to_end/resources/9-segment-db-single-data-file.tar.gz +- end_to_end/resources/9-segment-db.tar.gz +- end_to_end/resources/corrupt-db.tar.gz +- end_to_end/resources/corrupt-metadata-db.tar.gz +- end_to_end/resources/no-segment-count-db.tar.gz diff --git a/pom.xml b/pom.xml new file mode 100644 index 00000000..b3302e0e --- /dev/null +++ b/pom.xml @@ -0,0 +1,328 @@ + + + + + 4.0.0 + org.apache.cloudberry + apache-cloudberry-backup-incubating + 2.1.0-incubating + pom + + Apache Cloudberry Backup (Incubating) + Backup and Restore for Apache Cloudberry (Incubating) + + + + + org.apache.rat + apache-rat-plugin + 0.16.1 + + true + + + options/flag.go + options/options.go + options/options_suite_test.go + options/flag_test.go + options/options_test.go + + toc/toc.go + toc/toc_test.go + + end_to_end/special_characters_test.go + end_to_end/incremental_test.go + end_to_end/signal_handler_test.go + end_to_end/resources/8-segment-db-rowcounts.txt + end_to_end/resources/test_db_ddl.sql + end_to_end/resources/4-segment-db-rowcounts.txt + end_to_end/resources/test_tables_ddl.sql + end_to_end/resources/gpdb6_objects.sql + end_to_end/resources/test_rowcount_ddl.sql + end_to_end/resources/gpdb4_compatible_objects_after_gpdb7.sql + end_to_end/resources/gpdb5_objects.sql + end_to_end/resources/7-segment-db-incremental-rowcounts.txt + end_to_end/resources/implicit_casts.sql + end_to_end/resources/replicated_table.sql + end_to_end/resources/9-segment-db-rowcounts.txt + end_to_end/resources/5-segment-db-rowcounts.txt + end_to_end/resources/1-segment-db-rowcounts.txt + end_to_end/resources/test_db_incremental_ddl.sql + end_to_end/resources/3-segment-db-incremental-rowcounts.txt + end_to_end/resources/3-segment-db-rowcounts.txt + end_to_end/resources/gpdb4_compatible_objects_before_gpdb7.sql + end_to_end/resources/7-segment-db-rowcounts.txt + end_to_end/resources/10-segment-db-rowcounts.txt + end_to_end/resources/2-segment-db-incremental-rowcounts.txt + end_to_end/resources/6-segment-db-rowcounts.txt + end_to_end/resources/test_tables_data.sql + end_to_end/resources/2-segment-db-rowcounts.txt + end_to_end/resources/gpdb4_objects.sql + end_to_end/locks_test.go + end_to_end/plugin_test.go + end_to_end/filtered_test.go + end_to_end/end_to_end_suite_test.go + + filepath/filepath_test.go + filepath/filepath.go + + plugins/plugin_test.sh + plugins/example_plugin.bash + plugins/README.md + plugins/example_plugin_config.yaml + plugins/generate_minio_config.sh + plugins/plugin_test_scale.sh + plugins/s3plugin/backup.go + plugins/s3plugin/s3plugin_test.go + plugins/s3plugin/README.md + plugins/s3plugin/restore.go + plugins/s3plugin/s3plugin.go + + integration/predata_acl_queries_test.go + integration/statistics_queries_test.go + integration/predata_acl_create_test.go + integration/postdata_queries_test.go + integration/helper_test.go + integration/utils_test.go + integration/predata_shared_queries_test.go + integration/parallel_queries_test.go + integration/statistics_create_test.go + integration/options_integration_test.go + integration/predata_textsearch_queries_test.go + integration/dependency_queries_test.go + integration/predata_types_queries_test.go + integration/predata_externals_queries_test.go + integration/incremental_queries_test.go + integration/predata_types_create_test.go + integration/agent_remote_test.go + integration/predata_shared_create_test.go + integration/wrappers_test.go + integration/integration_suite_test.go + integration/predata_relations_create_test.go + integration/predata_relations_queries_test.go + integration/snapshot_test.go + integration/metadata_globals_queries_test.go + integration/postdata_create_test.go + integration/inheritance_test.go + integration/predata_operators_queries_test.go + integration/predata_operators_create_test.go + integration/metadata_globals_create_test.go + integration/predata_table_defs_queries_test.go + integration/predata_textsearch_create_test.go + integration/data_backup_test.go + integration/predata_externals_create_test.go + integration/gpexpand_not_running_test.go + integration/predata_functions_queries_test.go + integration/predata_functions_create_test.go + + utils/compression_test.go + utils/agent_remote.go + utils/util_test.go + utils/io.go + utils/utils_suite_test.go + utils/agent_remote_test.go + utils/progress_bar_test.go + utils/compression.go + utils/util.go + utils/plugin_test.go + utils/set_test.go + utils/plugin.go + utils/gpexpand_sensor.go + utils/gpexpand_sensor_test.go + utils/set.go + utils/io_test.go + utils/progress_bar.go + + history/history_test.go + history/history.go + + report/report.go + report/report_test.go + + gppkg/gpbackup_control.in + gppkg/gppkg_spec.yml.in + gppkg/gppkg_v2_spec.yml.in + gppkg/gpbackup_tools.spec.in + + backup/predata_operators.go + backup/predata_relations_tables_test.go + backup/global_variables.go + backup/predata_acl.go + backup/queries_postdata.go + backup/backup_suite_test.go + backup/postdata_test.go + backup/backup.go + backup/incremental_test.go + backup/predata_types.go + backup/queries_acl.go + backup/predata_shared.go + backup/data_test.go + backup/queries_textsearch.go + backup/validate_test.go + backup/predata_textsearch_test.go + backup/queries_globals.go + backup/queries_operators.go + backup/predata_acl_test.go + backup/queries_table_defs.go + backup/queries_functions_test.go + backup/queries_shared.go + backup/predata_operators_test.go + backup/predata_relations.go + backup/predata_shared_test.go + backup/predata_externals.go + backup/predata_functions_test.go + backup/wrappers.go + backup/predata_externals_test.go + backup/metadata_globals.go + backup/queries_functions.go + backup/validate.go + backup/queries_shared_test.go + backup/queries_acl_test.go + backup/incremental.go + backup/dependencies.go + backup/snapshot.go + backup/data.go + backup/dependencies_test.go + backup/predata_textsearch.go + backup/predata_types_test.go + backup/queries_postdata_test.go + backup/predata_functions.go + backup/queries_externals.go + backup/queries_relations.go + backup/queries_incremental.go + backup/statistics_test.go + backup/queries_types.go + backup/queries_statistics.go + backup/queries_relation_test.go + backup/postdata.go + backup/backup_internal_test.go + backup/statistics.go + backup/metadata_globals_test.go + backup/predata_relations_other_test.go + + restore/restore_internal_test.go + restore/global_variables.go + restore/parallel_test.go + restore/data_test.go + restore/validate_test.go + restore/wrappers_test.go + restore/remote.go + restore/wrappers.go + restore/remote_test.go + restore/validate.go + restore/data.go + restore/restore_suite_test.go + restore/restore.go + restore/parallel.go + + helper/backup_helper.go + helper/restore_helper.go + helper/helper.go + helper/backup_helper_pipes.go + + testutils/functions.go + testutils/functions_test.go + + Makefile + gpbackup_s3_plugin.go + gprestore.go + show_coverage.sh + go.mod + gometalinter.config + .github/workflows/build_and_unit_test.yml + gpbackup_helper.go + tools.go + go.sum + gpbackup.go + + + + + .github/pull_request_template.md + VERSION + + + + + + + Apache License (Greenplum-derived) + GRPM + + copyright (c) 2007, 2008, 2009 GreenPlum. All rights reserved. + + + + + Apache License (VMware-derived) + VMW + + Copyright 2018-Present VMware, Inc. or its affiliates. + Portions Copyright (c) 2023 VMware, Inc. or its affiliates. + + + + + + + Apache License (Greenplum-derived) + + + Apache License (VMware-derived) + + + + + + + verify + + check + + + + + + +