diff --git a/.github/workflows/weekly-export-audit-data-to-csv-scheduled.yml b/.github/workflows/weekly-export-audit-data-to-csv-scheduled.yml index 59b7bad08c..30b55d63fe 100644 --- a/.github/workflows/weekly-export-audit-data-to-csv-scheduled.yml +++ b/.github/workflows/weekly-export-audit-data-to-csv-scheduled.yml @@ -29,3 +29,7 @@ jobs: cf_org: gsa-tts-oros-fac cf_space: ${{ env.space }} command: cf run-task gsa-fac -k 4G -m 6G --name export_audit_data_to_csv_scheduled --command "python manage.py export_data_audit --year ${{ matrix.years }}" --wait + # This will be the new command for the bash script instead of the sling management command. + # We will no longer need the matrix, because the bash script handles it. + # Or: we can just run one command, un-matrixed. + # command: cf run-task gsa-fac -k 4G -m 6G --name export_audit_data_to_csv_scheduled --command "./util/csv-export-to-s3/csv-export-to-s3.bash" --wait diff --git a/backend/util/csv-export-to-s3/csv-export-to-s3.bash b/backend/util/csv-export-to-s3/csv-export-to-s3.bash new file mode 100755 index 0000000000..7306041b80 --- /dev/null +++ b/backend/util/csv-export-to-s3/csv-export-to-s3.bash @@ -0,0 +1,236 @@ +#!/bin/bash + +# 1. A copy of each table as a single CSV +# 2. A copy of each table, broken out by year (calendar/audit year) +# 3. A copy of each table, broken out by federal fiscal year + +# https://stackoverflow.com/questions/1120109/how-to-export-table-as-csv-with-headings-on-postgresql +# https://stackoverflow.com/questions/1517635/save-pl-pgsql-output-from-postgresql-to-a-csv-file + +#################################################### +# This should be a schema in the API. +API_VERSION="api_v1_1_0" + +endpoints=( + "additional_eins" + "additional_ueis" + "corrective_action_plans" + "federal_awards" + "findings_text" + "findings" + "general" + "notes_to_sefa" + "passthrough" + "resubmission" + "secondary_auditors" +) + +#################################################### +# Get the bucket variables for use with AWS CLI +# These come out of VCAP_SERVICES if we are in the cloud. +# If we are local, we need to set up our env vars differently. +export AWSCLI="/tmp/aws-cli/v2/current/bin/aws" +# Where we'll store the temporary CSVs +export ROOT="/tmp/csv" + +if [[ "$ENV" = "LOCAL" ]]; then + echo "WE ARE LOCAL" + export AWS_ACCESS_KEY_ID="minioadmin" + export AWS_SECRET_ACCESS_KEY="minioadmin" + export BUCKET="gsa-fac-private-s3" + export REGION="usa-east-1" + export AWS_ENDPOINT="http://minio:9000" + export AWSCLI="${AWSCLI} --endpoint-url ${AWS_ENDPOINT}" + export DAS_DB_HOST="db" + export DAS_DB_USER="postgres" + export DAS_DB_NAME="postgres" + export DAS_DB_PASSWORD="" + export PGPASSWORD="${DAS_DB_PASSWORD}" + +else + echo "WE ARE ON CGOV" + export AWS_ACCESS_KEY_ID=$(echo $VCAP_SERVICES | jq -rc '.s3[] | select(.name | contains("fac-private-s3")) | .credentials.access_key_id') + export AWS_SECRET_ACCESS_KEY=$(echo $VCAP_SERVICES | jq -rc '.s3[] | select(.name | contains("fac-private-s3")) | .credentials.secret_access_key') + export BUCKET=$(echo $VCAP_SERVICES | jq -rc '.s3[] | select(.name | contains("fac-private-s3")) | .credentials.bucket') + export AWS_REGION=$(echo $VCAP_SERVICES | jq -rc '.s3[] | select(.name | contains("fac-private-s3")) | .credentials.region') + export AWS_DEFAULT_REGION=${AWS_REGION} + + export DAS_DB_HOST=$(echo $VCAP_SERVICES | jq -rc '."aws-rds"[] | select(.name | contains("fac-db")) | .credentials.host') + export DAS_DB_USER=$(echo $VCAP_SERVICES | jq -rc '."aws-rds"[] | select(.name | contains("fac-db")) | .credentials.username') + export DAS_DB_NAME=$(echo $VCAP_SERVICES | jq -rc '."aws-rds"[] | select(.name | contains("fac-db")) | .credentials.db_name') + export DAS_DB_PASSWORD=$(echo $VCAP_SERVICES | jq -rc '."aws-rds"[] | select(.name | contains("fac-db")) | .credentials.password') + export PGPASSWORD="${DAS_DB_PASSWORD}" + + alias psql='/home/vcap/deps/0/apt/usr/lib/postgresql/*/bin/psql' + export PATH=/home/vcap/deps/0/apt/usr/lib/postgresql/15/bin:$PATH +fi + +#################################################### +# FUNCTIONS +# These keep things further down more readable. +install_aws_cli() { + rm -rf /tmp/aws-cli + cd /tmp + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip -u -q awscliv2.zip + /tmp/aws/install -i /tmp/aws-cli -b /tmp/sym --update + if [ $? -ne 0 ]; then + echo FAILED TO INSTALL AWS + exit -1 + fi + rm -f awscliv2.zip + rm -rf aws +} + +cleanup_aws() { + rm -rf /tmp/aws-cli +} + +SAVED_PROXY="" +unset_proxy() { + SAVED_PROXY="${https_proxy}" + unset https_proxy + # echo "PROXY AFTER UNSET: ${https_proxy}" +} + +restore_proxy() { + set https_proxy "${SAVED_PROXY}" + # echo "PROXY AFTER RESTORE: ${https_proxy}" +} + +download_full_csv() { + local endpoint=$1 + + echo downloading csv for ${endpoint} + env PGOPTIONS='-c client_min_messages=WARNING' psql \ + -d "${DAS_DB_NAME}" \ + -h "${DAS_DB_HOST}" \ + -U "${DAS_DB_USER}" \ + -t -A \ + -c "\COPY (SELECT * FROM ${API_VERSION}.${endpoint}) TO '${ROOT}/${endpoint}.csv' WITH (FORMAT CSV, HEADER, DELIMITER ',');" \ + |& grep -v "has_tribal" + + if [ $? -ne 0 ]; then + echo "PSQL FAILED IN FULL TABLE DOWNLOAD" + exit -1 + fi +} + +# NEAT TRICK +# Note the pipe-ampersand (|&) below. It says +# "keep all the lines except for the ones where this matches" +# https://stackoverflow.com/a/16321435 +# We do this to quiet some INFO messages from psql. +download_date_range_csv () { + local endpoint=$1 + local start_date=$2 + local end_date=$3 + + local query="\COPY " + query+="( SELECT * FROM ${API_VERSION}.${endpoint} " + query+=" WHERE report_id in ( " + query+=" SELECT report_id from ${API_VERSION}.general " + query+=" WHERE fac_accepted_date >= '${start_date}' " + query+=" AND fac_accepted_date <= '${end_date}' " + query+=")) " + query+="TO '${ROOT}/${endpoint}.csv' " + query+="WITH (FORMAT CSV, HEADER, DELIMITER ',');" + + echo "${query}" + + echo downloading audit year csv for ${endpoint} + # TODO: Consider adding a quiet flag or routing output to /dev/null to suppress + # warnings like "INFO: api_v1_1_0 has_tribal f". We will see a lot of them + # because we're using the API in an un-authenticated way (which is intentional!). + env PGOPTIONS='-c client_min_messages=WARNING' psql \ + -d "${DAS_DB_NAME}" \ + -h "${DAS_DB_HOST}" \ + -U "${DAS_DB_USER}" \ + -t -A \ + -c "${query}" |& \ + grep -v "has_tribal" + + if [ $? -ne 0 ]; then + echo "PSQL FAILED IN DATE RANGE DOWNLOAD" + exit -1 + fi +} + +copy_to_s3() { + local endpoint=$1 + local path_in_s3=$2 + + if [[ "$ENV" = "LOCAL" ]]; then + # Use the low-level API to avoid multipart. + # (Minio does not like the multipart.) + cmd=$(echo ${AWSCLI} s3api put-object --bucket ${BUCKET} --key "${path_in_s3}" --body "${ROOT}/${endpoint}.csv") + echo $cmd + eval $cmd + else + cmd=$(echo ${AWSCLI} s3 cp "${ROOT}/${endpoint}.csv" "s3://${BUCKET}/${path_in_s3}") + # cmd=$(echo ${AWSCLI} s3api put-object --bucket ${BUCKET} --key "${path_in_s3}" --body "${ROOT}/${endpoint}.csv") + echo $cmd + eval $cmd + + if [ $? -ne 0 ]; then + echo "S3 COPY FAILED FOR ${path_in_s3}" + fi + fi +} + +#################################################### +# Install the AWS CLI +install_aws_cli + +#################################################### +# Unset the proxy (so we can upload to S3) +unset_proxy + +#################################################### +# Create a destination directory (in the container/cloud env.) +mkdir -p "${ROOT}" + + +#################################################### +# In a loop, download, upload, and delete the full CSVs. +# public-data/gsa/full/{table_name}.csv +for endpoint in ${endpoints[@]}; do + # Download the endpoint from Postgres as a CSV. + download_full_csv ${endpoint} + copy_to_s3 ${endpoint} "public-data/gsa/full/${endpoint}.csv" + rm -f "${ROOT}/${endpoint}.csv" +done + +#################################################### +# federal fiscal year year CSVs. +# public-data/gsa/federal-fiscal-year/{audit_year}-ffy-{table_name}.csv +for endpoint in ${endpoints[@]}; do + for year in $(seq 2015 `date -d "+2 year" +'%Y'`); do + # Download the endpoint from Postgres as a CSV. + start_date="${year}-10-01" + next_year=$(($year+1)) + end_date="${next_year}-09-30" + download_date_range_csv ${endpoint} ${start_date} ${end_date} + copy_to_s3 ${endpoint} "public-data/gsa/federal-fiscal-year/${year}-ffy-${endpoint}.csv" + rm -f "${ROOT}/${endpoint}.csv" + done +done + +#################################################### +# audit year CSVs. +# public-data/gsa/audit-year/{audit_year}-ay-{table_name}.csv +for endpoint in ${endpoints[@]}; do + for year in $(seq 2015 `date -d "+2 year" +'%Y'`); do + # Download the endpoint from Postgres as a CSV. + start_date="${year}-01-01" + end_date="${year}-12-31" + download_date_range_csv ${endpoint} ${start_date} ${end_date} + copy_to_s3 ${endpoint} "public-data/gsa/audit-year/${year}-ay-${endpoint}.csv" + rm -f "${ROOT}/${endpoint}.csv" + done +done + + +#################################################### +# Don't leave things lying around when you're done. +cleanup_aws