From 6f70c1e307944820a13c0238f62508bbada373c6 Mon Sep 17 00:00:00 2001 From: skye rogers Date: Tue, 2 Dec 2025 17:12:31 -0800 Subject: [PATCH 1/4] robust clean up strategy and added dedicated sample.properties file --- .github/resources/sample.properties | 39 ++++++++++++++++++++++++ .github/scripts/clean_up_stream_table.sh | 34 ++++++++++++++------- .github/scripts/manipulate_properties.sh | 18 ++++------- .github/scripts/start_kcl.sh | 6 ++-- 4 files changed, 71 insertions(+), 26 deletions(-) create mode 100644 .github/resources/sample.properties diff --git a/.github/resources/sample.properties b/.github/resources/sample.properties new file mode 100644 index 00000000..3e1a3356 --- /dev/null +++ b/.github/resources/sample.properties @@ -0,0 +1,39 @@ +# The script that abides by the multi-language protocol. This script will +# be executed by the MultiLangDaemon, which will communicate with this script +# over STDIN and STDOUT according to the multi-language protocol. +executableName = sample_kclpy_app.py + +# The Stream arn: arn:aws:kinesis:::stream/ +# Important: streamArn takes precedence over streamName if both are set +streamArn = arn:aws:kinesis:us-east-1:000000000000:stream/kclpysample + +# The name of an Amazon Kinesis stream to process. +# Important: streamArn takes precedence over streamName if both are set +streamName = kclpysample + +# Used by the KCL as the name of this application. Will be used as the name +# of an Amazon DynamoDB table which will store the lease and checkpoint +# information for workers with this application name +applicationName = PythonKCLSample + +# Users can change the credentials provider the KCL will use to retrieve credentials. +# Expected key name (case-sensitive): +# AwsCredentialsProvider / AwsCredentialsProviderDynamoDB / AwsCredentialsProviderCloudWatch +# The DefaultCredentialsProvider checks several other providers, which is +# described here: +# https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/auth/credentials/DefaultCredentialsProvider.html +AwsCredentialsProvider = DefaultCredentialsProvider + +# Appended to the user agent of the KCL. Does not impact the functionality of the +# KCL in any other way. +processingLanguage = python/3.8 + +# Valid options at TRIM_HORIZON or LATEST. +# See http://docs.aws.amazon.com/kinesis/latest/APIReference/API_GetShardIterator.html#API_GetShardIterator_RequestSyntax +initialPositionInStream = TRIM_HORIZON + +# The KCL defaults to us-east-1 +regionName = us-east-1 + +# Idle time between record reads in milliseconds. +idleTimeBetweenReadsInMillis = 250 diff --git a/.github/scripts/clean_up_stream_table.sh b/.github/scripts/clean_up_stream_table.sh index ee8967f6..a5ff3a13 100644 --- a/.github/scripts/clean_up_stream_table.sh +++ b/.github/scripts/clean_up_stream_table.sh @@ -1,19 +1,31 @@ #!/bin/bash -aws kinesis delete-stream --stream-name $STREAM_NAME || true +# Delete stream +if aws kinesis describe-stream --stream-name $STREAM_NAME &>/dev/null; then + echo "Deleting stream $STREAM_NAME" + for i in {1..10}; do + aws kinesis delete-stream --stream-name $STREAM_NAME && break || + echo "Stream deletion failed, attempt $i/10. Retrying Stream deletion in $((i * 3))s" && sleep $((i * 3)) + done +else + echo "Stream $STREAM_NAME does not exist and does not need to be cleaned up" +fi -# Delete all tables -for i in {1..10}; do - echo "Deleting table $APP_NAME" - aws dynamodb delete-table --table-name $APP_NAME && break || - echo "Table deletion failed, attempt $i/10. Retrying DynamoDB Table deletion in $((i * 3)) seconds" && sleep $((i * 3)) -done -for SUFFIX in "-CoordinatorState" "-WorkerMetricStats"; do - if aws dynamodb describe-table --table-name $APP_NAME$SUFFIX &>/dev/null; then - echo "Deleting table $APP_NAME$SUFFIX" +# Delete table +delete_table() { + table_name=$1 + if aws dynamodb describe-table --table-name $table_name &>/dev/null; then + echo "Deleting table $table_name" for i in {1..10}; do - aws dynamodb delete-table --table-name $APP_NAME$SUFFIX && break || + aws dynamodb delete-table --table-name $table_name && break || echo "Table deletion failed, attempt $i/10. Retrying DynamoDB Table deletion in $((i * 3))s" && sleep $((i * 3)) done + else + echo "Table $table_name does not exist and does not need to be cleaned up" fi +} + +# Delete all tables +for SUFFIX in "" "-CoordinatorState" "-WorkerMetricStats"; do + delete_table "$APP_NAME$SUFFIX" done \ No newline at end of file diff --git a/.github/scripts/manipulate_properties.sh b/.github/scripts/manipulate_properties.sh index 92b07f44..a461b373 100644 --- a/.github/scripts/manipulate_properties.sh +++ b/.github/scripts/manipulate_properties.sh @@ -4,26 +4,20 @@ set -e # Manipulate sample.properties file that the KCL application pulls properties from (ex: streamName, applicationName) # Depending on the OS, different properties need to be changed if [[ "$RUNNER_OS" == "macOS" ]]; then - sed -i "" "s/kclpysample/$STREAM_NAME/g" samples/sample.properties - sed -i "" "s/PythonKCLSample/$APP_NAME/g" samples/sample.properties - sed -i "" 's/us-east-5/us-east-1/g' samples/sample.properties - grep -v "idleTimeBetweenReadsInMillis" samples/sample.properties > samples/temp.properties - echo "idleTimeBetweenReadsInMillis = 250" >> samples/temp.properties - mv samples/temp.properties samples/sample.properties + sed -i "" "s/kclpysample/$STREAM_NAME/g" .github/resources/sample.properties + sed -i "" "s/PythonKCLSample/$APP_NAME/g" .github/resources/sample.properties elif [[ "$RUNNER_OS" == "Linux" || "$RUNNER_OS" == "Windows" ]]; then - sed -i "s/kclpysample/$STREAM_NAME/g" samples/sample.properties - sed -i "s/PythonKCLSample/$APP_NAME/g" samples/sample.properties - sed -i 's/us-east-5/us-east-1/g' samples/sample.properties - sed -i "/idleTimeBetweenReadsInMillis/c\idleTimeBetweenReadsInMillis = 250" samples/sample.properties + sed -i "s/kclpysample/$STREAM_NAME/g" .github/resources/sample.properties + sed -i "s/PythonKCLSample/$APP_NAME/g" .github/resources/sample.properties if [[ "$RUNNER_OS" == "Windows" ]]; then echo '@echo off' > samples/run_script.bat echo 'python %~dp0\sample_kclpy_app.py %*' >> samples/run_script.bat - sed -i 's/executableName = sample_kclpy_app.py/executableName = samples\/run_script.bat/' samples/sample.properties + sed -i 's/executableName = sample_kclpy_app.py/executableName = samples\/run_script.bat/' .github/resources/sample.properties fi else echo "Unknown OS: $RUNNER_OS" exit 1 fi -cat samples/sample.properties \ No newline at end of file +cat .github/resources/sample.properties \ No newline at end of file diff --git a/.github/scripts/start_kcl.sh b/.github/scripts/start_kcl.sh index e8f8c273..097dda25 100644 --- a/.github/scripts/start_kcl.sh +++ b/.github/scripts/start_kcl.sh @@ -2,7 +2,7 @@ set -e set -o pipefail -chmod +x samples/sample.properties +chmod +x .github/resources/sample.properties chmod +x samples/sample_kclpy_app.py # Get records from stream to verify they exist before continuing @@ -14,10 +14,10 @@ echo "Found $RECORD_COUNT_BEFORE records in stream before KCL start" if [[ "$RUNNER_OS" == "macOS" ]]; then brew install coreutils - KCL_COMMAND=$(amazon_kclpy_helper.py --print_command --java $(which java) --properties samples/sample.properties) + KCL_COMMAND=$(amazon_kclpy_helper.py --print_command --java $(which java) --properties .github/resources/sample.properties) gtimeout $RUN_TIME_SECONDS $KCL_COMMAND 2>&1 | tee kcl_output.log || [ $? -eq 124 ] elif [[ "$RUNNER_OS" == "Linux" || "$RUNNER_OS" == "Windows" ]]; then - KCL_COMMAND=$(amazon_kclpy_helper.py --print_command --java $(which java) --properties samples/sample.properties) + KCL_COMMAND=$(amazon_kclpy_helper.py --print_command --java $(which java) --properties .github/resources/sample.properties) timeout $RUN_TIME_SECONDS $KCL_COMMAND 2>&1 | tee kcl_output.log || [ $? -eq 124 ] else echo "Unknown OS: $RUNNER_OS" From fb0a67135fb2cac26de2fc77fc2309716a0b281c Mon Sep 17 00:00:00 2001 From: skye rogers Date: Tue, 2 Dec 2025 17:19:01 -0800 Subject: [PATCH 2/4] echo table contents before deletion --- .github/scripts/clean_up_stream_table.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/scripts/clean_up_stream_table.sh b/.github/scripts/clean_up_stream_table.sh index a5ff3a13..cabaf3a5 100644 --- a/.github/scripts/clean_up_stream_table.sh +++ b/.github/scripts/clean_up_stream_table.sh @@ -15,6 +15,8 @@ fi delete_table() { table_name=$1 if aws dynamodb describe-table --table-name $table_name &>/dev/null; then + echo "=== Table contents for $table_name before deletion ===" + aws dynamodb scan --table-name $table_name --output json echo "Deleting table $table_name" for i in {1..10}; do aws dynamodb delete-table --table-name $table_name && break || From a6f89164ffb0fb7ad8f9488a387b44dea88120a4 Mon Sep 17 00:00:00 2001 From: skye rogers Date: Tue, 2 Dec 2025 17:43:06 -0800 Subject: [PATCH 3/4] remove redundant records check --- .github/scripts/clean_up_stream_table.sh | 2 -- .github/scripts/start_kcl.sh | 7 ------- 2 files changed, 9 deletions(-) diff --git a/.github/scripts/clean_up_stream_table.sh b/.github/scripts/clean_up_stream_table.sh index cabaf3a5..a5ff3a13 100644 --- a/.github/scripts/clean_up_stream_table.sh +++ b/.github/scripts/clean_up_stream_table.sh @@ -15,8 +15,6 @@ fi delete_table() { table_name=$1 if aws dynamodb describe-table --table-name $table_name &>/dev/null; then - echo "=== Table contents for $table_name before deletion ===" - aws dynamodb scan --table-name $table_name --output json echo "Deleting table $table_name" for i in {1..10}; do aws dynamodb delete-table --table-name $table_name && break || diff --git a/.github/scripts/start_kcl.sh b/.github/scripts/start_kcl.sh index 097dda25..64c3500f 100644 --- a/.github/scripts/start_kcl.sh +++ b/.github/scripts/start_kcl.sh @@ -5,13 +5,6 @@ set -o pipefail chmod +x .github/resources/sample.properties chmod +x samples/sample_kclpy_app.py -# Get records from stream to verify they exist before continuing -SHARD_ITERATOR=$(aws kinesis get-shard-iterator --stream-name $STREAM_NAME --shard-id shardId-000000000000 --shard-iterator-type TRIM_HORIZON --query 'ShardIterator' --output text) -INITIAL_RECORDS=$(aws kinesis get-records --shard-iterator $SHARD_ITERATOR) -RECORD_COUNT_BEFORE=$(echo $INITIAL_RECORDS | jq '.Records | length') - -echo "Found $RECORD_COUNT_BEFORE records in stream before KCL start" - if [[ "$RUNNER_OS" == "macOS" ]]; then brew install coreutils KCL_COMMAND=$(amazon_kclpy_helper.py --print_command --java $(which java) --properties .github/resources/sample.properties) From 3f1fd534c60f21c04955dd72ec9d2b8d73c984a5 Mon Sep 17 00:00:00 2001 From: skye rogers Date: Thu, 4 Dec 2025 09:55:01 -0800 Subject: [PATCH 4/4] refactor variables and file names and remove streamArn --- ...ample.properties => github_workflow.properties} | 8 ++------ .github/scripts/manipulate_properties.sh | 14 +++++++------- .github/scripts/start_kcl.sh | 6 +++--- .github/workflows/python.yml | 4 ++-- 4 files changed, 14 insertions(+), 18 deletions(-) rename .github/resources/{sample.properties => github_workflow.properties} (84%) diff --git a/.github/resources/sample.properties b/.github/resources/github_workflow.properties similarity index 84% rename from .github/resources/sample.properties rename to .github/resources/github_workflow.properties index 3e1a3356..aa039e8e 100644 --- a/.github/resources/sample.properties +++ b/.github/resources/github_workflow.properties @@ -3,18 +3,14 @@ # over STDIN and STDOUT according to the multi-language protocol. executableName = sample_kclpy_app.py -# The Stream arn: arn:aws:kinesis:::stream/ -# Important: streamArn takes precedence over streamName if both are set -streamArn = arn:aws:kinesis:us-east-1:000000000000:stream/kclpysample - # The name of an Amazon Kinesis stream to process. # Important: streamArn takes precedence over streamName if both are set -streamName = kclpysample +streamName = STREAM_NAME_PLACEHOLDER # Used by the KCL as the name of this application. Will be used as the name # of an Amazon DynamoDB table which will store the lease and checkpoint # information for workers with this application name -applicationName = PythonKCLSample +applicationName = APP_NAME_PLACEHOLDER # Users can change the credentials provider the KCL will use to retrieve credentials. # Expected key name (case-sensitive): diff --git a/.github/scripts/manipulate_properties.sh b/.github/scripts/manipulate_properties.sh index a461b373..1b50fab2 100644 --- a/.github/scripts/manipulate_properties.sh +++ b/.github/scripts/manipulate_properties.sh @@ -1,23 +1,23 @@ #!/bin/bash set -e -# Manipulate sample.properties file that the KCL application pulls properties from (ex: streamName, applicationName) +# Manipulate github_workflow.properties file that the KCL application pulls properties from (ex: streamName, applicationName) # Depending on the OS, different properties need to be changed if [[ "$RUNNER_OS" == "macOS" ]]; then - sed -i "" "s/kclpysample/$STREAM_NAME/g" .github/resources/sample.properties - sed -i "" "s/PythonKCLSample/$APP_NAME/g" .github/resources/sample.properties + sed -i "" "s/STREAM_NAME_PLACEHOLDER/$STREAM_NAME/g" .github/resources/github_workflow.properties + sed -i "" "s/APP_NAME_PLACEHOLDER/$APP_NAME/g" .github/resources/github_workflow.properties elif [[ "$RUNNER_OS" == "Linux" || "$RUNNER_OS" == "Windows" ]]; then - sed -i "s/kclpysample/$STREAM_NAME/g" .github/resources/sample.properties - sed -i "s/PythonKCLSample/$APP_NAME/g" .github/resources/sample.properties + sed -i "s/STREAM_NAME_PLACEHOLDER/$STREAM_NAME/g" .github/resources/github_workflow.properties + sed -i "s/APP_NAME_PLACEHOLDER/$APP_NAME/g" .github/resources/github_workflow.properties if [[ "$RUNNER_OS" == "Windows" ]]; then echo '@echo off' > samples/run_script.bat echo 'python %~dp0\sample_kclpy_app.py %*' >> samples/run_script.bat - sed -i 's/executableName = sample_kclpy_app.py/executableName = samples\/run_script.bat/' .github/resources/sample.properties + sed -i 's/executableName = sample_kclpy_app.py/executableName = samples\/run_script.bat/' .github/resources/github_workflow.properties fi else echo "Unknown OS: $RUNNER_OS" exit 1 fi -cat .github/resources/sample.properties \ No newline at end of file +cat .github/resources/github_workflow.properties \ No newline at end of file diff --git a/.github/scripts/start_kcl.sh b/.github/scripts/start_kcl.sh index 64c3500f..2082c70d 100644 --- a/.github/scripts/start_kcl.sh +++ b/.github/scripts/start_kcl.sh @@ -2,15 +2,15 @@ set -e set -o pipefail -chmod +x .github/resources/sample.properties +chmod +x .github/resources/github_workflow.properties chmod +x samples/sample_kclpy_app.py if [[ "$RUNNER_OS" == "macOS" ]]; then brew install coreutils - KCL_COMMAND=$(amazon_kclpy_helper.py --print_command --java $(which java) --properties .github/resources/sample.properties) + KCL_COMMAND=$(amazon_kclpy_helper.py --print_command --java $(which java) --properties .github/resources/github_workflow.properties) gtimeout $RUN_TIME_SECONDS $KCL_COMMAND 2>&1 | tee kcl_output.log || [ $? -eq 124 ] elif [[ "$RUNNER_OS" == "Linux" || "$RUNNER_OS" == "Windows" ]]; then - KCL_COMMAND=$(amazon_kclpy_helper.py --print_command --java $(which java) --properties .github/resources/sample.properties) + KCL_COMMAND=$(amazon_kclpy_helper.py --print_command --java $(which java) --properties .github/resources/github_workflow.properties) timeout $RUN_TIME_SECONDS $KCL_COMMAND 2>&1 | tee kcl_output.log || [ $? -eq 124 ] else echo "Unknown OS: $RUNNER_OS" diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 0d05ac08..43f93e63 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -112,8 +112,8 @@ jobs: echo "STREAM_NAME=$STREAM_NAME" >> $GITHUB_ENV echo "APP_NAME=$APP_NAME" >> $GITHUB_ENV - # Manipulate sample.properties file to use unique stream name, application name, and OS specific program changes - - name: Manipulate sample.properties file + # Manipulate github_workflow.properties file to use unique stream name, application name, and OS specific program changes + - name: Manipulate github_workflow.properties file run: | chmod +x .github/scripts/manipulate_properties.sh .github/scripts/manipulate_properties.sh