From 01f33cd013fcfbef1f8169a37a82594ca09220a7 Mon Sep 17 00:00:00 2001 From: ljstella Date: Mon, 13 Oct 2025 09:39:19 -0400 Subject: [PATCH] Add runner group label --- .github/workflows/mirror_data_archive.yml | 67 +++---- .github/workflows/replay-datasets.yml | 209 +++++++++++----------- 2 files changed, 139 insertions(+), 137 deletions(-) diff --git a/.github/workflows/mirror_data_archive.yml b/.github/workflows/mirror_data_archive.yml index bf49b3311..f1f2f3f9a 100644 --- a/.github/workflows/mirror_data_archive.yml +++ b/.github/workflows/mirror_data_archive.yml @@ -2,44 +2,45 @@ name: mirror-archive-on-merge-to-default-branch on: push: - branches: - - master + branches: + - master jobs: mirror-archive: - runs-on: ubuntu-latest + runs-on: + group: attack-data-runners env: - BUCKET: attack-range-attack-data - ATTACK_DATA_ARCHIVE_FILE: attack_data.tar.zstd + BUCKET: attack-range-attack-data + ATTACK_DATA_ARCHIVE_FILE: attack_data.tar.zstd steps: - - name: Checkout Repo - uses: actions/checkout@v4 - # We must EXPLICITLY specificy lfs: true. It defaults to false - with: - lfs: true - - - name: Setup AWS CLI and Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - aws-access-key-id: ${{ secrets.ACCESS_KEY}} - aws-secret-access-key: ${{ secrets.SECRET_ACCESS_KEY }} - aws-region: us-west-2 + - name: Checkout Repo + uses: actions/checkout@v4 + # We must EXPLICITLY specificy lfs: true. It defaults to false + with: + lfs: true - - name: Create archive of ONLY the datasets folder - run: | - # The structure of the tar + zstd archive should mirror that of checking out the repo directly - mkdir attack_data - mv datasets/ attack_data/. - - #Build some metadata about the archive for documentation purposes - git rev-parse HEAD > attack_data/git_hash.txt - date -u > attack_data/cache_build_date.txt + - name: Setup AWS CLI and Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.ACCESS_KEY}} + aws-secret-access-key: ${{ secrets.SECRET_ACCESS_KEY }} + aws-region: us-west-2 - # Compress with number of threads equal to number of CPU cores. - # Compression level 10 is a great compromise of speed and file size. - # File size reductions are diminishing returns after this - determined experimentally. - tar -c attack_data | zstd --compress -T0 -10 -o $ATTACK_DATA_ARCHIVE_FILE + - name: Create archive of ONLY the datasets folder + run: | + # The structure of the tar + zstd archive should mirror that of checking out the repo directly + mkdir attack_data + mv datasets/ attack_data/. - - name: Upload Attack data archive file to S3 Bucket - run: | - aws s3 cp $ATTACK_DATA_ARCHIVE_FILE s3://$BUCKET/ \ No newline at end of file + #Build some metadata about the archive for documentation purposes + git rev-parse HEAD > attack_data/git_hash.txt + date -u > attack_data/cache_build_date.txt + + # Compress with number of threads equal to number of CPU cores. + # Compression level 10 is a great compromise of speed and file size. + # File size reductions are diminishing returns after this - determined experimentally. + tar -c attack_data | zstd --compress -T0 -10 -o $ATTACK_DATA_ARCHIVE_FILE + + - name: Upload Attack data archive file to S3 Bucket + run: | + aws s3 cp $ATTACK_DATA_ARCHIVE_FILE s3://$BUCKET/ diff --git a/.github/workflows/replay-datasets.yml b/.github/workflows/replay-datasets.yml index e3ab77aa5..dc7084d71 100644 --- a/.github/workflows/replay-datasets.yml +++ b/.github/workflows/replay-datasets.yml @@ -2,7 +2,7 @@ name: Replay Changed Datasets to Splunk on: push: - branches: [ master ] + branches: [master] paths: - 'datasets/**' workflow_dispatch: @@ -14,115 +14,116 @@ on: jobs: replay-datasets: - runs-on: ubuntu-latest - + runs-on: + group: attack-data-runners + steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 # Fetch full history for file change detection - - - name: Pull Git LFS files - run: git lfs pull - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.9' - - - name: Install dependencies - run: | - cd bin - pip install -r requirements.txt - - - name: Find changed YAML files - id: changed-files - if: github.event_name != 'workflow_dispatch' || github.event.inputs.dataset_path == '' - run: | - # Get list of changed YAML files in datasets directory - if [ "${{ github.event_name }}" = "pull_request" ]; then - # For PR, compare against base branch - BASE_SHA="${{ github.event.pull_request.base.sha }}" - HEAD_SHA="${{ github.event.pull_request.head.sha }}" - echo "Comparing PR: $BASE_SHA...$HEAD_SHA" - YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "") - else - # For push, compare against previous commit - BASE_SHA="${{ github.event.before }}" - HEAD_SHA="${{ github.sha }}" - echo "Comparing push: $BASE_SHA...$HEAD_SHA" - YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "") - fi - - if [ -z "$YAML_FILES" ]; then - echo "No YAML dataset files changed" - echo "yaml_files=" >> $GITHUB_OUTPUT - else - echo "Changed YAML files:" - echo "$YAML_FILES" - # Convert newlines to spaces for easier handling - YAML_FILES_SPACE=$(echo "$YAML_FILES" | tr '\n' ' ') - echo "yaml_files=$YAML_FILES_SPACE" >> $GITHUB_OUTPUT - fi - - - name: Set manual dataset path - id: manual-path - if: github.event_name == 'workflow_dispatch' && github.event.inputs.dataset_path != '' - run: | - # For manual dispatch, find YAML files in the specified path - if [ -f "${{ github.event.inputs.dataset_path }}" ]; then - # Single file provided - echo "yaml_files=${{ github.event.inputs.dataset_path }}" >> $GITHUB_OUTPUT - else - # Directory provided - find YAML files - YAML_FILES=$(python bin/find_changed_datasets.py --directory "${{ github.event.inputs.dataset_path }}" --output files 2>/dev/null || echo "") - if [ -n "$YAML_FILES" ]; then + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch full history for file change detection + + - name: Pull Git LFS files + run: git lfs pull + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + cd bin + pip install -r requirements.txt + + - name: Find changed YAML files + id: changed-files + if: github.event_name != 'workflow_dispatch' || github.event.inputs.dataset_path == '' + run: | + # Get list of changed YAML files in datasets directory + if [ "${{ github.event_name }}" = "pull_request" ]; then + # For PR, compare against base branch + BASE_SHA="${{ github.event.pull_request.base.sha }}" + HEAD_SHA="${{ github.event.pull_request.head.sha }}" + echo "Comparing PR: $BASE_SHA...$HEAD_SHA" + YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "") + else + # For push, compare against previous commit + BASE_SHA="${{ github.event.before }}" + HEAD_SHA="${{ github.sha }}" + echo "Comparing push: $BASE_SHA...$HEAD_SHA" + YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "") + fi + + if [ -z "$YAML_FILES" ]; then + echo "No YAML dataset files changed" + echo "yaml_files=" >> $GITHUB_OUTPUT + else + echo "Changed YAML files:" + echo "$YAML_FILES" + # Convert newlines to spaces for easier handling YAML_FILES_SPACE=$(echo "$YAML_FILES" | tr '\n' ' ') echo "yaml_files=$YAML_FILES_SPACE" >> $GITHUB_OUTPUT + fi + + - name: Set manual dataset path + id: manual-path + if: github.event_name == 'workflow_dispatch' && github.event.inputs.dataset_path != '' + run: | + # For manual dispatch, find YAML files in the specified path + if [ -f "${{ github.event.inputs.dataset_path }}" ]; then + # Single file provided + echo "yaml_files=${{ github.event.inputs.dataset_path }}" >> $GITHUB_OUTPUT else - echo "yaml_files=" >> $GITHUB_OUTPUT + # Directory provided - find YAML files + YAML_FILES=$(python bin/find_changed_datasets.py --directory "${{ github.event.inputs.dataset_path }}" --output files 2>/dev/null || echo "") + if [ -n "$YAML_FILES" ]; then + YAML_FILES_SPACE=$(echo "$YAML_FILES" | tr '\n' ' ') + echo "yaml_files=$YAML_FILES_SPACE" >> $GITHUB_OUTPUT + else + echo "yaml_files=" >> $GITHUB_OUTPUT + fi fi - fi - - - name: Replay datasets to Splunk - if: steps.changed-files.outputs.yaml_files != '' || steps.manual-path.outputs.yaml_files != '' - env: - SPLUNK_HOST: ${{ secrets.SPLUNK_HOST }} - SPLUNK_HEC_TOKEN: ${{ secrets.SPLUNK_HEC_TOKEN }} - run: | - # Get the YAML files to process - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}" - else - YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}" - fi - - if [ -z "$YAML_FILES" ]; then - echo "No YAML files to process" - exit 0 - fi - - echo "Processing YAML files: $YAML_FILES" - - # Run replay script with all YAML files - # The replay script now reads all metadata from the YAML files themselves - python bin/replay.py $YAML_FILES || echo "Failed to replay some datasets" - - - name: Summary - if: always() - run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}" - if [ -n "$YAML_FILES" ]; then - echo "Manual replay completed for YAML files: $YAML_FILES" + + - name: Replay datasets to Splunk + if: steps.changed-files.outputs.yaml_files != '' || steps.manual-path.outputs.yaml_files != '' + env: + SPLUNK_HOST: ${{ secrets.SPLUNK_HOST }} + SPLUNK_HEC_TOKEN: ${{ secrets.SPLUNK_HEC_TOKEN }} + run: | + # Get the YAML files to process + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}" else - echo "No YAML files found in specified path: ${{ github.event.inputs.dataset_path }}" + YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}" fi - else - YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}" - if [ -n "$YAML_FILES" ]; then - echo "Automated replay completed for changed YAML files: $YAML_FILES" + + if [ -z "$YAML_FILES" ]; then + echo "No YAML files to process" + exit 0 + fi + + echo "Processing YAML files: $YAML_FILES" + + # Run replay script with all YAML files + # The replay script now reads all metadata from the YAML files themselves + python bin/replay.py $YAML_FILES || echo "Failed to replay some datasets" + + - name: Summary + if: always() + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}" + if [ -n "$YAML_FILES" ]; then + echo "Manual replay completed for YAML files: $YAML_FILES" + else + echo "No YAML files found in specified path: ${{ github.event.inputs.dataset_path }}" + fi else - echo "No YAML dataset changes detected, no replay needed" + YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}" + if [ -n "$YAML_FILES" ]; then + echo "Automated replay completed for changed YAML files: $YAML_FILES" + else + echo "No YAML dataset changes detected, no replay needed" + fi fi - fi