From 473501320fd184890e4da8a40d67cd335f85083b Mon Sep 17 00:00:00 2001 From: Patrick Bareiss Date: Thu, 21 Aug 2025 10:16:44 +0200 Subject: [PATCH 1/2] replay improvements --- .github/workflows/replay-datasets.yml | 129 ++++++++++++++++ bin/find_changed_datasets.py | 204 ++++++++++++++++++++++++++ bin/replay.py | 138 ++++++++++------- 3 files changed, 421 insertions(+), 50 deletions(-) create mode 100644 .github/workflows/replay-datasets.yml create mode 100644 bin/find_changed_datasets.py diff --git a/.github/workflows/replay-datasets.yml b/.github/workflows/replay-datasets.yml new file mode 100644 index 00000000..ea575308 --- /dev/null +++ b/.github/workflows/replay-datasets.yml @@ -0,0 +1,129 @@ +name: Replay Changed Datasets to Splunk + +on: + push: + branches: [ main, master ] + paths: + - 'datasets/**' + pull_request: + branches: [ main, master ] + paths: + - 'datasets/**' + workflow_dispatch: + inputs: + dataset_path: + description: 'Specific dataset path to replay (optional, defaults to all changed files)' + required: false + type: string + +jobs: + replay-datasets: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch full history for file change detection + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + cd bin + pip install -r requirements.txt + + - name: Find changed YAML files + id: changed-files + if: github.event_name != 'workflow_dispatch' || github.event.inputs.dataset_path == '' + run: | + # Get list of changed YAML files in datasets directory + if [ "${{ github.event_name }}" = "pull_request" ]; then + # For PR, compare against base branch + BASE_SHA="${{ github.event.pull_request.base.sha }}" + HEAD_SHA="${{ github.event.pull_request.head.sha }}" + echo "Comparing PR: $BASE_SHA...$HEAD_SHA" + YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "") + else + # For push, compare against previous commit + BASE_SHA="${{ github.event.before }}" + HEAD_SHA="${{ github.sha }}" + echo "Comparing push: $BASE_SHA...$HEAD_SHA" + YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "") + fi + + if [ -z "$YAML_FILES" ]; then + echo "No YAML dataset files changed" + echo "yaml_files=" >> $GITHUB_OUTPUT + else + echo "Changed YAML files:" + echo "$YAML_FILES" + # Convert newlines to spaces for easier handling + YAML_FILES_SPACE=$(echo "$YAML_FILES" | tr '\n' ' ') + echo "yaml_files=$YAML_FILES_SPACE" >> $GITHUB_OUTPUT + fi + + - name: Set manual dataset path + id: manual-path + if: github.event_name == 'workflow_dispatch' && github.event.inputs.dataset_path != '' + run: | + # For manual dispatch, find YAML files in the specified path + if [ -f "${{ github.event.inputs.dataset_path }}" ]; then + # Single file provided + echo "yaml_files=${{ github.event.inputs.dataset_path }}" >> $GITHUB_OUTPUT + else + # Directory provided - find YAML files + YAML_FILES=$(python bin/find_changed_datasets.py --directory "${{ github.event.inputs.dataset_path }}" --output files 2>/dev/null || echo "") + if [ -n "$YAML_FILES" ]; then + YAML_FILES_SPACE=$(echo "$YAML_FILES" | tr '\n' ' ') + echo "yaml_files=$YAML_FILES_SPACE" >> $GITHUB_OUTPUT + else + echo "yaml_files=" >> $GITHUB_OUTPUT + fi + fi + + - name: Replay datasets to Splunk + if: steps.changed-files.outputs.yaml_files != '' || steps.manual-path.outputs.yaml_files != '' + env: + SPLUNK_HOST: ${{ secrets.SPLUNK_HOST }} + SPLUNK_HEC_TOKEN: ${{ secrets.SPLUNK_HEC_TOKEN }} + run: | + # Get the YAML files to process + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}" + else + YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}" + fi + + if [ -z "$YAML_FILES" ]; then + echo "No YAML files to process" + exit 0 + fi + + echo "Processing YAML files: $YAML_FILES" + + # Run replay script with all YAML files + # The replay script now reads all metadata from the YAML files themselves + python bin/replay.py $YAML_FILES || echo "Failed to replay some datasets" + + - name: Summary + if: always() + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}" + if [ -n "$YAML_FILES" ]; then + echo "Manual replay completed for YAML files: $YAML_FILES" + else + echo "No YAML files found in specified path: ${{ github.event.inputs.dataset_path }}" + fi + else + YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}" + if [ -n "$YAML_FILES" ]; then + echo "Automated replay completed for changed YAML files: $YAML_FILES" + else + echo "No YAML dataset changes detected, no replay needed" + fi + fi diff --git a/bin/find_changed_datasets.py b/bin/find_changed_datasets.py new file mode 100644 index 00000000..253a11dd --- /dev/null +++ b/bin/find_changed_datasets.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +""" +Script to identify changed or added YAML dataset files for replay. +This script simplifies the bash logic from the GitHub Actions workflow. +""" + +import sys +import argparse +import subprocess +from pathlib import Path + + +def run_git_command(cmd): + """Run a git command and return the output.""" + try: + result = subprocess.run( + cmd, shell=True, capture_output=True, text=True, check=True + ) + return result.stdout.strip() + except subprocess.CalledProcessError as e: + print(f"Git command failed: {cmd}") + print(f"Error: {e.stderr}") + return "" + + +def find_changed_files(base_sha, head_sha): + """Find files that changed between two commits.""" + if not base_sha or not head_sha: + print("Error: Both base and head SHA are required") + return [] + + cmd = f"git diff --name-only {base_sha}...{head_sha}" + output = run_git_command(cmd) + + if not output: + return [] + + # Filter for files in datasets directory + changed_files = [] + for line in output.split('\n'): + if line.strip() and line.startswith('datasets/'): + changed_files.append(line.strip()) + + return changed_files + + +def find_yaml_files_in_directories(changed_files): + """Find directories containing YAML files from changed files.""" + yaml_dirs = set() + + for file_path in changed_files: + # Get the directory containing the changed file + current_dir = Path(file_path).parent + + # Walk up the directory tree to find YAML files + while current_dir != Path("datasets") and current_dir != Path("."): + # Check if this directory contains YAML files + yaml_files = (list(current_dir.glob("*.yml")) + + list(current_dir.glob("*.yaml"))) + + if yaml_files: + yaml_dirs.add(str(current_dir)) + break + + current_dir = current_dir.parent + + return sorted(yaml_dirs) + + +def find_all_yaml_files(directories): + """Find all YAML files in the given directories.""" + yaml_files = [] + + for dir_path in directories: + dir_path = Path(dir_path) + if dir_path.exists() and dir_path.is_dir(): + # Find YAML files in this directory (not recursive) + yaml_files.extend(dir_path.glob("*.yml")) + yaml_files.extend(dir_path.glob("*.yaml")) + + return [str(f) for f in sorted(yaml_files)] + + +def main(): + parser = argparse.ArgumentParser( + description="Find changed dataset YAML files for replay", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Find changes between two commits + python find_changed_datasets.py --base-sha abc123 --head-sha def456 + + # Find changes in current branch vs main + python find_changed_datasets.py --compare-branch main + + # List all YAML files in a specific directory + python find_changed_datasets.py --directory datasets/attack_techniques/T1003.003 + +Output formats: + --output directories : Print directories containing YAML files (default) + --output files : Print individual YAML file paths + """ + ) + + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument( + '--base-sha', + help='Base commit SHA to compare from' + ) + group.add_argument( + '--compare-branch', + help='Compare current HEAD against this branch (e.g., main, origin/main)' + ) + group.add_argument( + '--directory', + help='Specific directory to find YAML files in' + ) + + parser.add_argument( + '--head-sha', + help='Head commit SHA to compare to (defaults to HEAD if using --base-sha)' + ) + parser.add_argument( + '--output', + choices=['directories', 'files'], + default='directories', + help='Output format: directories or individual files' + ) + + args = parser.parse_args() + + try: + if args.directory: + # Direct directory mode + if not Path(args.directory).exists(): + print(f"Error: Directory {args.directory} does not exist") + sys.exit(1) + + if args.output == 'files': + yaml_files = find_all_yaml_files([args.directory]) + for f in yaml_files: + print(f) + else: + if find_all_yaml_files([args.directory]): + print(args.directory) + + elif args.compare_branch: + # Compare against a branch + head_sha = run_git_command("git rev-parse HEAD") + base_sha = run_git_command(f"git merge-base HEAD {args.compare_branch}") + + if not head_sha or not base_sha: + print("Error: Could not determine commit SHAs") + sys.exit(1) + + changed_files = find_changed_files(base_sha, head_sha) + if not changed_files: + print("No dataset files changed") + sys.exit(0) + + print(f"Changed files: {len(changed_files)}", file=sys.stderr) + for f in changed_files: + print(f" {f}", file=sys.stderr) + + yaml_dirs = find_yaml_files_in_directories(changed_files) + + if args.output == 'files': + yaml_files = find_all_yaml_files(yaml_dirs) + for f in yaml_files: + print(f) + else: + for d in yaml_dirs: + print(d) + + else: + # Base/head SHA mode + head_sha = args.head_sha or run_git_command("git rev-parse HEAD") + + changed_files = find_changed_files(args.base_sha, head_sha) + if not changed_files: + print("No dataset files changed") + sys.exit(0) + + print(f"Changed files: {len(changed_files)}", file=sys.stderr) + for f in changed_files: + print(f" {f}", file=sys.stderr) + + yaml_dirs = find_yaml_files_in_directories(changed_files) + + if args.output == 'files': + yaml_files = find_all_yaml_files(yaml_dirs) + for f in yaml_files: + print(f) + else: + for d in yaml_dirs: + print(d) + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/bin/replay.py b/bin/replay.py index 1898c152..e0a55d9d 100644 --- a/bin/replay.py +++ b/bin/replay.py @@ -25,24 +25,26 @@ def load_environment_variables(): def find_data_yml_files(folder_path): - """Find all data.yml files recursively in folder and subfolders.""" + """Find all YAML files recursively in folder and subfolders.""" data_yml_files = [] folder_path = Path(folder_path) - # Use pathlib to recursively find all data.yml files - for yml_file in folder_path.rglob("data.yml"): + # Use pathlib to recursively find all .yml and .yaml files + for yml_file in folder_path.rglob("*.yml"): data_yml_files.append(str(yml_file)) + for yaml_file in folder_path.rglob("*.yaml"): + data_yml_files.append(str(yaml_file)) if not data_yml_files: - print(f"Warning: No data.yml files found in {folder_path}") + print(f"Warning: No YAML files found in {folder_path}") else: - print(f"Found {len(data_yml_files)} data.yml files") + print(f"Found {len(data_yml_files)} YAML files") return data_yml_files def parse_data_yml(yml_file_path): - """Parse a data.yml file and extract dataset information.""" + """Parse a YAML file and extract dataset information.""" try: with open(yml_file_path, 'r') as file: data = yaml.safe_load(file) @@ -51,12 +53,21 @@ def parse_data_yml(yml_file_path): file_id = data.get('id', str(uuid.uuid4())) datasets = data.get('datasets', []) - # Return tuple of (id, datasets_list) - return file_id, datasets + # Extract default metadata from YAML file + default_index = data.get('index', 'attack_data') # Default to attack_data index + default_source = data.get('source', 'attack_data') + default_sourcetype = data.get('sourcetype', '_json') + + # Return tuple of (id, datasets_list, default_metadata) + return file_id, datasets, { + 'index': default_index, + 'source': default_source, + 'sourcetype': default_sourcetype + } except Exception as e: print(f"Error parsing {yml_file_path}: {e}") - return None, [] + return None, [], {} def find_data_files(folder_path): @@ -106,51 +117,55 @@ def send_data_to_splunk(file_path, splunk_host, hec_token, event_host_uuid, def main(): parser = argparse.ArgumentParser( - description="Recursively find and replay datasets from data.yml files " - "to Splunk via HTTP Event Collector (HEC)", + description="Replay datasets from YAML files to Splunk via HTTP Event Collector (HEC). " + "All metadata (source, sourcetype, index) is read from the YAML files.", epilog=""" Environment Variables Required: SPLUNK_HOST - Splunk server hostname/IP SPLUNK_HEC_TOKEN - Splunk HEC token Example usage: - python replay_all.py /path/to/datasets/folder - python replay_all.py datasets/attack_techniques --host-uuid 12345678-abcd-efgh + # Replay from specific YAML files + python replay.py datasets/attack_techniques/T1003.003/atomic_red_team/atomic_red_team.yml + python replay.py file1.yml file2.yml file3.yml + + # Replay from directories (finds all YAML files) + python replay.py datasets/attack_techniques/T1003.003/ + python replay.py datasets/attack_techniques/T1003.003/ datasets/attack_techniques/T1005/ + +Environment setup: export SPLUNK_HOST="192.168.1.100" export SPLUNK_HEC_TOKEN="your-hec-token" This script will: -1. Recursively find all data.yml files in the specified directory -2. Parse each data.yml file to extract dataset information -3. Replay each dataset using the source and sourcetype from the yml file -4. Use the id field from data.yml as the host field for Splunk events +1. Process YAML files directly or find all YAML files in specified directories +2. Parse each YAML file to extract all metadata (source, sourcetype, index, etc.) +3. Replay each dataset using the metadata from the YAML file +4. Use the id field from YAML file as the host field for Splunk events """, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( - 'path', - help='Path to a directory containing data.yml files ' - '(searches recursively)' + 'paths', + nargs='+', + help='Paths to YAML files or directories containing YAML files' ) parser.add_argument( - '--source', - default='test', - help='Source field for Splunk events (default: test)' + '--index-override', + help='Override the index specified in YAML files (optional)' ) parser.add_argument( - '--sourcetype', - default='test', - help='Sourcetype field for Splunk events (default: test)' + '--source-override', + help='Override the source specified in YAML files (optional)' ) parser.add_argument( - '--index', - default='test', - help='Splunk index to send events to (default: test)' + '--sourcetype-override', + help='Override the sourcetype specified in YAML files (optional)' ) parser.add_argument( '--host-uuid', help='UUID to use as the host field for Splunk events ' - '(generates random UUID if not provided)' + '(uses id from YAML file if not provided)' ) args = parser.parse_args() @@ -159,36 +174,58 @@ def main(): splunk_host = env_vars['host'] hec_token = env_vars['hec_token'] - if not os.path.isdir(args.path): - print(f"Error: {args.path} is not a valid directory") - sys.exit(1) - - # Find all data.yml files recursively - data_yml_files = find_data_yml_files(args.path) + # Collect all YAML files from paths (files or directories) + all_yaml_files = [] + for path in args.paths: + path_obj = Path(path) - if not data_yml_files: - print(f"No data.yml files found in {args.path}") + if path_obj.is_file(): + # Direct YAML file + if path_obj.suffix.lower() in ['.yml', '.yaml']: + all_yaml_files.append(str(path_obj)) + else: + print(f"Warning: {path} is not a YAML file, skipping") + elif path_obj.is_dir(): + # Directory - find YAML files + yaml_files = find_data_yml_files(str(path_obj)) + all_yaml_files.extend(yaml_files) + else: + print(f"Warning: {path} does not exist, skipping") + + if not all_yaml_files: + print("No YAML files found to process") sys.exit(1) - # Process each data.yml file - for yml_file in data_yml_files: + print(f"Found {len(all_yaml_files)} YAML files to process") + + # Process each YAML file + for yml_file in all_yaml_files: print(f"\nProcessing {yml_file}...") - file_id, datasets = parse_data_yml(yml_file) + file_id, datasets, defaults = parse_data_yml(yml_file) if not file_id or not datasets: print(f"Skipping {yml_file} - no valid data found") continue - # Use the id from data.yml as host field (unless user provided one) + # Use the id from YAML file as host field (unless user provided one) event_host_uuid = args.host_uuid or file_id print(f"Using host UUID: {event_host_uuid}") - # Process each dataset in the data.yml file + # Process each dataset in the YAML file for dataset in datasets: dataset_name = dataset.get('name', 'unknown') dataset_path = dataset.get('path', '') - dataset_source = dataset.get('source', args.source) - dataset_sourcetype = dataset.get('sourcetype', args.sourcetype) + + # Use dataset-specific metadata, fall back to YAML defaults + dataset_source = (args.source_override or + dataset.get('source') or + defaults.get('source', 'attack_data')) + dataset_sourcetype = (args.sourcetype_override or + dataset.get('sourcetype') or + defaults.get('sourcetype', '_json')) + dataset_index = (args.index_override or + dataset.get('index') or + defaults.get('index', 'attack_data')) if not dataset_path: print(f"Warning: No path specified for dataset " @@ -198,10 +235,10 @@ def main(): # Handle relative paths - relative to attack_data root if dataset_path.startswith('/datasets/'): # Convert to absolute path based on project structure - if Path(args.path).name == 'datasets': - base_dir = Path(args.path).parent - else: - base_dir = Path(args.path) + current_path = Path(yml_file).parent + base_dir = current_path + + # Walk up to find attack_data root while (base_dir.name != 'attack_data' and base_dir.parent != base_dir): base_dir = base_dir.parent @@ -221,6 +258,7 @@ def main(): continue print(f" Sending dataset '{dataset_name}' from {full_path}") + print(f" index: {dataset_index}") print(f" source: {dataset_source}") print(f" sourcetype: {dataset_sourcetype}") @@ -229,7 +267,7 @@ def main(): splunk_host=splunk_host, hec_token=hec_token, event_host_uuid=event_host_uuid, - index=args.index, + index=dataset_index, source=dataset_source, sourcetype=dataset_sourcetype, ) From d0b6f9903d8bf0c00f046e642207ee7948b47ad1 Mon Sep 17 00:00:00 2001 From: Patrick Bareiss Date: Thu, 21 Aug 2025 10:34:10 +0200 Subject: [PATCH 2/2] CI/CD upload attack datasets to Splunk --- bin/replay.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/bin/replay.py b/bin/replay.py index e0a55d9d..dd1b0615 100644 --- a/bin/replay.py +++ b/bin/replay.py @@ -117,8 +117,8 @@ def send_data_to_splunk(file_path, splunk_host, hec_token, event_host_uuid, def main(): parser = argparse.ArgumentParser( - description="Replay datasets from YAML files to Splunk via HTTP Event Collector (HEC). " - "All metadata (source, sourcetype, index) is read from the YAML files.", + description="Replay datasets from YAML files to Splunk via HEC. " + "All metadata (source, sourcetype, index) is read from YAML files.", epilog=""" Environment Variables Required: SPLUNK_HOST - Splunk server hostname/IP @@ -126,12 +126,13 @@ def main(): Example usage: # Replay from specific YAML files - python replay.py datasets/attack_techniques/T1003.003/atomic_red_team/atomic_red_team.yml + python replay.py datasets/attack_techniques/T1003.003/atomic_red_team/\ + atomic_red_team.yml python replay.py file1.yml file2.yml file3.yml # Replay from directories (finds all YAML files) python replay.py datasets/attack_techniques/T1003.003/ - python replay.py datasets/attack_techniques/T1003.003/ datasets/attack_techniques/T1005/ + python replay.py datasets/attack_techniques/T1003.003/ Environment setup: export SPLUNK_HOST="192.168.1.100" @@ -232,22 +233,33 @@ def main(): f"'{dataset_name}', skipping") continue - # Handle relative paths - relative to attack_data root + # Handle relative paths - relative to git project root if dataset_path.startswith('/datasets/'): # Convert to absolute path based on project structure + # Find git project root by looking for .git directory current_path = Path(yml_file).parent - base_dir = current_path + project_root = current_path - # Walk up to find attack_data root - while (base_dir.name != 'attack_data' and - base_dir.parent != base_dir): - base_dir = base_dir.parent + # Walk up to find git project root (directory containing .git) + while (not (project_root / '.git').exists() and + project_root.parent != project_root): + project_root = project_root.parent - if base_dir.name == 'attack_data': - full_path = base_dir / dataset_path.lstrip('/') + if (project_root / '.git').exists(): + # Found git project root, construct path relative to it + full_path = project_root / dataset_path.lstrip('/') else: - # Fallback: assume current working directory structure - full_path = Path.cwd() / dataset_path.lstrip('/') + # Fallback: try to find project root using current working dir + cwd = Path.cwd() + while (not (cwd / '.git').exists() and + cwd.parent != cwd): + cwd = cwd.parent + + if (cwd / '.git').exists(): + full_path = cwd / dataset_path.lstrip('/') + else: + # Last resort: assume current working directory structure + full_path = Path.cwd() / dataset_path.lstrip('/') else: # Assume relative to yml file location yml_dir = Path(yml_file).parent