From 473501320fd184890e4da8a40d67cd335f85083b Mon Sep 17 00:00:00 2001
From: Patrick Bareiss <pbareiss@splunk.com>
Date: Thu, 21 Aug 2025 10:16:44 +0200
Subject: [PATCH 1/2] replay improvements

---
 .github/workflows/replay-datasets.yml | 129 ++++++++++++++++
 bin/find_changed_datasets.py          | 204 ++++++++++++++++++++++++++
 bin/replay.py                         | 138 ++++++++++-------
 3 files changed, 421 insertions(+), 50 deletions(-)
 create mode 100644 .github/workflows/replay-datasets.yml
 create mode 100644 bin/find_changed_datasets.py

diff --git a/.github/workflows/replay-datasets.yml b/.github/workflows/replay-datasets.yml
new file mode 100644
index 00000000..ea575308
--- /dev/null
+++ b/.github/workflows/replay-datasets.yml
@@ -0,0 +1,129 @@
+name: Replay Changed Datasets to Splunk
+
+on:
+  push:
+    branches: [ main, master ]
+    paths:
+      - 'datasets/**'
+  pull_request:
+    branches: [ main, master ]
+    paths:
+      - 'datasets/**'
+  workflow_dispatch:
+    inputs:
+      dataset_path:
+        description: 'Specific dataset path to replay (optional, defaults to all changed files)'
+        required: false
+        type: string
+
+jobs:
+  replay-datasets:
+    runs-on: ubuntu-latest
+    
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0  # Fetch full history for file change detection
+    
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.9'
+    
+    - name: Install dependencies
+      run: |
+        cd bin
+        pip install -r requirements.txt
+    
+    - name: Find changed YAML files
+      id: changed-files
+      if: github.event_name != 'workflow_dispatch' || github.event.inputs.dataset_path == ''
+      run: |
+        # Get list of changed YAML files in datasets directory
+        if [ "${{ github.event_name }}" = "pull_request" ]; then
+          # For PR, compare against base branch
+          BASE_SHA="${{ github.event.pull_request.base.sha }}"
+          HEAD_SHA="${{ github.event.pull_request.head.sha }}"
+          echo "Comparing PR: $BASE_SHA...$HEAD_SHA"
+          YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "")
+        else
+          # For push, compare against previous commit
+          BASE_SHA="${{ github.event.before }}"
+          HEAD_SHA="${{ github.sha }}"
+          echo "Comparing push: $BASE_SHA...$HEAD_SHA"
+          YAML_FILES=$(python bin/find_changed_datasets.py --base-sha $BASE_SHA --head-sha $HEAD_SHA --output files 2>/dev/null || echo "")
+        fi
+        
+        if [ -z "$YAML_FILES" ]; then
+          echo "No YAML dataset files changed"
+          echo "yaml_files=" >> $GITHUB_OUTPUT
+        else
+          echo "Changed YAML files:"
+          echo "$YAML_FILES"
+          # Convert newlines to spaces for easier handling
+          YAML_FILES_SPACE=$(echo "$YAML_FILES" | tr '\n' ' ')
+          echo "yaml_files=$YAML_FILES_SPACE" >> $GITHUB_OUTPUT
+        fi
+    
+    - name: Set manual dataset path
+      id: manual-path
+      if: github.event_name == 'workflow_dispatch' && github.event.inputs.dataset_path != ''
+      run: |
+        # For manual dispatch, find YAML files in the specified path
+        if [ -f "${{ github.event.inputs.dataset_path }}" ]; then
+          # Single file provided
+          echo "yaml_files=${{ github.event.inputs.dataset_path }}" >> $GITHUB_OUTPUT
+        else
+          # Directory provided - find YAML files
+          YAML_FILES=$(python bin/find_changed_datasets.py --directory "${{ github.event.inputs.dataset_path }}" --output files 2>/dev/null || echo "")
+          if [ -n "$YAML_FILES" ]; then
+            YAML_FILES_SPACE=$(echo "$YAML_FILES" | tr '\n' ' ')
+            echo "yaml_files=$YAML_FILES_SPACE" >> $GITHUB_OUTPUT
+          else
+            echo "yaml_files=" >> $GITHUB_OUTPUT
+          fi
+        fi
+    
+    - name: Replay datasets to Splunk
+      if: steps.changed-files.outputs.yaml_files != '' || steps.manual-path.outputs.yaml_files != ''
+      env:
+        SPLUNK_HOST: ${{ secrets.SPLUNK_HOST }}
+        SPLUNK_HEC_TOKEN: ${{ secrets.SPLUNK_HEC_TOKEN }}
+      run: |
+        # Get the YAML files to process
+        if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+          YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}"
+        else
+          YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}"
+        fi
+        
+        if [ -z "$YAML_FILES" ]; then
+          echo "No YAML files to process"
+          exit 0
+        fi
+        
+        echo "Processing YAML files: $YAML_FILES"
+        
+        # Run replay script with all YAML files
+        # The replay script now reads all metadata from the YAML files themselves
+        python bin/replay.py $YAML_FILES || echo "Failed to replay some datasets"
+    
+    - name: Summary
+      if: always()
+      run: |
+        if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+          YAML_FILES="${{ steps.manual-path.outputs.yaml_files }}"
+          if [ -n "$YAML_FILES" ]; then
+            echo "Manual replay completed for YAML files: $YAML_FILES"
+          else
+            echo "No YAML files found in specified path: ${{ github.event.inputs.dataset_path }}"
+          fi
+        else
+          YAML_FILES="${{ steps.changed-files.outputs.yaml_files }}"
+          if [ -n "$YAML_FILES" ]; then
+            echo "Automated replay completed for changed YAML files: $YAML_FILES"
+          else
+            echo "No YAML dataset changes detected, no replay needed"
+          fi
+        fi
diff --git a/bin/find_changed_datasets.py b/bin/find_changed_datasets.py
new file mode 100644
index 00000000..253a11dd
--- /dev/null
+++ b/bin/find_changed_datasets.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""
+Script to identify changed or added YAML dataset files for replay.
+This script simplifies the bash logic from the GitHub Actions workflow.
+"""
+
+import sys
+import argparse
+import subprocess
+from pathlib import Path
+
+
+def run_git_command(cmd):
+    """Run a git command and return the output."""
+    try:
+        result = subprocess.run(
+            cmd, shell=True, capture_output=True, text=True, check=True
+        )
+        return result.stdout.strip()
+    except subprocess.CalledProcessError as e:
+        print(f"Git command failed: {cmd}")
+        print(f"Error: {e.stderr}")
+        return ""
+
+
+def find_changed_files(base_sha, head_sha):
+    """Find files that changed between two commits."""
+    if not base_sha or not head_sha:
+        print("Error: Both base and head SHA are required")
+        return []
+
+    cmd = f"git diff --name-only {base_sha}...{head_sha}"
+    output = run_git_command(cmd)
+
+    if not output:
+        return []
+
+    # Filter for files in datasets directory
+    changed_files = []
+    for line in output.split('\n'):
+        if line.strip() and line.startswith('datasets/'):
+            changed_files.append(line.strip())
+
+    return changed_files
+
+
+def find_yaml_files_in_directories(changed_files):
+    """Find directories containing YAML files from changed files."""
+    yaml_dirs = set()
+
+    for file_path in changed_files:
+        # Get the directory containing the changed file
+        current_dir = Path(file_path).parent
+
+        # Walk up the directory tree to find YAML files
+        while current_dir != Path("datasets") and current_dir != Path("."):
+            # Check if this directory contains YAML files
+            yaml_files = (list(current_dir.glob("*.yml")) +
+                         list(current_dir.glob("*.yaml")))
+
+            if yaml_files:
+                yaml_dirs.add(str(current_dir))
+                break
+
+            current_dir = current_dir.parent
+
+    return sorted(yaml_dirs)
+
+
+def find_all_yaml_files(directories):
+    """Find all YAML files in the given directories."""
+    yaml_files = []
+
+    for dir_path in directories:
+        dir_path = Path(dir_path)
+        if dir_path.exists() and dir_path.is_dir():
+            # Find YAML files in this directory (not recursive)
+            yaml_files.extend(dir_path.glob("*.yml"))
+            yaml_files.extend(dir_path.glob("*.yaml"))
+
+    return [str(f) for f in sorted(yaml_files)]
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Find changed dataset YAML files for replay",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Find changes between two commits
+  python find_changed_datasets.py --base-sha abc123 --head-sha def456
+
+  # Find changes in current branch vs main
+  python find_changed_datasets.py --compare-branch main
+
+  # List all YAML files in a specific directory
+  python find_changed_datasets.py --directory datasets/attack_techniques/T1003.003
+
+Output formats:
+  --output directories  : Print directories containing YAML files (default)
+  --output files        : Print individual YAML file paths
+        """
+    )
+    
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument(
+        '--base-sha',
+        help='Base commit SHA to compare from'
+    )
+    group.add_argument(
+        '--compare-branch',
+        help='Compare current HEAD against this branch (e.g., main, origin/main)'
+    )
+    group.add_argument(
+        '--directory',
+        help='Specific directory to find YAML files in'
+    )
+
+    parser.add_argument(
+        '--head-sha',
+        help='Head commit SHA to compare to (defaults to HEAD if using --base-sha)'
+    )
+    parser.add_argument(
+        '--output',
+        choices=['directories', 'files'],
+        default='directories',
+        help='Output format: directories or individual files'
+    )
+    
+    args = parser.parse_args()
+    
+    try:
+        if args.directory:
+            # Direct directory mode
+            if not Path(args.directory).exists():
+                print(f"Error: Directory {args.directory} does not exist")
+                sys.exit(1)
+            
+            if args.output == 'files':
+                yaml_files = find_all_yaml_files([args.directory])
+                for f in yaml_files:
+                    print(f)
+            else:
+                if find_all_yaml_files([args.directory]):
+                    print(args.directory)
+        
+        elif args.compare_branch:
+            # Compare against a branch
+            head_sha = run_git_command("git rev-parse HEAD")
+            base_sha = run_git_command(f"git merge-base HEAD {args.compare_branch}")
+            
+            if not head_sha or not base_sha:
+                print("Error: Could not determine commit SHAs")
+                sys.exit(1)
+            
+            changed_files = find_changed_files(base_sha, head_sha)
+            if not changed_files:
+                print("No dataset files changed")
+                sys.exit(0)
+            
+            print(f"Changed files: {len(changed_files)}", file=sys.stderr)
+            for f in changed_files:
+                print(f"  {f}", file=sys.stderr)
+            
+            yaml_dirs = find_yaml_files_in_directories(changed_files)
+            
+            if args.output == 'files':
+                yaml_files = find_all_yaml_files(yaml_dirs)
+                for f in yaml_files:
+                    print(f)
+            else:
+                for d in yaml_dirs:
+                    print(d)
+        
+        else:
+            # Base/head SHA mode
+            head_sha = args.head_sha or run_git_command("git rev-parse HEAD")
+            
+            changed_files = find_changed_files(args.base_sha, head_sha)
+            if not changed_files:
+                print("No dataset files changed")
+                sys.exit(0)
+            
+            print(f"Changed files: {len(changed_files)}", file=sys.stderr)
+            for f in changed_files:
+                print(f"  {f}", file=sys.stderr)
+            
+            yaml_dirs = find_yaml_files_in_directories(changed_files)
+            
+            if args.output == 'files':
+                yaml_files = find_all_yaml_files(yaml_dirs)
+                for f in yaml_files:
+                    print(f)
+            else:
+                for d in yaml_dirs:
+                    print(d)
+    
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/replay.py b/bin/replay.py
index 1898c152..e0a55d9d 100644
--- a/bin/replay.py
+++ b/bin/replay.py
@@ -25,24 +25,26 @@ def load_environment_variables():
 
 
 def find_data_yml_files(folder_path):
-    """Find all data.yml files recursively in folder and subfolders."""
+    """Find all YAML files recursively in folder and subfolders."""
     data_yml_files = []
     folder_path = Path(folder_path)
 
-    # Use pathlib to recursively find all data.yml files
-    for yml_file in folder_path.rglob("data.yml"):
+    # Use pathlib to recursively find all .yml and .yaml files
+    for yml_file in folder_path.rglob("*.yml"):
         data_yml_files.append(str(yml_file))
+    for yaml_file in folder_path.rglob("*.yaml"):
+        data_yml_files.append(str(yaml_file))
 
     if not data_yml_files:
-        print(f"Warning: No data.yml files found in {folder_path}")
+        print(f"Warning: No YAML files found in {folder_path}")
     else:
-        print(f"Found {len(data_yml_files)} data.yml files")
+        print(f"Found {len(data_yml_files)} YAML files")
 
     return data_yml_files
 
 
 def parse_data_yml(yml_file_path):
-    """Parse a data.yml file and extract dataset information."""
+    """Parse a YAML file and extract dataset information."""
     try:
         with open(yml_file_path, 'r') as file:
             data = yaml.safe_load(file)
@@ -51,12 +53,21 @@ def parse_data_yml(yml_file_path):
         file_id = data.get('id', str(uuid.uuid4()))
         datasets = data.get('datasets', [])
 
-        # Return tuple of (id, datasets_list)
-        return file_id, datasets
+        # Extract default metadata from YAML file
+        default_index = data.get('index', 'attack_data')  # Default to attack_data index
+        default_source = data.get('source', 'attack_data')
+        default_sourcetype = data.get('sourcetype', '_json')
+
+        # Return tuple of (id, datasets_list, default_metadata)
+        return file_id, datasets, {
+            'index': default_index,
+            'source': default_source,
+            'sourcetype': default_sourcetype
+        }
 
     except Exception as e:
         print(f"Error parsing {yml_file_path}: {e}")
-        return None, []
+        return None, [], {}
 
 
 def find_data_files(folder_path):
@@ -106,51 +117,55 @@ def send_data_to_splunk(file_path, splunk_host, hec_token, event_host_uuid,
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Recursively find and replay datasets from data.yml files "
-                    "to Splunk via HTTP Event Collector (HEC)",
+        description="Replay datasets from YAML files to Splunk via HTTP Event Collector (HEC). "
+                    "All metadata (source, sourcetype, index) is read from the YAML files.",
         epilog="""
 Environment Variables Required:
   SPLUNK_HOST      - Splunk server hostname/IP
   SPLUNK_HEC_TOKEN - Splunk HEC token
 
 Example usage:
-  python replay_all.py /path/to/datasets/folder
-  python replay_all.py datasets/attack_techniques --host-uuid 12345678-abcd-efgh
+  # Replay from specific YAML files
+  python replay.py datasets/attack_techniques/T1003.003/atomic_red_team/atomic_red_team.yml
+  python replay.py file1.yml file2.yml file3.yml
+
+  # Replay from directories (finds all YAML files)
+  python replay.py datasets/attack_techniques/T1003.003/
+  python replay.py datasets/attack_techniques/T1003.003/ datasets/attack_techniques/T1005/
+
+Environment setup:
   export SPLUNK_HOST="192.168.1.100"
   export SPLUNK_HEC_TOKEN="your-hec-token"
 
 This script will:
-1. Recursively find all data.yml files in the specified directory
-2. Parse each data.yml file to extract dataset information
-3. Replay each dataset using the source and sourcetype from the yml file
-4. Use the id field from data.yml as the host field for Splunk events
+1. Process YAML files directly or find all YAML files in specified directories
+2. Parse each YAML file to extract all metadata (source, sourcetype, index, etc.)
+3. Replay each dataset using the metadata from the YAML file
+4. Use the id field from YAML file as the host field for Splunk events
         """,
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
     parser.add_argument(
-        'path',
-        help='Path to a directory containing data.yml files '
-             '(searches recursively)'
+        'paths',
+        nargs='+',
+        help='Paths to YAML files or directories containing YAML files'
     )
     parser.add_argument(
-        '--source',
-        default='test',
-        help='Source field for Splunk events (default: test)'
+        '--index-override',
+        help='Override the index specified in YAML files (optional)'
     )
     parser.add_argument(
-        '--sourcetype',
-        default='test',
-        help='Sourcetype field for Splunk events (default: test)'
+        '--source-override',
+        help='Override the source specified in YAML files (optional)'
     )
     parser.add_argument(
-        '--index',
-        default='test',
-        help='Splunk index to send events to (default: test)'
+        '--sourcetype-override',
+        help='Override the sourcetype specified in YAML files (optional)'
     )
     parser.add_argument(
         '--host-uuid',
         help='UUID to use as the host field for Splunk events '
-             '(generates random UUID if not provided)'
+             '(uses id from YAML file if not provided)'
     )
     args = parser.parse_args()
 
@@ -159,36 +174,58 @@ def main():
         splunk_host = env_vars['host']
         hec_token = env_vars['hec_token']
 
-        if not os.path.isdir(args.path):
-            print(f"Error: {args.path} is not a valid directory")
-            sys.exit(1)
-
-        # Find all data.yml files recursively
-        data_yml_files = find_data_yml_files(args.path)
+        # Collect all YAML files from paths (files or directories)
+        all_yaml_files = []
+        for path in args.paths:
+            path_obj = Path(path)
 
-        if not data_yml_files:
-            print(f"No data.yml files found in {args.path}")
+            if path_obj.is_file():
+                # Direct YAML file
+                if path_obj.suffix.lower() in ['.yml', '.yaml']:
+                    all_yaml_files.append(str(path_obj))
+                else:
+                    print(f"Warning: {path} is not a YAML file, skipping")
+            elif path_obj.is_dir():
+                # Directory - find YAML files
+                yaml_files = find_data_yml_files(str(path_obj))
+                all_yaml_files.extend(yaml_files)
+            else:
+                print(f"Warning: {path} does not exist, skipping")
+
+        if not all_yaml_files:
+            print("No YAML files found to process")
             sys.exit(1)
 
-        # Process each data.yml file
-        for yml_file in data_yml_files:
+        print(f"Found {len(all_yaml_files)} YAML files to process")
+
+        # Process each YAML file
+        for yml_file in all_yaml_files:
             print(f"\nProcessing {yml_file}...")
-            file_id, datasets = parse_data_yml(yml_file)
+            file_id, datasets, defaults = parse_data_yml(yml_file)
 
             if not file_id or not datasets:
                 print(f"Skipping {yml_file} - no valid data found")
                 continue
 
-            # Use the id from data.yml as host field (unless user provided one)
+            # Use the id from YAML file as host field (unless user provided one)
             event_host_uuid = args.host_uuid or file_id
             print(f"Using host UUID: {event_host_uuid}")
 
-            # Process each dataset in the data.yml file
+            # Process each dataset in the YAML file
             for dataset in datasets:
                 dataset_name = dataset.get('name', 'unknown')
                 dataset_path = dataset.get('path', '')
-                dataset_source = dataset.get('source', args.source)
-                dataset_sourcetype = dataset.get('sourcetype', args.sourcetype)
+
+                # Use dataset-specific metadata, fall back to YAML defaults
+                dataset_source = (args.source_override or
+                                  dataset.get('source') or
+                                  defaults.get('source', 'attack_data'))
+                dataset_sourcetype = (args.sourcetype_override or
+                                      dataset.get('sourcetype') or
+                                      defaults.get('sourcetype', '_json'))
+                dataset_index = (args.index_override or
+                                 dataset.get('index') or
+                                 defaults.get('index', 'attack_data'))
 
                 if not dataset_path:
                     print(f"Warning: No path specified for dataset "
@@ -198,10 +235,10 @@ def main():
                 # Handle relative paths - relative to attack_data root
                 if dataset_path.startswith('/datasets/'):
                     # Convert to absolute path based on project structure
-                    if Path(args.path).name == 'datasets':
-                        base_dir = Path(args.path).parent
-                    else:
-                        base_dir = Path(args.path)
+                    current_path = Path(yml_file).parent
+                    base_dir = current_path
+
+                    # Walk up to find attack_data root
                     while (base_dir.name != 'attack_data' and
                            base_dir.parent != base_dir):
                         base_dir = base_dir.parent
@@ -221,6 +258,7 @@ def main():
                     continue
 
                 print(f"  Sending dataset '{dataset_name}' from {full_path}")
+                print(f"    index: {dataset_index}")
                 print(f"    source: {dataset_source}")
                 print(f"    sourcetype: {dataset_sourcetype}")
 
@@ -229,7 +267,7 @@ def main():
                     splunk_host=splunk_host,
                     hec_token=hec_token,
                     event_host_uuid=event_host_uuid,
-                    index=args.index,
+                    index=dataset_index,
                     source=dataset_source,
                     sourcetype=dataset_sourcetype,
                 )

From d0b6f9903d8bf0c00f046e642207ee7948b47ad1 Mon Sep 17 00:00:00 2001
From: Patrick Bareiss <pbareiss@splunk.com>
Date: Thu, 21 Aug 2025 10:34:10 +0200
Subject: [PATCH 2/2] CI/CD upload attack datasets to Splunk

---
 bin/replay.py | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/bin/replay.py b/bin/replay.py
index e0a55d9d..dd1b0615 100644
--- a/bin/replay.py
+++ b/bin/replay.py
@@ -117,8 +117,8 @@ def send_data_to_splunk(file_path, splunk_host, hec_token, event_host_uuid,
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Replay datasets from YAML files to Splunk via HTTP Event Collector (HEC). "
-                    "All metadata (source, sourcetype, index) is read from the YAML files.",
+        description="Replay datasets from YAML files to Splunk via HEC. "
+                    "All metadata (source, sourcetype, index) is read from YAML files.",
         epilog="""
 Environment Variables Required:
   SPLUNK_HOST      - Splunk server hostname/IP
@@ -126,12 +126,13 @@ def main():
 
 Example usage:
   # Replay from specific YAML files
-  python replay.py datasets/attack_techniques/T1003.003/atomic_red_team/atomic_red_team.yml
+  python replay.py datasets/attack_techniques/T1003.003/atomic_red_team/\
+    atomic_red_team.yml
   python replay.py file1.yml file2.yml file3.yml
 
   # Replay from directories (finds all YAML files)
   python replay.py datasets/attack_techniques/T1003.003/
-  python replay.py datasets/attack_techniques/T1003.003/ datasets/attack_techniques/T1005/
+  python replay.py datasets/attack_techniques/T1003.003/ 
 
 Environment setup:
   export SPLUNK_HOST="192.168.1.100"
@@ -232,22 +233,33 @@ def main():
                           f"'{dataset_name}', skipping")
                     continue
 
-                # Handle relative paths - relative to attack_data root
+                # Handle relative paths - relative to git project root
                 if dataset_path.startswith('/datasets/'):
                     # Convert to absolute path based on project structure
+                    # Find git project root by looking for .git directory
                     current_path = Path(yml_file).parent
-                    base_dir = current_path
+                    project_root = current_path
 
-                    # Walk up to find attack_data root
-                    while (base_dir.name != 'attack_data' and
-                           base_dir.parent != base_dir):
-                        base_dir = base_dir.parent
+                    # Walk up to find git project root (directory containing .git)
+                    while (not (project_root / '.git').exists() and
+                           project_root.parent != project_root):
+                        project_root = project_root.parent
 
-                    if base_dir.name == 'attack_data':
-                        full_path = base_dir / dataset_path.lstrip('/')
+                    if (project_root / '.git').exists():
+                        # Found git project root, construct path relative to it
+                        full_path = project_root / dataset_path.lstrip('/')
                     else:
-                        # Fallback: assume current working directory structure
-                        full_path = Path.cwd() / dataset_path.lstrip('/')
+                        # Fallback: try to find project root using current working dir
+                        cwd = Path.cwd()
+                        while (not (cwd / '.git').exists() and
+                               cwd.parent != cwd):
+                            cwd = cwd.parent
+
+                        if (cwd / '.git').exists():
+                            full_path = cwd / dataset_path.lstrip('/')
+                        else:
+                            # Last resort: assume current working directory structure
+                            full_path = Path.cwd() / dataset_path.lstrip('/')
                 else:
                     # Assume relative to yml file location
                     yml_dir = Path(yml_file).parent